From 55aebeb926b6f93a540328e7ac770ef536b09b77 Mon Sep 17 00:00:00 2001
From: Daniel Baluta <daniel.baluta@intel.com>
Date: Mon, 10 Nov 2014 14:45:30 +0200
Subject: iio: core: Introduce IIO_ACTIVITY channel

This channel will be used for exposing information about
activity composite sensors. Activities supported so far:
	* running
	* jogging
	* walking
	* still

THRESHOLD event is used to signal a change in the activity
state.

We associate a confidence interval for each activity expressed
as a percentage from 0 to 100.
  * 0, means the sensor IS NOT reporting that activity.
  * 100, means the sensor IS reporting that activity.

Users of this interface have two possible means to gather
information about the ongoing activities.

1. Event based, via event file descriptor
  * sensor may report an event when ENTERING an activity or LEAVING
    an activity based on a threshold value.
  * drivers will wake up applications waiting data on the event fd

2. Polling, by reading the sysfs associated attribute files:
  * /sys/bus/iio/devices/iio:device0/in_activity_running_input
expressed as percentage confidence value from 0 to 100.

This will offer an interface for Android significant motion
composite sensor defined here:
http://source.android.com/devices/sensors/composite_sensors.html

Activities listed above are supported by Freescale's MMA9553 sensor:
http://freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/types.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 4a2af8adf874..b3a241d53b54 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -30,6 +30,7 @@ enum iio_chan_type {
 	IIO_CCT,
 	IIO_PRESSURE,
 	IIO_HUMIDITYRELATIVE,
+	IIO_ACTIVITY,
 };
 
 enum iio_modifier {
@@ -59,7 +60,11 @@ enum iio_modifier {
 	IIO_MOD_NORTH_MAGN,
 	IIO_MOD_NORTH_TRUE,
 	IIO_MOD_NORTH_MAGN_TILT_COMP,
-	IIO_MOD_NORTH_TRUE_TILT_COMP
+	IIO_MOD_NORTH_TRUE_TILT_COMP,
+	IIO_MOD_RUNNING,
+	IIO_MOD_JOGGING,
+	IIO_MOD_WALKING,
+	IIO_MOD_STILL,
 };
 
 enum iio_event_type {
-- 
cgit v1.2.3


From 1843c2f3def16740eb6d129a9790c32dd21aa5ea Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Mon, 10 Nov 2014 14:45:31 +0200
Subject: iio: core: Introduce IIO_EV_DIR_NONE

For some events (e.g.: step detector) a direction does not make sense.

Add IIO_EV_DIR_NONE to be used with such events and generate sysfs event
attributes that do not contain direction.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-event.c | 12 +++++++++---
 include/linux/iio/types.h        |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 0c1e37e3120a..1290290adcb5 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -327,9 +327,15 @@ static int iio_device_add_event(struct iio_dev *indio_dev,
 	for_each_set_bit(i, mask, sizeof(*mask)*8) {
 		if (i >= ARRAY_SIZE(iio_ev_info_text))
 			return -EINVAL;
-		postfix = kasprintf(GFP_KERNEL, "%s_%s_%s",
-				iio_ev_type_text[type], iio_ev_dir_text[dir],
-				iio_ev_info_text[i]);
+		if (dir != IIO_EV_DIR_NONE)
+			postfix = kasprintf(GFP_KERNEL, "%s_%s_%s",
+					iio_ev_type_text[type],
+					iio_ev_dir_text[dir],
+					iio_ev_info_text[i]);
+		else
+			postfix = kasprintf(GFP_KERNEL, "%s_%s",
+					iio_ev_type_text[type],
+					iio_ev_info_text[i]);
 		if (postfix == NULL)
 			return -ENOMEM;
 
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index b3a241d53b54..52cb5329407b 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -86,6 +86,7 @@ enum iio_event_direction {
 	IIO_EV_DIR_EITHER,
 	IIO_EV_DIR_RISING,
 	IIO_EV_DIR_FALLING,
+	IIO_EV_DIR_NONE,
 };
 
 #define IIO_VAL_INT 1
-- 
cgit v1.2.3


From a88bfe78583026eb9f21d4014ba481b22b66cee3 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Mon, 10 Nov 2014 14:45:32 +0200
Subject: iio: core: Introduce STEPS channel, ENABLE mask and INSTANCE event

These changes are needed to support the functionality of a pedometer.
A pedometer has two basic functionalities: step counter and step detector.

The step counter needs to be enabled and then it will count the steps
in its hardware register. Whenever the application needs to check
the step count, it will read the step counter register. To support the
step counter a new channel type STEPS is added. Since the pedometer needs
to be enabled first so that the hardware can count and store the steps,
we need a specific ENABLE channel info mask.

The step detector will generate an interrupt each time a step is detected.
To support this functionality we add a new event type INSTANCE.

For more information on the Android requirements for step counter and step
detector see:
http://source.android.com/devices/sensors/composite_sensors.html#counter
and http://source.android.com/devices/sensors/composite_sensors.html#detector.

A device that has the pedometer functionality this interface needs to
support is Freescale's MMA9553L:
http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 22 ++++++++++++++++++++++
 drivers/iio/industrialio-core.c         |  2 ++
 drivers/iio/industrialio-event.c        |  1 +
 include/linux/iio/iio.h                 |  1 +
 include/linux/iio/types.h               |  2 ++
 5 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 7bf49ad8fd82..c60b0a1af839 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -856,6 +856,13 @@ Description:
 		number or direction is not specified, applies to all channels of
 		this type.
 
+What:		/sys/.../events/in_steps_instance_en
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Enables or disables step detection. Each time the user takes a step an
+		event of this type will be generated.
+
 What:		/sys/bus/iio/devices/iio:deviceX/trigger/current_trigger
 KernelVersion:	2.6.35
 Contact:	linux-iio@vger.kernel.org
@@ -1095,3 +1102,18 @@ Description:
 		after application of scale and offset. If no offset or scale is
 		present, output should be considered as processed with the
 		unit in milliamps.
+
+What:		/sys/.../iio:deviceX/in_steps_en
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Activates the step counter. After activation, the number of steps
+		taken by the user will be counted in hardware and exported through
+		in_steps_input.
+
+What:		/sys/.../iio:deviceX/in_steps_input
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		This attribute is used to read the number of steps taken by the user
+		since the last reboot while activated.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index e453ef9e0c36..1e060f390b08 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -71,6 +71,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_PRESSURE] = "pressure",
 	[IIO_HUMIDITYRELATIVE] = "humidityrelative",
 	[IIO_ACTIVITY] = "activity",
+	[IIO_STEPS] = "steps",
 };
 
 static const char * const iio_modifier_names[] = {
@@ -118,6 +119,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_HARDWAREGAIN] = "hardwaregain",
 	[IIO_CHAN_INFO_HYSTERESIS] = "hysteresis",
 	[IIO_CHAN_INFO_INT_TIME] = "integration_time",
+	[IIO_CHAN_INFO_ENABLE] = "en",
 };
 
 /**
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 1290290adcb5..3f5cee0295c5 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -197,6 +197,7 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_ROC] = "roc",
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
+	[IIO_EV_TYPE_INSTANCE] = "instance",
 };
 
 static const char * const iio_ev_dir_text[] = {
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 3642ce7ef512..f45a400a5e3e 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -38,6 +38,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_HARDWAREGAIN,
 	IIO_CHAN_INFO_HYSTERESIS,
 	IIO_CHAN_INFO_INT_TIME,
+	IIO_CHAN_INFO_ENABLE,
 };
 
 enum iio_shared_by {
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 52cb5329407b..904dcbbf0e6f 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -31,6 +31,7 @@ enum iio_chan_type {
 	IIO_PRESSURE,
 	IIO_HUMIDITYRELATIVE,
 	IIO_ACTIVITY,
+	IIO_STEPS,
 };
 
 enum iio_modifier {
@@ -73,6 +74,7 @@ enum iio_event_type {
 	IIO_EV_TYPE_ROC,
 	IIO_EV_TYPE_THRESH_ADAPTIVE,
 	IIO_EV_TYPE_MAG_ADAPTIVE,
+	IIO_EV_TYPE_INSTANCE,
 };
 
 enum iio_event_info {
-- 
cgit v1.2.3


From bcdf28fb1b8badf3cdba18d349f6251057e36a45 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Mon, 10 Nov 2014 14:45:33 +0200
Subject: iio: core: Introduce IIO_CHAN_INFO_CALIBHEIGHT

Some devices need the height of the user to compute various
parameters. One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that needs the height of the user to compute the stride length which
is used further to determine distance, speed and activity type.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 8 ++++++++
 drivers/iio/industrialio-core.c         | 1 +
 include/linux/iio/iio.h                 | 1 +
 3 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index c60b0a1af839..4a9e29a3c2dc 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -323,6 +323,14 @@ Description:
 		production inaccuracies).  If shared across all channels,
 		<type>_calibscale is used.
 
+What:		/sys/bus/iio/devices/iio:deviceX/in_steps_calibheight
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Height of the user (in centimeters) used by some pedometers
+		to compute the stride length, distance, speed and activity
+		type.
+
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_scale_available
 What:		/sys/.../iio:deviceX/in_voltageX_scale_available
 What:		/sys/.../iio:deviceX/in_voltage-voltage_scale_available
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 1e060f390b08..45bb3a43afac 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -120,6 +120,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_HYSTERESIS] = "hysteresis",
 	[IIO_CHAN_INFO_INT_TIME] = "integration_time",
 	[IIO_CHAN_INFO_ENABLE] = "en",
+	[IIO_CHAN_INFO_CALIBHEIGHT] = "calibheight",
 };
 
 /**
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index f45a400a5e3e..878d861b0610 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -39,6 +39,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_HYSTERESIS,
 	IIO_CHAN_INFO_INT_TIME,
 	IIO_CHAN_INFO_ENABLE,
+	IIO_CHAN_INFO_CALIBHEIGHT,
 };
 
 enum iio_shared_by {
-- 
cgit v1.2.3


From 0461a4149836c792d186027c8c859637a4cfb11a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 9 Dec 2014 21:38:05 +0000
Subject: spi: Pump transfers inside calling context for spi_sync()

If we are using the standard SPI message pump (which all drivers should be
transitioning over to) then special case the message enqueue and instead of
starting the worker thread to push messages to the hardware do so in the
context of the caller if the controller is idle. This avoids a context
switch in the common case where the controller has a single user in a
single thread, for short PIO transfers there may be no need to context
switch away from the calling context to complete the transfer.

The code is a bit more complex than is desirable in part due to the need
to handle drivers not using the standard queue and in part due to handling
the various combinations of bus locking and asynchronous submission in
interrupt context.

It is still suboptimal since it will still wake the message pump for each
transfer in order to schedule idling of the hardware and if multiple
contexts are using the controller simultaneously a caller may end up
pumping a message for some random other thread rather than for itself,
and if the thread ends up deferring due to another context idling the
hardware then it will just busy wait.  It can, however, have the benefit
of aggregating power up and down of the hardware when a caller performs
a series of transfers back to back without any need for the use of
spi_async().

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 66 +++++++++++++++++++++++++++++++++++++++++++------
 include/linux/spi/spi.h |  2 ++
 2 files changed, 60 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 0bc752d17be5..e1bf2579b9c0 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -882,6 +882,9 @@ EXPORT_SYMBOL_GPL(spi_finalize_current_transfer);
  * needs processing and if so call out to the driver to initialize hardware
  * and transfer each message.
  *
+ * Note that it is called both from the kthread itself and also from
+ * inside spi_sync(); the queue extraction handling at the top of the
+ * function should deal with this safely.
  */
 static void spi_pump_messages(struct kthread_work *work)
 {
@@ -900,6 +903,13 @@ static void spi_pump_messages(struct kthread_work *work)
 		return;
 	}
 
+	/* If another context is idling the device then defer */
+	if (master->idling) {
+		queue_kthread_work(&master->kworker, &master->pump_messages);
+		spin_unlock_irqrestore(&master->queue_lock, flags);
+		return;
+	}
+
 	/* Check if the queue is idle */
 	if (list_empty(&master->queue) || !master->running) {
 		if (!master->busy) {
@@ -907,7 +917,9 @@ static void spi_pump_messages(struct kthread_work *work)
 			return;
 		}
 		master->busy = false;
+		master->idling = true;
 		spin_unlock_irqrestore(&master->queue_lock, flags);
+
 		kfree(master->dummy_rx);
 		master->dummy_rx = NULL;
 		kfree(master->dummy_tx);
@@ -921,6 +933,10 @@ static void spi_pump_messages(struct kthread_work *work)
 			pm_runtime_put_autosuspend(master->dev.parent);
 		}
 		trace_spi_master_idle(master);
+
+		spin_lock_irqsave(&master->queue_lock, flags);
+		master->idling = false;
+		spin_unlock_irqrestore(&master->queue_lock, flags);
 		return;
 	}
 
@@ -1161,12 +1177,9 @@ static int spi_destroy_queue(struct spi_master *master)
 	return 0;
 }
 
-/**
- * spi_queued_transfer - transfer function for queued transfers
- * @spi: spi device which is requesting transfer
- * @msg: spi message which is to handled is queued to driver queue
- */
-static int spi_queued_transfer(struct spi_device *spi, struct spi_message *msg)
+static int __spi_queued_transfer(struct spi_device *spi,
+				 struct spi_message *msg,
+				 bool need_pump)
 {
 	struct spi_master *master = spi->master;
 	unsigned long flags;
@@ -1181,13 +1194,23 @@ static int spi_queued_transfer(struct spi_device *spi, struct spi_message *msg)
 	msg->status = -EINPROGRESS;
 
 	list_add_tail(&msg->queue, &master->queue);
-	if (!master->busy)
+	if (!master->busy && need_pump)
 		queue_kthread_work(&master->kworker, &master->pump_messages);
 
 	spin_unlock_irqrestore(&master->queue_lock, flags);
 	return 0;
 }
 
+/**
+ * spi_queued_transfer - transfer function for queued transfers
+ * @spi: spi device which is requesting transfer
+ * @msg: spi message which is to handled is queued to driver queue
+ */
+static int spi_queued_transfer(struct spi_device *spi, struct spi_message *msg)
+{
+	return __spi_queued_transfer(spi, msg, true);
+}
+
 static int spi_master_initialize_queue(struct spi_master *master)
 {
 	int ret;
@@ -2102,19 +2125,46 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message,
 	DECLARE_COMPLETION_ONSTACK(done);
 	int status;
 	struct spi_master *master = spi->master;
+	unsigned long flags;
+
+	status = __spi_validate(spi, message);
+	if (status != 0)
+		return status;
 
 	message->complete = spi_complete;
 	message->context = &done;
+	message->spi = spi;
 
 	if (!bus_locked)
 		mutex_lock(&master->bus_lock_mutex);
 
-	status = spi_async_locked(spi, message);
+	/* If we're not using the legacy transfer method then we will
+	 * try to transfer in the calling context so special case.
+	 * This code would be less tricky if we could remove the
+	 * support for driver implemented message queues.
+	 */
+	if (master->transfer == spi_queued_transfer) {
+		spin_lock_irqsave(&master->bus_lock_spinlock, flags);
+
+		trace_spi_message_submit(message);
+
+		status = __spi_queued_transfer(spi, message, false);
+
+		spin_unlock_irqrestore(&master->bus_lock_spinlock, flags);
+	} else {
+		status = spi_async_locked(spi, message);
+	}
 
 	if (!bus_locked)
 		mutex_unlock(&master->bus_lock_mutex);
 
 	if (status == 0) {
+		/* Push out the messages in the calling context if we
+		 * can.
+		 */
+		if (master->transfer == spi_queued_transfer)
+			spi_pump_messages(&master->pump_messages);
+
 		wait_for_completion(&done);
 		status = message->status;
 	}
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index a6ef2a8e6de4..4e6db75e9469 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -260,6 +260,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @pump_messages: work struct for scheduling work to the message pump
  * @queue_lock: spinlock to syncronise access to message queue
  * @queue: message queue
+ * @idling: the device is entering idle state
  * @cur_msg: the currently in-flight message
  * @cur_msg_prepared: spi_prepare_message was called for the currently
  *                    in-flight message
@@ -425,6 +426,7 @@ struct spi_master {
 	spinlock_t			queue_lock;
 	struct list_head		queue;
 	struct spi_message		*cur_msg;
+	bool				idling;
 	bool				busy;
 	bool				running;
 	bool				rt;
-- 
cgit v1.2.3


From f9380e7123863a4cb0627d940533be954a0a15df Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Thu, 27 Nov 2014 01:42:45 +0300
Subject: iio: inkern: add iio_write_channel_raw

Introduce API for easy in-kernel setting of DAC values.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/inkern.c         | 25 +++++++++++++++++++++++++
 include/linux/iio/consumer.h | 10 ++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 866fe904cba2..21655fd1465c 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -631,3 +631,28 @@ err_unlock:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iio_get_channel_type);
+
+static int iio_channel_write(struct iio_channel *chan, int val, int val2,
+			     enum iio_chan_info_enum info)
+{
+	return chan->indio_dev->info->write_raw(chan->indio_dev,
+						chan->channel, val, val2, info);
+}
+
+int iio_write_channel_raw(struct iio_channel *chan, int val)
+{
+	int ret;
+
+	mutex_lock(&chan->indio_dev->info_exist_lock);
+	if (chan->indio_dev->info == NULL) {
+		ret = -ENODEV;
+		goto err_unlock;
+	}
+
+	ret = iio_channel_write(chan, val, 0, IIO_CHAN_INFO_RAW);
+err_unlock:
+	mutex_unlock(&chan->indio_dev->info_exist_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iio_write_channel_raw);
diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index 651f9a0e2765..6f64624f329b 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -150,6 +150,16 @@ int iio_read_channel_average_raw(struct iio_channel *chan, int *val);
  */
 int iio_read_channel_processed(struct iio_channel *chan, int *val);
 
+/**
+ * iio_write_channel_raw() - write to a given channel
+ * @chan:		The channel being queried.
+ * @val:		Value being written.
+ *
+ * Note raw writes to iio channels are in dac counts and hence
+ * scale will need to be applied if standard units required.
+ */
+int iio_write_channel_raw(struct iio_channel *chan, int val);
+
 /**
  * iio_get_channel_type() - get the type of a channel
  * @channel:		The channel being queried.
-- 
cgit v1.2.3


From 217a5cf0a1100264ced523e437e2e22c442dca7c Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:09 +0100
Subject: iio: Unexport iio_scan_mask_set()

Individual drivers should not be messing with the scan mask that contains
the list of enabled channels. This is something that is supposed to be
managed by the core.

Now that the last few drivers that used it to configure a default scan mask
have been updated to not do this anymore we can unexport the function.

Note, this patch also requires moving a few functions around so they are all
declared before the first internal user.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Reviewed-by: Daniel Baluta <daniel.baluta@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-buffer.c | 149 +++++++++++++++++++-------------------
 include/linux/iio/buffer.h        |   9 ---
 2 files changed, 74 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index f971f79103ec..f667e4e7ea6d 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -178,6 +178,80 @@ static ssize_t iio_scan_el_show(struct device *dev,
 	return sprintf(buf, "%d\n", ret);
 }
 
+/* Note NULL used as error indicator as it doesn't make sense. */
+static const unsigned long *iio_scan_mask_match(const unsigned long *av_masks,
+					  unsigned int masklength,
+					  const unsigned long *mask)
+{
+	if (bitmap_empty(mask, masklength))
+		return NULL;
+	while (*av_masks) {
+		if (bitmap_subset(mask, av_masks, masklength))
+			return av_masks;
+		av_masks += BITS_TO_LONGS(masklength);
+	}
+	return NULL;
+}
+
+static bool iio_validate_scan_mask(struct iio_dev *indio_dev,
+	const unsigned long *mask)
+{
+	if (!indio_dev->setup_ops->validate_scan_mask)
+		return true;
+
+	return indio_dev->setup_ops->validate_scan_mask(indio_dev, mask);
+}
+
+/**
+ * iio_scan_mask_set() - set particular bit in the scan mask
+ * @indio_dev: the iio device
+ * @buffer: the buffer whose scan mask we are interested in
+ * @bit: the bit to be set.
+ *
+ * Note that at this point we have no way of knowing what other
+ * buffers might request, hence this code only verifies that the
+ * individual buffers request is plausible.
+ */
+static int iio_scan_mask_set(struct iio_dev *indio_dev,
+		      struct iio_buffer *buffer, int bit)
+{
+	const unsigned long *mask;
+	unsigned long *trialmask;
+
+	trialmask = kmalloc(sizeof(*trialmask)*
+			    BITS_TO_LONGS(indio_dev->masklength),
+			    GFP_KERNEL);
+
+	if (trialmask == NULL)
+		return -ENOMEM;
+	if (!indio_dev->masklength) {
+		WARN_ON("Trying to set scanmask prior to registering buffer\n");
+		goto err_invalid_mask;
+	}
+	bitmap_copy(trialmask, buffer->scan_mask, indio_dev->masklength);
+	set_bit(bit, trialmask);
+
+	if (!iio_validate_scan_mask(indio_dev, trialmask))
+		goto err_invalid_mask;
+
+	if (indio_dev->available_scan_masks) {
+		mask = iio_scan_mask_match(indio_dev->available_scan_masks,
+					   indio_dev->masklength,
+					   trialmask);
+		if (!mask)
+			goto err_invalid_mask;
+	}
+	bitmap_copy(buffer->scan_mask, trialmask, indio_dev->masklength);
+
+	kfree(trialmask);
+
+	return 0;
+
+err_invalid_mask:
+	kfree(trialmask);
+	return -EINVAL;
+}
+
 static int iio_scan_mask_clear(struct iio_buffer *buffer, int bit)
 {
 	clear_bit(bit, buffer->scan_mask);
@@ -455,21 +529,6 @@ ssize_t iio_buffer_show_enable(struct device *dev,
 }
 EXPORT_SYMBOL(iio_buffer_show_enable);
 
-/* Note NULL used as error indicator as it doesn't make sense. */
-static const unsigned long *iio_scan_mask_match(const unsigned long *av_masks,
-					  unsigned int masklength,
-					  const unsigned long *mask)
-{
-	if (bitmap_empty(mask, masklength))
-		return NULL;
-	while (*av_masks) {
-		if (bitmap_subset(mask, av_masks, masklength))
-			return av_masks;
-		av_masks += BITS_TO_LONGS(masklength);
-	}
-	return NULL;
-}
-
 static int iio_compute_scan_bytes(struct iio_dev *indio_dev,
 				const unsigned long *mask, bool timestamp)
 {
@@ -808,66 +867,6 @@ bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
 }
 EXPORT_SYMBOL_GPL(iio_validate_scan_mask_onehot);
 
-static bool iio_validate_scan_mask(struct iio_dev *indio_dev,
-	const unsigned long *mask)
-{
-	if (!indio_dev->setup_ops->validate_scan_mask)
-		return true;
-
-	return indio_dev->setup_ops->validate_scan_mask(indio_dev, mask);
-}
-
-/**
- * iio_scan_mask_set() - set particular bit in the scan mask
- * @indio_dev: the iio device
- * @buffer: the buffer whose scan mask we are interested in
- * @bit: the bit to be set.
- *
- * Note that at this point we have no way of knowing what other
- * buffers might request, hence this code only verifies that the
- * individual buffers request is plausible.
- */
-int iio_scan_mask_set(struct iio_dev *indio_dev,
-		      struct iio_buffer *buffer, int bit)
-{
-	const unsigned long *mask;
-	unsigned long *trialmask;
-
-	trialmask = kmalloc(sizeof(*trialmask)*
-			    BITS_TO_LONGS(indio_dev->masklength),
-			    GFP_KERNEL);
-
-	if (trialmask == NULL)
-		return -ENOMEM;
-	if (!indio_dev->masklength) {
-		WARN_ON("Trying to set scanmask prior to registering buffer\n");
-		goto err_invalid_mask;
-	}
-	bitmap_copy(trialmask, buffer->scan_mask, indio_dev->masklength);
-	set_bit(bit, trialmask);
-
-	if (!iio_validate_scan_mask(indio_dev, trialmask))
-		goto err_invalid_mask;
-
-	if (indio_dev->available_scan_masks) {
-		mask = iio_scan_mask_match(indio_dev->available_scan_masks,
-					   indio_dev->masklength,
-					   trialmask);
-		if (!mask)
-			goto err_invalid_mask;
-	}
-	bitmap_copy(buffer->scan_mask, trialmask, indio_dev->masklength);
-
-	kfree(trialmask);
-
-	return 0;
-
-err_invalid_mask:
-	kfree(trialmask);
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(iio_scan_mask_set);
-
 int iio_scan_mask_query(struct iio_dev *indio_dev,
 			struct iio_buffer *buffer, int bit)
 {
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 519392763393..8c8ce611949c 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -116,15 +116,6 @@ void iio_buffer_init(struct iio_buffer *buffer);
 int iio_scan_mask_query(struct iio_dev *indio_dev,
 			struct iio_buffer *buffer, int bit);
 
-/**
- * iio_scan_mask_set() - set particular bit in the scan mask
- * @indio_dev		IIO device structure
- * @buffer:		the buffer whose scan mask we are interested in
- * @bit:		the bit to be set.
- **/
-int iio_scan_mask_set(struct iio_dev *indio_dev,
-		      struct iio_buffer *buffer, int bit);
-
 /**
  * iio_push_to_buffers() - push to a registered buffer.
  * @indio_dev:		iio_dev structure for device.
-- 
cgit v1.2.3


From 3e1b6c95b990c93f4aa3b17e9f66221e2fa44bee Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:12 +0100
Subject: iio: Move buffer registration to the core

Originally device and buffer registration were kept as separate operations
in IIO to allow to register two distinct sets of channels for buffered and
non-buffered operations. This has since already been further restricted and
the channel set registered for the buffer needs to be a subset of the
channel set registered for the device. Additionally the possibility to not
have a raw (or processed) attribute for a channel which was registered for
the device was added a while ago. This means it is possible to not register
any device level attributes for a channel even if it is registered for the
device. Also if a channel's scan_index is set to -1 and the channel is
registered for the buffer it is ignored.

So in summary it means it is possible to register the same channel array for
both the device and the buffer yet still end up with distinctive sets of
channels for both of them. This makes the argument for having to have to
manually register the channels for both the device and the buffer invalid.
Considering that the vast majority of all drivers want to register the same
set of channels for both the buffer and the device it makes sense to move
the buffer registration into the core to avoid some boiler-plate code in the
device driver setup path.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/adc/ti_am335x_adc.c                 |  9 ---------
 drivers/iio/iio_core.h                          |  9 +++++++++
 drivers/iio/industrialio-buffer.c               | 18 ++++++++++-------
 drivers/iio/industrialio-core.c                 | 14 ++++++++++++-
 drivers/iio/industrialio-triggered-buffer.c     | 11 +----------
 drivers/staging/iio/accel/lis3l02dq_core.c      | 13 +------------
 drivers/staging/iio/accel/sca3000_core.c        | 10 +---------
 drivers/staging/iio/iio_simple_dummy_buffer.c   |  8 --------
 drivers/staging/iio/impedance-analyzer/ad5933.c | 12 ++----------
 drivers/staging/iio/meter/ade7758.h             |  1 -
 drivers/staging/iio/meter/ade7758_core.c        | 15 ++------------
 drivers/staging/iio/meter/ade7758_ring.c        |  5 -----
 include/linux/iio/buffer.h                      | 26 -------------------------
 13 files changed, 40 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ti_am335x_adc.c b/drivers/iio/adc/ti_am335x_adc.c
index b730864731e8..d550ac7d2365 100644
--- a/drivers/iio/adc/ti_am335x_adc.c
+++ b/drivers/iio/adc/ti_am335x_adc.c
@@ -264,16 +264,8 @@ static int tiadc_iio_buffered_hardware_setup(struct iio_dev *indio_dev,
 	indio_dev->setup_ops = setup_ops;
 	indio_dev->modes |= INDIO_BUFFER_HARDWARE;
 
-	ret = iio_buffer_register(indio_dev,
-				  indio_dev->channels,
-				  indio_dev->num_channels);
-	if (ret)
-		goto error_free_irq;
-
 	return 0;
 
-error_free_irq:
-	free_irq(irq, indio_dev);
 error_kfifo_free:
 	iio_kfifo_free(indio_dev->buffer);
 	return ret;
@@ -285,7 +277,6 @@ static void tiadc_iio_buffered_hardware_remove(struct iio_dev *indio_dev)
 
 	free_irq(adc_dev->mfd_tscadc->irq, indio_dev);
 	iio_kfifo_free(indio_dev->buffer);
-	iio_buffer_unregister(indio_dev);
 }
 
 
diff --git a/drivers/iio/iio_core.h b/drivers/iio/iio_core.h
index 5f0ea77fe717..359883525ab7 100644
--- a/drivers/iio/iio_core.h
+++ b/drivers/iio/iio_core.h
@@ -48,6 +48,8 @@ unsigned int iio_buffer_poll(struct file *filp,
 ssize_t iio_buffer_read_first_n_outer(struct file *filp, char __user *buf,
 				      size_t n, loff_t *f_ps);
 
+int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev);
+void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev);
 
 #define iio_buffer_poll_addr (&iio_buffer_poll)
 #define iio_buffer_read_first_n_outer_addr (&iio_buffer_read_first_n_outer)
@@ -60,6 +62,13 @@ void iio_buffer_wakeup_poll(struct iio_dev *indio_dev);
 #define iio_buffer_poll_addr NULL
 #define iio_buffer_read_first_n_outer_addr NULL
 
+static inline int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
+{
+	return 0;
+}
+
+static inline void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev) {}
+
 static inline void iio_disable_all_buffers(struct iio_dev *indio_dev) {}
 static inline void iio_buffer_wakeup_poll(struct iio_dev *indio_dev) {}
 
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index f667e4e7ea6d..8bb3e64eaf2c 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -385,14 +385,16 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 
 static const char * const iio_scan_elements_group_name = "scan_elements";
 
-int iio_buffer_register(struct iio_dev *indio_dev,
-			const struct iio_chan_spec *channels,
-			int num_channels)
+int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 {
 	struct iio_dev_attr *p;
 	struct attribute **attr;
 	struct iio_buffer *buffer = indio_dev->buffer;
 	int ret, i, attrn, attrcount, attrcount_orig = 0;
+	const struct iio_chan_spec *channels;
+
+	if (!buffer)
+		return 0;
 
 	if (buffer->attrs)
 		indio_dev->groups[indio_dev->groupcounter++] = buffer->attrs;
@@ -404,9 +406,10 @@ int iio_buffer_register(struct iio_dev *indio_dev,
 	}
 	attrcount = attrcount_orig;
 	INIT_LIST_HEAD(&buffer->scan_el_dev_attr_list);
+	channels = indio_dev->channels;
 	if (channels) {
 		/* new magic */
-		for (i = 0; i < num_channels; i++) {
+		for (i = 0; i < indio_dev->num_channels; i++) {
 			if (channels[i].scan_index < 0)
 				continue;
 
@@ -463,15 +466,16 @@ error_cleanup_dynamic:
 
 	return ret;
 }
-EXPORT_SYMBOL(iio_buffer_register);
 
-void iio_buffer_unregister(struct iio_dev *indio_dev)
+void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev)
 {
+	if (!indio_dev->buffer)
+		return;
+
 	kfree(indio_dev->buffer->scan_mask);
 	kfree(indio_dev->buffer->scan_el_group.attrs);
 	iio_free_chan_devattr_list(&indio_dev->buffer->scan_el_dev_attr_list);
 }
-EXPORT_SYMBOL(iio_buffer_unregister);
 
 ssize_t iio_buffer_read_length(struct device *dev,
 			       struct device_attribute *attr,
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 45bb3a43afac..ee442ee482ab 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1158,11 +1158,19 @@ int iio_device_register(struct iio_dev *indio_dev)
 			"Failed to register debugfs interfaces\n");
 		return ret;
 	}
+
+	ret = iio_buffer_alloc_sysfs_and_mask(indio_dev);
+	if (ret) {
+		dev_err(indio_dev->dev.parent,
+			"Failed to create buffer sysfs interfaces\n");
+		goto error_unreg_debugfs;
+	}
+
 	ret = iio_device_register_sysfs(indio_dev);
 	if (ret) {
 		dev_err(indio_dev->dev.parent,
 			"Failed to register sysfs interfaces\n");
-		goto error_unreg_debugfs;
+		goto error_buffer_free_sysfs;
 	}
 	ret = iio_device_register_eventset(indio_dev);
 	if (ret) {
@@ -1195,6 +1203,8 @@ error_unreg_eventset:
 	iio_device_unregister_eventset(indio_dev);
 error_free_sysfs:
 	iio_device_unregister_sysfs(indio_dev);
+error_buffer_free_sysfs:
+	iio_buffer_free_sysfs_and_mask(indio_dev);
 error_unreg_debugfs:
 	iio_device_unregister_debugfs(indio_dev);
 	return ret;
@@ -1223,6 +1233,8 @@ void iio_device_unregister(struct iio_dev *indio_dev)
 	iio_buffer_wakeup_poll(indio_dev);
 
 	mutex_unlock(&indio_dev->info_exist_lock);
+
+	iio_buffer_free_sysfs_and_mask(indio_dev);
 }
 EXPORT_SYMBOL(iio_device_unregister);
 
diff --git a/drivers/iio/industrialio-triggered-buffer.c b/drivers/iio/industrialio-triggered-buffer.c
index d6f54930b34a..61a5d0404edf 100644
--- a/drivers/iio/industrialio-triggered-buffer.c
+++ b/drivers/iio/industrialio-triggered-buffer.c
@@ -32,7 +32,7 @@ static const struct iio_buffer_setup_ops iio_triggered_buffer_setup_ops = {
  *
  * This function combines some common tasks which will normally be performed
  * when setting up a triggered buffer. It will allocate the buffer and the
- * pollfunc, as well as register the buffer with the IIO core.
+ * pollfunc.
  *
  * Before calling this function the indio_dev structure should already be
  * completely initialized, but not yet registered. In practice this means that
@@ -78,16 +78,8 @@ int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
 	/* Flag that polled ring buffering is possible */
 	indio_dev->modes |= INDIO_BUFFER_TRIGGERED;
 
-	ret = iio_buffer_register(indio_dev,
-				  indio_dev->channels,
-				  indio_dev->num_channels);
-	if (ret)
-		goto error_dealloc_pollfunc;
-
 	return 0;
 
-error_dealloc_pollfunc:
-	iio_dealloc_pollfunc(indio_dev->pollfunc);
 error_kfifo_free:
 	iio_kfifo_free(indio_dev->buffer);
 error_ret:
@@ -101,7 +93,6 @@ EXPORT_SYMBOL(iio_triggered_buffer_setup);
  */
 void iio_triggered_buffer_cleanup(struct iio_dev *indio_dev)
 {
-	iio_buffer_unregister(indio_dev);
 	iio_dealloc_pollfunc(indio_dev->pollfunc);
 	iio_kfifo_free(indio_dev->buffer);
 }
diff --git a/drivers/staging/iio/accel/lis3l02dq_core.c b/drivers/staging/iio/accel/lis3l02dq_core.c
index f5e145caffa9..b78c9c5d5588 100644
--- a/drivers/staging/iio/accel/lis3l02dq_core.c
+++ b/drivers/staging/iio/accel/lis3l02dq_core.c
@@ -716,14 +716,6 @@ static int lis3l02dq_probe(struct spi_device *spi)
 	if (ret)
 		return ret;
 
-	ret = iio_buffer_register(indio_dev,
-				  lis3l02dq_channels,
-				  ARRAY_SIZE(lis3l02dq_channels));
-	if (ret) {
-		dev_err(&spi->dev, "failed to initialize the buffer\n");
-		goto error_unreg_buffer_funcs;
-	}
-
 	if (spi->irq) {
 		ret = request_threaded_irq(st->us->irq,
 					   &lis3l02dq_th,
@@ -732,7 +724,7 @@ static int lis3l02dq_probe(struct spi_device *spi)
 					   "lis3l02dq",
 					   indio_dev);
 		if (ret)
-			goto error_uninitialize_buffer;
+			goto error_unreg_buffer_funcs;
 
 		ret = lis3l02dq_probe_trigger(indio_dev);
 		if (ret)
@@ -756,8 +748,6 @@ error_remove_trigger:
 error_free_interrupt:
 	if (spi->irq)
 		free_irq(st->us->irq, indio_dev);
-error_uninitialize_buffer:
-	iio_buffer_unregister(indio_dev);
 error_unreg_buffer_funcs:
 	lis3l02dq_unconfigure_buffer(indio_dev);
 	return ret;
@@ -804,7 +794,6 @@ static int lis3l02dq_remove(struct spi_device *spi)
 		free_irq(st->us->irq, indio_dev);
 
 	lis3l02dq_remove_trigger(indio_dev);
-	iio_buffer_unregister(indio_dev);
 	lis3l02dq_unconfigure_buffer(indio_dev);
 
 	return 0;
diff --git a/drivers/staging/iio/accel/sca3000_core.c b/drivers/staging/iio/accel/sca3000_core.c
index aef8c916e07b..9cd04c7147c8 100644
--- a/drivers/staging/iio/accel/sca3000_core.c
+++ b/drivers/staging/iio/accel/sca3000_core.c
@@ -1156,11 +1156,6 @@ static int sca3000_probe(struct spi_device *spi)
 	if (ret < 0)
 		return ret;
 
-	ret = iio_buffer_register(indio_dev, indio_dev->channels,
-				  indio_dev->num_channels);
-	if (ret < 0)
-		goto error_unregister_dev;
-
 	if (spi->irq) {
 		ret = request_threaded_irq(spi->irq,
 					   NULL,
@@ -1169,7 +1164,7 @@ static int sca3000_probe(struct spi_device *spi)
 					   "sca3000",
 					   indio_dev);
 		if (ret)
-			goto error_unregister_ring;
+			goto error_unregister_dev;
 	}
 	sca3000_register_ring_funcs(indio_dev);
 	ret = sca3000_clean_setup(st);
@@ -1180,8 +1175,6 @@ static int sca3000_probe(struct spi_device *spi)
 error_free_irq:
 	if (spi->irq)
 		free_irq(spi->irq, indio_dev);
-error_unregister_ring:
-	iio_buffer_unregister(indio_dev);
 error_unregister_dev:
 	iio_device_unregister(indio_dev);
 	return ret;
@@ -1215,7 +1208,6 @@ static int sca3000_remove(struct spi_device *spi)
 	if (spi->irq)
 		free_irq(spi->irq, indio_dev);
 	iio_device_unregister(indio_dev);
-	iio_buffer_unregister(indio_dev);
 	sca3000_unconfigure_ring(indio_dev);
 
 	return 0;
diff --git a/drivers/staging/iio/iio_simple_dummy_buffer.c b/drivers/staging/iio/iio_simple_dummy_buffer.c
index 35d60d5045dc..a2d72c102119 100644
--- a/drivers/staging/iio/iio_simple_dummy_buffer.c
+++ b/drivers/staging/iio/iio_simple_dummy_buffer.c
@@ -172,15 +172,8 @@ int iio_simple_dummy_configure_buffer(struct iio_dev *indio_dev)
 	 */
 	indio_dev->modes |= INDIO_BUFFER_TRIGGERED;
 
-	ret = iio_buffer_register(indio_dev, indio_dev->channels,
-				  indio_dev->num_channels);
-	if (ret)
-		goto error_dealloc_pollfunc;
-
 	return 0;
 
-error_dealloc_pollfunc:
-	iio_dealloc_pollfunc(indio_dev->pollfunc);
 error_free_buffer:
 	iio_kfifo_free(indio_dev->buffer);
 error_ret:
@@ -194,7 +187,6 @@ error_ret:
  */
 void iio_simple_dummy_unconfigure_buffer(struct iio_dev *indio_dev)
 {
-	iio_buffer_unregister(indio_dev);
 	iio_dealloc_pollfunc(indio_dev->pollfunc);
 	iio_kfifo_free(indio_dev->buffer);
 }
diff --git a/drivers/staging/iio/impedance-analyzer/ad5933.c b/drivers/staging/iio/impedance-analyzer/ad5933.c
index aa6a368506cd..c50b1380b9aa 100644
--- a/drivers/staging/iio/impedance-analyzer/ad5933.c
+++ b/drivers/staging/iio/impedance-analyzer/ad5933.c
@@ -752,23 +752,16 @@ static int ad5933_probe(struct i2c_client *client,
 	if (ret)
 		goto error_disable_reg;
 
-	ret = iio_buffer_register(indio_dev, ad5933_channels,
-		ARRAY_SIZE(ad5933_channels));
-	if (ret)
-		goto error_unreg_ring;
-
 	ret = ad5933_setup(st);
 	if (ret)
-		goto error_uninitialize_ring;
+		goto error_unreg_ring;
 
 	ret = iio_device_register(indio_dev);
 	if (ret)
-		goto error_uninitialize_ring;
+		goto error_unreg_ring;
 
 	return 0;
 
-error_uninitialize_ring:
-	iio_buffer_unregister(indio_dev);
 error_unreg_ring:
 	iio_kfifo_free(indio_dev->buffer);
 error_disable_reg:
@@ -784,7 +777,6 @@ static int ad5933_remove(struct i2c_client *client)
 	struct ad5933_state *st = iio_priv(indio_dev);
 
 	iio_device_unregister(indio_dev);
-	iio_buffer_unregister(indio_dev);
 	iio_kfifo_free(indio_dev->buffer);
 	if (!IS_ERR(st->reg))
 		regulator_disable(st->reg);
diff --git a/drivers/staging/iio/meter/ade7758.h b/drivers/staging/iio/meter/ade7758.h
index 07318203a836..762d7dc0e6e2 100644
--- a/drivers/staging/iio/meter/ade7758.h
+++ b/drivers/staging/iio/meter/ade7758.h
@@ -146,7 +146,6 @@ ssize_t ade7758_read_data_from_ring(struct device *dev,
 int ade7758_configure_ring(struct iio_dev *indio_dev);
 void ade7758_unconfigure_ring(struct iio_dev *indio_dev);
 
-void ade7758_uninitialize_ring(struct iio_dev *indio_dev);
 int ade7758_set_irq(struct device *dev, bool enable);
 
 int ade7758_spi_write_reg_8(struct device *dev,
diff --git a/drivers/staging/iio/meter/ade7758_core.c b/drivers/staging/iio/meter/ade7758_core.c
index abc60067cd72..6e8b01104a68 100644
--- a/drivers/staging/iio/meter/ade7758_core.c
+++ b/drivers/staging/iio/meter/ade7758_core.c
@@ -885,23 +885,15 @@ static int ade7758_probe(struct spi_device *spi)
 	if (ret)
 		goto error_free_tx;
 
-	ret = iio_buffer_register(indio_dev,
-				  &ade7758_channels[0],
-				  ARRAY_SIZE(ade7758_channels));
-	if (ret) {
-		dev_err(&spi->dev, "failed to initialize the ring\n");
-		goto error_unreg_ring_funcs;
-	}
-
 	/* Get the device into a sane initial state */
 	ret = ade7758_initial_setup(indio_dev);
 	if (ret)
-		goto error_uninitialize_ring;
+		goto error_unreg_ring_funcs;
 
 	if (spi->irq) {
 		ret = ade7758_probe_trigger(indio_dev);
 		if (ret)
-			goto error_uninitialize_ring;
+			goto error_unreg_ring_funcs;
 	}
 
 	ret = iio_device_register(indio_dev);
@@ -913,8 +905,6 @@ static int ade7758_probe(struct spi_device *spi)
 error_remove_trigger:
 	if (spi->irq)
 		ade7758_remove_trigger(indio_dev);
-error_uninitialize_ring:
-	ade7758_uninitialize_ring(indio_dev);
 error_unreg_ring_funcs:
 	ade7758_unconfigure_ring(indio_dev);
 error_free_tx:
@@ -932,7 +922,6 @@ static int ade7758_remove(struct spi_device *spi)
 	iio_device_unregister(indio_dev);
 	ade7758_stop_device(&indio_dev->dev);
 	ade7758_remove_trigger(indio_dev);
-	ade7758_uninitialize_ring(indio_dev);
 	ade7758_unconfigure_ring(indio_dev);
 	kfree(st->tx);
 	kfree(st->rx);
diff --git a/drivers/staging/iio/meter/ade7758_ring.c b/drivers/staging/iio/meter/ade7758_ring.c
index c0accf8cce93..27c3ed6ca468 100644
--- a/drivers/staging/iio/meter/ade7758_ring.c
+++ b/drivers/staging/iio/meter/ade7758_ring.c
@@ -181,8 +181,3 @@ error_iio_kfifo_free:
 	iio_kfifo_free(indio_dev->buffer);
 	return ret;
 }
-
-void ade7758_uninitialize_ring(struct iio_dev *indio_dev)
-{
-	iio_buffer_unregister(indio_dev);
-}
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 8c8ce611949c..b0e006c3db43 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -150,22 +150,6 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev,
 
 int iio_update_demux(struct iio_dev *indio_dev);
 
-/**
- * iio_buffer_register() - register the buffer with IIO core
- * @indio_dev:		device with the buffer to be registered
- * @channels:		the channel descriptions used to construct buffer
- * @num_channels:	the number of channels
- **/
-int iio_buffer_register(struct iio_dev *indio_dev,
-			const struct iio_chan_spec *channels,
-			int num_channels);
-
-/**
- * iio_buffer_unregister() - unregister the buffer from IIO core
- * @indio_dev:		the device with the buffer to be unregistered
- **/
-void iio_buffer_unregister(struct iio_dev *indio_dev);
-
 /**
  * iio_buffer_read_length() - attr func to get number of datums in the buffer
  **/
@@ -223,16 +207,6 @@ static inline void iio_device_attach_buffer(struct iio_dev *indio_dev,
 
 #else /* CONFIG_IIO_BUFFER */
 
-static inline int iio_buffer_register(struct iio_dev *indio_dev,
-					   const struct iio_chan_spec *channels,
-					   int num_channels)
-{
-	return 0;
-}
-
-static inline void iio_buffer_unregister(struct iio_dev *indio_dev)
-{}
-
 static inline void iio_buffer_get(struct iio_buffer *buffer) {}
 static inline void iio_buffer_put(struct iio_buffer *buffer) {}
 
-- 
cgit v1.2.3


From 616dde2a1ea3df9398b1fcc7d6d6516c5fab6183 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:13 +0100
Subject: iio: Remove get_bytes_per_datum() from iio_buffer_access_funcs

There haven't been any users of the get_bytes_per_datum() callback for a
while. The core assumes that the number of bytes per datum can be calculated
based on the enabled channels and the storage size of the channel and
iio_compute_scan_bytes() is used to compute this number. So remove the
callback.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/kfifo_buf.c                    | 6 ------
 drivers/staging/iio/Documentation/ring.txt | 4 ++--
 drivers/staging/iio/accel/sca3000_ring.c   | 7 -------
 include/linux/iio/buffer.h                 | 2 --
 4 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index 7134e8ada09a..1258b4e0a722 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -66,11 +66,6 @@ static struct attribute_group iio_kfifo_attribute_group = {
 	.name = "buffer",
 };
 
-static int iio_get_bytes_per_datum_kfifo(struct iio_buffer *r)
-{
-	return r->bytes_per_datum;
-}
-
 static int iio_mark_update_needed_kfifo(struct iio_buffer *r)
 {
 	struct iio_kfifo *kf = iio_to_kfifo(r);
@@ -159,7 +154,6 @@ static const struct iio_buffer_access_funcs kfifo_access_funcs = {
 	.read_first_n = &iio_read_first_n_kfifo,
 	.data_available = iio_kfifo_buf_data_available,
 	.request_update = &iio_request_update_kfifo,
-	.get_bytes_per_datum = &iio_get_bytes_per_datum_kfifo,
 	.set_bytes_per_datum = &iio_set_bytes_per_datum_kfifo,
 	.get_length = &iio_get_length_kfifo,
 	.set_length = &iio_set_length_kfifo,
diff --git a/drivers/staging/iio/Documentation/ring.txt b/drivers/staging/iio/Documentation/ring.txt
index e1da43381d0e..434d63a6123a 100644
--- a/drivers/staging/iio/Documentation/ring.txt
+++ b/drivers/staging/iio/Documentation/ring.txt
@@ -39,8 +39,8 @@ request_update
   If parameters have changed that require reinitialization or configuration of
   the buffer this will trigger it.
 
-get_bytes_per_datum, set_bytes_per_datum
-  Get/set the number of bytes for a complete scan. (All samples + timestamp)
+set_bytes_per_datum
+  Set the number of bytes for a complete scan. (All samples + timestamp)
 
 get_length / set_length
   Get/set the number of complete scans that may be held by the buffer.
diff --git a/drivers/staging/iio/accel/sca3000_ring.c b/drivers/staging/iio/accel/sca3000_ring.c
index 157827651bfa..aa0e5d85bfe2 100644
--- a/drivers/staging/iio/accel/sca3000_ring.c
+++ b/drivers/staging/iio/accel/sca3000_ring.c
@@ -135,12 +135,6 @@ static int sca3000_ring_get_length(struct iio_buffer *r)
 	return 64;
 }
 
-/* only valid if resolution is kept at 11bits */
-static int sca3000_ring_get_bytes_per_datum(struct iio_buffer *r)
-{
-	return 6;
-}
-
 static bool sca3000_ring_buf_data_available(struct iio_buffer *r)
 {
 	return r->stufftoread;
@@ -278,7 +272,6 @@ static void sca3000_ring_release(struct iio_buffer *r)
 static const struct iio_buffer_access_funcs sca3000_ring_access_funcs = {
 	.read_first_n = &sca3000_read_first_n_hw_rb,
 	.get_length = &sca3000_ring_get_length,
-	.get_bytes_per_datum = &sca3000_ring_get_bytes_per_datum,
 	.data_available = sca3000_ring_buf_data_available,
 	.release = sca3000_ring_release,
 };
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index b0e006c3db43..79cdb3d1b1bc 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -25,7 +25,6 @@ struct iio_buffer;
  *			available.
  * @request_update:	if a parameter change has been marked, update underlying
  *			storage.
- * @get_bytes_per_datum:get current bytes per datum
  * @set_bytes_per_datum:set number of bytes per datum
  * @get_length:		get number of datums in buffer
  * @set_length:		set number of datums in buffer
@@ -49,7 +48,6 @@ struct iio_buffer_access_funcs {
 
 	int (*request_update)(struct iio_buffer *buffer);
 
-	int (*get_bytes_per_datum)(struct iio_buffer *buffer);
 	int (*set_bytes_per_datum)(struct iio_buffer *buffer, size_t bpd);
 	int (*get_length)(struct iio_buffer *buffer);
 	int (*set_length)(struct iio_buffer *buffer, int length);
-- 
cgit v1.2.3


From 08e7e0adaa17205f86894157d86c4bee3c714330 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:15 +0100
Subject: iio: buffer: Allocate standard attributes in the core

All buffers want at least the length and the enable attribute. Move the
creation of those attributes to the core instead of having to do this in
each individual buffer implementation. This allows us to get rid of some
boiler-plate code.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-buffer.c        | 59 ++++++++++++++++++++++----------
 drivers/iio/kfifo_buf.c                  | 15 --------
 drivers/staging/iio/accel/sca3000_ring.c | 14 ++------
 include/linux/iio/buffer.h               | 37 ++------------------
 4 files changed, 45 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 8cd89eb269c5..ba89357fc096 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -383,9 +383,9 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 	return ret;
 }
 
-ssize_t iio_buffer_read_length(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf)
+static ssize_t iio_buffer_read_length(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
 {
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
 	struct iio_buffer *buffer = indio_dev->buffer;
@@ -396,12 +396,10 @@ ssize_t iio_buffer_read_length(struct device *dev,
 
 	return 0;
 }
-EXPORT_SYMBOL(iio_buffer_read_length);
 
-ssize_t iio_buffer_write_length(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf,
-				size_t len)
+static ssize_t iio_buffer_write_length(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t len)
 {
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
 	struct iio_buffer *buffer = indio_dev->buffer;
@@ -428,16 +426,14 @@ ssize_t iio_buffer_write_length(struct device *dev,
 
 	return ret ? ret : len;
 }
-EXPORT_SYMBOL(iio_buffer_write_length);
 
-ssize_t iio_buffer_show_enable(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf)
+static ssize_t iio_buffer_show_enable(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
 {
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
 	return sprintf(buf, "%d\n", iio_buffer_is_active(indio_dev->buffer));
 }
-EXPORT_SYMBOL(iio_buffer_show_enable);
 
 static int iio_compute_scan_bytes(struct iio_dev *indio_dev,
 				const unsigned long *mask, bool timestamp)
@@ -724,10 +720,10 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(iio_update_buffers);
 
-ssize_t iio_buffer_store_enable(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf,
-				size_t len)
+static ssize_t iio_buffer_store_enable(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf,
+				       size_t len)
 {
 	int ret;
 	bool requested_state;
@@ -759,10 +755,14 @@ done:
 	mutex_unlock(&indio_dev->mlock);
 	return (ret < 0) ? ret : len;
 }
-EXPORT_SYMBOL(iio_buffer_store_enable);
 
 static const char * const iio_scan_elements_group_name = "scan_elements";
 
+static DEVICE_ATTR(length, S_IRUGO | S_IWUSR, iio_buffer_read_length,
+		   iio_buffer_write_length);
+static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
+		   iio_buffer_show_enable, iio_buffer_store_enable);
+
 int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 {
 	struct iio_dev_attr *p;
@@ -774,6 +774,27 @@ int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 	if (!buffer)
 		return 0;
 
+	attrcount = 0;
+	if (buffer->attrs) {
+		while (buffer->attrs[attrcount] != NULL)
+			attrcount++;
+	}
+
+	buffer->buffer_group.name = "buffer";
+	buffer->buffer_group.attrs = kcalloc(attrcount + 3,
+			sizeof(*buffer->buffer_group.attrs), GFP_KERNEL);
+	if (!buffer->buffer_group.attrs)
+		return -ENOMEM;
+
+	buffer->buffer_group.attrs[0] = &dev_attr_length.attr;
+	buffer->buffer_group.attrs[1] = &dev_attr_enable.attr;
+	if (buffer->attrs)
+		memcpy(&buffer->buffer_group.attrs[2], buffer->attrs,
+			sizeof(*&buffer->buffer_group.attrs) * (attrcount - 2));
+	buffer->buffer_group.attrs[attrcount+2] = NULL;
+
+	indio_dev->groups[indio_dev->groupcounter++] = &buffer->buffer_group;
+
 	if (buffer->scan_el_attrs != NULL) {
 		attr = buffer->scan_el_attrs->attrs;
 		while (*attr++ != NULL)
@@ -838,6 +859,7 @@ error_free_scan_mask:
 	kfree(buffer->scan_mask);
 error_cleanup_dynamic:
 	iio_free_chan_devattr_list(&buffer->scan_el_dev_attr_list);
+	kfree(indio_dev->buffer->buffer_group.attrs);
 
 	return ret;
 }
@@ -848,6 +870,7 @@ void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev)
 		return;
 
 	kfree(indio_dev->buffer->scan_mask);
+	kfree(indio_dev->buffer->buffer_group.attrs);
 	kfree(indio_dev->buffer->scan_el_group.attrs);
 	iio_free_chan_devattr_list(&indio_dev->buffer->scan_el_dev_attr_list);
 }
diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index 1258b4e0a722..3b0a3bc4f0ad 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -52,20 +52,6 @@ static int iio_get_length_kfifo(struct iio_buffer *r)
 	return r->length;
 }
 
-static IIO_BUFFER_ENABLE_ATTR;
-static IIO_BUFFER_LENGTH_ATTR;
-
-static struct attribute *iio_kfifo_attributes[] = {
-	&dev_attr_length.attr,
-	&dev_attr_enable.attr,
-	NULL,
-};
-
-static struct attribute_group iio_kfifo_attribute_group = {
-	.attrs = iio_kfifo_attributes,
-	.name = "buffer",
-};
-
 static int iio_mark_update_needed_kfifo(struct iio_buffer *r)
 {
 	struct iio_kfifo *kf = iio_to_kfifo(r);
@@ -169,7 +155,6 @@ struct iio_buffer *iio_kfifo_allocate(struct iio_dev *indio_dev)
 		return NULL;
 	kf->update_needed = true;
 	iio_buffer_init(&kf->buffer);
-	kf->buffer.attrs = &iio_kfifo_attribute_group;
 	kf->buffer.access = &kfifo_access_funcs;
 	kf->buffer.length = 2;
 	mutex_init(&kf->user_lock);
diff --git a/drivers/staging/iio/accel/sca3000_ring.c b/drivers/staging/iio/accel/sca3000_ring.c
index aa0e5d85bfe2..f2f260e01550 100644
--- a/drivers/staging/iio/accel/sca3000_ring.c
+++ b/drivers/staging/iio/accel/sca3000_ring.c
@@ -140,9 +140,6 @@ static bool sca3000_ring_buf_data_available(struct iio_buffer *r)
 	return r->stufftoread;
 }
 
-static IIO_BUFFER_ENABLE_ATTR;
-static IIO_BUFFER_LENGTH_ATTR;
-
 /**
  * sca3000_query_ring_int() is the hardware ring status interrupt enabled
  **/
@@ -232,20 +229,13 @@ static IIO_DEVICE_ATTR(in_accel_scale,
  * only apply to the ring buffer.  At all times full rate and accuracy
  * is available via direct reading from registers.
  */
-static struct attribute *sca3000_ring_attributes[] = {
-	&dev_attr_length.attr,
-	&dev_attr_enable.attr,
+static const struct attribute *sca3000_ring_attributes[] = {
 	&iio_dev_attr_50_percent.dev_attr.attr,
 	&iio_dev_attr_75_percent.dev_attr.attr,
 	&iio_dev_attr_in_accel_scale.dev_attr.attr,
 	NULL,
 };
 
-static struct attribute_group sca3000_ring_attr = {
-	.attrs = sca3000_ring_attributes,
-	.name = "buffer",
-};
-
 static struct iio_buffer *sca3000_rb_allocate(struct iio_dev *indio_dev)
 {
 	struct iio_buffer *buf;
@@ -258,7 +248,7 @@ static struct iio_buffer *sca3000_rb_allocate(struct iio_dev *indio_dev)
 	ring->private = indio_dev;
 	buf = &ring->buf;
 	buf->stufftoread = 0;
-	buf->attrs = &sca3000_ring_attr;
+	buf->attrs = sca3000_ring_attributes;
 	iio_buffer_init(buf);
 
 	return buf;
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 79cdb3d1b1bc..16b7663036f2 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -83,10 +83,11 @@ struct iio_buffer {
 	bool					scan_timestamp;
 	const struct iio_buffer_access_funcs	*access;
 	struct list_head			scan_el_dev_attr_list;
+	struct attribute_group			buffer_group;
 	struct attribute_group			scan_el_group;
 	wait_queue_head_t			pollq;
 	bool					stufftoread;
-	const struct attribute_group *attrs;
+	const struct attribute			**attrs;
 	struct list_head			demux_list;
 	void					*demux_bounce;
 	struct list_head			buffer_list;
@@ -148,40 +149,6 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev,
 
 int iio_update_demux(struct iio_dev *indio_dev);
 
-/**
- * iio_buffer_read_length() - attr func to get number of datums in the buffer
- **/
-ssize_t iio_buffer_read_length(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf);
-/**
- * iio_buffer_write_length() - attr func to set number of datums in the buffer
- **/
-ssize_t iio_buffer_write_length(struct device *dev,
-			      struct device_attribute *attr,
-			      const char *buf,
-			      size_t len);
-/**
- * iio_buffer_store_enable() - attr to turn the buffer on
- **/
-ssize_t iio_buffer_store_enable(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf,
-				size_t len);
-/**
- * iio_buffer_show_enable() - attr to see if the buffer is on
- **/
-ssize_t iio_buffer_show_enable(struct device *dev,
-			       struct device_attribute *attr,
-			       char *buf);
-#define IIO_BUFFER_LENGTH_ATTR DEVICE_ATTR(length, S_IRUGO | S_IWUSR,	\
-					   iio_buffer_read_length,	\
-					   iio_buffer_write_length)
-
-#define IIO_BUFFER_ENABLE_ATTR DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,	\
-					   iio_buffer_show_enable,	\
-					   iio_buffer_store_enable)
-
 bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
 	const unsigned long *mask);
 
-- 
cgit v1.2.3


From 374956600ecbedf5ca29c76bde114160eb805091 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 26 Nov 2014 18:55:17 +0100
Subject: iio: buffer: Drop get_length callback

We already do have the length field in the struct iio_buffer which is
expected to be in sync with the current size of the buffer. And currently
all implementations of the get_length callback either return this field or a
constant number.

This patch removes the get_length callback and replaces all occurrences in
the IIO core with directly accessing the length field of the buffer.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-buffer.c          | 11 +++--------
 drivers/iio/kfifo_buf.c                    |  6 ------
 drivers/staging/iio/Documentation/ring.txt |  4 ++--
 drivers/staging/iio/accel/sca3000_ring.c   |  8 +-------
 include/linux/iio/buffer.h                 |  2 --
 5 files changed, 6 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 4ca4c0a09923..2bd8d399f2ec 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -390,11 +390,7 @@ static ssize_t iio_buffer_read_length(struct device *dev,
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
 	struct iio_buffer *buffer = indio_dev->buffer;
 
-	if (buffer->access->get_length)
-		return sprintf(buf, "%d\n",
-			       buffer->access->get_length(buffer));
-
-	return 0;
+	return sprintf(buf, "%d\n", buffer->length);
 }
 
 static ssize_t iio_buffer_write_length(struct device *dev,
@@ -410,9 +406,8 @@ static ssize_t iio_buffer_write_length(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (buffer->access->get_length)
-		if (val == buffer->access->get_length(buffer))
-			return len;
+	if (val == buffer->length)
+		return len;
 
 	mutex_lock(&indio_dev->mlock);
 	if (iio_buffer_is_active(indio_dev->buffer)) {
diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index 3b0a3bc4f0ad..b20a9cfbc8ed 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -47,11 +47,6 @@ static int iio_request_update_kfifo(struct iio_buffer *r)
 	return ret;
 }
 
-static int iio_get_length_kfifo(struct iio_buffer *r)
-{
-	return r->length;
-}
-
 static int iio_mark_update_needed_kfifo(struct iio_buffer *r)
 {
 	struct iio_kfifo *kf = iio_to_kfifo(r);
@@ -141,7 +136,6 @@ static const struct iio_buffer_access_funcs kfifo_access_funcs = {
 	.data_available = iio_kfifo_buf_data_available,
 	.request_update = &iio_request_update_kfifo,
 	.set_bytes_per_datum = &iio_set_bytes_per_datum_kfifo,
-	.get_length = &iio_get_length_kfifo,
 	.set_length = &iio_set_length_kfifo,
 	.release = &iio_kfifo_buffer_release,
 };
diff --git a/drivers/staging/iio/Documentation/ring.txt b/drivers/staging/iio/Documentation/ring.txt
index 434d63a6123a..18718fcaf259 100644
--- a/drivers/staging/iio/Documentation/ring.txt
+++ b/drivers/staging/iio/Documentation/ring.txt
@@ -42,6 +42,6 @@ request_update
 set_bytes_per_datum
   Set the number of bytes for a complete scan. (All samples + timestamp)
 
-get_length / set_length
-  Get/set the number of complete scans that may be held by the buffer.
+set_length
+  Set the number of complete scans that may be held by the buffer.
 
diff --git a/drivers/staging/iio/accel/sca3000_ring.c b/drivers/staging/iio/accel/sca3000_ring.c
index f2f260e01550..f76a26885808 100644
--- a/drivers/staging/iio/accel/sca3000_ring.c
+++ b/drivers/staging/iio/accel/sca3000_ring.c
@@ -129,12 +129,6 @@ error_ret:
 	return ret ? ret : num_read;
 }
 
-/* This is only valid with all 3 elements enabled */
-static int sca3000_ring_get_length(struct iio_buffer *r)
-{
-	return 64;
-}
-
 static bool sca3000_ring_buf_data_available(struct iio_buffer *r)
 {
 	return r->stufftoread;
@@ -248,6 +242,7 @@ static struct iio_buffer *sca3000_rb_allocate(struct iio_dev *indio_dev)
 	ring->private = indio_dev;
 	buf = &ring->buf;
 	buf->stufftoread = 0;
+	buf->length = 64;
 	buf->attrs = sca3000_ring_attributes;
 	iio_buffer_init(buf);
 
@@ -261,7 +256,6 @@ static void sca3000_ring_release(struct iio_buffer *r)
 
 static const struct iio_buffer_access_funcs sca3000_ring_access_funcs = {
 	.read_first_n = &sca3000_read_first_n_hw_rb,
-	.get_length = &sca3000_ring_get_length,
 	.data_available = sca3000_ring_buf_data_available,
 	.release = sca3000_ring_release,
 };
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 16b7663036f2..b65850a41127 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -26,7 +26,6 @@ struct iio_buffer;
  * @request_update:	if a parameter change has been marked, update underlying
  *			storage.
  * @set_bytes_per_datum:set number of bytes per datum
- * @get_length:		get number of datums in buffer
  * @set_length:		set number of datums in buffer
  * @release:		called when the last reference to the buffer is dropped,
  *			should free all resources allocated by the buffer.
@@ -49,7 +48,6 @@ struct iio_buffer_access_funcs {
 	int (*request_update)(struct iio_buffer *buffer);
 
 	int (*set_bytes_per_datum)(struct iio_buffer *buffer, size_t bpd);
-	int (*get_length)(struct iio_buffer *buffer);
 	int (*set_length)(struct iio_buffer *buffer, int length);
 
 	void (*release)(struct iio_buffer *buffer);
-- 
cgit v1.2.3


From 09546a30632fd35996373146657d5a0296fd37ca Mon Sep 17 00:00:00 2001
From: "Ivan T. Ivanov" <iivanov@mm-sol.com>
Date: Tue, 23 Sep 2014 15:51:42 +0300
Subject: iio: consumer.h: Fix scale factor in function comment

1 milivolt is equal to 1000000 nanovolts.

Signed-off-by: Ivan T. Ivanov <iivanov@mm-sol.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/consumer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index 6f64624f329b..26fb8f6342bb 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -201,7 +201,7 @@ int iio_read_channel_scale(struct iio_channel *chan, int *val,
  * The scale factor allows to increase the precession of the returned value. For
  * a scale factor of 1 the function will return the result in the normal IIO
  * unit for the channel type. E.g. millivolt for voltage channels, if you want
- * nanovolts instead pass 1000 as the scale factor.
+ * nanovolts instead pass 1000000 as the scale factor.
  */
 int iio_convert_raw_to_processed(struct iio_channel *chan, int raw,
 	int *processed, unsigned int scale);
-- 
cgit v1.2.3


From e39f2d5956999c05c85814787a113ffadbcd4b26 Mon Sep 17 00:00:00 2001
From: Andrew Duggan <aduggan@synaptics.com>
Date: Fri, 12 Dec 2014 10:17:26 -0800
Subject: HID: rmi: Scan the report descriptor to determine if the device is
 suitable for the hid-rmi driver

On composite HID devices there may be multiple HID devices on separate
interfaces, but hid-rmi should only bind to the touchpad. The previous version
simply checked that the interface protocol was set to mouse. Unfortuately, it
is not always the case that the touchpad has the mouse interface protocol set.
This patch takes a different approach and scans the report descriptor looking
for the Generic Desktop Pointer usage and the Vendor Specific Top Level
Collection needed by the hid-rmi driver to interface with the device.

Signed-off-by: Andrew Duggan <aduggan@synaptics.com>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c | 22 +++++++++++++++++-----
 include/linux/hid.h    |  4 +++-
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index c3d0ac1a0988..81665b4f2258 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -698,6 +698,7 @@ static void hid_scan_feature_usage(struct hid_parser *parser, u32 usage)
 static void hid_scan_collection(struct hid_parser *parser, unsigned type)
 {
 	struct hid_device *hid = parser->device;
+	int i;
 
 	if (((parser->global.usage_page << 16) == HID_UP_SENSOR) &&
 	    type == HID_COLLECTION_PHYSICAL)
@@ -707,6 +708,14 @@ static void hid_scan_collection(struct hid_parser *parser, unsigned type)
 	    hid->product == USB_DEVICE_ID_MS_TYPE_COVER_3 &&
 	    hid->group == HID_GROUP_MULTITOUCH)
 		hid->group = HID_GROUP_GENERIC;
+
+	if ((parser->global.usage_page << 16) == HID_UP_GENDESK)
+		for (i = 0; i < parser->local.usage_index; i++)
+			if (parser->local.usage[i] == HID_GD_POINTER)
+				parser->scan_flags |= HID_SCAN_FLAG_GD_POINTER;
+
+	if ((parser->global.usage_page << 16) >= HID_UP_MSVENDOR)
+		parser->scan_flags |= HID_SCAN_FLAG_VENDOR_SPECIFIC;
 }
 
 static int hid_scan_main(struct hid_parser *parser, struct hid_item *item)
@@ -792,11 +801,14 @@ static int hid_scan_report(struct hid_device *hid)
 		hid->group = HID_GROUP_WACOM;
 		break;
 	case USB_VENDOR_ID_SYNAPTICS:
-		if ((hid->group == HID_GROUP_GENERIC) &&
-		    (hid->bus != BUS_USB || hid->type == HID_TYPE_USBMOUSE))
-			/* hid-rmi should only bind to the mouse interface of
-			 * composite USB devices */
-			hid->group = HID_GROUP_RMI;
+		if (hid->group == HID_GROUP_GENERIC)
+			if ((parser->scan_flags & HID_SCAN_FLAG_VENDOR_SPECIFIC)
+			    && (parser->scan_flags & HID_SCAN_FLAG_GD_POINTER))
+				/*
+				 * hid-rmi should take care of them,
+				 * not hid-generic
+				 */
+				hid->group = HID_GROUP_RMI;
 		break;
 	}
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 06c4607744f6..efc7787a41a8 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -574,7 +574,9 @@ static inline void hid_set_drvdata(struct hid_device *hdev, void *data)
 #define HID_GLOBAL_STACK_SIZE 4
 #define HID_COLLECTION_STACK_SIZE 4
 
-#define HID_SCAN_FLAG_MT_WIN_8			0x00000001
+#define HID_SCAN_FLAG_MT_WIN_8			BIT(0)
+#define HID_SCAN_FLAG_VENDOR_SPECIFIC		BIT(1)
+#define HID_SCAN_FLAG_GD_POINTER		BIT(2)
 
 struct hid_parser {
 	struct hid_global     global;
-- 
cgit v1.2.3


From d64cb71bede87dbca60d586a7bb4cef87fbe2731 Mon Sep 17 00:00:00 2001
From: Jaewon Kim <jaewon02.kim@samsung.com>
Date: Wed, 17 Dec 2014 10:31:08 -0800
Subject: Input: add regulator haptic driver

This change adds support for haptic driver controlled by voltage of a
regulator. Userspace can control the device via Force Feedback interface
from input framework.

Signed-off-by: Jaewon Kim <jaewon02.kim@samsung.com>
Signed-off-by: Hyunhee Kim <hyunhee.kim@samsung.com>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Tested-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Pankaj Dubey <pankaj.dubey@samsung.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 .../devicetree/bindings/input/regulator-haptic.txt |  21 ++
 drivers/input/misc/Kconfig                         |  12 +
 drivers/input/misc/Makefile                        |   1 +
 drivers/input/misc/regulator-haptic.c              | 279 +++++++++++++++++++++
 include/linux/platform_data/regulator-haptic.h     |  29 +++
 5 files changed, 342 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/input/regulator-haptic.txt
 create mode 100644 drivers/input/misc/regulator-haptic.c
 create mode 100644 include/linux/platform_data/regulator-haptic.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/input/regulator-haptic.txt b/Documentation/devicetree/bindings/input/regulator-haptic.txt
new file mode 100644
index 000000000000..3ed1c7eb2f97
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/regulator-haptic.txt
@@ -0,0 +1,21 @@
+* Regulator Haptic Device Tree Bindings
+
+Required Properties:
+ - compatible : Should be "regulator-haptic"
+ - haptic-supply : Power supply to the haptic motor.
+	[*] refer Documentation/devicetree/bindings/regulator/regulator.txt
+
+ - max-microvolt : The maximum voltage value supplied to the haptic motor.
+		[The unit of the voltage is a micro]
+
+ - min-microvolt : The minimum voltage value supplied to the haptic motor.
+		[The unit of the voltage is a micro]
+
+Example:
+
+	haptics {
+		compatible = "regulator-haptic";
+		haptic-supply = <&motor_regulator>;
+		max-microvolt = <2700000>;
+		min-microvolt = <1100000>;
+	};
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 23297ab6163f..1da0a20c42ea 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -394,6 +394,18 @@ config INPUT_CM109
 	  To compile this driver as a module, choose M here: the module will be
 	  called cm109.
 
+config INPUT_REGULATOR_HAPTIC
+	tristate "Regulator haptics support"
+	depends on REGULATOR
+	select INPUT_FF_MEMLESS
+	help
+	  This option enables device driver support for the haptic controlled
+	  by a regulator. This driver supports ff-memless interface
+	  from input framework.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called regulator-haptic.
+
 config INPUT_RETU_PWRBUTTON
 	tristate "Retu Power button Driver"
 	depends on MFD_RETU
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
index 19c760361f80..1f135af4af01 100644
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_INPUT_PMIC8XXX_PWRKEY)	+= pmic8xxx-pwrkey.o
 obj-$(CONFIG_INPUT_POWERMATE)		+= powermate.o
 obj-$(CONFIG_INPUT_PWM_BEEPER)		+= pwm-beeper.o
 obj-$(CONFIG_INPUT_RB532_BUTTON)	+= rb532_button.o
+obj-$(CONFIG_INPUT_REGULATOR_HAPTIC)	+= regulator-haptic.o
 obj-$(CONFIG_INPUT_RETU_PWRBUTTON)	+= retu-pwrbutton.o
 obj-$(CONFIG_INPUT_GPIO_ROTARY_ENCODER)	+= rotary_encoder.o
 obj-$(CONFIG_INPUT_SGI_BTNS)		+= sgi_btns.o
diff --git a/drivers/input/misc/regulator-haptic.c b/drivers/input/misc/regulator-haptic.c
new file mode 100644
index 000000000000..942622189bee
--- /dev/null
+++ b/drivers/input/misc/regulator-haptic.c
@@ -0,0 +1,279 @@
+/*
+ * Regulator haptic driver
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Jaewon Kim <jaewon02.kim@samsung.com>
+ * Author: Hyunhee Kim <hyunhee.kim@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/input.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_data/regulator-haptic.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+
+#define MAX_MAGNITUDE_SHIFT	16
+
+struct regulator_haptic {
+	struct device *dev;
+	struct input_dev *input_dev;
+	struct regulator *regulator;
+
+	struct work_struct work;
+	struct mutex mutex;
+
+	bool active;
+	bool suspended;
+
+	unsigned int max_volt;
+	unsigned int min_volt;
+	unsigned int magnitude;
+};
+
+static int regulator_haptic_toggle(struct regulator_haptic *haptic, bool on)
+{
+	int error;
+
+	if (haptic->active != on) {
+
+		error = on ? regulator_enable(haptic->regulator) :
+			     regulator_disable(haptic->regulator);
+		if (error) {
+			dev_err(haptic->dev,
+				"failed to switch regulator %s: %d\n",
+				on ? "on" : "off", error);
+			return error;
+		}
+
+		haptic->active = on;
+	}
+
+	return 0;
+}
+
+static int regulator_haptic_set_voltage(struct regulator_haptic *haptic,
+					 unsigned int magnitude)
+{
+	u64 volt_mag_multi;
+	unsigned int intensity;
+	int error;
+
+	volt_mag_multi = (u64)(haptic->max_volt - haptic->min_volt) * magnitude;
+	intensity = (unsigned int)(volt_mag_multi >> MAX_MAGNITUDE_SHIFT);
+
+	error = regulator_set_voltage(haptic->regulator,
+				      intensity + haptic->min_volt,
+				      haptic->max_volt);
+	if (error) {
+		dev_err(haptic->dev, "cannot set regulator voltage to %d: %d\n",
+			intensity + haptic->min_volt, error);
+		return error;
+	}
+
+	return 0;
+}
+
+static void regulator_haptic_work(struct work_struct *work)
+{
+	struct regulator_haptic *haptic = container_of(work,
+					struct regulator_haptic, work);
+	unsigned int magnitude;
+	int error;
+
+	mutex_lock(&haptic->mutex);
+
+	if (haptic->suspended)
+		goto out;
+
+	magnitude = ACCESS_ONCE(haptic->magnitude);
+
+	error = regulator_haptic_set_voltage(haptic, magnitude);
+	if (error)
+		goto out;
+
+	regulator_haptic_toggle(haptic, magnitude != 0);
+
+out:
+	mutex_unlock(&haptic->mutex);
+}
+
+static int regulator_haptic_play_effect(struct input_dev *input, void *data,
+					struct ff_effect *effect)
+{
+	struct regulator_haptic *haptic = input_get_drvdata(input);
+
+	haptic->magnitude = effect->u.rumble.strong_magnitude;
+	if (!haptic->magnitude)
+		haptic->magnitude = effect->u.rumble.weak_magnitude;
+
+	schedule_work(&haptic->work);
+
+	return 0;
+}
+
+static void regulator_haptic_close(struct input_dev *input)
+{
+	struct regulator_haptic *haptic = input_get_drvdata(input);
+
+	cancel_work_sync(&haptic->work);
+	regulator_haptic_set_voltage(haptic, 0);
+	regulator_haptic_toggle(haptic, false);
+}
+
+static int __maybe_unused
+regulator_haptic_parse_dt(struct device *dev, struct regulator_haptic *haptic)
+{
+	struct device_node *node;
+	int error;
+
+	node = dev->of_node;
+	if(!node) {
+		dev_err(dev, "Missing dveice tree data\n");
+		return -EINVAL;
+	}
+
+	error = of_property_read_u32(node, "max-microvolt", &haptic->max_volt);
+	if (error) {
+		dev_err(dev, "cannot parse max-microvolt\n");
+		return error;
+	}
+
+	error = of_property_read_u32(node, "min-microvolt", &haptic->min_volt);
+	if (error) {
+		dev_err(dev, "cannot parse min-microvolt\n");
+		return error;
+	}
+
+	return 0;
+}
+
+static int regulator_haptic_probe(struct platform_device *pdev)
+{
+	const struct regulator_haptic_data *pdata = dev_get_platdata(&pdev->dev);
+	struct regulator_haptic *haptic;
+	struct input_dev *input_dev;
+	int error;
+
+	haptic = devm_kzalloc(&pdev->dev, sizeof(*haptic), GFP_KERNEL);
+	if (!haptic)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, haptic);
+	haptic->dev = &pdev->dev;
+	mutex_init(&haptic->mutex);
+	INIT_WORK(&haptic->work, regulator_haptic_work);
+
+	if (pdata) {
+		haptic->max_volt = pdata->max_volt;
+		haptic->min_volt = pdata->min_volt;
+	} else if (IS_ENABLED(CONFIG_OF)) {
+		error = regulator_haptic_parse_dt(&pdev->dev, haptic);
+		if (error)
+			return error;
+	} else {
+		dev_err(&pdev->dev, "Missing platform data\n");
+		return -EINVAL;
+	}
+
+	haptic->regulator = devm_regulator_get_exclusive(&pdev->dev, "haptic");
+	if (IS_ERR(haptic->regulator)) {
+		dev_err(&pdev->dev, "failed to get regulator\n");
+		return PTR_ERR(haptic->regulator);
+	}
+
+	input_dev = devm_input_allocate_device(&pdev->dev);
+	if (!input_dev)
+		return	-ENOMEM;
+
+	haptic->input_dev = input_dev;
+	haptic->input_dev->name = "regulator-haptic";
+	haptic->input_dev->dev.parent = &pdev->dev;
+	haptic->input_dev->close = regulator_haptic_close;
+	input_set_drvdata(haptic->input_dev, haptic);
+	input_set_capability(haptic->input_dev, EV_FF, FF_RUMBLE);
+
+	error = input_ff_create_memless(input_dev, NULL,
+					regulator_haptic_play_effect);
+	if (error) {
+		dev_err(&pdev->dev, "failed to create force-feedback\n");
+		return error;
+	}
+
+	error = input_register_device(haptic->input_dev);
+	if (error) {
+		dev_err(&pdev->dev, "failed to register input device\n");
+		return error;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused regulator_haptic_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct regulator_haptic *haptic = platform_get_drvdata(pdev);
+	int error;
+
+	error = mutex_lock_interruptible(&haptic->mutex);
+	if (error)
+		return error;
+
+	regulator_haptic_set_voltage(haptic, 0);
+	regulator_haptic_toggle(haptic, false);
+
+	haptic->suspended = true;
+
+	mutex_unlock(&haptic->mutex);
+
+	return 0;
+}
+
+static int __maybe_unused regulator_haptic_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct regulator_haptic *haptic = platform_get_drvdata(pdev);
+	unsigned int magnitude;
+
+	mutex_lock(&haptic->mutex);
+
+	haptic->suspended = false;
+
+	magnitude = ACCESS_ONCE(haptic->magnitude);
+	if (magnitude) {
+		regulator_haptic_set_voltage(haptic, magnitude);
+		regulator_haptic_toggle(haptic, true);
+	}
+
+	mutex_unlock(&haptic->mutex);
+
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(regulator_haptic_pm_ops,
+		regulator_haptic_suspend, regulator_haptic_resume);
+
+static struct of_device_id regulator_haptic_dt_match[] = {
+	{ .compatible = "regulator-haptic" },
+	{ /* sentinel */ },
+};
+
+static struct platform_driver regulator_haptic_driver = {
+	.probe		= regulator_haptic_probe,
+	.driver		= {
+		.name		= "regulator-haptic",
+		.of_match_table = regulator_haptic_dt_match,
+		.pm		= &regulator_haptic_pm_ops,
+	},
+};
+module_platform_driver(regulator_haptic_driver);
+
+MODULE_AUTHOR("Jaewon Kim <jaewon02.kim@samsung.com>");
+MODULE_AUTHOR("Hyunhee Kim <hyunhee.kim@samsung.com>");
+MODULE_DESCRIPTION("Regulator haptic driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/platform_data/regulator-haptic.h b/include/linux/platform_data/regulator-haptic.h
new file mode 100644
index 000000000000..5658e58e0738
--- /dev/null
+++ b/include/linux/platform_data/regulator-haptic.h
@@ -0,0 +1,29 @@
+/*
+ * Regulator Haptic Platform Data
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Jaewon Kim <jaewon02.kim@samsung.com>
+ * Author: Hyunhee Kim <hyunhee.kim@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _REGULATOR_HAPTIC_H
+#define _REGULATOR_HAPTIC_H
+
+/*
+ * struct regulator_haptic_data - Platform device data
+ *
+ * @max_volt: maximum voltage value supplied to the haptic motor.
+ *		<The unit of the voltage is a micro>
+ * @min_volt: minimum voltage value supplied to the haptic motor.
+ *		<The unit of the voltage is a micro>
+ */
+struct regulator_haptic_data {
+	unsigned int max_volt;
+	unsigned int min_volt;
+};
+
+#endif /* _REGULATOR_HAPTIC_H */
-- 
cgit v1.2.3


From ceacbdbf65c4cf48a130db6152c6e03432c85ed1 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 17 Nov 2014 14:41:57 +0100
Subject: dmaengine: Make the destination abbreviation coherent

The dmaengine header abbreviates destination as at least two different strings.
Make a coherent use of a single one.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Stephen Warren <swarren@wwwdotorg.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_xdmac.c                | 2 +-
 drivers/dma/bcm2835-dma.c             | 2 +-
 drivers/dma/edma.c                    | 2 +-
 drivers/dma/fsl-edma.c                | 2 +-
 drivers/dma/nbpfaxi.c                 | 2 +-
 drivers/dma/omap-dma.c                | 2 +-
 drivers/dma/pl330.c                   | 2 +-
 drivers/dma/sirf-dma.c                | 2 +-
 include/linux/dmaengine.h             | 8 ++++----
 sound/soc/soc-generic-dmaengine-pcm.c | 2 +-
 10 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index b60d77a22df6..ff67466c779c 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1229,7 +1229,7 @@ static int at_xdmac_device_slave_caps(struct dma_chan *dchan,
 {
 
 	caps->src_addr_widths = AT_XDMAC_DMA_BUSWIDTHS;
-	caps->dstn_addr_widths = AT_XDMAC_DMA_BUSWIDTHS;
+	caps->dst_addr_widths = AT_XDMAC_DMA_BUSWIDTHS;
 	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
 	caps->cmd_pause = true;
 	caps->cmd_terminate = true;
diff --git a/drivers/dma/bcm2835-dma.c b/drivers/dma/bcm2835-dma.c
index 918b7b3f766f..3feba6c25836 100644
--- a/drivers/dma/bcm2835-dma.c
+++ b/drivers/dma/bcm2835-dma.c
@@ -569,7 +569,7 @@ static int bcm2835_dma_device_slave_caps(struct dma_chan *dchan,
 	struct dma_slave_caps *caps)
 {
 	caps->src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
-	caps->dstn_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
+	caps->dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
 	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
 	caps->cmd_pause = false;
 	caps->cmd_terminate = true;
diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c
index b969206439b7..2b49fe6f0465 100644
--- a/drivers/dma/edma.c
+++ b/drivers/dma/edma.c
@@ -998,7 +998,7 @@ static int edma_dma_device_slave_caps(struct dma_chan *dchan,
 				      struct dma_slave_caps *caps)
 {
 	caps->src_addr_widths = EDMA_DMA_BUSWIDTHS;
-	caps->dstn_addr_widths = EDMA_DMA_BUSWIDTHS;
+	caps->dst_addr_widths = EDMA_DMA_BUSWIDTHS;
 	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
 	caps->cmd_pause = true;
 	caps->cmd_terminate = true;
diff --git a/drivers/dma/fsl-edma.c b/drivers/dma/fsl-edma.c
index e9ebb89e1711..ce6e960b78a7 100644
--- a/drivers/dma/fsl-edma.c
+++ b/drivers/dma/fsl-edma.c
@@ -784,7 +784,7 @@ static int fsl_dma_device_slave_caps(struct dma_chan *dchan,
 		struct dma_slave_caps *caps)
 {
 	caps->src_addr_widths = FSL_EDMA_BUSWIDTHS;
-	caps->dstn_addr_widths = FSL_EDMA_BUSWIDTHS;
+	caps->dst_addr_widths = FSL_EDMA_BUSWIDTHS;
 	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
 	caps->cmd_pause = true;
 	caps->cmd_terminate = true;
diff --git a/drivers/dma/nbpfaxi.c b/drivers/dma/nbpfaxi.c
index d7d61e1a01c3..3d993e785180 100644
--- a/drivers/dma/nbpfaxi.c
+++ b/drivers/dma/nbpfaxi.c
@@ -1076,7 +1076,7 @@ static int nbpf_slave_caps(struct dma_chan *dchan,
 			   struct dma_slave_caps *caps)
 {
 	caps->src_addr_widths = NBPF_DMA_BUSWIDTHS;
-	caps->dstn_addr_widths = NBPF_DMA_BUSWIDTHS;
+	caps->dst_addr_widths = NBPF_DMA_BUSWIDTHS;
 	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
 	caps->cmd_pause = false;
 	caps->cmd_terminate = true;
diff --git a/drivers/dma/omap-dma.c b/drivers/dma/omap-dma.c
index c0016a68b446..ca4645c27634 100644
--- a/drivers/dma/omap-dma.c
+++ b/drivers/dma/omap-dma.c
@@ -1098,7 +1098,7 @@ static int omap_dma_device_slave_caps(struct dma_chan *dchan,
 				      struct dma_slave_caps *caps)
 {
 	caps->src_addr_widths = OMAP_DMA_BUSWIDTHS;
-	caps->dstn_addr_widths = OMAP_DMA_BUSWIDTHS;
+	caps->dst_addr_widths = OMAP_DMA_BUSWIDTHS;
 	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
 	caps->cmd_pause = true;
 	caps->cmd_terminate = true;
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index bdf40b530032..4a759c8718a8 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -2627,7 +2627,7 @@ static int pl330_dma_device_slave_caps(struct dma_chan *dchan,
 	struct dma_slave_caps *caps)
 {
 	caps->src_addr_widths = PL330_DMA_BUSWIDTHS;
-	caps->dstn_addr_widths = PL330_DMA_BUSWIDTHS;
+	caps->dst_addr_widths = PL330_DMA_BUSWIDTHS;
 	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
 	caps->cmd_pause = false;
 	caps->cmd_terminate = true;
diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
index 3492a5f91d31..11c85fc450a1 100644
--- a/drivers/dma/sirf-dma.c
+++ b/drivers/dma/sirf-dma.c
@@ -652,7 +652,7 @@ static int sirfsoc_dma_device_slave_caps(struct dma_chan *dchan,
 	struct dma_slave_caps *caps)
 {
 	caps->src_addr_widths = SIRFSOC_DMA_BUSWIDTHS;
-	caps->dstn_addr_widths = SIRFSOC_DMA_BUSWIDTHS;
+	caps->dst_addr_widths = SIRFSOC_DMA_BUSWIDTHS;
 	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
 	caps->cmd_pause = true;
 	caps->cmd_terminate = true;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 40cd75e21ea2..03a1febe8740 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -387,7 +387,7 @@ enum dma_residue_granularity {
 /* struct dma_slave_caps - expose capabilities of a slave channel only
  *
  * @src_addr_widths: bit mask of src addr widths the channel supports
- * @dstn_addr_widths: bit mask of dstn addr widths the channel supports
+ * @dst_addr_widths: bit mask of dstn addr widths the channel supports
  * @directions: bit mask of slave direction the channel supported
  * 	since the enum dma_transfer_direction is not defined as bits for each
  * 	type of direction, the dma controller should fill (1 << <TYPE>) and same
@@ -398,7 +398,7 @@ enum dma_residue_granularity {
  */
 struct dma_slave_caps {
 	u32 src_addr_widths;
-	u32 dstn_addr_widths;
+	u32 dst_addr_widths;
 	u32 directions;
 	bool cmd_pause;
 	bool cmd_terminate;
@@ -639,10 +639,10 @@ struct dma_device {
 	void (*device_free_chan_resources)(struct dma_chan *chan);
 
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
-		struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+		struct dma_chan *chan, dma_addr_t dst, dma_addr_t src,
 		size_t len, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
-		struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
+		struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
 		unsigned int src_cnt, size_t len, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)(
 		struct dma_chan *chan, dma_addr_t *src,	unsigned int src_cnt,
diff --git a/sound/soc/soc-generic-dmaengine-pcm.c b/sound/soc/soc-generic-dmaengine-pcm.c
index b329b84bc5af..851f7afcd5dc 100644
--- a/sound/soc/soc-generic-dmaengine-pcm.c
+++ b/sound/soc/soc-generic-dmaengine-pcm.c
@@ -151,7 +151,7 @@ static int dmaengine_pcm_set_runtime_hwparams(struct snd_pcm_substream *substrea
 			hw.info |= SNDRV_PCM_INFO_BATCH;
 
 		if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
-			addr_widths = dma_caps.dstn_addr_widths;
+			addr_widths = dma_caps.dst_addr_widths;
 		else
 			addr_widths = dma_caps.src_addr_widths;
 	}
-- 
cgit v1.2.3


From 94a73e30dfe6722e9f4ef19f7892901d7d00eab1 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 17 Nov 2014 14:42:00 +0100
Subject: dmaengine: Introduce a device_config callback

The fact that the channel configuration is done in device_control is rather
misleading, since it's not really advertised as such, plus, the fact that the
framework exposes a function of its own makes it not really intuitive, while
we're losing the type checking whenever we pass that unsigned long argument.

Add a device_config callback to dma_device, with a fallback on the old
behaviour for now for existing drivers to opt in.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 03a1febe8740..cf7c85d408ad 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -608,6 +608,8 @@ struct dma_tx_state {
  *	The function takes a buffer of size buf_len. The callback function will
  *	be called after period_len bytes have been transferred.
  * @device_prep_interleaved_dma: Transfer expression in a generic way.
+ * @device_config: Pushes a new configuration to a channel, return 0 or an error
+ *	code
  * @device_control: manipulate all pending operations on a channel, returns
  *	zero or error code
  * @device_tx_status: poll for transaction completion, the optional
@@ -674,6 +676,9 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
 		struct dma_chan *chan, struct dma_interleaved_template *xt,
 		unsigned long flags);
+
+	int (*device_config)(struct dma_chan *chan,
+			     struct dma_slave_config *config);
 	int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 		unsigned long arg);
 
@@ -697,6 +702,9 @@ static inline int dmaengine_device_control(struct dma_chan *chan,
 static inline int dmaengine_slave_config(struct dma_chan *chan,
 					  struct dma_slave_config *config)
 {
+	if (chan->device->device_config)
+		return chan->device->device_config(chan, config);
+
 	return dmaengine_device_control(chan, DMA_SLAVE_CONFIG,
 			(unsigned long)config);
 }
-- 
cgit v1.2.3


From 23a3ea2f5bead4d3b16e119e9127a66234f41d53 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 17 Nov 2014 14:42:01 +0100
Subject: dmaengine: split out pause/resume operations from device_control

Split out the pause and resume operations to callbacks of their own. In order
to preserve some backwark compatibility, the dmaengine_pause/dmaengine_resume
are still falling back on dmaengine_device_control.

Eventually, that will allow to get the device capabilities in a generic way,
removing the need to implement device_slave_caps.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index cf7c85d408ad..01f27e8a69b7 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -612,6 +612,10 @@ struct dma_tx_state {
  *	code
  * @device_control: manipulate all pending operations on a channel, returns
  *	zero or error code
+ * @device_pause: Pauses any transfer happening on a channel. Returns
+ *	0 or an error code
+ * @device_resume: Resumes any transfer on a channel previously
+ *	paused. Returns 0 or an error code
  * @device_tx_status: poll for transaction completion, the optional
  *	txstate parameter can be supplied with a pointer to get a
  *	struct with auxiliary transfer status information, otherwise the call
@@ -681,6 +685,8 @@ struct dma_device {
 			     struct dma_slave_config *config);
 	int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 		unsigned long arg);
+	int (*device_pause)(struct dma_chan *chan);
+	int (*device_resume)(struct dma_chan *chan);
 
 	enum dma_status (*device_tx_status)(struct dma_chan *chan,
 					    dma_cookie_t cookie,
@@ -795,11 +801,17 @@ static inline int dmaengine_terminate_all(struct dma_chan *chan)
 
 static inline int dmaengine_pause(struct dma_chan *chan)
 {
+	if (chan->device->device_pause)
+		return chan->device->device_pause(chan);
+
 	return dmaengine_device_control(chan, DMA_PAUSE, 0);
 }
 
 static inline int dmaengine_resume(struct dma_chan *chan)
 {
+	if (chan->device->device_resume)
+		return chan->device->device_resume(chan);
+
 	return dmaengine_device_control(chan, DMA_RESUME, 0);
 }
 
-- 
cgit v1.2.3


From 7fa0cf462daa6f6121b332b87833d7f5bdb515c0 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 17 Nov 2014 14:42:02 +0100
Subject: dmaengine: Add device_terminate_all callback

Split out the terminate_all command from device_control to a dma_device
callback. In order to preserve backward capability, still rely on
device_control if no such callback has been implemented.

Eventually, this will allow to create a generic dma_slave_caps callback.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 01f27e8a69b7..ded5161653aa 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -616,6 +616,8 @@ struct dma_tx_state {
  *	0 or an error code
  * @device_resume: Resumes any transfer on a channel previously
  *	paused. Returns 0 or an error code
+ * @device_terminate_all: Aborts all transfers on a channel. Returns 0
+ *	or an error code
  * @device_tx_status: poll for transaction completion, the optional
  *	txstate parameter can be supplied with a pointer to get a
  *	struct with auxiliary transfer status information, otherwise the call
@@ -687,6 +689,7 @@ struct dma_device {
 		unsigned long arg);
 	int (*device_pause)(struct dma_chan *chan);
 	int (*device_resume)(struct dma_chan *chan);
+	int (*device_terminate_all)(struct dma_chan *chan);
 
 	enum dma_status (*device_tx_status)(struct dma_chan *chan,
 					    dma_cookie_t cookie,
@@ -796,6 +799,9 @@ static inline int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_cap
 
 static inline int dmaengine_terminate_all(struct dma_chan *chan)
 {
+	if (chan->device->device_terminate_all)
+		return chan->device->device_terminate_all(chan);
+
 	return dmaengine_device_control(chan, DMA_TERMINATE_ALL, 0);
 }
 
-- 
cgit v1.2.3


From cb8cea513c80db1dfe2dce468d2d0772005bb9a1 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 17 Nov 2014 14:42:04 +0100
Subject: dmaengine: Create a generic dma_slave_caps callback

dma_slave_caps is very important to the generic layers that might interact with
dmaengine, such as ASoC. Unfortunately, it has been added as yet another
dma_device callback, and most of the existing drivers haven't implemented it,
reducing its reliability.

Introduce a generic behaviour to implement this, that rely on both the split of
device_control to derive which functions are supported and on new variables to
be set in the dma_device structure.

These variables holds what used to be the capabilities, that were set
per-channel. However, this proved to be a bit overkill, since every driver
filling these so far were hardcoding it, disregarding which channel was
actually given.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ded5161653aa..adf22089cc93 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -594,6 +594,14 @@ struct dma_tx_state {
  * @fill_align: alignment shift for memset operations
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
+ * @src_addr_widths: bit mask of src addr widths the device supports
+ * @dst_addr_widths: bit mask of dst addr widths the device supports
+ * @directions: bit mask of slave direction the device supports since
+ * 	the enum dma_transfer_direction is not defined as bits for
+ * 	each type of direction, the dma controller should fill (1 <<
+ * 	<TYPE>) and same should be checked by controller as well
+ * @residue_granularity: granularity of the transfer residue reported
+ *	by tx_status
  * @device_alloc_chan_resources: allocate resources and return the
  *	number of allocated descriptors
  * @device_free_chan_resources: release DMA channel's resources
@@ -643,6 +651,11 @@ struct dma_device {
 	int dev_id;
 	struct device *dev;
 
+	u32 src_addr_widths;
+	u32 dst_addr_widths;
+	u32 directions;
+	enum dma_residue_granularity residue_granularity;
+
 	int (*device_alloc_chan_resources)(struct dma_chan *chan);
 	void (*device_free_chan_resources)(struct dma_chan *chan);
 
@@ -784,17 +797,37 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
 
 static inline int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
 {
+	struct dma_device *device;
+
 	if (!chan || !caps)
 		return -EINVAL;
 
+	device = chan->device;
+
 	/* check if the channel supports slave transactions */
-	if (!test_bit(DMA_SLAVE, chan->device->cap_mask.bits))
+	if (!test_bit(DMA_SLAVE, device->cap_mask.bits))
+		return -ENXIO;
+
+	if (device->device_slave_caps)
+		return device->device_slave_caps(chan, caps);
+
+	/*
+	 * Check whether it reports it uses the generic slave
+	 * capabilities, if not, that means it doesn't support any
+	 * kind of slave capabilities reporting.
+	 */
+	if (!device->directions)
 		return -ENXIO;
 
-	if (chan->device->device_slave_caps)
-		return chan->device->device_slave_caps(chan, caps);
+	caps->src_addr_widths = device->src_addr_widths;
+	caps->dst_addr_widths = device->dst_addr_widths;
+	caps->directions = device->directions;
+	caps->residue_granularity = device->residue_granularity;
+
+	caps->cmd_pause = !!device->device_pause;
+	caps->cmd_terminate = !!device->device_terminate_all;
 
-	return -ENXIO;
+	return 0;
 }
 
 static inline int dmaengine_terminate_all(struct dma_chan *chan)
-- 
cgit v1.2.3


From 2c44ad914c56f4e53ef43285b5e4fe3459109769 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 17 Nov 2014 14:42:54 +0100
Subject: dmaengine: Remove device_control and device_slave_caps

Now that device_control has been split into several functions, and
device_slave_caps rendered useless, we can safely remove them.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 52 ++++++-----------------------------------------
 1 file changed, 6 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index adf22089cc93..6d34ce91036c 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -188,25 +188,6 @@ enum dma_ctrl_flags {
 	DMA_PREP_FENCE = (1 << 5),
 };
 
-/**
- * enum dma_ctrl_cmd - DMA operations that can optionally be exercised
- * on a running channel.
- * @DMA_TERMINATE_ALL: terminate all ongoing transfers
- * @DMA_PAUSE: pause ongoing transfers
- * @DMA_RESUME: resume paused transfer
- * @DMA_SLAVE_CONFIG: this command is only implemented by DMA controllers
- * that need to runtime reconfigure the slave channels (as opposed to passing
- * configuration data in statically from the platform). An additional
- * argument of struct dma_slave_config must be passed in with this
- * command.
- */
-enum dma_ctrl_cmd {
-	DMA_TERMINATE_ALL,
-	DMA_PAUSE,
-	DMA_RESUME,
-	DMA_SLAVE_CONFIG,
-};
-
 /**
  * enum sum_check_bits - bit position of pq_check_flags
  */
@@ -336,9 +317,8 @@ enum dma_slave_buswidth {
  * This struct is passed in as configuration data to a DMA engine
  * in order to set up a certain channel for DMA transport at runtime.
  * The DMA device/engine has to provide support for an additional
- * command in the channel config interface, DMA_SLAVE_CONFIG
- * and this struct will then be passed in as an argument to the
- * DMA engine device_control() function.
+ * callback in the dma_device structure, device_config and this struct
+ * will then be passed in as an argument to the function.
  *
  * The rationale for adding configuration information to this struct is as
  * follows: if it is likely that more than one DMA slave controllers in
@@ -618,8 +598,6 @@ struct dma_tx_state {
  * @device_prep_interleaved_dma: Transfer expression in a generic way.
  * @device_config: Pushes a new configuration to a channel, return 0 or an error
  *	code
- * @device_control: manipulate all pending operations on a channel, returns
- *	zero or error code
  * @device_pause: Pauses any transfer happening on a channel. Returns
  *	0 or an error code
  * @device_resume: Resumes any transfer on a channel previously
@@ -631,7 +609,6 @@ struct dma_tx_state {
  *	struct with auxiliary transfer status information, otherwise the call
  *	will just return a simple status code
  * @device_issue_pending: push pending transactions to hardware
- * @device_slave_caps: return the slave channel capabilities
  */
 struct dma_device {
 
@@ -698,8 +675,6 @@ struct dma_device {
 
 	int (*device_config)(struct dma_chan *chan,
 			     struct dma_slave_config *config);
-	int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
-		unsigned long arg);
 	int (*device_pause)(struct dma_chan *chan);
 	int (*device_resume)(struct dma_chan *chan);
 	int (*device_terminate_all)(struct dma_chan *chan);
@@ -708,27 +683,15 @@ struct dma_device {
 					    dma_cookie_t cookie,
 					    struct dma_tx_state *txstate);
 	void (*device_issue_pending)(struct dma_chan *chan);
-	int (*device_slave_caps)(struct dma_chan *chan, struct dma_slave_caps *caps);
 };
 
-static inline int dmaengine_device_control(struct dma_chan *chan,
-					   enum dma_ctrl_cmd cmd,
-					   unsigned long arg)
-{
-	if (chan->device->device_control)
-		return chan->device->device_control(chan, cmd, arg);
-
-	return -ENOSYS;
-}
-
 static inline int dmaengine_slave_config(struct dma_chan *chan,
 					  struct dma_slave_config *config)
 {
 	if (chan->device->device_config)
 		return chan->device->device_config(chan, config);
 
-	return dmaengine_device_control(chan, DMA_SLAVE_CONFIG,
-			(unsigned long)config);
+	return -ENOSYS;
 }
 
 static inline bool is_slave_direction(enum dma_transfer_direction direction)
@@ -808,9 +771,6 @@ static inline int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_cap
 	if (!test_bit(DMA_SLAVE, device->cap_mask.bits))
 		return -ENXIO;
 
-	if (device->device_slave_caps)
-		return device->device_slave_caps(chan, caps);
-
 	/*
 	 * Check whether it reports it uses the generic slave
 	 * capabilities, if not, that means it doesn't support any
@@ -835,7 +795,7 @@ static inline int dmaengine_terminate_all(struct dma_chan *chan)
 	if (chan->device->device_terminate_all)
 		return chan->device->device_terminate_all(chan);
 
-	return dmaengine_device_control(chan, DMA_TERMINATE_ALL, 0);
+	return -ENOSYS;
 }
 
 static inline int dmaengine_pause(struct dma_chan *chan)
@@ -843,7 +803,7 @@ static inline int dmaengine_pause(struct dma_chan *chan)
 	if (chan->device->device_pause)
 		return chan->device->device_pause(chan);
 
-	return dmaengine_device_control(chan, DMA_PAUSE, 0);
+	return -ENOSYS;
 }
 
 static inline int dmaengine_resume(struct dma_chan *chan)
@@ -851,7 +811,7 @@ static inline int dmaengine_resume(struct dma_chan *chan)
 	if (chan->device->device_resume)
 		return chan->device->device_resume(chan);
 
-	return dmaengine_device_control(chan, DMA_RESUME, 0);
+	return -ENOSYS;
 }
 
 static inline enum dma_status dmaengine_tx_status(struct dma_chan *chan,
-- 
cgit v1.2.3


From 3a2c0ba5ad00c018c0bef39a2224aca950aa33f2 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 8 Dec 2014 16:50:37 +0800
Subject: hwrng: use reference counts on each struct hwrng.

current_rng holds one reference, and we bump it every time we want
to do a read from it.

This means we only hold the rng_mutex to grab or drop a reference,
so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
block on read of /dev/hwrng.

Using a kref is overkill (we're always under the rng_mutex), but
a standard pattern.

This also solves the problem that the hwrng_fillfn thread was
accessing current_rng without a lock, which could change (eg. to NULL)
underneath it.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Amos Kong <akong@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/core.c | 135 ++++++++++++++++++++++++++++--------------
 include/linux/hw_random.h     |   2 +
 2 files changed, 94 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index b4c0e873d362..089c18dc579e 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -42,6 +42,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/random.h>
+#include <linux/err.h>
 #include <asm/uaccess.h>
 
 
@@ -91,6 +92,60 @@ static void add_early_randomness(struct hwrng *rng)
 		add_device_randomness(bytes, bytes_read);
 }
 
+static inline void cleanup_rng(struct kref *kref)
+{
+	struct hwrng *rng = container_of(kref, struct hwrng, ref);
+
+	if (rng->cleanup)
+		rng->cleanup(rng);
+}
+
+static void set_current_rng(struct hwrng *rng)
+{
+	BUG_ON(!mutex_is_locked(&rng_mutex));
+	kref_get(&rng->ref);
+	current_rng = rng;
+}
+
+static void drop_current_rng(void)
+{
+	BUG_ON(!mutex_is_locked(&rng_mutex));
+	if (!current_rng)
+		return;
+
+	/* decrease last reference for triggering the cleanup */
+	kref_put(&current_rng->ref, cleanup_rng);
+	current_rng = NULL;
+}
+
+/* Returns ERR_PTR(), NULL or refcounted hwrng */
+static struct hwrng *get_current_rng(void)
+{
+	struct hwrng *rng;
+
+	if (mutex_lock_interruptible(&rng_mutex))
+		return ERR_PTR(-ERESTARTSYS);
+
+	rng = current_rng;
+	if (rng)
+		kref_get(&rng->ref);
+
+	mutex_unlock(&rng_mutex);
+	return rng;
+}
+
+static void put_rng(struct hwrng *rng)
+{
+	/*
+	 * Hold rng_mutex here so we serialize in case they set_current_rng
+	 * on rng again immediately.
+	 */
+	mutex_lock(&rng_mutex);
+	if (rng)
+		kref_put(&rng->ref, cleanup_rng);
+	mutex_unlock(&rng_mutex);
+}
+
 static inline int hwrng_init(struct hwrng *rng)
 {
 	if (rng->init) {
@@ -113,12 +168,6 @@ static inline int hwrng_init(struct hwrng *rng)
 	return 0;
 }
 
-static inline void hwrng_cleanup(struct hwrng *rng)
-{
-	if (rng && rng->cleanup)
-		rng->cleanup(rng);
-}
-
 static int rng_dev_open(struct inode *inode, struct file *filp)
 {
 	/* enforce read-only access to this chrdev */
@@ -154,21 +203,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 	ssize_t ret = 0;
 	int err = 0;
 	int bytes_read, len;
+	struct hwrng *rng;
 
 	while (size) {
-		if (mutex_lock_interruptible(&rng_mutex)) {
-			err = -ERESTARTSYS;
+		rng = get_current_rng();
+		if (IS_ERR(rng)) {
+			err = PTR_ERR(rng);
 			goto out;
 		}
-
-		if (!current_rng) {
+		if (!rng) {
 			err = -ENODEV;
-			goto out_unlock;
+			goto out;
 		}
 
 		mutex_lock(&reading_mutex);
 		if (!data_avail) {
-			bytes_read = rng_get_data(current_rng, rng_buffer,
+			bytes_read = rng_get_data(rng, rng_buffer,
 				rng_buffer_size(),
 				!(filp->f_flags & O_NONBLOCK));
 			if (bytes_read < 0) {
@@ -200,8 +250,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 			ret += len;
 		}
 
-		mutex_unlock(&rng_mutex);
 		mutex_unlock(&reading_mutex);
+		put_rng(rng);
 
 		if (need_resched())
 			schedule_timeout_interruptible(1);
@@ -213,12 +263,11 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
 	}
 out:
 	return ret ? : err;
-out_unlock:
-	mutex_unlock(&rng_mutex);
-	goto out;
+
 out_unlock_reading:
 	mutex_unlock(&reading_mutex);
-	goto out_unlock;
+	put_rng(rng);
+	goto out;
 }
 
 
@@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev,
 			err = hwrng_init(rng);
 			if (err)
 				break;
-			hwrng_cleanup(current_rng);
-			current_rng = rng;
+			drop_current_rng();
+			set_current_rng(rng);
 			err = 0;
 			break;
 		}
@@ -272,17 +321,15 @@ static ssize_t hwrng_attr_current_show(struct device *dev,
 				       struct device_attribute *attr,
 				       char *buf)
 {
-	int err;
 	ssize_t ret;
-	const char *name = "none";
+	struct hwrng *rng;
 
-	err = mutex_lock_interruptible(&rng_mutex);
-	if (err)
-		return -ERESTARTSYS;
-	if (current_rng)
-		name = current_rng->name;
-	ret = snprintf(buf, PAGE_SIZE, "%s\n", name);
-	mutex_unlock(&rng_mutex);
+	rng = get_current_rng();
+	if (IS_ERR(rng))
+		return PTR_ERR(rng);
+
+	ret = snprintf(buf, PAGE_SIZE, "%s\n", rng ? rng->name : "none");
+	put_rng(rng);
 
 	return ret;
 }
@@ -353,12 +400,16 @@ static int hwrng_fillfn(void *unused)
 	long rc;
 
 	while (!kthread_should_stop()) {
-		if (!current_rng)
+		struct hwrng *rng;
+
+		rng = get_current_rng();
+		if (IS_ERR(rng) || !rng)
 			break;
 		mutex_lock(&reading_mutex);
-		rc = rng_get_data(current_rng, rng_fillbuf,
+		rc = rng_get_data(rng, rng_fillbuf,
 				  rng_buffer_size(), 1);
 		mutex_unlock(&reading_mutex);
+		put_rng(rng);
 		if (rc <= 0) {
 			pr_warn("hwrng: no data available\n");
 			msleep_interruptible(10000);
@@ -419,14 +470,13 @@ int hwrng_register(struct hwrng *rng)
 		err = hwrng_init(rng);
 		if (err)
 			goto out_unlock;
-		current_rng = rng;
+		set_current_rng(rng);
 	}
 	err = 0;
 	if (!old_rng) {
 		err = register_miscdev();
 		if (err) {
-			hwrng_cleanup(rng);
-			current_rng = NULL;
+			drop_current_rng();
 			goto out_unlock;
 		}
 	}
@@ -453,22 +503,21 @@ EXPORT_SYMBOL_GPL(hwrng_register);
 
 void hwrng_unregister(struct hwrng *rng)
 {
-	int err;
-
 	mutex_lock(&rng_mutex);
 
 	list_del(&rng->list);
 	if (current_rng == rng) {
-		hwrng_cleanup(rng);
-		if (list_empty(&rng_list)) {
-			current_rng = NULL;
-		} else {
-			current_rng = list_entry(rng_list.prev, struct hwrng, list);
-			err = hwrng_init(current_rng);
-			if (err)
-				current_rng = NULL;
+		drop_current_rng();
+		if (!list_empty(&rng_list)) {
+			struct hwrng *tail;
+
+			tail = list_entry(rng_list.prev, struct hwrng, list);
+
+			if (hwrng_init(tail) == 0)
+				set_current_rng(tail);
 		}
 	}
+
 	if (list_empty(&rng_list)) {
 		mutex_unlock(&rng_mutex);
 		unregister_miscdev();
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index 914bb08cd738..c212e71ea886 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/kref.h>
 
 /**
  * struct hwrng - Hardware Random Number Generator driver
@@ -44,6 +45,7 @@ struct hwrng {
 
 	/* internal. */
 	struct list_head list;
+	struct kref ref;
 };
 
 /** Register a new Hardware Random Number Generator driver. */
-- 
cgit v1.2.3


From a027f30d72f2c4d27d6dd9bd053205d3102de7d1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 8 Dec 2014 16:50:38 +0800
Subject: hwrng: fix unregister race.

The previous patch added one potential problem: we can still be
reading from a hwrng when it's unregistered.  Add a wait for zero
in the hwrng_unregister path.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Amos Kong <akong@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/core.c | 12 ++++++++++++
 include/linux/hw_random.h     |  1 +
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 089c18dc579e..8d609a026465 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex);
 static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
+static DECLARE_WAIT_QUEUE_HEAD(rng_done);
 static unsigned short current_quality;
 static unsigned short default_quality; /* = 0; default to "off" */
 
@@ -98,6 +99,11 @@ static inline void cleanup_rng(struct kref *kref)
 
 	if (rng->cleanup)
 		rng->cleanup(rng);
+
+	/* cleanup_done should be updated after cleanup finishes */
+	smp_wmb();
+	rng->cleanup_done = true;
+	wake_up_all(&rng_done);
 }
 
 static void set_current_rng(struct hwrng *rng)
@@ -494,6 +500,8 @@ int hwrng_register(struct hwrng *rng)
 		add_early_randomness(rng);
 	}
 
+	rng->cleanup_done = false;
+
 out_unlock:
 	mutex_unlock(&rng_mutex);
 out:
@@ -525,6 +533,10 @@ void hwrng_unregister(struct hwrng *rng)
 			kthread_stop(hwrng_fill);
 	} else
 		mutex_unlock(&rng_mutex);
+
+	/* Just in case rng is reading right now, wait. */
+	wait_event(rng_done, rng->cleanup_done &&
+		   atomic_read(&rng->ref.refcount) == 0);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index c212e71ea886..7832e5008959 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -46,6 +46,7 @@ struct hwrng {
 	/* internal. */
 	struct list_head list;
 	struct kref ref;
+	bool cleanup_done;
 };
 
 /** Register a new Hardware Random Number Generator driver. */
-- 
cgit v1.2.3


From c5f4546593e9911800f0926c1090959b58bc5c93 Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@redhat.com>
Date: Tue, 16 Dec 2014 11:58:18 -0600
Subject: livepatch: kernel: add TAINT_LIVEPATCH

This adds a new taint flag to indicate when the kernel or a kernel
module has been live patched.  This will provide a clean indication in
bug reports that live patching was used.

Additionally, if the crash occurs in a live patched function, the live
patch module will appear beside the patched function in the backtrace.

Signed-off-by: Seth Jennings <sjenning@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/oops-tracing.txt  | 2 ++
 Documentation/sysctl/kernel.txt | 1 +
 include/linux/kernel.h          | 1 +
 kernel/panic.c                  | 2 ++
 4 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index beefb9f82902..f3ac05cc23e4 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -270,6 +270,8 @@ characters, each representing a particular tainted value.
 
  15: 'L' if a soft lockup has previously occurred on the system.
 
+ 16: 'K' if the kernel has been live patched.
+
 The primary reason for the 'Tainted: ' string is to tell kernel
 debuggers if this is a clean kernel or if anything unusual has
 occurred.  Tainting is permanent: even if an offending module is
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 75511efefc64..83ab25660fc9 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -843,6 +843,7 @@ can be ORed together:
 8192 - An unsigned module has been loaded in a kernel supporting module
        signature.
 16384 - A soft lockup has previously occurred on the system.
+32768 - The kernel has been live patched.
 
 ==============================================================
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5449d2f4a1ef..d03e3deee091 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -471,6 +471,7 @@ extern enum system_states {
 #define TAINT_OOT_MODULE		12
 #define TAINT_UNSIGNED_MODULE		13
 #define TAINT_SOFTLOCKUP		14
+#define TAINT_LIVEPATCH			15
 
 extern const char hex_asc[];
 #define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d8d6f906dec..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -226,6 +226,7 @@ static const struct tnt tnts[] = {
 	{ TAINT_OOT_MODULE,		'O', ' ' },
 	{ TAINT_UNSIGNED_MODULE,	'E', ' ' },
 	{ TAINT_SOFTLOCKUP,		'L', ' ' },
+	{ TAINT_LIVEPATCH,		'K', ' ' },
 };
 
 /**
@@ -246,6 +247,7 @@ static const struct tnt tnts[] = {
  *  'O' - Out-of-tree module has been loaded.
  *  'E' - Unsigned module has been loaded.
  *  'L' - A soft lockup has previously occurred.
+ *  'K' - Kernel has been live patched.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
-- 
cgit v1.2.3


From b700e7f03df5d92f85fa5247fe1f557528d3363d Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@redhat.com>
Date: Tue, 16 Dec 2014 11:58:19 -0600
Subject: livepatch: kernel: add support for live patching

This commit introduces code for the live patching core.  It implements
an ftrace-based mechanism and kernel interface for doing live patching
of kernel and kernel module functions.

It represents the greatest common functionality set between kpatch and
kgraft and can accept patches built using either method.

This first version does not implement any consistency mechanism that
ensures that old and new code do not run together.  In practice, ~90% of
CVEs are safe to apply in this way, since they simply add a conditional
check.  However, any function change that can not execute safely with
the old version of the function can _not_ be safely applied in this
version.

[ jkosina@suse.cz: due to the number of contributions that got folded into
  this original patch from Seth Jennings, add SUSE's copyright as well, as
  discussed via e-mail ]

Signed-off-by: Seth Jennings <sjenning@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Petr Mladek <pmladek@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ABI/testing/sysfs-kernel-livepatch |  44 ++
 MAINTAINERS                                      |  13 +
 arch/x86/Kconfig                                 |   3 +
 arch/x86/include/asm/livepatch.h                 |  37 +
 arch/x86/kernel/Makefile                         |   1 +
 arch/x86/kernel/livepatch.c                      |  90 +++
 include/linux/livepatch.h                        | 133 ++++
 kernel/Makefile                                  |   1 +
 kernel/livepatch/Kconfig                         |  18 +
 kernel/livepatch/Makefile                        |   3 +
 kernel/livepatch/core.c                          | 930 +++++++++++++++++++++++
 11 files changed, 1273 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-livepatch
 create mode 100644 arch/x86/include/asm/livepatch.h
 create mode 100644 arch/x86/kernel/livepatch.c
 create mode 100644 include/linux/livepatch.h
 create mode 100644 kernel/livepatch/Kconfig
 create mode 100644 kernel/livepatch/Makefile
 create mode 100644 kernel/livepatch/core.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch
new file mode 100644
index 000000000000..5bf42a840b22
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-livepatch
@@ -0,0 +1,44 @@
+What:		/sys/kernel/livepatch
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		Interface for kernel live patching
+
+		The /sys/kernel/livepatch directory contains subdirectories for
+		each loaded live patch module.
+
+What:		/sys/kernel/livepatch/<patch>
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		The patch directory contains subdirectories for each kernel
+		object (vmlinux or a module) in which it patched functions.
+
+What:		/sys/kernel/livepatch/<patch>/enabled
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		A writable attribute that indicates whether the patched
+		code is currently applied.  Writing 0 will disable the patch
+		while writing 1 will re-enable the patch.
+
+What:		/sys/kernel/livepatch/<patch>/<object>
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		The object directory contains subdirectories for each function
+		that is patched within the object.
+
+What:		/sys/kernel/livepatch/<patch>/<object>/<function>
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		The function directory contains attributes regarding the
+		properties and state of the patched function.
+
+		There are currently no such attributes.
diff --git a/MAINTAINERS b/MAINTAINERS
index ddb9ac8d32b3..df6a0784b466 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5784,6 +5784,19 @@ F:	Documentation/misc-devices/lis3lv02d
 F:	drivers/misc/lis3lv02d/
 F:	drivers/platform/x86/hp_accel.c
 
+LIVE PATCHING
+M:	Josh Poimboeuf <jpoimboe@redhat.com>
+M:	Seth Jennings <sjenning@redhat.com>
+M:	Jiri Kosina <jkosina@suse.cz>
+M:	Vojtech Pavlik <vojtech@suse.cz>
+S:	Maintained
+F:	kernel/livepatch/
+F:	include/linux/livepatch.h
+F:	arch/x86/include/asm/livepatch.h
+F:	arch/x86/kernel/livepatch.c
+F:	Documentation/ABI/testing/sysfs-kernel-livepatch
+L:	live-patching@vger.kernel.org
+
 LLC (802.2)
 M:	Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
 S:	Maintained
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba397bde7948..460b31b79938 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,6 +17,7 @@ config X86_64
 	depends on 64BIT
 	select X86_DEV_DMA_OPS
 	select ARCH_USE_CMPXCHG_LOCKREF
+	select ARCH_HAVE_LIVE_PATCHING
 
 ### Arch settings
 config X86
@@ -2008,6 +2009,8 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+source "kernel/livepatch/Kconfig"
+
 endmenu
 
 config ARCH_ENABLE_MEMORY_HOTPLUG
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
new file mode 100644
index 000000000000..d529db1b1edf
--- /dev/null
+++ b/arch/x86/include/asm/livepatch.h
@@ -0,0 +1,37 @@
+/*
+ * livepatch.h - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ASM_X86_LIVEPATCH_H
+#define _ASM_X86_LIVEPATCH_H
+
+#include <linux/module.h>
+
+#ifdef CONFIG_LIVE_PATCHING
+#ifndef CC_USING_FENTRY
+#error Your compiler must support -mfentry for live patching to work
+#endif
+extern int klp_write_module_reloc(struct module *mod, unsigned long type,
+				  unsigned long loc, unsigned long value);
+
+#else
+#error Live patching support is disabled; check CONFIG_LIVE_PATCHING
+#endif
+
+#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5d4502c8b983..316b34e74c15 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_LIVE_PATCHING)	+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..ff3c3101d003
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,90 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/elf.h>
+#include <asm/livepatch.h>
+
+/**
+ * klp_write_module_reloc() - write a relocation in a module
+ * @mod:	module in which the section to be modified is found
+ * @type:	ELF relocation type (see asm/elf.h)
+ * @loc:	address that the relocation should be written to
+ * @value:	relocation value (sym address + addend)
+ *
+ * This function writes a relocation to the specified location for
+ * a particular module.
+ */
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+			   unsigned long loc, unsigned long value)
+{
+	int ret, numpages, size = 4;
+	bool readonly;
+	unsigned long val;
+	unsigned long core = (unsigned long)mod->module_core;
+	unsigned long core_ro_size = mod->core_ro_size;
+	unsigned long core_size = mod->core_size;
+
+	switch (type) {
+	case R_X86_64_NONE:
+		return 0;
+	case R_X86_64_64:
+		val = value;
+		size = 8;
+		break;
+	case R_X86_64_32:
+		val = (u32)value;
+		break;
+	case R_X86_64_32S:
+		val = (s32)value;
+		break;
+	case R_X86_64_PC32:
+		val = (u32)(value - loc);
+		break;
+	default:
+		/* unsupported relocation type */
+		return -EINVAL;
+	}
+
+	if (loc < core || loc >= core + core_size)
+		/* loc does not point to any symbol inside the module */
+		return -EINVAL;
+
+	if (loc < core + core_ro_size)
+		readonly = true;
+	else
+		readonly = false;
+
+	/* determine if the relocation spans a page boundary */
+	numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
+
+	if (readonly)
+		set_memory_rw(loc & PAGE_MASK, numpages);
+
+	ret = probe_kernel_write((void *)loc, &val, size);
+
+	if (readonly)
+		set_memory_ro(loc & PAGE_MASK, numpages);
+
+	return ret;
+}
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
new file mode 100644
index 000000000000..950bc615842f
--- /dev/null
+++ b/include/linux/livepatch.h
@@ -0,0 +1,133 @@
+/*
+ * livepatch.h - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _LINUX_LIVEPATCH_H_
+#define _LINUX_LIVEPATCH_H_
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+#if IS_ENABLED(CONFIG_LIVE_PATCHING)
+
+#include <asm/livepatch.h>
+
+enum klp_state {
+	KLP_DISABLED,
+	KLP_ENABLED
+};
+
+/**
+ * struct klp_func - function structure for live patching
+ * @old_name:	name of the function to be patched
+ * @new_func:	pointer to the patched function code
+ * @old_addr:	a hint conveying at what address the old function
+ *		can be found (optional, vmlinux patches only)
+ * @kobj:	kobject for sysfs resources
+ * @fops:	ftrace operations structure
+ * @state:	tracks function-level patch application state
+ */
+struct klp_func {
+	/* external */
+	const char *old_name;
+	void *new_func;
+	/*
+	 * The old_addr field is optional and can be used to resolve
+	 * duplicate symbol names in the vmlinux object.  If this
+	 * information is not present, the symbol is located by name
+	 * with kallsyms. If the name is not unique and old_addr is
+	 * not provided, the patch application fails as there is no
+	 * way to resolve the ambiguity.
+	 */
+	unsigned long old_addr;
+
+	/* internal */
+	struct kobject kobj;
+	struct ftrace_ops *fops;
+	enum klp_state state;
+};
+
+/**
+ * struct klp_reloc - relocation structure for live patching
+ * @loc:	address where the relocation will be written
+ * @val:	address of the referenced symbol (optional,
+ *		vmlinux	patches only)
+ * @type:	ELF relocation type
+ * @name:	name of the referenced symbol (for lookup/verification)
+ * @addend:	offset from the referenced symbol
+ * @external:	symbol is either exported or within the live patch module itself
+ */
+struct klp_reloc {
+	unsigned long loc;
+	unsigned long val;
+	unsigned long type;
+	const char *name;
+	int addend;
+	int external;
+};
+
+/**
+ * struct klp_object - kernel object structure for live patching
+ * @name:	module name (or NULL for vmlinux)
+ * @relocs:	relocation entries to be applied at load time
+ * @funcs:	function entries for functions to be patched in the object
+ * @kobj:	kobject for sysfs resources
+ * @mod:	kernel module associated with the patched object
+ * 		(NULL for vmlinux)
+ * @state:	tracks object-level patch application state
+ */
+struct klp_object {
+	/* external */
+	const char *name;
+	struct klp_reloc *relocs;
+	struct klp_func *funcs;
+
+	/* internal */
+	struct kobject *kobj;
+	struct module *mod;
+	enum klp_state state;
+};
+
+/**
+ * struct klp_patch - patch structure for live patching
+ * @mod:	reference to the live patch module
+ * @objs:	object entries for kernel objects to be patched
+ * @list:	list node for global list of registered patches
+ * @kobj:	kobject for sysfs resources
+ * @state:	tracks patch-level application state
+ */
+struct klp_patch {
+	/* external */
+	struct module *mod;
+	struct klp_object *objs;
+
+	/* internal */
+	struct list_head list;
+	struct kobject kobj;
+	enum klp_state state;
+};
+
+extern int klp_register_patch(struct klp_patch *);
+extern int klp_unregister_patch(struct klp_patch *);
+extern int klp_enable_patch(struct klp_patch *);
+extern int klp_disable_patch(struct klp_patch *);
+
+#endif /* CONFIG_LIVE_PATCHING */
+
+#endif /* _LINUX_LIVEPATCH_H_ */
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a3fa6c..616994f0a76f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -26,6 +26,7 @@ obj-y += power/
 obj-y += printk/
 obj-y += irq/
 obj-y += rcu/
+obj-y += livepatch/
 
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..96da00fbc120
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
+config ARCH_HAVE_LIVE_PATCHING
+	boolean
+	help
+	  Arch supports kernel live patching
+
+config LIVE_PATCHING
+	boolean "Kernel Live Patching"
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	depends on MODULES
+	depends on SYSFS
+	depends on KALLSYMS_ALL
+	depends on ARCH_HAVE_LIVE_PATCHING
+	help
+	  Say Y here if you want to support kernel live patching.
+	  This option has no runtime impact until a kernel "patch"
+	  module uses the interface provided by this option to register
+	  a patch, causing calls to patched functions to be redirected
+	  to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..7c1f00861428
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LIVE_PATCHING) += livepatch.o
+
+livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..f99fe189d596
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,930 @@
+/*
+ * core.c - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/ftrace.h>
+#include <linux/list.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+
+/*
+ * The klp_mutex protects the klp_patches list and state transitions of any
+ * structure reachable from the patches list.  References to any structure must
+ * be obtained under mutex protection.
+ */
+
+static DEFINE_MUTEX(klp_mutex);
+static LIST_HEAD(klp_patches);
+
+static struct kobject *klp_root_kobj;
+
+static bool klp_is_module(struct klp_object *obj)
+{
+	return obj->name;
+}
+
+static bool klp_is_object_loaded(struct klp_object *obj)
+{
+	return !obj->name || obj->mod;
+}
+
+/* sets obj->mod if object is not vmlinux and module is found */
+static void klp_find_object_module(struct klp_object *obj)
+{
+	if (!klp_is_module(obj))
+		return;
+
+	mutex_lock(&module_mutex);
+	/*
+	 * We don't need to take a reference on the module here because we have
+	 * the klp_mutex, which is also taken by the module notifier.  This
+	 * prevents any module from unloading until we release the klp_mutex.
+	 */
+	obj->mod = find_module(obj->name);
+	mutex_unlock(&module_mutex);
+}
+
+/* klp_mutex must be held by caller */
+static bool klp_is_patch_registered(struct klp_patch *patch)
+{
+	struct klp_patch *mypatch;
+
+	list_for_each_entry(mypatch, &klp_patches, list)
+		if (mypatch == patch)
+			return true;
+
+	return false;
+}
+
+static bool klp_initialized(void)
+{
+	return klp_root_kobj;
+}
+
+struct klp_find_arg {
+	const char *objname;
+	const char *name;
+	unsigned long addr;
+	/*
+	 * If count == 0, the symbol was not found. If count == 1, a unique
+	 * match was found and addr is set.  If count > 1, there is
+	 * unresolvable ambiguity among "count" number of symbols with the same
+	 * name in the same object.
+	 */
+	unsigned long count;
+};
+
+static int klp_find_callback(void *data, const char *name,
+			     struct module *mod, unsigned long addr)
+{
+	struct klp_find_arg *args = data;
+
+	if ((mod && !args->objname) || (!mod && args->objname))
+		return 0;
+
+	if (strcmp(args->name, name))
+		return 0;
+
+	if (args->objname && strcmp(args->objname, mod->name))
+		return 0;
+
+	/*
+	 * args->addr might be overwritten if another match is found
+	 * but klp_find_object_symbol() handles this and only returns the
+	 * addr if count == 1.
+	 */
+	args->addr = addr;
+	args->count++;
+
+	return 0;
+}
+
+static int klp_find_object_symbol(const char *objname, const char *name,
+				  unsigned long *addr)
+{
+	struct klp_find_arg args = {
+		.objname = objname,
+		.name = name,
+		.addr = 0,
+		.count = 0
+	};
+
+	kallsyms_on_each_symbol(klp_find_callback, &args);
+
+	if (args.count == 0)
+		pr_err("symbol '%s' not found in symbol table\n", name);
+	else if (args.count > 1)
+		pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
+		       args.count, name, objname);
+	else {
+		*addr = args.addr;
+		return 0;
+	}
+
+	*addr = 0;
+	return -EINVAL;
+}
+
+struct klp_verify_args {
+	const char *name;
+	const unsigned long addr;
+};
+
+static int klp_verify_callback(void *data, const char *name,
+			       struct module *mod, unsigned long addr)
+{
+	struct klp_verify_args *args = data;
+
+	if (!mod &&
+	    !strcmp(args->name, name) &&
+	    args->addr == addr)
+		return 1;
+
+	return 0;
+}
+
+static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
+{
+	struct klp_verify_args args = {
+		.name = name,
+		.addr = addr,
+	};
+
+	if (kallsyms_on_each_symbol(klp_verify_callback, &args))
+		return 0;
+
+	pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?",
+		name, addr);
+	return -EINVAL;
+}
+
+static int klp_find_verify_func_addr(struct klp_object *obj,
+				     struct klp_func *func)
+{
+	int ret;
+
+#if defined(CONFIG_RANDOMIZE_BASE)
+	/* KASLR is enabled, disregard old_addr from user */
+	func->old_addr = 0;
+#endif
+
+	if (!func->old_addr || klp_is_module(obj))
+		ret = klp_find_object_symbol(obj->name, func->old_name,
+					     &func->old_addr);
+	else
+		ret = klp_verify_vmlinux_symbol(func->old_name,
+						func->old_addr);
+
+	return ret;
+}
+
+/*
+ * external symbols are located outside the parent object (where the parent
+ * object is either vmlinux or the kmod being patched).
+ */
+static int klp_find_external_symbol(struct module *pmod, const char *name,
+				    unsigned long *addr)
+{
+	const struct kernel_symbol *sym;
+
+	/* first, check if it's an exported symbol */
+	preempt_disable();
+	sym = find_symbol(name, NULL, NULL, true, true);
+	preempt_enable();
+	if (sym) {
+		*addr = sym->value;
+		return 0;
+	}
+
+	/* otherwise check if it's in another .o within the patch module */
+	return klp_find_object_symbol(pmod->name, name, addr);
+}
+
+static int klp_write_object_relocations(struct module *pmod,
+					struct klp_object *obj)
+{
+	int ret;
+	struct klp_reloc *reloc;
+
+	if (WARN_ON(!klp_is_object_loaded(obj)))
+		return -EINVAL;
+
+	if (WARN_ON(!obj->relocs))
+		return -EINVAL;
+
+	for (reloc = obj->relocs; reloc->name; reloc++) {
+		if (!klp_is_module(obj)) {
+			ret = klp_verify_vmlinux_symbol(reloc->name,
+							reloc->val);
+			if (ret)
+				return ret;
+		} else {
+			/* module, reloc->val needs to be discovered */
+			if (reloc->external)
+				ret = klp_find_external_symbol(pmod,
+							       reloc->name,
+							       &reloc->val);
+			else
+				ret = klp_find_object_symbol(obj->mod->name,
+							     reloc->name,
+							     &reloc->val);
+			if (ret)
+				return ret;
+		}
+		ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
+					     reloc->val + reloc->addend);
+		if (ret) {
+			pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
+			       reloc->name, reloc->val, ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void notrace klp_ftrace_handler(unsigned long ip,
+				       unsigned long parent_ip,
+				       struct ftrace_ops *ops,
+				       struct pt_regs *regs)
+{
+	struct klp_func *func = ops->private;
+
+	regs->ip = (unsigned long)func->new_func;
+}
+
+static int klp_disable_func(struct klp_func *func)
+{
+	int ret;
+
+	if (WARN_ON(func->state != KLP_ENABLED))
+		return -EINVAL;
+
+	if (WARN_ON(!func->old_addr))
+		return -EINVAL;
+
+	ret = unregister_ftrace_function(func->fops);
+	if (ret) {
+		pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
+		       func->old_name, ret);
+		return ret;
+	}
+
+	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
+	if (ret)
+		pr_warn("function unregister succeeded but failed to clear the filter\n");
+
+	func->state = KLP_DISABLED;
+
+	return 0;
+}
+
+static int klp_enable_func(struct klp_func *func)
+{
+	int ret;
+
+	if (WARN_ON(!func->old_addr))
+		return -EINVAL;
+
+	if (WARN_ON(func->state != KLP_DISABLED))
+		return -EINVAL;
+
+	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 0, 0);
+	if (ret) {
+		pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+		       func->old_name, ret);
+		return ret;
+	}
+
+	ret = register_ftrace_function(func->fops);
+	if (ret) {
+		pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+		       func->old_name, ret);
+		ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
+	} else {
+		func->state = KLP_ENABLED;
+	}
+
+	return ret;
+}
+
+static int klp_disable_object(struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+
+	for (func = obj->funcs; func->old_name; func++) {
+		if (func->state != KLP_ENABLED)
+			continue;
+
+		ret = klp_disable_func(func);
+		if (ret)
+			return ret;
+	}
+
+	obj->state = KLP_DISABLED;
+
+	return 0;
+}
+
+static int klp_enable_object(struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+
+	if (WARN_ON(obj->state != KLP_DISABLED))
+		return -EINVAL;
+
+	if (WARN_ON(!klp_is_object_loaded(obj)))
+		return -EINVAL;
+
+	for (func = obj->funcs; func->old_name; func++) {
+		ret = klp_enable_func(func);
+		if (ret)
+			goto unregister;
+	}
+	obj->state = KLP_ENABLED;
+
+	return 0;
+
+unregister:
+	WARN_ON(klp_disable_object(obj));
+	return ret;
+}
+
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+	int ret;
+
+	pr_notice("disabling patch '%s'\n", patch->mod->name);
+
+	for (obj = patch->objs; obj->funcs; obj++) {
+		if (obj->state != KLP_ENABLED)
+			continue;
+
+		ret = klp_disable_object(obj);
+		if (ret)
+			return ret;
+	}
+
+	patch->state = KLP_DISABLED;
+
+	return 0;
+}
+
+/**
+ * klp_disable_patch() - disables a registered patch
+ * @patch:	The registered, enabled patch to be disabled
+ *
+ * Unregisters the patched functions from ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_disable_patch(struct klp_patch *patch)
+{
+	int ret;
+
+	mutex_lock(&klp_mutex);
+
+	if (!klp_is_patch_registered(patch)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (patch->state == KLP_DISABLED) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = __klp_disable_patch(patch);
+
+err:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_disable_patch);
+
+static int __klp_enable_patch(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+	int ret;
+
+	if (WARN_ON(patch->state != KLP_DISABLED))
+		return -EINVAL;
+
+	pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
+	add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+
+	pr_notice("enabling patch '%s'\n", patch->mod->name);
+
+	for (obj = patch->objs; obj->funcs; obj++) {
+		klp_find_object_module(obj);
+
+		if (!klp_is_object_loaded(obj))
+			continue;
+
+		ret = klp_enable_object(obj);
+		if (ret)
+			goto unregister;
+	}
+
+	patch->state = KLP_ENABLED;
+
+	return 0;
+
+unregister:
+	WARN_ON(__klp_disable_patch(patch));
+	return ret;
+}
+
+/**
+ * klp_enable_patch() - enables a registered patch
+ * @patch:	The registered, disabled patch to be enabled
+ *
+ * Performs the needed symbol lookups and code relocations,
+ * then registers the patched functions with ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_enable_patch(struct klp_patch *patch)
+{
+	int ret;
+
+	mutex_lock(&klp_mutex);
+
+	if (!klp_is_patch_registered(patch)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = __klp_enable_patch(patch);
+
+err:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * Sysfs Interface
+ *
+ * /sys/kernel/livepatch
+ * /sys/kernel/livepatch/<patch>
+ * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/<func>
+ */
+
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct klp_patch *patch;
+	int ret;
+	unsigned long val;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return -EINVAL;
+
+	if (val != KLP_DISABLED && val != KLP_ENABLED)
+		return -EINVAL;
+
+	patch = container_of(kobj, struct klp_patch, kobj);
+
+	mutex_lock(&klp_mutex);
+
+	if (val == patch->state) {
+		/* already in requested state */
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (val == KLP_ENABLED) {
+		ret = __klp_enable_patch(patch);
+		if (ret)
+			goto err;
+	} else {
+		ret = __klp_disable_patch(patch);
+		if (ret)
+			goto err;
+	}
+
+	mutex_unlock(&klp_mutex);
+
+	return count;
+
+err:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buf)
+{
+	struct klp_patch *patch;
+
+	patch = container_of(kobj, struct klp_patch, kobj);
+	return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+}
+
+static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct attribute *klp_patch_attrs[] = {
+	&enabled_kobj_attr.attr,
+	NULL
+};
+
+static void klp_kobj_release_patch(struct kobject *kobj)
+{
+	/*
+	 * Once we have a consistency model we'll need to module_put() the
+	 * patch module here.  See klp_register_patch() for more details.
+	 */
+}
+
+static struct kobj_type klp_ktype_patch = {
+	.release = klp_kobj_release_patch,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = klp_patch_attrs,
+};
+
+static void klp_kobj_release_func(struct kobject *kobj)
+{
+	struct klp_func *func;
+
+	func = container_of(kobj, struct klp_func, kobj);
+	kfree(func->fops);
+}
+
+static struct kobj_type klp_ktype_func = {
+	.release = klp_kobj_release_func,
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
+/*
+ * Free all functions' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_funcs_limited(struct klp_object *obj,
+				   struct klp_func *limit)
+{
+	struct klp_func *func;
+
+	for (func = obj->funcs; func->old_name && func != limit; func++)
+		kobject_put(&func->kobj);
+}
+
+/* Clean up when a patched object is unloaded */
+static void klp_free_object_loaded(struct klp_object *obj)
+{
+	struct klp_func *func;
+
+	obj->mod = NULL;
+
+	for (func = obj->funcs; func->old_name; func++)
+		func->old_addr = 0;
+}
+
+/*
+ * Free all objects' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_objects_limited(struct klp_patch *patch,
+				     struct klp_object *limit)
+{
+	struct klp_object *obj;
+
+	for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
+		klp_free_funcs_limited(obj, NULL);
+		kobject_put(obj->kobj);
+	}
+}
+
+static void klp_free_patch(struct klp_patch *patch)
+{
+	klp_free_objects_limited(patch, NULL);
+	if (!list_empty(&patch->list))
+		list_del(&patch->list);
+	kobject_put(&patch->kobj);
+}
+
+static int klp_init_func(struct klp_object *obj, struct klp_func *func)
+{
+	struct ftrace_ops *ops;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	ops->private = func;
+	ops->func = klp_ftrace_handler;
+	ops->flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC;
+	func->fops = ops;
+	func->state = KLP_DISABLED;
+
+	ret = kobject_init_and_add(&func->kobj, &klp_ktype_func,
+				   obj->kobj, func->old_name);
+	if (ret) {
+		kfree(func->fops);
+		return ret;
+	}
+
+	return 0;
+}
+
+/* parts of the initialization that is done only when the object is loaded */
+static int klp_init_object_loaded(struct klp_patch *patch,
+				  struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+
+	if (obj->relocs) {
+		ret = klp_write_object_relocations(patch->mod, obj);
+		if (ret)
+			return ret;
+	}
+
+	for (func = obj->funcs; func->old_name; func++) {
+		ret = klp_find_verify_func_addr(obj, func);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+	const char *name;
+
+	if (!obj->funcs)
+		return -EINVAL;
+
+	obj->state = KLP_DISABLED;
+
+	klp_find_object_module(obj);
+
+	name = klp_is_module(obj) ? obj->name : "vmlinux";
+	obj->kobj = kobject_create_and_add(name, &patch->kobj);
+	if (!obj->kobj)
+		return -ENOMEM;
+
+	for (func = obj->funcs; func->old_name; func++) {
+		ret = klp_init_func(obj, func);
+		if (ret)
+			goto free;
+	}
+
+	if (klp_is_object_loaded(obj)) {
+		ret = klp_init_object_loaded(patch, obj);
+		if (ret)
+			goto free;
+	}
+
+	return 0;
+
+free:
+	klp_free_funcs_limited(obj, func);
+	kobject_put(obj->kobj);
+	return ret;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+	int ret;
+
+	if (!patch->objs)
+		return -EINVAL;
+
+	mutex_lock(&klp_mutex);
+
+	patch->state = KLP_DISABLED;
+
+	ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
+				   klp_root_kobj, patch->mod->name);
+	if (ret)
+		goto unlock;
+
+	for (obj = patch->objs; obj->funcs; obj++) {
+		ret = klp_init_object(patch, obj);
+		if (ret)
+			goto free;
+	}
+
+	list_add(&patch->list, &klp_patches);
+
+	mutex_unlock(&klp_mutex);
+
+	return 0;
+
+free:
+	klp_free_objects_limited(patch, obj);
+	kobject_put(&patch->kobj);
+unlock:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+
+/**
+ * klp_unregister_patch() - unregisters a patch
+ * @patch:	Disabled patch to be unregistered
+ *
+ * Frees the data structures and removes the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_unregister_patch(struct klp_patch *patch)
+{
+	int ret = 0;
+
+	mutex_lock(&klp_mutex);
+
+	if (!klp_is_patch_registered(patch)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (patch->state == KLP_ENABLED) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	klp_free_patch(patch);
+
+out:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_unregister_patch);
+
+/**
+ * klp_register_patch() - registers a patch
+ * @patch:	Patch to be registered
+ *
+ * Initializes the data structure associated with the patch and
+ * creates the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_register_patch(struct klp_patch *patch)
+{
+	int ret;
+
+	if (!klp_initialized())
+		return -ENODEV;
+
+	if (!patch || !patch->mod)
+		return -EINVAL;
+
+	/*
+	 * A reference is taken on the patch module to prevent it from being
+	 * unloaded.  Right now, we don't allow patch modules to unload since
+	 * there is currently no method to determine if a thread is still
+	 * running in the patched code contained in the patch module once
+	 * the ftrace registration is successful.
+	 */
+	if (!try_module_get(patch->mod))
+		return -ENODEV;
+
+	ret = klp_init_patch(patch);
+	if (ret)
+		module_put(patch->mod);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_register_patch);
+
+static void klp_module_notify_coming(struct klp_patch *patch,
+				     struct klp_object *obj)
+{
+	struct module *pmod = patch->mod;
+	struct module *mod = obj->mod;
+	int ret;
+
+	ret = klp_init_object_loaded(patch, obj);
+	if (ret)
+		goto err;
+
+	if (patch->state == KLP_DISABLED)
+		return;
+
+	pr_notice("applying patch '%s' to loading module '%s'\n",
+		  pmod->name, mod->name);
+
+	ret = klp_enable_object(obj);
+	if (!ret)
+		return;
+
+err:
+	pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+		pmod->name, mod->name, ret);
+}
+
+static void klp_module_notify_going(struct klp_patch *patch,
+				    struct klp_object *obj)
+{
+	struct module *pmod = patch->mod;
+	struct module *mod = obj->mod;
+	int ret;
+
+	if (patch->state == KLP_DISABLED)
+		goto disabled;
+
+	pr_notice("reverting patch '%s' on unloading module '%s'\n",
+		  pmod->name, mod->name);
+
+	ret = klp_disable_object(obj);
+	if (ret)
+		pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
+			pmod->name, mod->name, ret);
+
+disabled:
+	klp_free_object_loaded(obj);
+}
+
+static int klp_module_notify(struct notifier_block *nb, unsigned long action,
+			     void *data)
+{
+	struct module *mod = data;
+	struct klp_patch *patch;
+	struct klp_object *obj;
+
+	if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
+		return 0;
+
+	mutex_lock(&klp_mutex);
+
+	list_for_each_entry(patch, &klp_patches, list) {
+		for (obj = patch->objs; obj->funcs; obj++) {
+			if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+				continue;
+
+			if (action == MODULE_STATE_COMING) {
+				obj->mod = mod;
+				klp_module_notify_coming(patch, obj);
+			} else /* MODULE_STATE_GOING */
+				klp_module_notify_going(patch, obj);
+
+			break;
+		}
+	}
+
+	mutex_unlock(&klp_mutex);
+
+	return 0;
+}
+
+static struct notifier_block klp_module_nb = {
+	.notifier_call = klp_module_notify,
+	.priority = INT_MIN+1, /* called late but before ftrace notifier */
+};
+
+static int klp_init(void)
+{
+	int ret;
+
+	ret = register_module_notifier(&klp_module_nb);
+	if (ret)
+		return ret;
+
+	klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
+	if (!klp_root_kobj) {
+		ret = -ENOMEM;
+		goto unregister;
+	}
+
+	return 0;
+
+unregister:
+	unregister_module_notifier(&klp_module_nb);
+	return ret;
+}
+
+module_init(klp_init);
-- 
cgit v1.2.3


From 2c658e212ce7e40ace56d9441c8c5634d4d420e3 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Thu, 18 Dec 2014 16:12:08 +0200
Subject: spi: Remove FSF mailing addresses

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-au1550.c        | 4 ----
 drivers/spi/spi-bcm2835.c       | 4 ----
 drivers/spi/spi-bcm63xx.c       | 4 ----
 drivers/spi/spi-bitbang.c       | 4 ----
 drivers/spi/spi-butterfly.c     | 4 ----
 drivers/spi/spi-coldfire-qspi.c | 5 -----
 drivers/spi/spi-davinci.c       | 4 ----
 drivers/spi/spi-gpio.c          | 4 ----
 drivers/spi/spi-lm70llp.c       | 4 ----
 drivers/spi/spi-omap-100k.c     | 5 -----
 drivers/spi/spi-omap-uwire.c    | 4 ----
 drivers/spi/spi-omap2-mcspi.c   | 5 -----
 drivers/spi/spi-pxa2xx-pxadma.c | 4 ----
 drivers/spi/spi-pxa2xx.c        | 4 ----
 drivers/spi/spi-rspi.c          | 5 -----
 drivers/spi/spi-s3c64xx.c       | 4 ----
 drivers/spi/spi-sc18is602.c     | 4 ----
 drivers/spi/spi-sh-hspi.c       | 5 -----
 drivers/spi/spi-sh.c            | 5 -----
 drivers/spi/spi-topcliff-pch.c  | 4 ----
 drivers/spi/spi.c               | 4 ----
 drivers/spi/spidev.c            | 4 ----
 include/linux/spi/at86rf230.h   | 4 ----
 include/linux/spi/l4f00242t03.h | 4 ----
 include/linux/spi/lms283gf05.h  | 4 ----
 include/linux/spi/mxs-spi.h     | 4 ----
 include/linux/spi/pxa2xx_spi.h  | 4 ----
 include/linux/spi/rspi.h        | 5 -----
 include/linux/spi/sh_hspi.h     | 4 ----
 include/linux/spi/spi.h         | 4 ----
 include/linux/spi/tle62x0.h     | 4 ----
 include/linux/spi/tsc2005.h     | 5 -----
 32 files changed, 136 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-au1550.c b/drivers/spi/spi-au1550.c
index 326f47973684..f45e085c01a6 100644
--- a/drivers/spi/spi-au1550.c
+++ b/drivers/spi/spi-au1550.c
@@ -15,10 +15,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/init.h>
diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index 98aab457b24d..419a782ab6d5 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -17,10 +17,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 
 #include <linux/clk.h>
diff --git a/drivers/spi/spi-bcm63xx.c b/drivers/spi/spi-bcm63xx.c
index c20530982e26..e73e2b052c9c 100644
--- a/drivers/spi/spi-bcm63xx.c
+++ b/drivers/spi/spi-bcm63xx.c
@@ -13,10 +13,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the
- * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spi-bitbang.c b/drivers/spi/spi-bitbang.c
index dc7d2c2d643e..5ef6638d5e8a 100644
--- a/drivers/spi/spi-bitbang.c
+++ b/drivers/spi/spi-bitbang.c
@@ -10,10 +10,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/spinlock.h>
diff --git a/drivers/spi/spi-butterfly.c b/drivers/spi/spi-butterfly.c
index ee4f91ccd8fd..9a95862986c8 100644
--- a/drivers/spi/spi-butterfly.c
+++ b/drivers/spi/spi-butterfly.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
diff --git a/drivers/spi/spi-coldfire-qspi.c b/drivers/spi/spi-coldfire-qspi.c
index 41b5dc4445f6..688956ff5095 100644
--- a/drivers/spi/spi-coldfire-qspi.c
+++ b/drivers/spi/spi-coldfire-qspi.c
@@ -12,11 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA
- *
 */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c
index b3707badb1e5..5e991065f5b0 100644
--- a/drivers/spi/spi-davinci.c
+++ b/drivers/spi/spi-davinci.c
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/interrupt.h>
diff --git a/drivers/spi/spi-gpio.c b/drivers/spi/spi-gpio.c
index aee4e7589568..2b76492fe5c1 100644
--- a/drivers/spi/spi-gpio.c
+++ b/drivers/spi/spi-gpio.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/spi/spi-lm70llp.c b/drivers/spi/spi-lm70llp.c
index 41c5765be746..ba72347cb99d 100644
--- a/drivers/spi/spi-lm70llp.c
+++ b/drivers/spi/spi-lm70llp.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/init.h>
diff --git a/drivers/spi/spi-omap-100k.c b/drivers/spi/spi-omap-100k.c
index 79399ae9c84c..d890d309dff9 100644
--- a/drivers/spi/spi-omap-100k.c
+++ b/drivers/spi/spi-omap-100k.c
@@ -16,11 +16,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
diff --git a/drivers/spi/spi-omap-uwire.c b/drivers/spi/spi-omap-uwire.c
index daf1ada5cd11..3c0844457c07 100644
--- a/drivers/spi/spi-omap-uwire.c
+++ b/drivers/spi/spi-omap-uwire.c
@@ -28,10 +28,6 @@
  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index 3bc3cbabbbc0..4df8942058de 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -14,11 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spi-pxa2xx-pxadma.c b/drivers/spi/spi-pxa2xx-pxadma.c
index e8a26f25d5c0..e51fcf9fd39b 100644
--- a/drivers/spi/spi-pxa2xx-pxadma.c
+++ b/drivers/spi/spi-pxa2xx-pxadma.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/delay.h>
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 05c623cfb078..7a9a605d9bd2 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/init.h>
diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c
index 2071f788c6fb..46ce47076e63 100644
--- a/drivers/spi/spi-rspi.c
+++ b/drivers/spi/spi-rspi.c
@@ -15,11 +15,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #include <linux/module.h>
diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c
index 37b19836f5cb..9231c34b5a5c 100644
--- a/drivers/spi/spi-s3c64xx.c
+++ b/drivers/spi/spi-s3c64xx.c
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/init.h>
diff --git a/drivers/spi/spi-sc18is602.c b/drivers/spi/spi-sc18is602.c
index 237f2e7a7179..5a56acf8a43e 100644
--- a/drivers/spi/spi-sc18is602.c
+++ b/drivers/spi/spi-sc18is602.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spi-sh-hspi.c b/drivers/spi/spi-sh-hspi.c
index fc29233d0650..20e800e70442 100644
--- a/drivers/spi/spi-sh-hspi.c
+++ b/drivers/spi/spi-sh-hspi.c
@@ -16,11 +16,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #include <linux/clk.h>
diff --git a/drivers/spi/spi-sh.c b/drivers/spi/spi-sh.c
index 1cfc906dd174..502501187c9e 100644
--- a/drivers/spi/spi-sh.c
+++ b/drivers/spi/spi-sh.c
@@ -14,11 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #include <linux/module.h>
diff --git a/drivers/spi/spi-topcliff-pch.c b/drivers/spi/spi-topcliff-pch.c
index be692ad50442..93dfcee0f987 100644
--- a/drivers/spi/spi-topcliff-pch.c
+++ b/drivers/spi/spi-topcliff-pch.c
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307, USA.
  */
 
 #include <linux/delay.h>
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 66a70e9bc743..c76cc7568639 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -13,10 +13,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index 6941e04afb8c..6f97e5af282b 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -14,10 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/init.h>
diff --git a/include/linux/spi/at86rf230.h b/include/linux/spi/at86rf230.h
index b2b1afbb3202..cd519a11c2c6 100644
--- a/include/linux/spi/at86rf230.h
+++ b/include/linux/spi/at86rf230.h
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Dmitry Eremin-Solenikov <dmitry.baryshkov@siemens.com>
  */
diff --git a/include/linux/spi/l4f00242t03.h b/include/linux/spi/l4f00242t03.h
index bc8677c8eba9..e69e9b51b21a 100644
--- a/include/linux/spi/l4f00242t03.h
+++ b/include/linux/spi/l4f00242t03.h
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
 #ifndef _INCLUDE_LINUX_SPI_L4F00242T03_H_
diff --git a/include/linux/spi/lms283gf05.h b/include/linux/spi/lms283gf05.h
index 555d254e6606..fdd1d1d51da5 100644
--- a/include/linux/spi/lms283gf05.h
+++ b/include/linux/spi/lms283gf05.h
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
 #ifndef _INCLUDE_LINUX_SPI_LMS283GF05_H_
diff --git a/include/linux/spi/mxs-spi.h b/include/linux/spi/mxs-spi.h
index 4835486f58e5..381d368b91b4 100644
--- a/include/linux/spi/mxs-spi.h
+++ b/include/linux/spi/mxs-spi.h
@@ -15,10 +15,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #ifndef __LINUX_SPI_MXS_SPI_H__
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index d5a316550177..5eb56e35cf0d 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -10,10 +10,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #ifndef __linux_pxa2xx_spi_h
 #define __linux_pxa2xx_spi_h
diff --git a/include/linux/spi/rspi.h b/include/linux/spi/rspi.h
index e546b2ceb623..a693188cc08b 100644
--- a/include/linux/spi/rspi.h
+++ b/include/linux/spi/rspi.h
@@ -11,11 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #ifndef __LINUX_SPI_RENESAS_SPI_H__
diff --git a/include/linux/spi/sh_hspi.h b/include/linux/spi/sh_hspi.h
index a1121f872ac1..aa0d440ab4f0 100644
--- a/include/linux/spi/sh_hspi.h
+++ b/include/linux/spi/sh_hspi.h
@@ -9,10 +9,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #ifndef SH_HSPI_H
 #define SH_HSPI_H
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index a6ef2a8e6de4..6e2664a17114 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -10,10 +10,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #ifndef __LINUX_SPI_H
diff --git a/include/linux/spi/tle62x0.h b/include/linux/spi/tle62x0.h
index 60b59187e590..414c6fddfcf0 100644
--- a/include/linux/spi/tle62x0.h
+++ b/include/linux/spi/tle62x0.h
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
 struct tle62x0_pdata {
diff --git a/include/linux/spi/tsc2005.h b/include/linux/spi/tsc2005.h
index 8f721e465e05..563b3b1799a8 100644
--- a/include/linux/spi/tsc2005.h
+++ b/include/linux/spi/tsc2005.h
@@ -12,11 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
  */
 
 #ifndef _LINUX_SPI_TSC2005_H
-- 
cgit v1.2.3


From 556d2f055bf6d79ce81587dfe774d4dd10da473f Mon Sep 17 00:00:00 2001
From: Yalin Wang <Yalin.Wang@sonymobile.com>
Date: Mon, 3 Nov 2014 03:01:03 +0100
Subject: ARM: 8187/1: add CONFIG_HAVE_ARCH_BITREVERSE to support rbit
 instruction

this change add CONFIG_HAVE_ARCH_BITREVERSE config option,
so that we can use some architecture's bitrev hardware instruction
to do bitrev operation.

Introduce __constant_bitrev* macro for constant bitrev operation.

Change __bitrev16() __bitrev32() to be inline function,
don't need export symbol for these tiny functions.

Signed-off-by: Yalin Wang <yalin.wang@sonymobile.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/bitrev.h | 77 +++++++++++++++++++++++++++++++++++++++++++++++---
 lib/Kconfig            |  9 ++++++
 lib/bitrev.c           | 17 ++---------
 3 files changed, 84 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h
index 7ffe03f4693d..fb790b8449c1 100644
--- a/include/linux/bitrev.h
+++ b/include/linux/bitrev.h
@@ -3,14 +3,83 @@
 
 #include <linux/types.h>
 
-extern u8 const byte_rev_table[256];
+#ifdef CONFIG_HAVE_ARCH_BITREVERSE
+#include <asm/bitrev.h>
+
+#define __bitrev32 __arch_bitrev32
+#define __bitrev16 __arch_bitrev16
+#define __bitrev8 __arch_bitrev8
 
-static inline u8 bitrev8(u8 byte)
+#else
+extern u8 const byte_rev_table[256];
+static inline u8 __bitrev8(u8 byte)
 {
 	return byte_rev_table[byte];
 }
 
-extern u16 bitrev16(u16 in);
-extern u32 bitrev32(u32 in);
+static inline u16 __bitrev16(u16 x)
+{
+	return (__bitrev8(x & 0xff) << 8) | __bitrev8(x >> 8);
+}
+
+static inline u32 __bitrev32(u32 x)
+{
+	return (__bitrev16(x & 0xffff) << 16) | __bitrev16(x >> 16);
+}
+
+#endif /* CONFIG_HAVE_ARCH_BITREVERSE */
+
+#define __constant_bitrev32(x)	\
+({					\
+	u32 __x = x;			\
+	__x = (__x >> 16) | (__x << 16);	\
+	__x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8);	\
+	__x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4);	\
+	__x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2);	\
+	__x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1);	\
+	__x;								\
+})
+
+#define __constant_bitrev16(x)	\
+({					\
+	u16 __x = x;			\
+	__x = (__x >> 8) | (__x << 8);	\
+	__x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4);	\
+	__x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2);	\
+	__x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1);	\
+	__x;								\
+})
+
+#define __constant_bitrev8(x)	\
+({					\
+	u8 __x = x;			\
+	__x = (__x >> 4) | (__x << 4);	\
+	__x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2);	\
+	__x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1);	\
+	__x;								\
+})
+
+#define bitrev32(x) \
+({			\
+	u32 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev32(__x) :			\
+	__bitrev32(__x);				\
+})
+
+#define bitrev16(x) \
+({			\
+	u16 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev16(__x) :			\
+	__bitrev16(__x);				\
+ })
 
+#define bitrev8(x) \
+({			\
+	u8 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev8(__x) :			\
+	__bitrev8(__x)	;			\
+ })
 #endif /* _LINUX_BITREV_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 54cf309a92a5..cd177caf3876 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -13,6 +13,15 @@ config RAID6_PQ
 config BITREVERSE
 	tristate
 
+config HAVE_ARCH_BITREVERSE
+	boolean
+	default n
+	depends on BITREVERSE
+	help
+	  This option provides an config for the architecture which have instruction
+	  can do bitreverse operation, we use the hardware instruction if the architecture
+	  have this capability.
+
 config RATIONAL
 	boolean
 
diff --git a/lib/bitrev.c b/lib/bitrev.c
index 3956203456d4..40ffda94cc5d 100644
--- a/lib/bitrev.c
+++ b/lib/bitrev.c
@@ -1,3 +1,4 @@
+#ifndef CONFIG_HAVE_ARCH_BITREVERSE
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/bitrev.h>
@@ -42,18 +43,4 @@ const u8 byte_rev_table[256] = {
 };
 EXPORT_SYMBOL_GPL(byte_rev_table);
 
-u16 bitrev16(u16 x)
-{
-	return (bitrev8(x & 0xff) << 8) | bitrev8(x >> 8);
-}
-EXPORT_SYMBOL(bitrev16);
-
-/**
- * bitrev32 - reverse the order of bits in a u32 value
- * @x: value to be bit-reversed
- */
-u32 bitrev32(u32 x)
-{
-	return (bitrev16(x & 0xffff) << 16) | bitrev16(x >> 16);
-}
-EXPORT_SYMBOL(bitrev32);
+#endif /* CONFIG_HAVE_ARCH_BITREVERSE */
-- 
cgit v1.2.3


From 523c5b89640e147abc7c70466f55327df8559d0a Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Sun, 30 Nov 2014 17:52:32 +0100
Subject: i2c: Remove support for legacy PM

There haven't been any I2C driver that use the legacy suspend/resume
callbacks for a while now and new drivers are supposed to use PM ops. So
remove support for legacy suspend/resume for I2C drivers.

Since there aren't any special bus specific things to do during
suspend/resume and since the PM core will automatically fallback directly to
using the device's PM ops if no bus PM ops are specified there is no need to
have any I2C bus PM ops.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 118 -------------------------------------------------
 include/linux/i2c.h    |   4 --
 2 files changed, 122 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 39d25a8cb1ad..393bb0e8c8f2 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -698,101 +698,6 @@ static void i2c_device_shutdown(struct device *dev)
 		driver->shutdown(client);
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int i2c_legacy_suspend(struct device *dev, pm_message_t mesg)
-{
-	struct i2c_client *client = i2c_verify_client(dev);
-	struct i2c_driver *driver;
-
-	if (!client || !dev->driver)
-		return 0;
-	driver = to_i2c_driver(dev->driver);
-	if (!driver->suspend)
-		return 0;
-	return driver->suspend(client, mesg);
-}
-
-static int i2c_legacy_resume(struct device *dev)
-{
-	struct i2c_client *client = i2c_verify_client(dev);
-	struct i2c_driver *driver;
-
-	if (!client || !dev->driver)
-		return 0;
-	driver = to_i2c_driver(dev->driver);
-	if (!driver->resume)
-		return 0;
-	return driver->resume(client);
-}
-
-static int i2c_device_pm_suspend(struct device *dev)
-{
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	if (pm)
-		return pm_generic_suspend(dev);
-	else
-		return i2c_legacy_suspend(dev, PMSG_SUSPEND);
-}
-
-static int i2c_device_pm_resume(struct device *dev)
-{
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	if (pm)
-		return pm_generic_resume(dev);
-	else
-		return i2c_legacy_resume(dev);
-}
-
-static int i2c_device_pm_freeze(struct device *dev)
-{
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	if (pm)
-		return pm_generic_freeze(dev);
-	else
-		return i2c_legacy_suspend(dev, PMSG_FREEZE);
-}
-
-static int i2c_device_pm_thaw(struct device *dev)
-{
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	if (pm)
-		return pm_generic_thaw(dev);
-	else
-		return i2c_legacy_resume(dev);
-}
-
-static int i2c_device_pm_poweroff(struct device *dev)
-{
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	if (pm)
-		return pm_generic_poweroff(dev);
-	else
-		return i2c_legacy_suspend(dev, PMSG_HIBERNATE);
-}
-
-static int i2c_device_pm_restore(struct device *dev)
-{
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	if (pm)
-		return pm_generic_restore(dev);
-	else
-		return i2c_legacy_resume(dev);
-}
-#else /* !CONFIG_PM_SLEEP */
-#define i2c_device_pm_suspend	NULL
-#define i2c_device_pm_resume	NULL
-#define i2c_device_pm_freeze	NULL
-#define i2c_device_pm_thaw	NULL
-#define i2c_device_pm_poweroff	NULL
-#define i2c_device_pm_restore	NULL
-#endif /* !CONFIG_PM_SLEEP */
-
 static void i2c_client_dev_release(struct device *dev)
 {
 	kfree(to_i2c_client(dev));
@@ -837,27 +742,12 @@ static const struct attribute_group *i2c_dev_attr_groups[] = {
 	NULL
 };
 
-static const struct dev_pm_ops i2c_device_pm_ops = {
-	.suspend = i2c_device_pm_suspend,
-	.resume = i2c_device_pm_resume,
-	.freeze = i2c_device_pm_freeze,
-	.thaw = i2c_device_pm_thaw,
-	.poweroff = i2c_device_pm_poweroff,
-	.restore = i2c_device_pm_restore,
-	SET_RUNTIME_PM_OPS(
-		pm_generic_runtime_suspend,
-		pm_generic_runtime_resume,
-		NULL
-	)
-};
-
 struct bus_type i2c_bus_type = {
 	.name		= "i2c",
 	.match		= i2c_device_match,
 	.probe		= i2c_device_probe,
 	.remove		= i2c_device_remove,
 	.shutdown	= i2c_device_shutdown,
-	.pm		= &i2c_device_pm_ops,
 };
 EXPORT_SYMBOL_GPL(i2c_bus_type);
 
@@ -1859,14 +1749,6 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver)
 	if (res)
 		return res;
 
-	/* Drivers should switch to dev_pm_ops instead. */
-	if (driver->suspend)
-		pr_warn("i2c-core: driver [%s] using legacy suspend method\n",
-			driver->driver.name);
-	if (driver->resume)
-		pr_warn("i2c-core: driver [%s] using legacy resume method\n",
-			driver->driver.name);
-
 	pr_debug("i2c-core: driver [%s] registered\n", driver->driver.name);
 
 	INIT_LIST_HEAD(&driver->clients);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e3a1721c8354..3090ee6e171d 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -130,8 +130,6 @@ extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client,
  * @probe: Callback for device binding
  * @remove: Callback for device unbinding
  * @shutdown: Callback for device shutdown
- * @suspend: Callback for device suspend
- * @resume: Callback for device resume
  * @alert: Alert callback, for example for the SMBus alert protocol
  * @command: Callback for bus-wide signaling (optional)
  * @driver: Device driver model driver
@@ -174,8 +172,6 @@ struct i2c_driver {
 
 	/* driver model interfaces that don't relate to enumeration  */
 	void (*shutdown)(struct i2c_client *);
-	int (*suspend)(struct i2c_client *, pm_message_t mesg);
-	int (*resume)(struct i2c_client *);
 
 	/* Alert callback, for example for the SMBus alert protocol.
 	 * The format and meaning of the data value depends on the protocol.
-- 
cgit v1.2.3


From c4827bb859cbe8afad9287c9dd4e7162119d3d59 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Thu, 18 Dec 2014 15:04:21 +0200
Subject: spi: pxa2xx: Add definition for Intel Quark DDS_RATE register

Intel Quark DDS_RATE register is defined only in register access macro. Add
a definition for it to common SSP register definitions for preparing to
cleanup those macros.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/pxa2xx_ssp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 77aed9ea1d26..dab545bb66b3 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -37,6 +37,7 @@
 #define SSDR		(0x10)  /* SSP Data Write/Data Read Register */
 
 #define SSTO		(0x28)  /* SSP Time Out Register */
+#define DDS_RATE	(0x28)  /* SSP DDS Clock Rate Register (Intel Quark) */
 #define SSPSP		(0x2C)  /* SSP Programmable Serial Protocol */
 #define SSTSA		(0x30)  /* SSP Tx Timeslot Active */
 #define SSRSA		(0x34)  /* SSP Rx Timeslot Active */
-- 
cgit v1.2.3


From 534a729866f9edc9264340c5b96cb94878ffda00 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 6 Aug 2014 10:52:41 +0200
Subject: dmaengine: Add 16 bytes, 32 bytes and 64 bytes bus widths

The widths are missing, add them.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Tested-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/linux/dmaengine.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 6d34ce91036c..b7724a5d4661 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -279,6 +279,9 @@ enum dma_slave_buswidth {
 	DMA_SLAVE_BUSWIDTH_3_BYTES = 3,
 	DMA_SLAVE_BUSWIDTH_4_BYTES = 4,
 	DMA_SLAVE_BUSWIDTH_8_BYTES = 8,
+	DMA_SLAVE_BUSWIDTH_16_BYTES = 16,
+	DMA_SLAVE_BUSWIDTH_32_BYTES = 32,
+	DMA_SLAVE_BUSWIDTH_64_BYTES = 64,
 };
 
 /**
-- 
cgit v1.2.3


From 32d17597d3e299ffe8b07e3afc12f8074e7ae483 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 9 Apr 2014 08:13:18 -0300
Subject: [media] v4l: vsp1: Remove support for platform data

Now that all platforms instantiate the VSP1 through DT, platform data
support isn't needed anymore.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/media/platform/Kconfig         |  2 +-
 drivers/media/platform/vsp1/vsp1.h     | 14 +++++-
 drivers/media/platform/vsp1/vsp1_drv.c | 81 ++++++++++++----------------------
 drivers/media/platform/vsp1/vsp1_wpf.c |  2 +-
 include/linux/platform_data/vsp1.h     | 27 ------------
 5 files changed, 43 insertions(+), 83 deletions(-)
 delete mode 100644 include/linux/platform_data/vsp1.h

(limited to 'include/linux')

diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig
index 765bffb49a72..480a174832a6 100644
--- a/drivers/media/platform/Kconfig
+++ b/drivers/media/platform/Kconfig
@@ -223,7 +223,7 @@ config VIDEO_SH_VEU
 config VIDEO_RENESAS_VSP1
 	tristate "Renesas VSP1 Video Processing Engine"
 	depends on VIDEO_V4L2 && VIDEO_V4L2_SUBDEV_API && HAS_DMA
-	depends on ARCH_SHMOBILE || COMPILE_TEST
+	depends on (ARCH_SHMOBILE && OF) || COMPILE_TEST
 	select VIDEOBUF2_DMA_CONTIG
 	---help---
 	  This is a V4L2 driver for the Renesas VSP1 video processing engine.
diff --git a/drivers/media/platform/vsp1/vsp1.h b/drivers/media/platform/vsp1/vsp1.h
index 12467191dff4..989e96f7e360 100644
--- a/drivers/media/platform/vsp1/vsp1.h
+++ b/drivers/media/platform/vsp1/vsp1.h
@@ -16,7 +16,6 @@
 #include <linux/io.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/platform_data/vsp1.h>
 
 #include <media/media-device.h>
 #include <media/v4l2-device.h>
@@ -40,9 +39,20 @@ struct vsp1_uds;
 #define VSP1_MAX_UDS		3
 #define VSP1_MAX_WPF		4
 
+#define VSP1_HAS_LIF		(1 << 0)
+#define VSP1_HAS_LUT		(1 << 1)
+#define VSP1_HAS_SRU		(1 << 2)
+
+struct vsp1_platform_data {
+	unsigned int features;
+	unsigned int rpf_count;
+	unsigned int uds_count;
+	unsigned int wpf_count;
+};
+
 struct vsp1_device {
 	struct device *dev;
-	struct vsp1_platform_data *pdata;
+	struct vsp1_platform_data pdata;
 
 	void __iomem *mmio;
 	struct clk *clock;
diff --git a/drivers/media/platform/vsp1/vsp1_drv.c b/drivers/media/platform/vsp1/vsp1_drv.c
index 5eb16e87d53f..913485a90e97 100644
--- a/drivers/media/platform/vsp1/vsp1_drv.c
+++ b/drivers/media/platform/vsp1/vsp1_drv.c
@@ -40,7 +40,7 @@ static irqreturn_t vsp1_irq_handler(int irq, void *data)
 	irqreturn_t ret = IRQ_NONE;
 	unsigned int i;
 
-	for (i = 0; i < vsp1->pdata->wpf_count; ++i) {
+	for (i = 0; i < vsp1->pdata.wpf_count; ++i) {
 		struct vsp1_rwpf *wpf = vsp1->wpf[i];
 		struct vsp1_pipeline *pipe;
 		u32 status;
@@ -181,7 +181,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 
 	list_add_tail(&vsp1->hst->entity.list_dev, &vsp1->entities);
 
-	if (vsp1->pdata->features & VSP1_HAS_LIF) {
+	if (vsp1->pdata.features & VSP1_HAS_LIF) {
 		vsp1->lif = vsp1_lif_create(vsp1);
 		if (IS_ERR(vsp1->lif)) {
 			ret = PTR_ERR(vsp1->lif);
@@ -191,7 +191,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&vsp1->lif->entity.list_dev, &vsp1->entities);
 	}
 
-	if (vsp1->pdata->features & VSP1_HAS_LUT) {
+	if (vsp1->pdata.features & VSP1_HAS_LUT) {
 		vsp1->lut = vsp1_lut_create(vsp1);
 		if (IS_ERR(vsp1->lut)) {
 			ret = PTR_ERR(vsp1->lut);
@@ -201,7 +201,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&vsp1->lut->entity.list_dev, &vsp1->entities);
 	}
 
-	for (i = 0; i < vsp1->pdata->rpf_count; ++i) {
+	for (i = 0; i < vsp1->pdata.rpf_count; ++i) {
 		struct vsp1_rwpf *rpf;
 
 		rpf = vsp1_rpf_create(vsp1, i);
@@ -214,7 +214,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&rpf->entity.list_dev, &vsp1->entities);
 	}
 
-	if (vsp1->pdata->features & VSP1_HAS_SRU) {
+	if (vsp1->pdata.features & VSP1_HAS_SRU) {
 		vsp1->sru = vsp1_sru_create(vsp1);
 		if (IS_ERR(vsp1->sru)) {
 			ret = PTR_ERR(vsp1->sru);
@@ -224,7 +224,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&vsp1->sru->entity.list_dev, &vsp1->entities);
 	}
 
-	for (i = 0; i < vsp1->pdata->uds_count; ++i) {
+	for (i = 0; i < vsp1->pdata.uds_count; ++i) {
 		struct vsp1_uds *uds;
 
 		uds = vsp1_uds_create(vsp1, i);
@@ -237,7 +237,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&uds->entity.list_dev, &vsp1->entities);
 	}
 
-	for (i = 0; i < vsp1->pdata->wpf_count; ++i) {
+	for (i = 0; i < vsp1->pdata.wpf_count; ++i) {
 		struct vsp1_rwpf *wpf;
 
 		wpf = vsp1_wpf_create(vsp1, i);
@@ -261,7 +261,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 			goto done;
 	}
 
-	if (vsp1->pdata->features & VSP1_HAS_LIF) {
+	if (vsp1->pdata.features & VSP1_HAS_LIF) {
 		ret = media_entity_create_link(
 			&vsp1->wpf[0]->entity.subdev.entity, RWPF_PAD_SOURCE,
 			&vsp1->lif->entity.subdev.entity, LIF_PAD_SINK, 0);
@@ -294,7 +294,7 @@ static int vsp1_device_init(struct vsp1_device *vsp1)
 	/* Reset any channel that might be running. */
 	status = vsp1_read(vsp1, VI6_STATUS);
 
-	for (i = 0; i < vsp1->pdata->wpf_count; ++i) {
+	for (i = 0; i < vsp1->pdata.wpf_count; ++i) {
 		unsigned int timeout;
 
 		if (!(status & VI6_STATUS_SYS_ACT(i)))
@@ -318,10 +318,10 @@ static int vsp1_device_init(struct vsp1_device *vsp1)
 	vsp1_write(vsp1, VI6_CLK_DCSWT, (8 << VI6_CLK_DCSWT_CSTPW_SHIFT) |
 		   (8 << VI6_CLK_DCSWT_CSTRW_SHIFT));
 
-	for (i = 0; i < vsp1->pdata->rpf_count; ++i)
+	for (i = 0; i < vsp1->pdata.rpf_count; ++i)
 		vsp1_write(vsp1, VI6_DPR_RPF_ROUTE(i), VI6_DPR_NODE_UNUSED);
 
-	for (i = 0; i < vsp1->pdata->uds_count; ++i)
+	for (i = 0; i < vsp1->pdata.uds_count; ++i)
 		vsp1_write(vsp1, VI6_DPR_UDS_ROUTE(i), VI6_DPR_NODE_UNUSED);
 
 	vsp1_write(vsp1, VI6_DPR_SRU_ROUTE, VI6_DPR_NODE_UNUSED);
@@ -428,28 +428,36 @@ static const struct dev_pm_ops vsp1_pm_ops = {
  * Platform Driver
  */
 
-static int vsp1_validate_platform_data(struct platform_device *pdev,
-				       struct vsp1_platform_data *pdata)
+static int vsp1_parse_dt(struct vsp1_device *vsp1)
 {
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "missing platform data\n");
-		return -EINVAL;
-	}
+	struct device_node *np = vsp1->dev->of_node;
+	struct vsp1_platform_data *pdata = &vsp1->pdata;
+
+	if (of_property_read_bool(np, "renesas,has-lif"))
+		pdata->features |= VSP1_HAS_LIF;
+	if (of_property_read_bool(np, "renesas,has-lut"))
+		pdata->features |= VSP1_HAS_LUT;
+	if (of_property_read_bool(np, "renesas,has-sru"))
+		pdata->features |= VSP1_HAS_SRU;
+
+	of_property_read_u32(np, "renesas,#rpf", &pdata->rpf_count);
+	of_property_read_u32(np, "renesas,#uds", &pdata->uds_count);
+	of_property_read_u32(np, "renesas,#wpf", &pdata->wpf_count);
 
 	if (pdata->rpf_count <= 0 || pdata->rpf_count > VSP1_MAX_RPF) {
-		dev_err(&pdev->dev, "invalid number of RPF (%u)\n",
+		dev_err(vsp1->dev, "invalid number of RPF (%u)\n",
 			pdata->rpf_count);
 		return -EINVAL;
 	}
 
 	if (pdata->uds_count <= 0 || pdata->uds_count > VSP1_MAX_UDS) {
-		dev_err(&pdev->dev, "invalid number of UDS (%u)\n",
+		dev_err(vsp1->dev, "invalid number of UDS (%u)\n",
 			pdata->uds_count);
 		return -EINVAL;
 	}
 
 	if (pdata->wpf_count <= 0 || pdata->wpf_count > VSP1_MAX_WPF) {
-		dev_err(&pdev->dev, "invalid number of WPF (%u)\n",
+		dev_err(vsp1->dev, "invalid number of WPF (%u)\n",
 			pdata->wpf_count);
 		return -EINVAL;
 	}
@@ -457,33 +465,6 @@ static int vsp1_validate_platform_data(struct platform_device *pdev,
 	return 0;
 }
 
-static struct vsp1_platform_data *
-vsp1_get_platform_data(struct platform_device *pdev)
-{
-	struct device_node *np = pdev->dev.of_node;
-	struct vsp1_platform_data *pdata;
-
-	if (!IS_ENABLED(CONFIG_OF) || np == NULL)
-		return pdev->dev.platform_data;
-
-	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
-	if (pdata == NULL)
-		return NULL;
-
-	if (of_property_read_bool(np, "renesas,has-lif"))
-		pdata->features |= VSP1_HAS_LIF;
-	if (of_property_read_bool(np, "renesas,has-lut"))
-		pdata->features |= VSP1_HAS_LUT;
-	if (of_property_read_bool(np, "renesas,has-sru"))
-		pdata->features |= VSP1_HAS_SRU;
-
-	of_property_read_u32(np, "renesas,#rpf", &pdata->rpf_count);
-	of_property_read_u32(np, "renesas,#uds", &pdata->uds_count);
-	of_property_read_u32(np, "renesas,#wpf", &pdata->wpf_count);
-
-	return pdata;
-}
-
 static int vsp1_probe(struct platform_device *pdev)
 {
 	struct vsp1_device *vsp1;
@@ -499,11 +480,7 @@ static int vsp1_probe(struct platform_device *pdev)
 	mutex_init(&vsp1->lock);
 	INIT_LIST_HEAD(&vsp1->entities);
 
-	vsp1->pdata = vsp1_get_platform_data(pdev);
-	if (vsp1->pdata == NULL)
-		return -ENODEV;
-
-	ret = vsp1_validate_platform_data(pdev, vsp1->pdata);
+	ret = vsp1_parse_dt(vsp1);
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/media/platform/vsp1/vsp1_wpf.c b/drivers/media/platform/vsp1/vsp1_wpf.c
index 6e057762c933..b1089d05583a 100644
--- a/drivers/media/platform/vsp1/vsp1_wpf.c
+++ b/drivers/media/platform/vsp1/vsp1_wpf.c
@@ -280,7 +280,7 @@ struct vsp1_rwpf *vsp1_wpf_create(struct vsp1_device *vsp1, unsigned int index)
 	 * except for the WPF0 source link if a LIF is present.
 	 */
 	flags = MEDIA_LNK_FL_ENABLED;
-	if (!(vsp1->pdata->features & VSP1_HAS_LIF) || index != 0)
+	if (!(vsp1->pdata.features & VSP1_HAS_LIF) || index != 0)
 		flags |= MEDIA_LNK_FL_IMMUTABLE;
 
 	ret = media_entity_create_link(&wpf->entity.subdev.entity,
diff --git a/include/linux/platform_data/vsp1.h b/include/linux/platform_data/vsp1.h
deleted file mode 100644
index 63170e2614b3..000000000000
--- a/include/linux/platform_data/vsp1.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * vsp1.h  --  R-Car VSP1 Platform Data
- *
- * Copyright (C) 2013 Renesas Corporation
- *
- * Contact: Laurent Pinchart (laurent.pinchart@ideasonboard.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-#ifndef __PLATFORM_VSP1_H__
-#define __PLATFORM_VSP1_H__
-
-#define VSP1_HAS_LIF		(1 << 0)
-#define VSP1_HAS_LUT		(1 << 1)
-#define VSP1_HAS_SRU		(1 << 2)
-
-struct vsp1_platform_data {
-	unsigned int features;
-	unsigned int rpf_count;
-	unsigned int uds_count;
-	unsigned int wpf_count;
-};
-
-#endif /* __PLATFORM_VSP1_H__ */
-- 
cgit v1.2.3


From 3110628d89f80fbafa085fd62e75afcb39fb764c Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Fri, 19 Dec 2014 17:15:53 +0900
Subject: spi: sh-msiof: Configure MSIOF sync signal timing in device tree

The MSIOF controller has DTDL and SYNCDL in SITMDR1 register. So,
this patch adds new properties like the following commit:
  d0fb47a5237d8b9576113568bacfd27892308b62
  (spi: fsl-espi: Configure FSL eSPI CSBEF and CSAFT)

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/spi/sh-msiof.txt | 16 ++++++++
 drivers/spi/spi-sh-msiof.c                         | 47 ++++++++++++++++++++++
 include/linux/spi/sh_msiof.h                       |  2 +
 3 files changed, 65 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/spi/sh-msiof.txt b/Documentation/devicetree/bindings/spi/sh-msiof.txt
index d11c3721e7cd..4c388bb2f0a2 100644
--- a/Documentation/devicetree/bindings/spi/sh-msiof.txt
+++ b/Documentation/devicetree/bindings/spi/sh-msiof.txt
@@ -30,6 +30,22 @@ Optional properties:
 			 specifiers, one for transmission, and one for
 			 reception.
 - dma-names            : Must contain a list of two DMA names, "tx" and "rx".
+- renesas,dtdl         : delay sync signal (setup) in transmit mode.
+			 Must contain one of the following values:
+			 0   (no bit delay)
+			 50  (0.5-clock-cycle delay)
+			 100 (1-clock-cycle delay)
+			 150 (1.5-clock-cycle delay)
+			 200 (2-clock-cycle delay)
+
+- renesas,syncdl       : delay sync signal (hold) in transmit mode.
+			 Must contain one of the following values:
+			 0   (no bit delay)
+			 50  (0.5-clock-cycle delay)
+			 100 (1-clock-cycle delay)
+			 150 (1.5-clock-cycle delay)
+			 200 (2-clock-cycle delay)
+			 300 (3-clock-cycle delay)
 
 Optional properties, deprecated for soctype-specific bindings:
 - renesas,tx-fifo-size : Overrides the default tx fifo size given in words
diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index 239be7cbe5a8..2a87cb939f56 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -82,6 +82,8 @@ struct sh_msiof_spi_priv {
 #define MDR1_SYNCMD_LR	 0x30000000 /*   L/R mode */
 #define MDR1_SYNCAC_SHIFT	 25 /* Sync Polarity (1 = Active-low) */
 #define MDR1_BITLSB_SHIFT	 24 /* MSB/LSB First (1 = LSB first) */
+#define MDR1_DTDL_SHIFT		 20 /* Data Pin Bit Delay for MSIOF_SYNC */
+#define MDR1_SYNCDL_SHIFT	 16 /* Frame Sync Signal Timing Delay */
 #define MDR1_FLD_MASK	 0x000000c0 /* Frame Sync Signal Interval (0-3) */
 #define MDR1_FLD_SHIFT		  2
 #define MDR1_XXSTP	 0x00000001 /* Transmission/Reception Stop on FIFO */
@@ -279,6 +281,48 @@ static void sh_msiof_spi_set_clk_regs(struct sh_msiof_spi_priv *p,
 		sh_msiof_write(p, RSCR, sh_msiof_spi_clk_table[k].scr);
 }
 
+static u32 sh_msiof_get_delay_bit(u32 dtdl_or_syncdl)
+{
+	/*
+	 * DTDL/SYNCDL bit	: p->info->dtdl or p->info->syncdl
+	 * b'000		: 0
+	 * b'001		: 100
+	 * b'010		: 200
+	 * b'011 (SYNCDL only)	: 300
+	 * b'101		: 50
+	 * b'110		: 150
+	 */
+	if (dtdl_or_syncdl % 100)
+		return dtdl_or_syncdl / 100 + 5;
+	else
+		return dtdl_or_syncdl / 100;
+}
+
+static u32 sh_msiof_spi_get_dtdl_and_syncdl(struct sh_msiof_spi_priv *p)
+{
+	u32 val;
+
+	if (!p->info)
+		return 0;
+
+	/* check if DTDL and SYNCDL is allowed value */
+	if (p->info->dtdl > 200 || p->info->syncdl > 300) {
+		dev_warn(&p->pdev->dev, "DTDL or SYNCDL is too large\n");
+		return 0;
+	}
+
+	/* check if the sum of DTDL and SYNCDL becomes an integer value  */
+	if ((p->info->dtdl + p->info->syncdl) % 100) {
+		dev_warn(&p->pdev->dev, "the sum of DTDL/SYNCDL is not good\n");
+		return 0;
+	}
+
+	val = sh_msiof_get_delay_bit(p->info->dtdl) << MDR1_DTDL_SHIFT;
+	val |= sh_msiof_get_delay_bit(p->info->syncdl) << MDR1_SYNCDL_SHIFT;
+
+	return val;
+}
+
 static void sh_msiof_spi_set_pin_regs(struct sh_msiof_spi_priv *p,
 				      u32 cpol, u32 cpha,
 				      u32 tx_hi_z, u32 lsb_first, u32 cs_high)
@@ -296,6 +340,7 @@ static void sh_msiof_spi_set_pin_regs(struct sh_msiof_spi_priv *p,
 	tmp = MDR1_SYNCMD_SPI | 1 << MDR1_FLD_SHIFT | MDR1_XXSTP;
 	tmp |= !cs_high << MDR1_SYNCAC_SHIFT;
 	tmp |= lsb_first << MDR1_BITLSB_SHIFT;
+	tmp |= sh_msiof_spi_get_dtdl_and_syncdl(p);
 	sh_msiof_write(p, TMDR1, tmp | MDR1_TRMD | TMDR1_PCON);
 	if (p->chipdata->master_flags & SPI_MASTER_MUST_TX) {
 		/* These bits are reserved if RX needs TX */
@@ -952,6 +997,8 @@ static struct sh_msiof_spi_info *sh_msiof_spi_parse_dt(struct device *dev)
 					&info->tx_fifo_override);
 	of_property_read_u32(np, "renesas,rx-fifo-size",
 					&info->rx_fifo_override);
+	of_property_read_u32(np, "renesas,dtdl", &info->dtdl);
+	of_property_read_u32(np, "renesas,syncdl", &info->syncdl);
 
 	info->num_chipselect = num_cs;
 
diff --git a/include/linux/spi/sh_msiof.h b/include/linux/spi/sh_msiof.h
index 88a14d81c49e..b087a85f5f72 100644
--- a/include/linux/spi/sh_msiof.h
+++ b/include/linux/spi/sh_msiof.h
@@ -7,6 +7,8 @@ struct sh_msiof_spi_info {
 	u16 num_chipselect;
 	unsigned int dma_tx_id;
 	unsigned int dma_rx_id;
+	u32 dtdl;
+	u32 syncdl;
 };
 
 #endif /* __SPI_SH_MSIOF_H__ */
-- 
cgit v1.2.3


From 0425e2420c0ab1b5da24f6d9fce39241ad85fc46 Mon Sep 17 00:00:00 2001
From: Flora Fu <flora.fu@mediatek.com>
Date: Fri, 5 Dec 2014 12:07:54 +0800
Subject: regulator: mt6397: Add support for MT6397 regulator

Add MT6397 regulator driver.

Signed-off-by: Flora Fu <flora.fu@mediatek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig                  |   9 +
 drivers/regulator/Makefile                 |   1 +
 drivers/regulator/mt6397-regulator.c       | 332 +++++++++++++++++++++++++++++
 include/linux/regulator/mt6397-regulator.h |  49 +++++
 4 files changed, 391 insertions(+)
 create mode 100644 drivers/regulator/mt6397-regulator.c
 create mode 100644 include/linux/regulator/mt6397-regulator.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index c3a60b57a865..f622d0613d27 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -433,6 +433,15 @@ config REGULATOR_MC13892
 	  Say y here to support the regulators found on the Freescale MC13892
 	  PMIC.
 
+config REGULATOR_MT6397
+	tristate "MediaTek MT6397 PMIC"
+	depends on MFD_MT6397
+	help
+	  Say y here to select this option to enable the power regulator of
+	  MediaTek MT6397 PMIC.
+	  This driver supports the control of different power rails of device
+	  through regulator interface.
+
 config REGULATOR_PALMAS
 	tristate "TI Palmas PMIC Regulators"
 	depends on MFD_PALMAS
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 1f28ebfc6f3a..1a0fbc397de1 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_REGULATOR_MAX77802) += max77802.o
 obj-$(CONFIG_REGULATOR_MC13783) += mc13783-regulator.o
 obj-$(CONFIG_REGULATOR_MC13892) += mc13892-regulator.o
 obj-$(CONFIG_REGULATOR_MC13XXX_CORE) +=  mc13xxx-regulator-core.o
+obj-$(CONFIG_REGULATOR_MT6397)	+= mt6397-regulator.o
 obj-$(CONFIG_REGULATOR_QCOM_RPM) += qcom_rpm-regulator.o
 obj-$(CONFIG_REGULATOR_PALMAS) += palmas-regulator.o
 obj-$(CONFIG_REGULATOR_PFUZE100) += pfuze100-regulator.o
diff --git a/drivers/regulator/mt6397-regulator.c b/drivers/regulator/mt6397-regulator.c
new file mode 100644
index 000000000000..a5b2f4762677
--- /dev/null
+++ b/drivers/regulator/mt6397-regulator.c
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2014 MediaTek Inc.
+ * Author: Flora Fu <flora.fu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/mfd/mt6397/core.h>
+#include <linux/mfd/mt6397/registers.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/mt6397-regulator.h>
+#include <linux/regulator/of_regulator.h>
+
+/*
+ * MT6397 regulators' information
+ *
+ * @desc: standard fields of regulator description.
+ * @qi: Mask for query enable signal status of regulators
+ * @vselon_reg: Register sections for hardware control mode of bucks
+ * @vselctrl_reg: Register for controlling the buck control mode.
+ * @vselctrl_mask: Mask for query buck's voltage control mode.
+ */
+struct mt6397_regulator_info {
+	struct regulator_desc desc;
+	u32 qi;
+	u32 vselon_reg;
+	u32 vselctrl_reg;
+	u32 vselctrl_mask;
+};
+
+#define MT6397_BUCK(match, vreg, min, max, step, volt_ranges, enreg,	\
+		vosel, vosel_mask, voselon, vosel_ctrl)			\
+[MT6397_ID_##vreg] = {							\
+	.desc = {							\
+		.name = #vreg,						\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6397_volt_range_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6397_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = (max - min)/step + 1,			\
+		.linear_ranges = volt_ranges,				\
+		.n_linear_ranges = ARRAY_SIZE(volt_ranges),		\
+		.vsel_reg = vosel,					\
+		.vsel_mask = vosel_mask,				\
+		.enable_reg = enreg,					\
+		.enable_mask = BIT(0),					\
+	},								\
+	.qi = BIT(13),							\
+	.vselon_reg = voselon,						\
+	.vselctrl_reg = vosel_ctrl,					\
+	.vselctrl_mask = BIT(1),					\
+}
+
+#define MT6397_LDO(match, vreg, ldo_volt_table, enreg, enbit, vosel,	\
+		vosel_mask)						\
+[MT6397_ID_##vreg] = {							\
+	.desc = {							\
+		.name = #vreg,						\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6397_volt_table_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6397_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = ARRAY_SIZE(ldo_volt_table),		\
+		.volt_table = ldo_volt_table,				\
+		.vsel_reg = vosel,					\
+		.vsel_mask = vosel_mask,				\
+		.enable_reg = enreg,					\
+		.enable_mask = BIT(enbit),				\
+	},								\
+	.qi = BIT(15),							\
+}
+
+#define MT6397_REG_FIXED(match, vreg, enreg, enbit, volt)		\
+[MT6397_ID_##vreg] = {							\
+	.desc = {							\
+		.name = #vreg,						\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6397_volt_fixed_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6397_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = 1,					\
+		.enable_reg = enreg,					\
+		.enable_mask = BIT(enbit),				\
+		.min_uV = volt,						\
+	},								\
+	.qi = BIT(15),							\
+}
+
+static const struct regulator_linear_range buck_volt_range1[] = {
+	REGULATOR_LINEAR_RANGE(700000, 0, 0x7f, 6250),
+};
+
+static const struct regulator_linear_range buck_volt_range2[] = {
+	REGULATOR_LINEAR_RANGE(800000, 0, 0x7f, 6250),
+};
+
+static const struct regulator_linear_range buck_volt_range3[] = {
+	REGULATOR_LINEAR_RANGE(1500000, 0, 0x1f, 20000),
+};
+
+static const u32 ldo_volt_table1[] = {
+	1500000, 1800000, 2500000, 2800000,
+};
+
+static const u32 ldo_volt_table2[] = {
+	1800000, 3300000,
+};
+
+static const u32 ldo_volt_table3[] = {
+	3000000, 3300000,
+};
+
+static const u32 ldo_volt_table4[] = {
+	1220000, 1300000, 1500000, 1800000, 2500000, 2800000, 3000000, 3300000,
+};
+
+static const u32 ldo_volt_table5[] = {
+	1200000, 1300000, 1500000, 1800000, 2500000, 2800000, 3000000, 3300000,
+};
+
+static const u32 ldo_volt_table5_v2[] = {
+	1200000, 1000000, 1500000, 1800000, 2500000, 2800000, 3000000, 3300000,
+};
+
+static const u32 ldo_volt_table6[] = {
+	1200000, 1300000, 1500000, 1800000, 2500000, 2800000, 3000000, 2000000,
+};
+
+static const u32 ldo_volt_table7[] = {
+	1300000, 1500000, 1800000, 2000000, 2500000, 2800000, 3000000, 3300000,
+};
+
+static int mt6397_get_status(struct regulator_dev *rdev)
+{
+	int ret;
+	u32 regval;
+	struct mt6397_regulator_info *info = rdev_get_drvdata(rdev);
+
+	ret = regmap_read(rdev->regmap, info->desc.enable_reg, &regval);
+	if (ret != 0) {
+		dev_err(&rdev->dev, "Failed to get enable reg: %d\n", ret);
+		return ret;
+	}
+
+	return (regval & info->qi) ? REGULATOR_STATUS_ON : REGULATOR_STATUS_OFF;
+}
+
+static struct regulator_ops mt6397_volt_range_ops = {
+	.list_voltage = regulator_list_voltage_linear_range,
+	.map_voltage = regulator_map_voltage_linear_range,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6397_get_status,
+};
+
+static struct regulator_ops mt6397_volt_table_ops = {
+	.list_voltage = regulator_list_voltage_table,
+	.map_voltage = regulator_map_voltage_iterate,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6397_get_status,
+};
+
+static struct regulator_ops mt6397_volt_fixed_ops = {
+	.list_voltage = regulator_list_voltage_linear,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6397_get_status,
+};
+
+/* The array is indexed by id(MT6397_ID_XXX) */
+static struct mt6397_regulator_info mt6397_regulators[] = {
+	MT6397_BUCK("buck_vpca15", VPCA15, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VCA15_CON7, MT6397_VCA15_CON9, 0x7f,
+		MT6397_VCA15_CON10, MT6397_VCA15_CON5),
+	MT6397_BUCK("buck_vpca7", VPCA7, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VPCA7_CON7, MT6397_VPCA7_CON9, 0x7f,
+		MT6397_VPCA7_CON10, MT6397_VPCA7_CON5),
+	MT6397_BUCK("buck_vsramca15", VSRAMCA15, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VSRMCA15_CON7, MT6397_VSRMCA15_CON9,
+		0x7f, MT6397_VSRMCA15_CON10, MT6397_VSRMCA15_CON5),
+	MT6397_BUCK("buck_vsramca7", VSRAMCA7, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VSRMCA7_CON7, MT6397_VSRMCA7_CON9,
+		0x7f, MT6397_VSRMCA7_CON10, MT6397_VSRMCA7_CON5),
+	MT6397_BUCK("buck_vcore", VCORE, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VCORE_CON7, MT6397_VCORE_CON9, 0x7f,
+		MT6397_VCORE_CON10, MT6397_VCORE_CON5),
+	MT6397_BUCK("buck_vgpu", VGPU, 700000, 1493750, 6250, buck_volt_range1,
+		MT6397_VGPU_CON7, MT6397_VGPU_CON9, 0x7f,
+		MT6397_VGPU_CON10, MT6397_VGPU_CON5),
+	MT6397_BUCK("buck_vdrm", VDRM, 800000, 1593750, 6250, buck_volt_range2,
+		MT6397_VDRM_CON7, MT6397_VDRM_CON9, 0x7f,
+		MT6397_VDRM_CON10, MT6397_VDRM_CON5),
+	MT6397_BUCK("buck_vio18", VIO18, 1500000, 2120000, 20000,
+		buck_volt_range3, MT6397_VIO18_CON7, MT6397_VIO18_CON9, 0x1f,
+		MT6397_VIO18_CON10, MT6397_VIO18_CON5),
+	MT6397_REG_FIXED("ldo_vtcxo", VTCXO, MT6397_ANALDO_CON0, 10, 2800000),
+	MT6397_REG_FIXED("ldo_va28", VA28, MT6397_ANALDO_CON1, 14, 2800000),
+	MT6397_LDO("ldo_vcama", VCAMA, ldo_volt_table1,
+		MT6397_ANALDO_CON2, 15, MT6397_ANALDO_CON6, 0xC0),
+	MT6397_REG_FIXED("ldo_vio28", VIO28, MT6397_DIGLDO_CON0, 14, 2800000),
+	MT6397_REG_FIXED("ldo_vusb", VUSB, MT6397_DIGLDO_CON1, 14, 3300000),
+	MT6397_LDO("ldo_vmc", VMC, ldo_volt_table2,
+		MT6397_DIGLDO_CON2, 12, MT6397_DIGLDO_CON29, 0x10),
+	MT6397_LDO("ldo_vmch", VMCH, ldo_volt_table3,
+		MT6397_DIGLDO_CON3, 14, MT6397_DIGLDO_CON17, 0x80),
+	MT6397_LDO("ldo_vemc3v3", VEMC3V3, ldo_volt_table3,
+		MT6397_DIGLDO_CON4, 14, MT6397_DIGLDO_CON18, 0x10),
+	MT6397_LDO("ldo_vgp1", VGP1, ldo_volt_table4,
+		MT6397_DIGLDO_CON5, 15, MT6397_DIGLDO_CON19, 0xE0),
+	MT6397_LDO("ldo_vgp2", VGP2, ldo_volt_table5,
+		MT6397_DIGLDO_CON6, 15, MT6397_DIGLDO_CON20, 0xE0),
+	MT6397_LDO("ldo_vgp3", VGP3, ldo_volt_table5,
+		MT6397_DIGLDO_CON7, 15, MT6397_DIGLDO_CON21, 0xE0),
+	MT6397_LDO("ldo_vgp4", VGP4, ldo_volt_table5,
+		MT6397_DIGLDO_CON8, 15, MT6397_DIGLDO_CON22, 0xE0),
+	MT6397_LDO("ldo_vgp5", VGP5, ldo_volt_table6,
+		MT6397_DIGLDO_CON9, 15, MT6397_DIGLDO_CON23, 0xE0),
+	MT6397_LDO("ldo_vgp6", VGP6, ldo_volt_table5,
+		MT6397_DIGLDO_CON10, 15, MT6397_DIGLDO_CON33, 0xE0),
+	MT6397_LDO("ldo_vibr", VIBR, ldo_volt_table7,
+		MT6397_DIGLDO_CON24, 15, MT6397_DIGLDO_CON25, 0xE00),
+};
+
+static int mt6397_set_buck_vosel_reg(struct platform_device *pdev)
+{
+	struct mt6397_chip *mt6397 = dev_get_drvdata(pdev->dev.parent);
+	int i;
+	u32 regval;
+
+	for (i = 0; i < MT6397_MAX_REGULATOR; i++) {
+		if (mt6397_regulators[i].vselctrl_reg) {
+			if (regmap_read(mt6397->regmap,
+				mt6397_regulators[i].vselctrl_reg,
+				&regval) < 0) {
+				dev_err(&pdev->dev,
+					"Failed to read buck ctrl\n");
+				return -EIO;
+			}
+
+			if (regval & mt6397_regulators[i].vselctrl_mask) {
+				mt6397_regulators[i].desc.vsel_reg =
+				mt6397_regulators[i].vselon_reg;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int mt6397_regulator_probe(struct platform_device *pdev)
+{
+	struct mt6397_chip *mt6397 = dev_get_drvdata(pdev->dev.parent);
+	struct regulator_config config = {};
+	struct regulator_dev *rdev;
+	int i;
+	u32 reg_value, version;
+
+	/* Query buck controller to select activated voltage register part */
+	if (mt6397_set_buck_vosel_reg(pdev))
+		return -EIO;
+
+	/* Read PMIC chip revision to update constraints and voltage table */
+	if (regmap_read(mt6397->regmap, MT6397_CID, &reg_value) < 0) {
+		dev_err(&pdev->dev, "Failed to read Chip ID\n");
+		return -EIO;
+	}
+	dev_info(&pdev->dev, "Chip ID = 0x%x\n", reg_value);
+
+	version = (reg_value & 0xFF);
+	switch (version) {
+	case MT6397_REGULATOR_ID91:
+		mt6397_regulators[MT6397_ID_VGP2].desc.volt_table =
+		ldo_volt_table5_v2;
+		break;
+	default:
+		break;
+	}
+
+	for (i = 0; i < MT6397_MAX_REGULATOR; i++) {
+		config.dev = &pdev->dev;
+		config.driver_data = &mt6397_regulators[i];
+		config.regmap = mt6397->regmap;
+		rdev = devm_regulator_register(&pdev->dev,
+				&mt6397_regulators[i].desc, &config);
+		if (IS_ERR(rdev)) {
+			dev_err(&pdev->dev, "failed to register %s\n",
+				mt6397_regulators[i].desc.name);
+			return PTR_ERR(rdev);
+		}
+	}
+
+	return 0;
+}
+
+static struct platform_driver mt6397_regulator_driver = {
+	.driver = {
+		.name = "mt6397-regulator",
+	},
+	.probe = mt6397_regulator_probe,
+};
+
+module_platform_driver(mt6397_regulator_driver);
+
+MODULE_AUTHOR("Flora Fu <flora.fu@mediatek.com>");
+MODULE_DESCRIPTION("Regulator Driver for MediaTek MT6397 PMIC");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:mt6397-regulator");
diff --git a/include/linux/regulator/mt6397-regulator.h b/include/linux/regulator/mt6397-regulator.h
new file mode 100644
index 000000000000..30cc5963e265
--- /dev/null
+++ b/include/linux/regulator/mt6397-regulator.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2014 MediaTek Inc.
+ * Author: Flora Fu <flora.fu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_REGULATOR_MT6397_H
+#define __LINUX_REGULATOR_MT6397_H
+
+enum {
+	MT6397_ID_VPCA15 = 0,
+	MT6397_ID_VPCA7,
+	MT6397_ID_VSRAMCA15,
+	MT6397_ID_VSRAMCA7,
+	MT6397_ID_VCORE,
+	MT6397_ID_VGPU,
+	MT6397_ID_VDRM,
+	MT6397_ID_VIO18 = 7,
+	MT6397_ID_VTCXO,
+	MT6397_ID_VA28,
+	MT6397_ID_VCAMA,
+	MT6397_ID_VIO28,
+	MT6397_ID_VUSB,
+	MT6397_ID_VMC,
+	MT6397_ID_VMCH,
+	MT6397_ID_VEMC3V3,
+	MT6397_ID_VGP1,
+	MT6397_ID_VGP2,
+	MT6397_ID_VGP3,
+	MT6397_ID_VGP4,
+	MT6397_ID_VGP5,
+	MT6397_ID_VGP6,
+	MT6397_ID_VIBR,
+	MT6397_ID_RG_MAX,
+};
+
+#define MT6397_MAX_REGULATOR	MT6397_ID_RG_MAX
+#define MT6397_REGULATOR_ID97	0x97
+#define MT6397_REGULATOR_ID91	0x91
+
+#endif /* __LINUX_REGULATOR_MT6397_H */
-- 
cgit v1.2.3


From 0628ee7c81a8ced9b10f9ee300707f7f79fdecf1 Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Sun, 21 Dec 2014 22:36:37 -0500
Subject: libata: s/ata_id_removeable()/ata_id_removable()/

Changes the spelling typos of removeable to removable where
ata_id_removeable is defined in ata.h and called in libata-scsi.c
respectively.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-scsi.c | 4 ++--
 include/linux/ata.h       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index e364e86e84d7..7659d6468303 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1995,8 +1995,8 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
 
 	VPRINTK("ENTER\n");
 
-	/* set scsi removeable (RMB) bit per ata bit */
-	if (ata_id_removeable(args->id))
+	/* set scsi removable (RMB) bit per ata bit */
+	if (ata_id_removable(args->id))
 		hdr[1] |= (1 << 7);
 
 	if (args->dev->class == ATA_DEV_ZAC) {
diff --git a/include/linux/ata.h b/include/linux/ata.h
index f2f4d8da97c0..1648026e06b4 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -503,7 +503,7 @@ struct ata_bmdma_prd {
 #define ata_id_has_dma(id)	((id)[ATA_ID_CAPABILITY] & (1 << 8))
 #define ata_id_has_ncq(id)	((id)[ATA_ID_SATA_CAPABILITY] & (1 << 8))
 #define ata_id_queue_depth(id)	(((id)[ATA_ID_QUEUE_DEPTH] & 0x1f) + 1)
-#define ata_id_removeable(id)	((id)[ATA_ID_CONFIG] & (1 << 7))
+#define ata_id_removable(id)	((id)[ATA_ID_CONFIG] & (1 << 7))
 #define ata_id_has_atapi_AN(id)	\
 	((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
 	  ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
-- 
cgit v1.2.3


From 77584ee57434813b50fc85cde995a6271a5081b7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 23 Dec 2014 16:40:17 +1100
Subject: hwrng: core - Use struct completion for cleanup_done

There is no point in doing a manual completion for cleanup_done
when struct completion fits in perfectly.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/core.c | 12 +++---------
 include/linux/hw_random.h     |  3 ++-
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 6ec42252e46e..3dba2cf50241 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -60,7 +60,6 @@ static DEFINE_MUTEX(rng_mutex);
 static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
-static DECLARE_WAIT_QUEUE_HEAD(rng_done);
 static unsigned short current_quality;
 static unsigned short default_quality; /* = 0; default to "off" */
 
@@ -100,10 +99,7 @@ static inline void cleanup_rng(struct kref *kref)
 	if (rng->cleanup)
 		rng->cleanup(rng);
 
-	/* cleanup_done should be updated after cleanup finishes */
-	smp_wmb();
-	rng->cleanup_done = true;
-	wake_up_all(&rng_done);
+	complete(&rng->cleanup_done);
 }
 
 static void set_current_rng(struct hwrng *rng)
@@ -498,7 +494,7 @@ int hwrng_register(struct hwrng *rng)
 		add_early_randomness(rng);
 	}
 
-	rng->cleanup_done = false;
+	init_completion(&rng->cleanup_done);
 
 out_unlock:
 	mutex_unlock(&rng_mutex);
@@ -532,9 +528,7 @@ void hwrng_unregister(struct hwrng *rng)
 	} else
 		mutex_unlock(&rng_mutex);
 
-	/* Just in case rng is reading right now, wait. */
-	wait_event(rng_done, rng->cleanup_done &&
-		   atomic_read(&rng->ref.refcount) == 0);
+	wait_for_completion(&rng->cleanup_done);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index 7832e5008959..eb7b414d232b 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -12,6 +12,7 @@
 #ifndef LINUX_HWRANDOM_H_
 #define LINUX_HWRANDOM_H_
 
+#include <linux/completion.h>
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/kref.h>
@@ -46,7 +47,7 @@ struct hwrng {
 	/* internal. */
 	struct list_head list;
 	struct kref ref;
-	bool cleanup_done;
+	struct completion cleanup_done;
 };
 
 /** Register a new Hardware Random Number Generator driver. */
-- 
cgit v1.2.3


From 7ab374a053a43050117eb452306b6cd9dcb58cfd Mon Sep 17 00:00:00 2001
From: Karol Wrona <k.wrona@samsung.com>
Date: Fri, 19 Dec 2014 18:39:24 +0100
Subject: iio: kfifo: Remove unused argument in iio_kfifo_allocate

indio_dev was unused in function body plus some small style fix - add new
lines after "if(sth) return sth" and before the last return statement.

The argument was removed also in its client.

Signed-off-by: Karol Wrona <k.wrona@samsung.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/adc/ti_am335x_adc.c                 | 2 +-
 drivers/iio/industrialio-triggered-buffer.c     | 2 +-
 drivers/iio/kfifo_buf.c                         | 6 ++++--
 drivers/staging/iio/accel/lis3l02dq_ring.c      | 2 +-
 drivers/staging/iio/iio_simple_dummy_buffer.c   | 2 +-
 drivers/staging/iio/impedance-analyzer/ad5933.c | 2 +-
 drivers/staging/iio/meter/ade7758_ring.c        | 2 +-
 include/linux/iio/kfifo_buf.h                   | 2 +-
 8 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ti_am335x_adc.c b/drivers/iio/adc/ti_am335x_adc.c
index d550ac7d2365..5eea299518a3 100644
--- a/drivers/iio/adc/ti_am335x_adc.c
+++ b/drivers/iio/adc/ti_am335x_adc.c
@@ -250,7 +250,7 @@ static int tiadc_iio_buffered_hardware_setup(struct iio_dev *indio_dev,
 	struct iio_buffer *buffer;
 	int ret;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer)
 		return -ENOMEM;
 
diff --git a/drivers/iio/industrialio-triggered-buffer.c b/drivers/iio/industrialio-triggered-buffer.c
index 61a5d0404edf..15a5341b5e7b 100644
--- a/drivers/iio/industrialio-triggered-buffer.c
+++ b/drivers/iio/industrialio-triggered-buffer.c
@@ -49,7 +49,7 @@ int iio_triggered_buffer_setup(struct iio_dev *indio_dev,
 	struct iio_buffer *buffer;
 	int ret;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer) {
 		ret = -ENOMEM;
 		goto error_ret;
diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index b20a9cfbc8ed..7f6fad658e83 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -140,18 +140,20 @@ static const struct iio_buffer_access_funcs kfifo_access_funcs = {
 	.release = &iio_kfifo_buffer_release,
 };
 
-struct iio_buffer *iio_kfifo_allocate(struct iio_dev *indio_dev)
+struct iio_buffer *iio_kfifo_allocate(void)
 {
 	struct iio_kfifo *kf;
 
-	kf = kzalloc(sizeof *kf, GFP_KERNEL);
+	kf = kzalloc(sizeof(*kf), GFP_KERNEL);
 	if (!kf)
 		return NULL;
+
 	kf->update_needed = true;
 	iio_buffer_init(&kf->buffer);
 	kf->buffer.access = &kfifo_access_funcs;
 	kf->buffer.length = 2;
 	mutex_init(&kf->user_lock);
+
 	return &kf->buffer;
 }
 EXPORT_SYMBOL(iio_kfifo_allocate);
diff --git a/drivers/staging/iio/accel/lis3l02dq_ring.c b/drivers/staging/iio/accel/lis3l02dq_ring.c
index 9efc77b0ebdd..1fd90090a633 100644
--- a/drivers/staging/iio/accel/lis3l02dq_ring.c
+++ b/drivers/staging/iio/accel/lis3l02dq_ring.c
@@ -393,7 +393,7 @@ int lis3l02dq_configure_buffer(struct iio_dev *indio_dev)
 	int ret;
 	struct iio_buffer *buffer;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer)
 		return -ENOMEM;
 
diff --git a/drivers/staging/iio/iio_simple_dummy_buffer.c b/drivers/staging/iio/iio_simple_dummy_buffer.c
index a2d72c102119..360a4c980722 100644
--- a/drivers/staging/iio/iio_simple_dummy_buffer.c
+++ b/drivers/staging/iio/iio_simple_dummy_buffer.c
@@ -121,7 +121,7 @@ int iio_simple_dummy_configure_buffer(struct iio_dev *indio_dev)
 	struct iio_buffer *buffer;
 
 	/* Allocate a buffer to use - here a kfifo */
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (buffer == NULL) {
 		ret = -ENOMEM;
 		goto error_ret;
diff --git a/drivers/staging/iio/impedance-analyzer/ad5933.c b/drivers/staging/iio/impedance-analyzer/ad5933.c
index c50b1380b9aa..39f60aca0838 100644
--- a/drivers/staging/iio/impedance-analyzer/ad5933.c
+++ b/drivers/staging/iio/impedance-analyzer/ad5933.c
@@ -626,7 +626,7 @@ static int ad5933_register_ring_funcs_and_init(struct iio_dev *indio_dev)
 {
 	struct iio_buffer *buffer;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer)
 		return -ENOMEM;
 
diff --git a/drivers/staging/iio/meter/ade7758_ring.c b/drivers/staging/iio/meter/ade7758_ring.c
index 27c3ed6ca468..782fd9a3bc0d 100644
--- a/drivers/staging/iio/meter/ade7758_ring.c
+++ b/drivers/staging/iio/meter/ade7758_ring.c
@@ -119,7 +119,7 @@ int ade7758_configure_ring(struct iio_dev *indio_dev)
 	struct iio_buffer *buffer;
 	int ret = 0;
 
-	buffer = iio_kfifo_allocate(indio_dev);
+	buffer = iio_kfifo_allocate();
 	if (!buffer) {
 		ret = -ENOMEM;
 		return ret;
diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h
index 25eeac762e84..1a8d57a41738 100644
--- a/include/linux/iio/kfifo_buf.h
+++ b/include/linux/iio/kfifo_buf.h
@@ -5,7 +5,7 @@
 #include <linux/iio/iio.h>
 #include <linux/iio/buffer.h>
 
-struct iio_buffer *iio_kfifo_allocate(struct iio_dev *indio_dev);
+struct iio_buffer *iio_kfifo_allocate(void);
 void iio_kfifo_free(struct iio_buffer *r);
 
 #endif
-- 
cgit v1.2.3


From 780103fef5c88a97fb9c8d0079bf326ed6147f1f Mon Sep 17 00:00:00 2001
From: Karol Wrona <k.wrona@samsung.com>
Date: Fri, 19 Dec 2014 18:39:25 +0100
Subject: iio: kfifo: Add resource management devm_iio_kfifo_allocate/free

iio kfifo allocate/free gained their devm_ wrappers.

Signed-off-by: Karol Wrona <k.wrona@samsung.com>
Suggested-by: Jonathan Cameron <jic23@kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/driver-model/devres.txt |  2 ++
 drivers/iio/kfifo_buf.c               | 54 +++++++++++++++++++++++++++++++++++
 include/linux/iio/kfifo_buf.h         |  3 ++
 3 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index b5ab416cd53a..6d1e8eeb5990 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -258,6 +258,8 @@ IIO
   devm_iio_device_free()
   devm_iio_device_register()
   devm_iio_device_unregister()
+  devm_iio_kfifo_allocate()
+  devm_iio_kfifo_free()
   devm_iio_trigger_alloc()
   devm_iio_trigger_free()
 
diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index 7f6fad658e83..b2beea01c49b 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -164,4 +164,58 @@ void iio_kfifo_free(struct iio_buffer *r)
 }
 EXPORT_SYMBOL(iio_kfifo_free);
 
+static void devm_iio_kfifo_release(struct device *dev, void *res)
+{
+	iio_kfifo_free(*(struct iio_buffer **)res);
+}
+
+static int devm_iio_kfifo_match(struct device *dev, void *res, void *data)
+{
+	struct iio_buffer **r = res;
+
+	if (WARN_ON(!r || !*r))
+		return 0;
+
+	return *r == data;
+}
+
+/**
+ * devm_iio_fifo_allocate - Resource-managed iio_kfifo_allocate()
+ * @dev:		Device to allocate kfifo buffer for
+ *
+ * RETURNS:
+ * Pointer to allocated iio_buffer on success, NULL on failure.
+ */
+struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev)
+{
+	struct iio_buffer **ptr, *r;
+
+	ptr = devres_alloc(devm_iio_kfifo_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	r = iio_kfifo_allocate();
+	if (r) {
+		*ptr = r;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return r;
+}
+EXPORT_SYMBOL(devm_iio_kfifo_allocate);
+
+/**
+ * devm_iio_fifo_free - Resource-managed iio_kfifo_free()
+ * @dev:		Device the buffer belongs to
+ * @r:			The buffer associated with the device
+ */
+void devm_iio_kfifo_free(struct device *dev, struct iio_buffer *r)
+{
+	WARN_ON(devres_release(dev, devm_iio_kfifo_release,
+			       devm_iio_kfifo_match, r));
+}
+EXPORT_SYMBOL(devm_iio_kfifo_free);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h
index 1a8d57a41738..1683bc710d14 100644
--- a/include/linux/iio/kfifo_buf.h
+++ b/include/linux/iio/kfifo_buf.h
@@ -8,4 +8,7 @@
 struct iio_buffer *iio_kfifo_allocate(void);
 void iio_kfifo_free(struct iio_buffer *r);
 
+struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev);
+void devm_iio_kfifo_free(struct device *dev, struct iio_buffer *r);
+
 #endif
-- 
cgit v1.2.3


From 74d23cc704d19732e70ef1579a669f7d5f09dd9a Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sun, 21 Dec 2014 19:46:56 +0100
Subject: time: move the timecounter/cyclecounter code into its own file.

The timecounter code has almost nothing to do with the clocksource
code. Let it live in its own file. This will help isolate the
timecounter users from the clocksource users in the source tree.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe.h        |   2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h |   2 +-
 drivers/net/ethernet/freescale/fec.h        |   1 +
 drivers/net/ethernet/intel/e1000e/e1000.h   |   2 +-
 drivers/net/ethernet/intel/igb/igb.h        |   2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe.h    |   2 +-
 drivers/net/ethernet/ti/cpts.h              |   1 +
 include/clocksource/arm_arch_timer.h        |   2 +-
 include/linux/clocksource.h                 | 102 -----------------------
 include/linux/mlx4/device.h                 |   2 +-
 include/linux/timecounter.h                 | 122 ++++++++++++++++++++++++++++
 include/linux/types.h                       |   3 +
 kernel/time/Makefile                        |   2 +-
 kernel/time/clocksource.c                   |  76 -----------------
 kernel/time/timecounter.c                   |  95 ++++++++++++++++++++++
 sound/pci/hda/hda_priv.h                    |   2 +-
 16 files changed, 231 insertions(+), 187 deletions(-)
 create mode 100644 include/linux/timecounter.h
 create mode 100644 kernel/time/timecounter.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index f9ec762ac3f0..2af6affc35a7 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -124,7 +124,7 @@
 #include <linux/if_vlan.h>
 #include <linux/bitops.h>
 #include <linux/ptp_clock_kernel.h>
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
 #include <net/dcbnl.h>
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index c3a6072134f5..792ba72fb5c8 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -22,7 +22,7 @@
 
 #include <linux/ptp_clock_kernel.h>
 #include <linux/net_tstamp.h>
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 
 /* compilation time flags */
 
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index 469691ad4a1e..df8bbddaeb37 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -16,6 +16,7 @@
 #include <linux/clocksource.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
+#include <linux/timecounter.h>
 
 #if defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) || \
     defined(CONFIG_M520x) || defined(CONFIG_M532x) || \
diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index 7785240a0da1..9416e5a7e0c8 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -34,7 +34,7 @@
 #include <linux/pci-aspm.h>
 #include <linux/crc32.h>
 #include <linux/if_vlan.h>
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/ptp_classify.h>
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 82d891e183b1..ee22da391474 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -29,7 +29,7 @@
 #include "e1000_mac.h"
 #include "e1000_82575.h"
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/bitops.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index b6137be43920..38fc64cf5dca 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -38,7 +38,7 @@
 #include <linux/if_vlan.h>
 #include <linux/jiffies.h>
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 
diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h
index 1a581ef7eee8..69a46b92c7d6 100644
--- a/drivers/net/ethernet/ti/cpts.h
+++ b/drivers/net/ethernet/ti/cpts.h
@@ -27,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/skbuff.h>
+#include <linux/timecounter.h>
 
 struct cpsw_cpts {
 	u32 idver;                /* Identification and version */
diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h
index 6d26b40cbf5d..9916d0e4eff5 100644
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -16,7 +16,7 @@
 #ifndef __CLKSOURCE_ARM_ARCH_TIMER_H
 #define __CLKSOURCE_ARM_ARCH_TIMER_H
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/types.h>
 
 #define ARCH_TIMER_CTRL_ENABLE		(1 << 0)
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index abcafaa20b86..9c78d15d33e4 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -18,8 +18,6 @@
 #include <asm/div64.h>
 #include <asm/io.h>
 
-/* clocksource cycle base type */
-typedef u64 cycle_t;
 struct clocksource;
 struct module;
 
@@ -27,106 +25,6 @@ struct module;
 #include <asm/clocksource.h>
 #endif
 
-/**
- * struct cyclecounter - hardware abstraction for a free running counter
- *	Provides completely state-free accessors to the underlying hardware.
- *	Depending on which hardware it reads, the cycle counter may wrap
- *	around quickly. Locking rules (if necessary) have to be defined
- *	by the implementor and user of specific instances of this API.
- *
- * @read:		returns the current cycle value
- * @mask:		bitmask for two's complement
- *			subtraction of non 64 bit counters,
- *			see CLOCKSOURCE_MASK() helper macro
- * @mult:		cycle to nanosecond multiplier
- * @shift:		cycle to nanosecond divisor (power of two)
- */
-struct cyclecounter {
-	cycle_t (*read)(const struct cyclecounter *cc);
-	cycle_t mask;
-	u32 mult;
-	u32 shift;
-};
-
-/**
- * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds
- *	Contains the state needed by timecounter_read() to detect
- *	cycle counter wrap around. Initialize with
- *	timecounter_init(). Also used to convert cycle counts into the
- *	corresponding nanosecond counts with timecounter_cyc2time(). Users
- *	of this code are responsible for initializing the underlying
- *	cycle counter hardware, locking issues and reading the time
- *	more often than the cycle counter wraps around. The nanosecond
- *	counter will only wrap around after ~585 years.
- *
- * @cc:			the cycle counter used by this instance
- * @cycle_last:		most recent cycle counter value seen by
- *			timecounter_read()
- * @nsec:		continuously increasing count
- */
-struct timecounter {
-	const struct cyclecounter *cc;
-	cycle_t cycle_last;
-	u64 nsec;
-};
-
-/**
- * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds
- * @cc:		Pointer to cycle counter.
- * @cycles:	Cycles
- *
- * XXX - This could use some mult_lxl_ll() asm optimization. Same code
- * as in cyc2ns, but with unsigned result.
- */
-static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
-				      cycle_t cycles)
-{
-	u64 ret = (u64)cycles;
-	ret = (ret * cc->mult) >> cc->shift;
-	return ret;
-}
-
-/**
- * timecounter_init - initialize a time counter
- * @tc:			Pointer to time counter which is to be initialized/reset
- * @cc:			A cycle counter, ready to be used.
- * @start_tstamp:	Arbitrary initial time stamp.
- *
- * After this call the current cycle register (roughly) corresponds to
- * the initial time stamp. Every call to timecounter_read() increments
- * the time stamp counter by the number of elapsed nanoseconds.
- */
-extern void timecounter_init(struct timecounter *tc,
-			     const struct cyclecounter *cc,
-			     u64 start_tstamp);
-
-/**
- * timecounter_read - return nanoseconds elapsed since timecounter_init()
- *                    plus the initial time stamp
- * @tc:          Pointer to time counter.
- *
- * In other words, keeps track of time since the same epoch as
- * the function which generated the initial time stamp.
- */
-extern u64 timecounter_read(struct timecounter *tc);
-
-/**
- * timecounter_cyc2time - convert a cycle counter to same
- *                        time base as values returned by
- *                        timecounter_read()
- * @tc:		Pointer to time counter.
- * @cycle_tstamp:	a value returned by tc->cc->read()
- *
- * Cycle counts that are converted correctly as long as they
- * fall into the interval [-1/2 max cycle count, +1/2 max cycle count],
- * with "max cycle count" == cs->mask+1.
- *
- * This allows conversion of cycle counter values which were generated
- * in the past.
- */
-extern u64 timecounter_cyc2time(struct timecounter *tc,
-				cycle_t cycle_tstamp);
-
 /**
  * struct clocksource - hardware abstraction for a free running counter
  *	Provides mostly state-free accessors to the underlying hardware.
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 25c791e295fd..f1e41b33462f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -42,7 +42,7 @@
 
 #include <linux/atomic.h>
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 
 #define MAX_MSIX_P_PORT		17
 #define MAX_MSIX		64
diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
new file mode 100644
index 000000000000..146f07a6651b
--- /dev/null
+++ b/include/linux/timecounter.h
@@ -0,0 +1,122 @@
+/*
+ * linux/include/linux/timecounter.h
+ *
+ * based on code that migrated away from
+ * linux/include/linux/clocksource.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _LINUX_TIMECOUNTER_H
+#define _LINUX_TIMECOUNTER_H
+
+#include <linux/types.h>
+
+/**
+ * struct cyclecounter - hardware abstraction for a free running counter
+ *	Provides completely state-free accessors to the underlying hardware.
+ *	Depending on which hardware it reads, the cycle counter may wrap
+ *	around quickly. Locking rules (if necessary) have to be defined
+ *	by the implementor and user of specific instances of this API.
+ *
+ * @read:		returns the current cycle value
+ * @mask:		bitmask for two's complement
+ *			subtraction of non 64 bit counters,
+ *			see CLOCKSOURCE_MASK() helper macro
+ * @mult:		cycle to nanosecond multiplier
+ * @shift:		cycle to nanosecond divisor (power of two)
+ */
+struct cyclecounter {
+	cycle_t (*read)(const struct cyclecounter *cc);
+	cycle_t mask;
+	u32 mult;
+	u32 shift;
+};
+
+/**
+ * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds
+ *	Contains the state needed by timecounter_read() to detect
+ *	cycle counter wrap around. Initialize with
+ *	timecounter_init(). Also used to convert cycle counts into the
+ *	corresponding nanosecond counts with timecounter_cyc2time(). Users
+ *	of this code are responsible for initializing the underlying
+ *	cycle counter hardware, locking issues and reading the time
+ *	more often than the cycle counter wraps around. The nanosecond
+ *	counter will only wrap around after ~585 years.
+ *
+ * @cc:			the cycle counter used by this instance
+ * @cycle_last:		most recent cycle counter value seen by
+ *			timecounter_read()
+ * @nsec:		continuously increasing count
+ */
+struct timecounter {
+	const struct cyclecounter *cc;
+	cycle_t cycle_last;
+	u64 nsec;
+};
+
+/**
+ * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds
+ * @cc:		Pointer to cycle counter.
+ * @cycles:	Cycles
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization. Same code
+ * as in cyc2ns, but with unsigned result.
+ */
+static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
+				      cycle_t cycles)
+{
+	u64 ret = (u64)cycles;
+	ret = (ret * cc->mult) >> cc->shift;
+	return ret;
+}
+
+/**
+ * timecounter_init - initialize a time counter
+ * @tc:			Pointer to time counter which is to be initialized/reset
+ * @cc:			A cycle counter, ready to be used.
+ * @start_tstamp:	Arbitrary initial time stamp.
+ *
+ * After this call the current cycle register (roughly) corresponds to
+ * the initial time stamp. Every call to timecounter_read() increments
+ * the time stamp counter by the number of elapsed nanoseconds.
+ */
+extern void timecounter_init(struct timecounter *tc,
+			     const struct cyclecounter *cc,
+			     u64 start_tstamp);
+
+/**
+ * timecounter_read - return nanoseconds elapsed since timecounter_init()
+ *                    plus the initial time stamp
+ * @tc:          Pointer to time counter.
+ *
+ * In other words, keeps track of time since the same epoch as
+ * the function which generated the initial time stamp.
+ */
+extern u64 timecounter_read(struct timecounter *tc);
+
+/**
+ * timecounter_cyc2time - convert a cycle counter to same
+ *                        time base as values returned by
+ *                        timecounter_read()
+ * @tc:		Pointer to time counter.
+ * @cycle_tstamp:	a value returned by tc->cc->read()
+ *
+ * Cycle counts that are converted correctly as long as they
+ * fall into the interval [-1/2 max cycle count, +1/2 max cycle count],
+ * with "max cycle count" == cs->mask+1.
+ *
+ * This allows conversion of cycle counter values which were generated
+ * in the past.
+ */
+extern u64 timecounter_cyc2time(struct timecounter *tc,
+				cycle_t cycle_tstamp);
+
+#endif
diff --git a/include/linux/types.h b/include/linux/types.h
index a0bb7048687f..62323825cff9 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -213,5 +213,8 @@ struct callback_head {
 };
 #define rcu_head callback_head
 
+/* clocksource cycle base type */
+typedef u64 cycle_t;
+
 #endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f622cf28628a..c09c07817d7a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
 obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
-obj-y += timeconv.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b79f39bda7e1..4892352f0e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -34,82 +34,6 @@
 #include "tick-internal.h"
 #include "timekeeping_internal.h"
 
-void timecounter_init(struct timecounter *tc,
-		      const struct cyclecounter *cc,
-		      u64 start_tstamp)
-{
-	tc->cc = cc;
-	tc->cycle_last = cc->read(cc);
-	tc->nsec = start_tstamp;
-}
-EXPORT_SYMBOL_GPL(timecounter_init);
-
-/**
- * timecounter_read_delta - get nanoseconds since last call of this function
- * @tc:         Pointer to time counter
- *
- * When the underlying cycle counter runs over, this will be handled
- * correctly as long as it does not run over more than once between
- * calls.
- *
- * The first call to this function for a new time counter initializes
- * the time tracking and returns an undefined result.
- */
-static u64 timecounter_read_delta(struct timecounter *tc)
-{
-	cycle_t cycle_now, cycle_delta;
-	u64 ns_offset;
-
-	/* read cycle counter: */
-	cycle_now = tc->cc->read(tc->cc);
-
-	/* calculate the delta since the last timecounter_read_delta(): */
-	cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
-
-	/* convert to nanoseconds: */
-	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
-
-	/* update time stamp of timecounter_read_delta() call: */
-	tc->cycle_last = cycle_now;
-
-	return ns_offset;
-}
-
-u64 timecounter_read(struct timecounter *tc)
-{
-	u64 nsec;
-
-	/* increment time by nanoseconds since last call */
-	nsec = timecounter_read_delta(tc);
-	nsec += tc->nsec;
-	tc->nsec = nsec;
-
-	return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_read);
-
-u64 timecounter_cyc2time(struct timecounter *tc,
-			 cycle_t cycle_tstamp)
-{
-	u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
-	u64 nsec;
-
-	/*
-	 * Instead of always treating cycle_tstamp as more recent
-	 * than tc->cycle_last, detect when it is too far in the
-	 * future and treat it as old time stamp instead.
-	 */
-	if (cycle_delta > tc->cc->mask / 2) {
-		cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
-		nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
-	} else {
-		nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
-	}
-
-	return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_cyc2time);
-
 /**
  * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
  * @mult:	pointer to mult variable
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..59a1ec3a57cb
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,95 @@
+/*
+ * linux/kernel/time/timecounter.c
+ *
+ * based on code that migrated away from
+ * linux/kernel/time/clocksource.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/export.h>
+#include <linux/timecounter.h>
+
+void timecounter_init(struct timecounter *tc,
+		      const struct cyclecounter *cc,
+		      u64 start_tstamp)
+{
+	tc->cc = cc;
+	tc->cycle_last = cc->read(cc);
+	tc->nsec = start_tstamp;
+}
+EXPORT_SYMBOL_GPL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc:         Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+	cycle_t cycle_now, cycle_delta;
+	u64 ns_offset;
+
+	/* read cycle counter: */
+	cycle_now = tc->cc->read(tc->cc);
+
+	/* calculate the delta since the last timecounter_read_delta(): */
+	cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+	/* convert to nanoseconds: */
+	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
+
+	/* update time stamp of timecounter_read_delta() call: */
+	tc->cycle_last = cycle_now;
+
+	return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+	u64 nsec;
+
+	/* increment time by nanoseconds since last call */
+	nsec = timecounter_read_delta(tc);
+	nsec += tc->nsec;
+	tc->nsec = nsec;
+
+	return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_read);
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+			 cycle_t cycle_tstamp)
+{
+	u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+	u64 nsec;
+
+	/*
+	 * Instead of always treating cycle_tstamp as more recent
+	 * than tc->cycle_last, detect when it is too far in the
+	 * future and treat it as old time stamp instead.
+	 */
+	if (cycle_delta > tc->cc->mask / 2) {
+		cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+		nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
+	} else {
+		nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
+	}
+
+	return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/sound/pci/hda/hda_priv.h b/sound/pci/hda/hda_priv.h
index 166e3e84b963..daf458299753 100644
--- a/sound/pci/hda/hda_priv.h
+++ b/sound/pci/hda/hda_priv.h
@@ -15,7 +15,7 @@
 #ifndef __SOUND_HDA_PRIV_H
 #define __SOUND_HDA_PRIV_H
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
 
-- 
cgit v1.2.3


From 796c1efd6fa0ed696d550b68f4410ab1a1749d01 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sun, 21 Dec 2014 19:46:57 +0100
Subject: timecounter: provide a helper function to shift the time.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some PTP Hardware Clock drivers use a struct timecounter to represent
their clock. To adjust the time by a given offset, these drivers all
perform a two step read/write of their timecounter. However, it is
better and simpler just to adjust the offset in one step. This patch
introduces a little routine to help drivers implement the adjtime
method.

Suggested-by: Janusz Użycki <j.uzycki@elproma.com.pl>
Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/timecounter.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
index 146f07a6651b..af3dfa4e90f0 100644
--- a/include/linux/timecounter.h
+++ b/include/linux/timecounter.h
@@ -78,6 +78,15 @@ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
 	return ret;
 }
 
+/**
+ * timecounter_adjtime - Shifts the time of the clock.
+ * @delta:	Desired change in nanoseconds.
+ */
+static inline void timecounter_adjtime(struct timecounter *tc, s64 delta)
+{
+	tc->nsec += delta;
+}
+
 /**
  * timecounter_init - initialize a time counter
  * @tc:			Pointer to time counter which is to be initialized/reset
-- 
cgit v1.2.3


From 2eebdde6528a722fbf8e2cffcf7aa52cbb4c2de0 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sun, 21 Dec 2014 19:47:06 +0100
Subject: timecounter: keep track of accumulated fractional nanoseconds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current timecounter implementation will drop a variable amount
of resolution, depending on the magnitude of the time delta. In
other words, reading the clock too often or too close to a time
stamp conversion will introduce errors into the time values. This
patch fixes the issue by introducing a fractional nanosecond field
that accumulates the low order bits.

Reported-by: Janusz Użycki <j.uzycki@elproma.com.pl>
Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_clock.c |  4 ++--
 include/linux/timecounter.h                   | 19 ++++++++++------
 kernel/time/timecounter.c                     | 31 +++++++++++++++++++++------
 virt/kvm/arm/arch_timer.c                     |  3 ++-
 4 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
index df35d0e1b899..e9cce4f72b24 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -240,7 +240,7 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
 {
 	struct mlx4_dev *dev = mdev->dev;
 	unsigned long flags;
-	u64 ns;
+	u64 ns, zero = 0;
 
 	rwlock_init(&mdev->clock_lock);
 
@@ -265,7 +265,7 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
 	/* Calculate period in seconds to call the overflow watchdog - to make
 	 * sure counter is checked at least once every wrap around.
 	 */
-	ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask);
+	ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask, zero, &zero);
 	do_div(ns, NSEC_PER_SEC / 2 / HZ);
 	mdev->overflow_period = ns;
 
diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
index af3dfa4e90f0..74f45496e6d1 100644
--- a/include/linux/timecounter.h
+++ b/include/linux/timecounter.h
@@ -55,27 +55,32 @@ struct cyclecounter {
  * @cycle_last:		most recent cycle counter value seen by
  *			timecounter_read()
  * @nsec:		continuously increasing count
+ * @mask:		bit mask for maintaining the 'frac' field
+ * @frac:		accumulated fractional nanoseconds
  */
 struct timecounter {
 	const struct cyclecounter *cc;
 	cycle_t cycle_last;
 	u64 nsec;
+	u64 mask;
+	u64 frac;
 };
 
 /**
  * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds
  * @cc:		Pointer to cycle counter.
  * @cycles:	Cycles
- *
- * XXX - This could use some mult_lxl_ll() asm optimization. Same code
- * as in cyc2ns, but with unsigned result.
+ * @mask:	bit mask for maintaining the 'frac' field
+ * @frac:	pointer to storage for the fractional nanoseconds.
  */
 static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
-				      cycle_t cycles)
+				      cycle_t cycles, u64 mask, u64 *frac)
 {
-	u64 ret = (u64)cycles;
-	ret = (ret * cc->mult) >> cc->shift;
-	return ret;
+	u64 ns = (u64) cycles;
+
+	ns = (ns * cc->mult) + *frac;
+	*frac = ns & mask;
+	return ns >> cc->shift;
 }
 
 /**
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 59a1ec3a57cb..4687b3104bae 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -25,6 +25,8 @@ void timecounter_init(struct timecounter *tc,
 	tc->cc = cc;
 	tc->cycle_last = cc->read(cc);
 	tc->nsec = start_tstamp;
+	tc->mask = (1ULL << cc->shift) - 1;
+	tc->frac = 0;
 }
 EXPORT_SYMBOL_GPL(timecounter_init);
 
@@ -51,7 +53,8 @@ static u64 timecounter_read_delta(struct timecounter *tc)
 	cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
 
 	/* convert to nanoseconds: */
-	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
+	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
+					tc->mask, &tc->frac);
 
 	/* update time stamp of timecounter_read_delta() call: */
 	tc->cycle_last = cycle_now;
@@ -72,22 +75,36 @@ u64 timecounter_read(struct timecounter *tc)
 }
 EXPORT_SYMBOL_GPL(timecounter_read);
 
+/*
+ * This is like cyclecounter_cyc2ns(), but it is used for computing a
+ * time previous to the time stored in the cycle counter.
+ */
+static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
+			       cycle_t cycles, u64 mask, u64 frac)
+{
+	u64 ns = (u64) cycles;
+
+	ns = ((ns * cc->mult) - frac) >> cc->shift;
+
+	return ns;
+}
+
 u64 timecounter_cyc2time(struct timecounter *tc,
 			 cycle_t cycle_tstamp)
 {
-	u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
-	u64 nsec;
+	u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+	u64 nsec = tc->nsec, frac = tc->frac;
 
 	/*
 	 * Instead of always treating cycle_tstamp as more recent
 	 * than tc->cycle_last, detect when it is too far in the
 	 * future and treat it as old time stamp instead.
 	 */
-	if (cycle_delta > tc->cc->mask / 2) {
-		cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
-		nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
+	if (delta > tc->cc->mask / 2) {
+		delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+		nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
 	} else {
-		nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
+		nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
 	}
 
 	return nsec;
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 1c0772b340d8..6e54f3542126 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -152,7 +152,8 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 		return;
 	}
 
-	ns = cyclecounter_cyc2ns(timecounter->cc, cval - now);
+	ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
+				 &timecounter->frac);
 	timer_arm(timer, ns);
 }
 
-- 
cgit v1.2.3


From dd450777990baae668c1143064f2f234dbab1b9b Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Wed, 24 Dec 2014 01:14:14 +0300
Subject: arm: sa1100: move irda header to linux/platform_data

In the end asm/mach/irda.h header is not used by anybody except sa1100.
Move the header to the platform data includes dir and rename it to
irda-sa11x0.h.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/include/asm/mach/irda.h          | 20 --------------------
 arch/arm/mach-sa1100/assabet.c            |  2 +-
 arch/arm/mach-sa1100/collie.c             |  2 +-
 arch/arm/mach-sa1100/h3100.c              |  2 +-
 arch/arm/mach-sa1100/h3600.c              |  2 +-
 drivers/net/irda/sa1100_ir.c              |  2 +-
 include/linux/platform_data/irda-sa11x0.h | 20 ++++++++++++++++++++
 7 files changed, 25 insertions(+), 25 deletions(-)
 delete mode 100644 arch/arm/include/asm/mach/irda.h
 create mode 100644 include/linux/platform_data/irda-sa11x0.h

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/mach/irda.h b/arch/arm/include/asm/mach/irda.h
deleted file mode 100644
index 38f77b5e56cf..000000000000
--- a/arch/arm/include/asm/mach/irda.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  arch/arm/include/asm/mach/irda.h
- *
- *  Copyright (C) 2004 Russell King.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_ARM_MACH_IRDA_H
-#define __ASM_ARM_MACH_IRDA_H
-
-struct irda_platform_data {
-	int (*startup)(struct device *);
-	void (*shutdown)(struct device *);
-	int (*set_power)(struct device *, unsigned int state);
-	void (*set_speed)(struct device *, unsigned int speed);
-};
-
-#endif
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index 7dd894ece9ae..d28ecb9ef172 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -37,7 +37,7 @@
 
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 #include <asm/mach/map.h>
 #include <mach/assabet.h>
 #include <linux/platform_data/mfd-mcp-sa11x0.h>
diff --git a/arch/arm/mach-sa1100/collie.c b/arch/arm/mach-sa1100/collie.c
index b90c7d828391..7fcbe3d119c7 100644
--- a/arch/arm/mach-sa1100/collie.c
+++ b/arch/arm/mach-sa1100/collie.c
@@ -43,7 +43,7 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 
 #include <asm/hardware/scoop.h>
 #include <asm/mach/sharpsl_param.h>
diff --git a/arch/arm/mach-sa1100/h3100.c b/arch/arm/mach-sa1100/h3100.c
index 3c43219bc881..c6b412054a3c 100644
--- a/arch/arm/mach-sa1100/h3100.c
+++ b/arch/arm/mach-sa1100/h3100.c
@@ -18,7 +18,7 @@
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 
 #include <mach/h3xxx.h>
 #include <mach/irqs.h>
diff --git a/arch/arm/mach-sa1100/h3600.c b/arch/arm/mach-sa1100/h3600.c
index 5be54c214c7c..118338efd790 100644
--- a/arch/arm/mach-sa1100/h3600.c
+++ b/arch/arm/mach-sa1100/h3600.c
@@ -18,7 +18,7 @@
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 
 #include <mach/h3xxx.h>
 #include <mach/irqs.h>
diff --git a/drivers/net/irda/sa1100_ir.c b/drivers/net/irda/sa1100_ir.c
index 7b17fa2114e1..b6e44ff4e373 100644
--- a/drivers/net/irda/sa1100_ir.c
+++ b/drivers/net/irda/sa1100_ir.c
@@ -38,7 +38,7 @@
 #include <net/irda/irda_device.h>
 
 #include <mach/hardware.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 
 static int power_level = 3;
 static int tx_lpm;
diff --git a/include/linux/platform_data/irda-sa11x0.h b/include/linux/platform_data/irda-sa11x0.h
new file mode 100644
index 000000000000..38f77b5e56cf
--- /dev/null
+++ b/include/linux/platform_data/irda-sa11x0.h
@@ -0,0 +1,20 @@
+/*
+ *  arch/arm/include/asm/mach/irda.h
+ *
+ *  Copyright (C) 2004 Russell King.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_ARM_MACH_IRDA_H
+#define __ASM_ARM_MACH_IRDA_H
+
+struct irda_platform_data {
+	int (*startup)(struct device *);
+	void (*shutdown)(struct device *);
+	int (*set_power)(struct device *, unsigned int state);
+	void (*set_speed)(struct device *, unsigned int speed);
+};
+
+#endif
-- 
cgit v1.2.3


From de40ed31b3c577cefd7b54972365a272ecbe9dd6 Mon Sep 17 00:00:00 2001
From: Nimrod Andy <B38611@freescale.com>
Date: Wed, 24 Dec 2014 17:30:39 +0800
Subject: net: fec: add Wake-on-LAN support

Support for Wake-on-LAN using Magic Packet. ENET IP supports sleep mode
in low power status, when system enter suspend status, Magic packet can
wake up system even if all SOC clocks are gate. The patch doing below things:
- flagging the device as a wakeup source for the system, as well as
  its Wake-on-LAN interrupt
- prepare the hardware for entering WoL mode
- add standard ethtool WOL interface
- enable the ENET interrupt to wake us

Tested on i.MX6q/dl sabresd, sabreauto boards, i.MX6SX arm2 boards.

Signed-off-by: Fugang Duan <B38611@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/fsl-fec.txt |   2 +
 drivers/net/ethernet/freescale/fec.h              |   2 +
 drivers/net/ethernet/freescale/fec_main.c         | 104 +++++++++++++++++++---
 include/linux/fec.h                               |   1 +
 4 files changed, 99 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/fsl-fec.txt b/Documentation/devicetree/bindings/net/fsl-fec.txt
index 0c8775c45798..a9eb611bee68 100644
--- a/Documentation/devicetree/bindings/net/fsl-fec.txt
+++ b/Documentation/devicetree/bindings/net/fsl-fec.txt
@@ -22,6 +22,8 @@ Optional properties:
 - fsl,num-rx-queues : The property is valid for enet-avb IP, which supports
   hw multi queues. Should specify the rx queue number, otherwise set rx queue
   number to 1.
+- fsl,magic-packet : If present, indicates that the hardware supports waking
+  up via magic packet.
 
 Optional subnodes:
 - mdio : specifies the mdio bus in the FEC, used as a container for phy nodes
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index df8bbddaeb37..d77a96fdf1dd 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -357,6 +357,7 @@ struct bufdesc_ex {
 #define FEC_ENET_RXB    ((uint)0x01000000)      /* A buffer was received */
 #define FEC_ENET_MII    ((uint)0x00800000)      /* MII interrupt */
 #define FEC_ENET_EBERR  ((uint)0x00400000)      /* SDMA bus error */
+#define FEC_ENET_WAKEUP	((uint)0x00020000)	/* Wakeup request */
 #define FEC_ENET_TXF	(FEC_ENET_TXF_0 | FEC_ENET_TXF_1 | FEC_ENET_TXF_2)
 #define FEC_ENET_RXF	(FEC_ENET_RXF_0 | FEC_ENET_RXF_1 | FEC_ENET_RXF_2)
 #define FEC_ENET_TS_AVAIL       ((uint)0x00010000)
@@ -512,6 +513,7 @@ struct fec_enet_private {
 	int	irq[FEC_IRQ_NUM];
 	bool	bufdesc_ex;
 	int	pause_flag;
+	int	wol_flag;
 	u32	quirks;
 
 	struct	napi_struct napi;
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 5ebdf8dc8a31..49cd358c30fa 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -187,6 +187,9 @@ MODULE_PARM_DESC(macaddr, "FEC Ethernet MAC address");
 #define FEC_MMFR_RA(v)		((v & 0x1f) << 18)
 #define FEC_MMFR_TA		(2 << 16)
 #define FEC_MMFR_DATA(v)	(v & 0xffff)
+/* FEC ECR bits definition */
+#define FEC_ECR_MAGICEN		(1 << 2)
+#define FEC_ECR_SLEEP		(1 << 3)
 
 #define FEC_MII_TIMEOUT		30000 /* us */
 
@@ -195,6 +198,9 @@ MODULE_PARM_DESC(macaddr, "FEC Ethernet MAC address");
 
 #define FEC_PAUSE_FLAG_AUTONEG	0x1
 #define FEC_PAUSE_FLAG_ENABLE	0x2
+#define FEC_WOL_HAS_MAGIC_PACKET	(0x1 << 0)
+#define FEC_WOL_FLAG_ENABLE		(0x1 << 1)
+#define FEC_WOL_FLAG_SLEEP_ON		(0x1 << 2)
 
 #define COPYBREAK_DEFAULT	256
 
@@ -1089,7 +1095,9 @@ static void
 fec_stop(struct net_device *ndev)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
+	struct fec_platform_data *pdata = fep->pdev->dev.platform_data;
 	u32 rmii_mode = readl(fep->hwp + FEC_R_CNTRL) & (1 << 8);
+	u32 val;
 
 	/* We cannot expect a graceful transmit stop without link !!! */
 	if (fep->link) {
@@ -1103,17 +1111,28 @@ fec_stop(struct net_device *ndev)
 	 * For i.MX6SX SOC, enet use AXI bus, we use disable MAC
 	 * instead of reset MAC itself.
 	 */
-	if (fep->quirks & FEC_QUIRK_HAS_AVB) {
-		writel(0, fep->hwp + FEC_ECNTRL);
+	if (!(fep->wol_flag & FEC_WOL_FLAG_SLEEP_ON)) {
+		if (fep->quirks & FEC_QUIRK_HAS_AVB) {
+			writel(0, fep->hwp + FEC_ECNTRL);
+		} else {
+			writel(1, fep->hwp + FEC_ECNTRL);
+			udelay(10);
+		}
+		writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
 	} else {
-		writel(1, fep->hwp + FEC_ECNTRL);
-		udelay(10);
+		writel(FEC_DEFAULT_IMASK | FEC_ENET_WAKEUP, fep->hwp + FEC_IMASK);
+		val = readl(fep->hwp + FEC_ECNTRL);
+		val |= (FEC_ECR_MAGICEN | FEC_ECR_SLEEP);
+		writel(val, fep->hwp + FEC_ECNTRL);
+
+		if (pdata && pdata->sleep_mode_enable)
+			pdata->sleep_mode_enable(true);
 	}
 	writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED);
-	writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
 
 	/* We have to keep ENET enabled to have MII interrupt stay working */
-	if (fep->quirks & FEC_QUIRK_ENET_MAC) {
+	if (fep->quirks & FEC_QUIRK_ENET_MAC &&
+		!(fep->wol_flag & FEC_WOL_FLAG_SLEEP_ON)) {
 		writel(2, fep->hwp + FEC_ECNTRL);
 		writel(rmii_mode, fep->hwp + FEC_R_CNTRL);
 	}
@@ -2427,6 +2446,44 @@ static int fec_enet_set_tunable(struct net_device *netdev,
 	return ret;
 }
 
+static void
+fec_enet_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct fec_enet_private *fep = netdev_priv(ndev);
+
+	if (fep->wol_flag & FEC_WOL_HAS_MAGIC_PACKET) {
+		wol->supported = WAKE_MAGIC;
+		wol->wolopts = fep->wol_flag & FEC_WOL_FLAG_ENABLE ? WAKE_MAGIC : 0;
+	} else {
+		wol->supported = wol->wolopts = 0;
+	}
+}
+
+static int
+fec_enet_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct fec_enet_private *fep = netdev_priv(ndev);
+
+	if (!(fep->wol_flag & FEC_WOL_HAS_MAGIC_PACKET))
+		return -EINVAL;
+
+	if (wol->wolopts & ~WAKE_MAGIC)
+		return -EINVAL;
+
+	device_set_wakeup_enable(&ndev->dev, wol->wolopts & WAKE_MAGIC);
+	if (device_may_wakeup(&ndev->dev)) {
+		fep->wol_flag |= FEC_WOL_FLAG_ENABLE;
+		if (fep->irq[0] > 0)
+			enable_irq_wake(fep->irq[0]);
+	} else {
+		fep->wol_flag &= (~FEC_WOL_FLAG_ENABLE);
+		if (fep->irq[0] > 0)
+			disable_irq_wake(fep->irq[0]);
+	}
+
+	return 0;
+}
+
 static const struct ethtool_ops fec_enet_ethtool_ops = {
 	.get_settings		= fec_enet_get_settings,
 	.set_settings		= fec_enet_set_settings,
@@ -2445,6 +2502,8 @@ static const struct ethtool_ops fec_enet_ethtool_ops = {
 	.get_ts_info		= fec_enet_get_ts_info,
 	.get_tunable		= fec_enet_get_tunable,
 	.set_tunable		= fec_enet_set_tunable,
+	.get_wol		= fec_enet_get_wol,
+	.set_wol		= fec_enet_set_wol,
 };
 
 static int fec_enet_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
@@ -2705,6 +2764,9 @@ fec_enet_open(struct net_device *ndev)
 	phy_start(fep->phy_dev);
 	netif_tx_start_all_queues(ndev);
 
+	device_set_wakeup_enable(&ndev->dev, fep->wol_flag &
+				 FEC_WOL_FLAG_ENABLE);
+
 	return 0;
 
 err_enet_mii_probe:
@@ -3153,6 +3215,9 @@ fec_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, ndev);
 
+	if (of_get_property(np, "fsl,magic-packet", NULL))
+		fep->wol_flag |= FEC_WOL_HAS_MAGIC_PACKET;
+
 	phy_node = of_parse_phandle(np, "phy-handle", 0);
 	if (!phy_node && of_phy_is_fixed_link(np)) {
 		ret = of_phy_register_fixed_link(np);
@@ -3247,6 +3312,8 @@ fec_probe(struct platform_device *pdev)
 				       0, pdev->name, ndev);
 		if (ret)
 			goto failed_irq;
+
+		fep->irq[i] = irq;
 	}
 
 	init_completion(&fep->mdio_done);
@@ -3263,6 +3330,9 @@ fec_probe(struct platform_device *pdev)
 	if (ret)
 		goto failed_register;
 
+	device_init_wakeup(&ndev->dev, fep->wol_flag &
+			   FEC_WOL_HAS_MAGIC_PACKET);
+
 	if (fep->bufdesc_ex && fep->ptp_clock)
 		netdev_info(ndev, "registered PHC device %d\n", fep->dev_id);
 
@@ -3316,6 +3386,8 @@ static int __maybe_unused fec_suspend(struct device *dev)
 
 	rtnl_lock();
 	if (netif_running(ndev)) {
+		if (fep->wol_flag & FEC_WOL_FLAG_ENABLE)
+			fep->wol_flag |= FEC_WOL_FLAG_SLEEP_ON;
 		phy_stop(fep->phy_dev);
 		napi_disable(&fep->napi);
 		netif_tx_lock_bh(ndev);
@@ -3323,11 +3395,12 @@ static int __maybe_unused fec_suspend(struct device *dev)
 		netif_tx_unlock_bh(ndev);
 		fec_stop(ndev);
 		fec_enet_clk_enable(ndev, false);
-		pinctrl_pm_select_sleep_state(&fep->pdev->dev);
+		if (!(fep->wol_flag & FEC_WOL_FLAG_ENABLE))
+			pinctrl_pm_select_sleep_state(&fep->pdev->dev);
 	}
 	rtnl_unlock();
 
-	if (fep->reg_phy)
+	if (fep->reg_phy && !(fep->wol_flag & FEC_WOL_FLAG_ENABLE))
 		regulator_disable(fep->reg_phy);
 
 	/* SOC supply clock to phy, when clock is disabled, phy link down
@@ -3343,9 +3416,11 @@ static int __maybe_unused fec_resume(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct fec_enet_private *fep = netdev_priv(ndev);
+	struct fec_platform_data *pdata = fep->pdev->dev.platform_data;
 	int ret;
+	int val;
 
-	if (fep->reg_phy) {
+	if (fep->reg_phy && !(fep->wol_flag & FEC_WOL_FLAG_ENABLE)) {
 		ret = regulator_enable(fep->reg_phy);
 		if (ret)
 			return ret;
@@ -3353,12 +3428,21 @@ static int __maybe_unused fec_resume(struct device *dev)
 
 	rtnl_lock();
 	if (netif_running(ndev)) {
-		pinctrl_pm_select_default_state(&fep->pdev->dev);
 		ret = fec_enet_clk_enable(ndev, true);
 		if (ret) {
 			rtnl_unlock();
 			goto failed_clk;
 		}
+		if (fep->wol_flag & FEC_WOL_FLAG_ENABLE) {
+			if (pdata && pdata->sleep_mode_enable)
+				pdata->sleep_mode_enable(false);
+			val = readl(fep->hwp + FEC_ECNTRL);
+			val &= ~(FEC_ECR_MAGICEN | FEC_ECR_SLEEP);
+			writel(val, fep->hwp + FEC_ECNTRL);
+			fep->wol_flag &= ~FEC_WOL_FLAG_SLEEP_ON;
+		} else {
+			pinctrl_pm_select_default_state(&fep->pdev->dev);
+		}
 		fec_restart(ndev);
 		netif_tx_lock_bh(ndev);
 		netif_device_attach(ndev);
diff --git a/include/linux/fec.h b/include/linux/fec.h
index bcff455d1d53..1454a503622d 100644
--- a/include/linux/fec.h
+++ b/include/linux/fec.h
@@ -19,6 +19,7 @@
 struct fec_platform_data {
 	phy_interface_t phy;
 	unsigned char mac[ETH_ALEN];
+	void (*sleep_mode_enable)(int enabled);
 };
 
 #endif
-- 
cgit v1.2.3


From 9b174d88c257150562b0101fcc6cb6c3cb74275c Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Tue, 30 Dec 2014 19:10:15 -0800
Subject: net: Add Transparent Ethernet Bridging GRO support.

Currently the only tunnel protocol that supports GRO with encapsulated
Ethernet is VXLAN. This pulls out the Ethernet code into a proper layer
so that it can be used by other tunnel protocols such as GRE and Geneve.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c         | 53 +++-----------------------
 include/linux/etherdevice.h |  4 ++
 net/ethernet/eth.c          | 92 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 7fbd89fbe107..2ab0922af0b4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -549,10 +549,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
 {
 	struct sk_buff *p, **pp = NULL;
 	struct vxlanhdr *vh, *vh2;
-	struct ethhdr *eh, *eh2;
-	unsigned int hlen, off_vx, off_eth;
-	const struct packet_offload *ptype;
-	__be16 type;
+	unsigned int hlen, off_vx;
 	int flush = 1;
 
 	off_vx = skb_gro_offset(skb);
@@ -563,17 +560,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
 		if (unlikely(!vh))
 			goto out;
 	}
-	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
-	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
-
-	off_eth = skb_gro_offset(skb);
-	hlen = off_eth + sizeof(*eh);
-	eh   = skb_gro_header_fast(skb, off_eth);
-	if (skb_gro_header_hard(skb, hlen)) {
-		eh = skb_gro_header_slow(skb, hlen, off_eth);
-		if (unlikely(!eh))
-			goto out;
-	}
 
 	flush = 0;
 
@@ -582,28 +568,16 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
 			continue;
 
 		vh2 = (struct vxlanhdr *)(p->data + off_vx);
-		eh2 = (struct ethhdr   *)(p->data + off_eth);
-		if (vh->vx_vni != vh2->vx_vni || compare_ether_header(eh, eh2)) {
+		if (vh->vx_vni != vh2->vx_vni) {
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}
 	}
 
-	type = eh->h_proto;
-
-	rcu_read_lock();
-	ptype = gro_find_receive_by_type(type);
-	if (ptype == NULL) {
-		flush = 1;
-		goto out_unlock;
-	}
-
-	skb_gro_pull(skb, sizeof(*eh)); /* pull inner eth header */
-	skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
-	pp = ptype->callbacks.gro_receive(head, skb);
+	skb_gro_pull(skb, sizeof(struct vxlanhdr));
+	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
+	pp = eth_gro_receive(head, skb);
 
-out_unlock:
-	rcu_read_unlock();
 out:
 	NAPI_GRO_CB(skb)->flush |= flush;
 
@@ -612,24 +586,9 @@ out:
 
 static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
 {
-	struct ethhdr *eh;
-	struct packet_offload *ptype;
-	__be16 type;
-	int vxlan_len  = sizeof(struct vxlanhdr) + sizeof(struct ethhdr);
-	int err = -ENOSYS;
-
 	udp_tunnel_gro_complete(skb, nhoff);
 
-	eh = (struct ethhdr *)(skb->data + nhoff + sizeof(struct vxlanhdr));
-	type = eh->h_proto;
-
-	rcu_read_lock();
-	ptype = gro_find_complete_by_type(type);
-	if (ptype != NULL)
-		err = ptype->callbacks.gro_complete(skb, nhoff + vxlan_len);
-
-	rcu_read_unlock();
-	return err;
+	return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
 }
 
 /* Notify netdevs that UDP port started listening */
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 41c891d05f04..1d869d185a0d 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -52,6 +52,10 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
 #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)
 
+struct sk_buff **eth_gro_receive(struct sk_buff **head,
+				 struct sk_buff *skb);
+int eth_gro_complete(struct sk_buff *skb, int nhoff);
+
 /* Reserved Ethernet Addresses per IEEE 802.1Q */
 static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
 { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 33a140e15834..238f38d21641 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -424,3 +424,95 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 	return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
 }
 EXPORT_SYMBOL(sysfs_format_mac);
+
+struct sk_buff **eth_gro_receive(struct sk_buff **head,
+				 struct sk_buff *skb)
+{
+	struct sk_buff *p, **pp = NULL;
+	struct ethhdr *eh, *eh2;
+	unsigned int hlen, off_eth;
+	const struct packet_offload *ptype;
+	__be16 type;
+	int flush = 1;
+
+	off_eth = skb_gro_offset(skb);
+	hlen = off_eth + sizeof(*eh);
+	eh = skb_gro_header_fast(skb, off_eth);
+	if (skb_gro_header_hard(skb, hlen)) {
+		eh = skb_gro_header_slow(skb, hlen, off_eth);
+		if (unlikely(!eh))
+			goto out;
+	}
+
+	flush = 0;
+
+	for (p = *head; p; p = p->next) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		eh2 = (struct ethhdr *)(p->data + off_eth);
+		if (compare_ether_header(eh, eh2)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+	}
+
+	type = eh->h_proto;
+
+	rcu_read_lock();
+	ptype = gro_find_receive_by_type(type);
+	if (ptype == NULL) {
+		flush = 1;
+		goto out_unlock;
+	}
+
+	skb_gro_pull(skb, sizeof(*eh));
+	skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
+	pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+EXPORT_SYMBOL(eth_gro_receive);
+
+int eth_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff);
+	__be16 type = eh->h_proto;
+	struct packet_offload *ptype;
+	int err = -ENOSYS;
+
+	if (skb->encapsulation)
+		skb_set_inner_mac_header(skb, nhoff);
+
+	rcu_read_lock();
+	ptype = gro_find_complete_by_type(type);
+	if (ptype != NULL)
+		err = ptype->callbacks.gro_complete(skb, nhoff +
+						    sizeof(struct ethhdr));
+
+	rcu_read_unlock();
+	return err;
+}
+EXPORT_SYMBOL(eth_gro_complete);
+
+static struct packet_offload eth_packet_offload __read_mostly = {
+	.type = cpu_to_be16(ETH_P_TEB),
+	.callbacks = {
+		.gro_receive = eth_gro_receive,
+		.gro_complete = eth_gro_complete,
+	},
+};
+
+static int __init eth_offload_init(void)
+{
+	dev_add_offload(&eth_packet_offload);
+
+	return 0;
+}
+
+fs_initcall(eth_offload_init);
-- 
cgit v1.2.3


From 1891172aa5c32f08ad9931b794edd71e91a4a527 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Fri, 2 Jan 2015 20:22:03 +0100
Subject: timecounter: provide a macro to initialize the cyclecounter mask
 field.

There is no need for users of the timecounter/cyclecounter code to include
clocksource.h just for a single macro.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/timecounter.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
index 74f45496e6d1..4382035a75bb 100644
--- a/include/linux/timecounter.h
+++ b/include/linux/timecounter.h
@@ -19,6 +19,9 @@
 
 #include <linux/types.h>
 
+/* simplify initialization of mask field */
+#define CYCLECOUNTER_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
+
 /**
  * struct cyclecounter - hardware abstraction for a free running counter
  *	Provides completely state-free accessors to the underlying hardware.
@@ -29,7 +32,7 @@
  * @read:		returns the current cycle value
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters,
- *			see CLOCKSOURCE_MASK() helper macro
+ *			see CYCLECOUNTER_MASK() helper macro
  * @mult:		cycle to nanosecond multiplier
  * @shift:		cycle to nanosecond divisor (power of two)
  */
-- 
cgit v1.2.3


From c761d96b079e99d106fa4064e730ef7d0f312f9d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 2 Jan 2015 15:05:12 -0700
Subject: blk-mq: export blk_mq_freeze_queue()

Commit b4c6a028774b exported the start and unfreeze, but we need
the regular blk_mq_freeze_queue() for the loop conversion.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 1 +
 include/linux/blk-mq.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1a41d7aefbd5..a7d4a988516f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -136,6 +136,7 @@ void blk_mq_freeze_queue(struct request_queue *q)
 	blk_mq_freeze_queue_start(q);
 	blk_mq_freeze_queue_wait(q);
 }
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3b43f509432c..5b6500c77ed2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -212,6 +212,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
 
-- 
cgit v1.2.3


From 8d24c0b43125ec26cc80e04588477a9a2afc025c Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:14 +0100
Subject: rhashtable: Do hashing inside of rhashtable_lookup_compare()

Hash the key inside of rhashtable_lookup_compare() like
rhashtable_lookup() does. This allows to simplify the hashing
functions and keep them private.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: netfilter-devel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  5 +--
 lib/rhashtable.c           | 91 +++++++++++++++-------------------------------
 net/netfilter/nft_hash.c   | 46 ++++++++++++++---------
 net/netlink/af_netlink.c   |  5 +--
 4 files changed, 61 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index b93fd89b2e5e..1b51221c6bbd 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -96,9 +96,6 @@ static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
 
-u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len);
-u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr);
-
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node);
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node);
 void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
@@ -111,7 +108,7 @@ int rhashtable_expand(struct rhashtable *ht);
 int rhashtable_shrink(struct rhashtable *ht);
 
 void *rhashtable_lookup(const struct rhashtable *ht, const void *key);
-void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
+void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
 
 void rhashtable_destroy(const struct rhashtable *ht);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6c3c723e902b..1ee0eb636ca3 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -42,69 +42,39 @@ static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
 	return (void *) he - ht->p.head_offset;
 }
 
-static u32 __hashfn(const struct rhashtable *ht, const void *key,
-		      u32 len, u32 hsize)
+static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
 {
-	u32 h;
-
-	h = ht->p.hashfn(key, len, ht->p.hash_rnd);
-
-	return h & (hsize - 1);
-}
-
-/**
- * rhashtable_hashfn - compute hash for key of given length
- * @ht:		hash table to compute for
- * @key:	pointer to key
- * @len:	length of key
- *
- * Computes the hash value using the hash function provided in the 'hashfn'
- * of struct rhashtable_params. The returned value is guaranteed to be
- * smaller than the number of buckets in the hash table.
- */
-u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len)
-{
-	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
-
-	return __hashfn(ht, key, len, tbl->size);
+	return hash & (tbl->size - 1);
 }
-EXPORT_SYMBOL_GPL(rhashtable_hashfn);
 
-static u32 obj_hashfn(const struct rhashtable *ht, const void *ptr, u32 hsize)
+static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 {
-	if (unlikely(!ht->p.key_len)) {
-		u32 h;
-
-		h = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+	u32 hash;
 
-		return h & (hsize - 1);
-	}
+	if (unlikely(!ht->p.key_len))
+		hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+	else
+		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
+				    ht->p.hash_rnd);
 
-	return __hashfn(ht, ptr + ht->p.key_offset, ht->p.key_len, hsize);
+	return hash;
 }
 
-/**
- * rhashtable_obj_hashfn - compute hash for hashed object
- * @ht:		hash table to compute for
- * @ptr:	pointer to hashed object
- *
- * Computes the hash value using the hash function `hashfn` respectively
- * 'obj_hashfn' depending on whether the hash table is set up to work with
- * a fixed length key. The returned value is guaranteed to be smaller than
- * the number of buckets in the hash table.
- */
-u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr)
+static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len)
 {
 	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	u32 hash;
+
+	hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
 
-	return obj_hashfn(ht, ptr, tbl->size);
+	return rht_bucket_index(tbl, hash);
 }
-EXPORT_SYMBOL_GPL(rhashtable_obj_hashfn);
 
 static u32 head_hashfn(const struct rhashtable *ht,
-		       const struct rhash_head *he, u32 hsize)
+		       const struct bucket_table *tbl,
+		       const struct rhash_head *he)
 {
-	return obj_hashfn(ht, rht_obj(ht, he), hsize);
+	return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
 }
 
 static struct bucket_table *bucket_table_alloc(size_t nbuckets)
@@ -170,9 +140,9 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 * reaches a node that doesn't hash to the same bucket as the
 	 * previous node p. Call the previous node p;
 	 */
-	h = head_hashfn(ht, p, new_tbl->size);
+	h = head_hashfn(ht, new_tbl, p);
 	rht_for_each(he, p->next, ht) {
-		if (head_hashfn(ht, he, new_tbl->size) != h)
+		if (head_hashfn(ht, new_tbl, he) != h)
 			break;
 		p = he;
 	}
@@ -184,7 +154,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	next = NULL;
 	if (he) {
 		rht_for_each(he, he->next, ht) {
-			if (head_hashfn(ht, he, new_tbl->size) == h) {
+			if (head_hashfn(ht, new_tbl, he) == h) {
 				next = he;
 				break;
 			}
@@ -237,9 +207,9 @@ int rhashtable_expand(struct rhashtable *ht)
 	 * single imprecise chain.
 	 */
 	for (i = 0; i < new_tbl->size; i++) {
-		h = i & (old_tbl->size - 1);
+		h = rht_bucket_index(old_tbl, i);
 		rht_for_each(he, old_tbl->buckets[h], ht) {
-			if (head_hashfn(ht, he, new_tbl->size) == i) {
+			if (head_hashfn(ht, new_tbl, he) == i) {
 				RCU_INIT_POINTER(new_tbl->buckets[i], he);
 				break;
 			}
@@ -353,7 +323,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	hash = head_hashfn(ht, obj, tbl->size);
+	hash = head_hashfn(ht, tbl, obj);
 	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 	ht->nelems++;
@@ -413,7 +383,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	h = head_hashfn(ht, obj, tbl->size);
+	h = head_hashfn(ht, tbl, obj);
 
 	pprev = &tbl->buckets[h];
 	rht_for_each(he, tbl->buckets[h], ht) {
@@ -452,7 +422,7 @@ void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
 
 	BUG_ON(!ht->p.key_len);
 
-	h = __hashfn(ht, key, ht->p.key_len, tbl->size);
+	h = key_hashfn(ht, key, ht->p.key_len);
 	rht_for_each_rcu(he, tbl->buckets[h], ht) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
@@ -467,7 +437,7 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
 /**
  * rhashtable_lookup_compare - search hash table with compare function
  * @ht:		hash table
- * @hash:	hash value of desired entry
+ * @key:	the pointer to the key
  * @compare:	compare function, must return true on match
  * @arg:	argument passed on to compare function
  *
@@ -479,15 +449,14 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
  *
  * Returns the first entry on which the compare function returned true.
  */
-void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
+void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg)
 {
 	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	struct rhash_head *he;
+	u32 hash;
 
-	if (unlikely(hash >= tbl->size))
-		return NULL;
-
+	hash = key_hashfn(ht, key, ht->p.key_len);
 	rht_for_each_rcu(he, tbl->buckets[hash], ht) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 1e316ce4cb5d..614ee099ba36 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -94,28 +94,40 @@ static void nft_hash_remove(const struct nft_set *set,
 	kfree(he);
 }
 
+struct nft_compare_arg {
+	const struct nft_set *set;
+	struct nft_set_elem *elem;
+};
+
+static bool nft_hash_compare(void *ptr, void *arg)
+{
+	struct nft_hash_elem *he = ptr;
+	struct nft_compare_arg *x = arg;
+
+	if (!nft_data_cmp(&he->key, &x->elem->key, x->set->klen)) {
+		x->elem->cookie = &he->node;
+		x->elem->flags = 0;
+		if (x->set->flags & NFT_SET_MAP)
+			nft_data_copy(&x->elem->data, he->data);
+
+		return true;
+	}
+
+	return false;
+}
+
 static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 {
 	const struct rhashtable *priv = nft_set_priv(set);
-	const struct bucket_table *tbl = rht_dereference_rcu(priv->tbl, priv);
-	struct rhash_head __rcu * const *pprev;
-	struct nft_hash_elem *he;
-	u32 h;
-
-	h = rhashtable_hashfn(priv, &elem->key, set->klen);
-	pprev = &tbl->buckets[h];
-	rht_for_each_entry_rcu(he, tbl->buckets[h], node) {
-		if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
-			pprev = &he->node.next;
-			continue;
-		}
+	struct nft_compare_arg arg = {
+		.set = set,
+		.elem = elem,
+	};
 
-		elem->cookie = (void *)pprev;
-		elem->flags = 0;
-		if (set->flags & NFT_SET_MAP)
-			nft_data_copy(&elem->data, he->data);
+	if (rhashtable_lookup_compare(priv, &elem->key,
+				      &nft_hash_compare, &arg))
 		return 0;
-	}
+
 	return -ENOENT;
 }
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 84ea76ca3f1f..a5d7ed627563 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1002,11 +1002,8 @@ static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
 		.net = net,
 		.portid = portid,
 	};
-	u32 hash;
 
-	hash = rhashtable_hashfn(&table->hash, &portid, sizeof(portid));
-
-	return rhashtable_lookup_compare(&table->hash, hash,
+	return rhashtable_lookup_compare(&table->hash, &portid,
 					 &netlink_compare, &arg);
 }
 
-- 
cgit v1.2.3


From 88d6ed15acff1cb44b1d1f3c0a393b7f7744957a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:16 +0100
Subject: rhashtable: Convert bucket iterators to take table and index

This patch is in preparation to introduce per bucket spinlocks. It
extends all iterator macros to take the bucket table and bucket
index. It also introduces a new rht_dereference_bucket() to
handle protected accesses to buckets.

It introduces a barrier() to the RCU iterators to the prevent
the compiler from caching the first element.

The lockdep verifier is introduced as stub which always succeeds
and properly implement in the next patch when the locks are
introduced.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 173 ++++++++++++++++++++++++++++++---------------
 lib/rhashtable.c           |  30 +++++---
 net/netfilter/nft_hash.c   |  12 ++--
 net/netlink/af_netlink.c   |  12 ++--
 net/netlink/diag.c         |   4 +-
 5 files changed, 152 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 1b51221c6bbd..b54e24a08806 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -87,11 +87,18 @@ struct rhashtable {
 
 #ifdef CONFIG_PROVE_LOCKING
 int lockdep_rht_mutex_is_held(const struct rhashtable *ht);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
 #else
 static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 {
 	return 1;
 }
+
+static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
+					     u32 hash)
+{
+	return 1;
+}
 #endif /* CONFIG_PROVE_LOCKING */
 
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
@@ -119,92 +126,144 @@ void rhashtable_destroy(const struct rhashtable *ht);
 #define rht_dereference_rcu(p, ht) \
 	rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))
 
-#define rht_entry(ptr, type, member) container_of(ptr, type, member)
-#define rht_entry_safe(ptr, type, member) \
-({ \
-	typeof(ptr) __ptr = (ptr); \
-	   __ptr ? rht_entry(__ptr, type, member) : NULL; \
-})
+#define rht_dereference_bucket(p, tbl, hash) \
+	rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash))
 
-#define rht_next_entry_safe(pos, ht, member) \
-({ \
-	pos ? rht_entry_safe(rht_dereference((pos)->member.next, ht), \
-			     typeof(*(pos)), member) : NULL; \
-})
+#define rht_dereference_bucket_rcu(p, tbl, hash) \
+	rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash))
+
+#define rht_entry(tpos, pos, member) \
+	({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
 /**
- * rht_for_each - iterate over hash chain
- * @pos:	&struct rhash_head to use as a loop cursor.
- * @head:	head of the hash chain (struct rhash_head *)
- * @ht:		pointer to your struct rhashtable
+ * rht_for_each_continue - continue iterating over hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
  */
-#define rht_for_each(pos, head, ht) \
-	for (pos = rht_dereference(head, ht); \
+#define rht_for_each_continue(pos, head, tbl, hash) \
+	for (pos = rht_dereference_bucket(head, tbl, hash); \
 	     pos; \
-	     pos = rht_dereference((pos)->next, ht))
+	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
+
+/**
+ * rht_for_each - iterate over hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ */
+#define rht_for_each(pos, tbl, hash) \
+	rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+
+/**
+ * rht_for_each_entry_continue - continue iterating over hash chain
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member)	\
+	for (pos = rht_dereference_bucket(head, tbl, hash);		\
+	     pos && rht_entry(tpos, pos, member);			\
+	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
 /**
  * rht_for_each_entry - iterate over hash chain of given type
- * @pos:	type * to use as a loop cursor.
- * @head:	head of the hash chain (struct rhash_head *)
- * @ht:		pointer to your struct rhashtable
- * @member:	name of the rhash_head within the hashable struct.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
  */
-#define rht_for_each_entry(pos, head, ht, member) \
-	for (pos = rht_entry_safe(rht_dereference(head, ht), \
-				   typeof(*(pos)), member); \
-	     pos; \
-	     pos = rht_next_entry_safe(pos, ht, member))
+#define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
+	rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash],	\
+				    tbl, hash, member)
 
 /**
  * rht_for_each_entry_safe - safely iterate over hash chain of given type
- * @pos:	type * to use as a loop cursor.
- * @n:		type * to use for temporary next object storage
- * @head:	head of the hash chain (struct rhash_head *)
- * @ht:		pointer to your struct rhashtable
- * @member:	name of the rhash_head within the hashable struct.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @next:	the &struct rhash_head to use as next in loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
  *
  * This hash chain list-traversal primitive allows for the looped code to
  * remove the loop cursor from the list.
  */
-#define rht_for_each_entry_safe(pos, n, head, ht, member)		\
-	for (pos = rht_entry_safe(rht_dereference(head, ht), \
-				  typeof(*(pos)), member), \
-	     n = rht_next_entry_safe(pos, ht, member); \
-	     pos; \
-	     pos = n, \
-	     n = rht_next_entry_safe(pos, ht, member))
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	    \
+	for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \
+	     next = pos ? rht_dereference_bucket(pos->next, tbl, hash)      \
+			: NULL;						    \
+	     pos && rht_entry(tpos, pos, member);			    \
+	     pos = next)
+
+/**
+ * rht_for_each_rcu_continue - continue iterating over rcu hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
+	for (({barrier(); }),						\
+	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
+	     pos;							\
+	     pos = rcu_dereference_raw(pos->next))
 
 /**
  * rht_for_each_rcu - iterate over rcu hash chain
- * @pos:	&struct rhash_head to use as a loop cursor.
- * @head:	head of the hash chain (struct rhash_head *)
- * @ht:		pointer to your struct rhashtable
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
  *
  * This hash chain list-traversal primitive may safely run concurrently with
- * the _rcu fkht mutation primitives such as rht_insert() as long as the
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_rcu(pos, head, ht) \
-	for (pos = rht_dereference_rcu(head, ht); \
-	     pos; \
-	     pos = rht_dereference_rcu((pos)->next, ht))
+#define rht_for_each_rcu(pos, tbl, hash)				\
+	rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+
+/**
+ * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+	for (({barrier(); }),						    \
+	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
+	     pos && rht_entry(tpos, pos, member);			    \
+	     pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
 
 /**
  * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
- * @pos:	type * to use as a loop cursor.
- * @head:	head of the hash chain (struct rhash_head *)
- * @member:	name of the rhash_head within the hashable struct.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
  *
  * This hash chain list-traversal primitive may safely run concurrently with
- * the _rcu fkht mutation primitives such as rht_insert() as long as the
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_entry_rcu(pos, head, member) \
-	for (pos = rht_entry_safe(rcu_dereference_raw(head), \
-				  typeof(*(pos)), member); \
-	     pos; \
-	     pos = rht_entry_safe(rcu_dereference_raw((pos)->member.next), \
-				  typeof(*(pos)), member))
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		\
+	rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
+					tbl, hash, member)
 
 #endif /* _LINUX_RHASHTABLE_H */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b658245826a1..ce450d095fdf 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -35,6 +35,12 @@ int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 	return ht->p.mutex_is_held(ht->p.parent);
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
+
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+	return 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #endif
 
 static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
@@ -141,7 +147,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 * previous node p. Call the previous node p;
 	 */
 	h = head_hashfn(ht, new_tbl, p);
-	rht_for_each(he, p->next, ht) {
+	rht_for_each_continue(he, p->next, old_tbl, n) {
 		if (head_hashfn(ht, new_tbl, he) != h)
 			break;
 		p = he;
@@ -153,7 +159,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 */
 	next = NULL;
 	if (he) {
-		rht_for_each(he, he->next, ht) {
+		rht_for_each_continue(he, he->next, old_tbl, n) {
 			if (head_hashfn(ht, new_tbl, he) == h) {
 				next = he;
 				break;
@@ -208,7 +214,7 @@ int rhashtable_expand(struct rhashtable *ht)
 	 */
 	for (i = 0; i < new_tbl->size; i++) {
 		h = rht_bucket_index(old_tbl, i);
-		rht_for_each(he, old_tbl->buckets[h], ht) {
+		rht_for_each(he, old_tbl, h) {
 			if (head_hashfn(ht, new_tbl, he) == i) {
 				RCU_INIT_POINTER(new_tbl->buckets[i], he);
 				break;
@@ -286,7 +292,7 @@ int rhashtable_shrink(struct rhashtable *ht)
 		 * to the new bucket.
 		 */
 		for (pprev = &ntbl->buckets[i]; *pprev != NULL;
-		     pprev = &rht_dereference(*pprev, ht)->next)
+		     pprev = &rht_dereference_bucket(*pprev, ntbl, i)->next)
 			;
 		RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
 	}
@@ -386,7 +392,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 	h = head_hashfn(ht, tbl, obj);
 
 	pprev = &tbl->buckets[h];
-	rht_for_each(he, tbl->buckets[h], ht) {
+	rht_for_each(he, tbl, h) {
 		if (he != obj) {
 			pprev = &he->next;
 			continue;
@@ -423,7 +429,7 @@ void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
 	BUG_ON(!ht->p.key_len);
 
 	h = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl->buckets[h], ht) {
+	rht_for_each_rcu(he, tbl, h) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
 			continue;
@@ -457,7 +463,7 @@ void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 	u32 hash;
 
 	hash = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl->buckets[hash], ht) {
+	rht_for_each_rcu(he, tbl, hash) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
 		return rht_obj(ht, he);
@@ -625,6 +631,7 @@ static int __init test_rht_lookup(struct rhashtable *ht)
 static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 {
 	unsigned int cnt, rcu_cnt, i, total = 0;
+	struct rhash_head *pos;
 	struct test_obj *obj;
 	struct bucket_table *tbl;
 
@@ -635,14 +642,14 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 		if (!quiet)
 			pr_info(" [%#4x/%zu]", i, tbl->size);
 
-		rht_for_each_entry_rcu(obj, tbl->buckets[i], node) {
+		rht_for_each_entry_rcu(obj, pos, tbl, i, node) {
 			cnt++;
 			total++;
 			if (!quiet)
 				pr_cont(" [%p],", obj);
 		}
 
-		rht_for_each_entry_rcu(obj, tbl->buckets[i], node)
+		rht_for_each_entry_rcu(obj, pos, tbl, i, node)
 			rcu_cnt++;
 
 		if (rcu_cnt != cnt)
@@ -664,7 +671,8 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 static int __init test_rhashtable(struct rhashtable *ht)
 {
 	struct bucket_table *tbl;
-	struct test_obj *obj, *next;
+	struct test_obj *obj;
+	struct rhash_head *pos, *next;
 	int err;
 	unsigned int i;
 
@@ -733,7 +741,7 @@ static int __init test_rhashtable(struct rhashtable *ht)
 error:
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 	for (i = 0; i < tbl->size; i++)
-		rht_for_each_entry_safe(obj, next, tbl->buckets[i], ht, node)
+		rht_for_each_entry_safe(obj, pos, next, tbl, i, node)
 			kfree(obj);
 
 	return err;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 614ee099ba36..d93f1f4c22a9 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -142,7 +142,9 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
 
 	tbl = rht_dereference_rcu(priv->tbl, priv);
 	for (i = 0; i < tbl->size; i++) {
-		rht_for_each_entry_rcu(he, tbl->buckets[i], node) {
+		struct rhash_head *pos;
+
+		rht_for_each_entry_rcu(he, pos, tbl, i, node) {
 			if (iter->count < iter->skip)
 				goto cont;
 
@@ -197,15 +199,13 @@ static void nft_hash_destroy(const struct nft_set *set)
 {
 	const struct rhashtable *priv = nft_set_priv(set);
 	const struct bucket_table *tbl = priv->tbl;
-	struct nft_hash_elem *he, *next;
+	struct nft_hash_elem *he;
+	struct rhash_head *pos, *next;
 	unsigned int i;
 
 	for (i = 0; i < tbl->size; i++) {
-		for (he = rht_entry(tbl->buckets[i], struct nft_hash_elem, node);
-		     he != NULL; he = next) {
-			next = rht_entry(he->node.next, struct nft_hash_elem, node);
+		rht_for_each_entry_safe(he, pos, next, tbl, i, node)
 			nft_hash_elem_destroy(set, he);
-		}
 	}
 	rhashtable_destroy(priv);
 }
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a5d7ed627563..57449b6089c2 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2898,7 +2898,9 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
 		const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 
 		for (j = 0; j < tbl->size; j++) {
-			rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) {
+			struct rhash_head *node;
+
+			rht_for_each_entry_rcu(nlk, node, tbl, j, node) {
 				s = (struct sock *)nlk;
 
 				if (sock_net(s) != seq_file_net(seq))
@@ -2926,6 +2928,8 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
 static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct rhashtable *ht;
+	const struct bucket_table *tbl;
+	struct rhash_head *node;
 	struct netlink_sock *nlk;
 	struct nl_seq_iter *iter;
 	struct net *net;
@@ -2942,17 +2946,17 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	i = iter->link;
 	ht = &nl_table[i].hash;
-	rht_for_each_entry(nlk, nlk->node.next, ht, node)
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	rht_for_each_entry_rcu_continue(nlk, node, nlk->node.next, tbl, iter->hash_idx, node)
 		if (net_eq(sock_net((struct sock *)nlk), net))
 			return nlk;
 
 	j = iter->hash_idx + 1;
 
 	do {
-		const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 
 		for (; j < tbl->size; j++) {
-			rht_for_each_entry(nlk, tbl->buckets[j], ht, node) {
+			rht_for_each_entry_rcu(nlk, node, tbl, j, node) {
 				if (net_eq(sock_net((struct sock *)nlk), net)) {
 					iter->link = i;
 					iter->hash_idx = j;
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index de8c74a3c061..fcca36d81a62 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -113,7 +113,9 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	req = nlmsg_data(cb->nlh);
 
 	for (i = 0; i < htbl->size; i++) {
-		rht_for_each_entry(nlsk, htbl->buckets[i], ht, node) {
+		struct rhash_head *pos;
+
+		rht_for_each_entry(nlsk, pos, htbl, i, node) {
 			sk = (struct sock *)nlsk;
 
 			if (!net_eq(sock_net(sk), net))
-- 
cgit v1.2.3


From 897362e446436d245972e72c6bc5b33bd7a5c659 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:18 +0100
Subject: nft_hash: Remove rhashtable_remove_pprev()

The removal function of nft_hash currently stores a reference to the
previous element during lookup which is used to optimize removal later
on. This was possible because a lock is held throughout calling
rhashtable_lookup() and rhashtable_remove().

With the introdution of deferred table resizing in parallel to lookups
and insertions, the nftables lock will no longer synchronize all
table mutations and the stored pprev may become invalid.

Removing this optimization makes removal slightly more expensive on
average but allows taking the resize cost out of the insert and
remove path.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: netfilter-devel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  2 --
 lib/rhashtable.c           | 34 +++++++---------------------------
 net/netfilter/nft_hash.c   | 11 +++--------
 3 files changed, 10 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index b54e24a08806..f624d4b5045f 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -105,8 +105,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
 
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node);
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node);
-void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head __rcu **pprev);
 
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 0bd29c178910..e6b85c4a5828 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -344,32 +344,6 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
 
-/**
- * rhashtable_remove_pprev - remove object from hash table given previous element
- * @ht:		hash table
- * @obj:	pointer to hash head inside object
- * @pprev:	pointer to previous element
- *
- * Identical to rhashtable_remove() but caller is alreayd aware of the element
- * in front of the element to be deleted. This is in particular useful for
- * deletion when combined with walking or lookup.
- */
-void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head __rcu **pprev)
-{
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
-
-	ASSERT_RHT_MUTEX(ht);
-
-	RCU_INIT_POINTER(*pprev, obj->next);
-	ht->nelems--;
-
-	if (ht->p.shrink_decision &&
-	    ht->p.shrink_decision(ht, tbl->size))
-		rhashtable_shrink(ht);
-}
-EXPORT_SYMBOL_GPL(rhashtable_remove_pprev);
-
 /**
  * rhashtable_remove - remove object from hash table
  * @ht:		hash table
@@ -403,7 +377,13 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 			continue;
 		}
 
-		rhashtable_remove_pprev(ht, he, pprev);
+		RCU_INIT_POINTER(*pprev, he->next);
+		ht->nelems--;
+
+		if (ht->p.shrink_decision &&
+		    ht->p.shrink_decision(ht, tbl->size))
+			rhashtable_shrink(ht);
+
 		return true;
 	}
 
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index d93f1f4c22a9..7f903cf9a1b9 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -83,15 +83,10 @@ static void nft_hash_remove(const struct nft_set *set,
 			    const struct nft_set_elem *elem)
 {
 	struct rhashtable *priv = nft_set_priv(set);
-	struct rhash_head *he, __rcu **pprev;
-
-	pprev = elem->cookie;
-	he = rht_dereference((*pprev), priv);
-
-	rhashtable_remove_pprev(priv, he, pprev);
 
+	rhashtable_remove(priv, elem->cookie);
 	synchronize_rcu();
-	kfree(he);
+	kfree(elem->cookie);
 }
 
 struct nft_compare_arg {
@@ -105,7 +100,7 @@ static bool nft_hash_compare(void *ptr, void *arg)
 	struct nft_compare_arg *x = arg;
 
 	if (!nft_data_cmp(&he->key, &x->elem->key, x->set->klen)) {
-		x->elem->cookie = &he->node;
+		x->elem->cookie = he;
 		x->elem->flags = 0;
 		if (x->set->flags & NFT_SET_MAP)
 			nft_data_copy(&x->elem->data, he->data);
-- 
cgit v1.2.3


From 113948d841e8d78039e5dbbb5248f5b73e99eafa Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:19 +0100
Subject: spinlock: Add spin_lock_bh_nested()

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/spinlock.h         | 8 ++++++++
 include/linux/spinlock_api_smp.h | 2 ++
 include/linux/spinlock_api_up.h  | 1 +
 kernel/locking/spinlock.c        | 8 ++++++++
 4 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 262ba4ef9a8e..3e18379dfa6f 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -190,6 +190,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define raw_spin_lock_nested(lock, subclass) \
 	_raw_spin_lock_nested(lock, subclass)
+# define raw_spin_lock_bh_nested(lock, subclass) \
+	_raw_spin_lock_bh_nested(lock, subclass)
 
 # define raw_spin_lock_nest_lock(lock, nest_lock)			\
 	 do {								\
@@ -205,6 +207,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 # define raw_spin_lock_nested(lock, subclass)		\
 	_raw_spin_lock(((void)(subclass), (lock)))
 # define raw_spin_lock_nest_lock(lock, nest_lock)	_raw_spin_lock(lock)
+# define raw_spin_lock_bh_nested(lock, subclass)	_raw_spin_lock_bh(lock)
 #endif
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
@@ -324,6 +327,11 @@ do {								\
 	raw_spin_lock_nested(spinlock_check(lock), subclass);	\
 } while (0)
 
+#define spin_lock_bh_nested(lock, subclass)			\
+do {								\
+	raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\
+} while (0)
+
 #define spin_lock_nest_lock(lock, nest_lock)				\
 do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 42dfab89e740..5344268e6e62 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -22,6 +22,8 @@ int in_lock_functions(unsigned long addr);
 void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)		__acquires(lock);
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 								__acquires(lock);
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+								__acquires(lock);
 void __lockfunc
 _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)
 								__acquires(lock);
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index d0d188861ad6..d3afef9d8dbe 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -57,6 +57,7 @@
 
 #define _raw_spin_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_nested(lock, subclass)	__LOCK(lock)
+#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock)
 #define _raw_read_lock(lock)			__LOCK(lock)
 #define _raw_write_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(_raw_spin_lock_nested);
 
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+{
+	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
+
 unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
 						   int subclass)
 {
-- 
cgit v1.2.3


From 97defe1ecf868b8127f8e62395499d6a06e4c4b1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:20 +0100
Subject: rhashtable: Per bucket locks & deferred expansion/shrinking

Introduces an array of spinlocks to protect bucket mutations. The number
of spinlocks per CPU is configurable and selected based on the hash of
the bucket. This allows for parallel insertions and removals of entries
which do not share a lock.

The patch also defers expansion and shrinking to a worker queue which
allows insertion and removal from atomic context. Insertions and
deletions may occur in parallel to it and are only held up briefly
while the particular bucket is linked or unzipped.

Mutations of the bucket table pointer is protected by a new mutex, read
access is RCU protected.

In the event of an expansion or shrinking, the new bucket table allocated
is exposed as a so called future table as soon as the resize process
starts.  Lookups, deletions, and insertions will briefly use both tables.
The future table becomes the main table after an RCU grace period and
initial linking of the old to the new table was performed. Optimization
of the chains to make use of the new number of buckets follows only the
new table is in use.

The side effect of this is that during that RCU grace period, a bucket
traversal using any rht_for_each() variant on the main table will not see
any insertions performed during the RCU grace period which would at that
point land in the future table. The lookup will see them as it searches
both tables if needed.

Having multiple insertions and removals occur in parallel requires nelems
to become an atomic counter.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  37 ++--
 lib/rhashtable.c           | 458 ++++++++++++++++++++++++++++++++++-----------
 net/netfilter/nft_hash.c   |  27 ++-
 net/netlink/af_netlink.c   |  15 +-
 4 files changed, 384 insertions(+), 153 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index f624d4b5045f..a1688f0a6193 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -19,6 +19,7 @@
 #define _LINUX_RHASHTABLE_H
 
 #include <linux/rculist.h>
+#include <linux/workqueue.h>
 
 struct rhash_head {
 	struct rhash_head __rcu		*next;
@@ -26,8 +27,17 @@ struct rhash_head {
 
 #define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL)
 
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @locks_mask: Mask to apply before accessing locks[]
+ * @locks: Array of spinlocks protecting individual buckets
+ * @buckets: size * hash buckets
+ */
 struct bucket_table {
 	size_t				size;
+	unsigned int			locks_mask;
+	spinlock_t			*locks;
 	struct rhash_head __rcu		*buckets[];
 };
 
@@ -45,11 +55,11 @@ struct rhashtable;
  * @hash_rnd: Seed to use while hashing
  * @max_shift: Maximum number of shifts while expanding
  * @min_shift: Minimum number of shifts while shrinking
+ * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
  * @hashfn: Function to hash key
  * @obj_hashfn: Function to hash object
  * @grow_decision: If defined, may return true if table should expand
  * @shrink_decision: If defined, may return true if table should shrink
- * @mutex_is_held: Must return true if protecting mutex is held
  */
 struct rhashtable_params {
 	size_t			nelem_hint;
@@ -59,37 +69,42 @@ struct rhashtable_params {
 	u32			hash_rnd;
 	size_t			max_shift;
 	size_t			min_shift;
+	size_t			locks_mul;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	bool			(*grow_decision)(const struct rhashtable *ht,
 						 size_t new_size);
 	bool			(*shrink_decision)(const struct rhashtable *ht,
 						   size_t new_size);
-#ifdef CONFIG_PROVE_LOCKING
-	int			(*mutex_is_held)(void *parent);
-	void			*parent;
-#endif
 };
 
 /**
  * struct rhashtable - Hash table handle
  * @tbl: Bucket table
+ * @future_tbl: Table under construction during expansion/shrinking
  * @nelems: Number of elements in table
  * @shift: Current size (1 << shift)
  * @p: Configuration parameters
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @being_destroyed: True if table is set up for destruction
  */
 struct rhashtable {
 	struct bucket_table __rcu	*tbl;
-	size_t				nelems;
+	struct bucket_table __rcu       *future_tbl;
+	atomic_t			nelems;
 	size_t				shift;
 	struct rhashtable_params	p;
+	struct delayed_work             run_work;
+	struct mutex                    mutex;
+	bool                            being_destroyed;
 };
 
 #ifdef CONFIG_PROVE_LOCKING
-int lockdep_rht_mutex_is_held(const struct rhashtable *ht);
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
 #else
-static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
 	return 1;
 }
@@ -112,11 +127,11 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
 int rhashtable_expand(struct rhashtable *ht);
 int rhashtable_shrink(struct rhashtable *ht);
 
-void *rhashtable_lookup(const struct rhashtable *ht, const void *key);
-void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
+void *rhashtable_lookup(struct rhashtable *ht, const void *key);
+void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
 
-void rhashtable_destroy(const struct rhashtable *ht);
+void rhashtable_destroy(struct rhashtable *ht);
 
 #define rht_dereference(p, ht) \
 	rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e6b85c4a5828..312e3437c7bc 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -26,19 +26,42 @@
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4UL
+#define BUCKET_LOCKS_PER_CPU   128UL
+
+enum {
+	RHT_LOCK_NORMAL,
+	RHT_LOCK_NESTED,
+	RHT_LOCK_NESTED2,
+};
+
+/* The bucket lock is selected based on the hash and protects mutations
+ * on a group of hash buckets.
+ *
+ * IMPORTANT: When holding the bucket lock of both the old and new table
+ * during expansions and shrinking, the old bucket lock must always be
+ * acquired first.
+ */
+static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
+{
+	return &tbl->locks[hash & tbl->locks_mask];
+}
 
 #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
+#define ASSERT_BUCKET_LOCK(TBL, HASH) \
+	BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH))
 
 #ifdef CONFIG_PROVE_LOCKING
-int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
-	return ht->p.mutex_is_held(ht->p.parent);
+	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
 {
-	return 1;
+	spinlock_t *lock = bucket_lock(tbl, hash);
+
+	return (debug_locks) ? lockdep_is_held(lock) : 1;
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #endif
@@ -66,7 +89,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 	return hash;
 }
 
-static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len)
+static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 {
 	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	u32 hash;
@@ -95,7 +118,49 @@ static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 	return pprev;
 }
 
-static struct bucket_table *bucket_table_alloc(size_t nbuckets)
+static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
+{
+	unsigned int i, size;
+#if defined(CONFIG_PROVE_LOCKING)
+	unsigned int nr_pcpus = 2;
+#else
+	unsigned int nr_pcpus = num_possible_cpus();
+#endif
+
+	nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
+	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+
+	/* Never allocate more than one lock per bucket */
+	size = min_t(unsigned int, size, tbl->size);
+
+	if (sizeof(spinlock_t) != 0) {
+#ifdef CONFIG_NUMA
+		if (size * sizeof(spinlock_t) > PAGE_SIZE)
+			tbl->locks = vmalloc(size * sizeof(spinlock_t));
+		else
+#endif
+		tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
+					   GFP_KERNEL);
+		if (!tbl->locks)
+			return -ENOMEM;
+		for (i = 0; i < size; i++)
+			spin_lock_init(&tbl->locks[i]);
+	}
+	tbl->locks_mask = size - 1;
+
+	return 0;
+}
+
+static void bucket_table_free(const struct bucket_table *tbl)
+{
+	if (tbl)
+		kvfree(tbl->locks);
+
+	kvfree(tbl);
+}
+
+static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
+					       size_t nbuckets)
 {
 	struct bucket_table *tbl;
 	size_t size;
@@ -110,12 +175,12 @@ static struct bucket_table *bucket_table_alloc(size_t nbuckets)
 
 	tbl->size = nbuckets;
 
-	return tbl;
-}
+	if (alloc_bucket_locks(ht, tbl) < 0) {
+		bucket_table_free(tbl);
+		return NULL;
+	}
 
-static void bucket_table_free(const struct bucket_table *tbl)
-{
-	kvfree(tbl);
+	return tbl;
 }
 
 /**
@@ -126,7 +191,7 @@ static void bucket_table_free(const struct bucket_table *tbl)
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
 {
 	/* Expand table when exceeding 75% load */
-	return ht->nelems > (new_size / 4 * 3);
+	return atomic_read(&ht->nelems) > (new_size / 4 * 3);
 }
 EXPORT_SYMBOL_GPL(rht_grow_above_75);
 
@@ -138,41 +203,59 @@ EXPORT_SYMBOL_GPL(rht_grow_above_75);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 {
 	/* Shrink table beneath 30% load */
-	return ht->nelems < (new_size * 3 / 10);
+	return atomic_read(&ht->nelems) < (new_size * 3 / 10);
 }
 EXPORT_SYMBOL_GPL(rht_shrink_below_30);
 
 static void hashtable_chain_unzip(const struct rhashtable *ht,
 				  const struct bucket_table *new_tbl,
-				  struct bucket_table *old_tbl, size_t n)
+				  struct bucket_table *old_tbl,
+				  size_t old_hash)
 {
 	struct rhash_head *he, *p, *next;
-	unsigned int h;
+	spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
+	unsigned int new_hash, new_hash2;
+
+	ASSERT_BUCKET_LOCK(old_tbl, old_hash);
 
 	/* Old bucket empty, no work needed. */
-	p = rht_dereference(old_tbl->buckets[n], ht);
+	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
+				   old_hash);
 	if (!p)
 		return;
 
+	new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
 	/* Advance the old bucket pointer one or more times until it
 	 * reaches a node that doesn't hash to the same bucket as the
 	 * previous node p. Call the previous node p;
 	 */
-	h = head_hashfn(ht, new_tbl, p);
-	rht_for_each_continue(he, p->next, old_tbl, n) {
-		if (head_hashfn(ht, new_tbl, he) != h)
+	rht_for_each_continue(he, p->next, old_tbl, old_hash) {
+		new_hash2 = head_hashfn(ht, new_tbl, he);
+		if (new_hash != new_hash2)
 			break;
 		p = he;
 	}
-	RCU_INIT_POINTER(old_tbl->buckets[n], p->next);
+	rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
+
+	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+
+	/* If we have encountered an entry that maps to a different bucket in
+	 * the new table, lock down that bucket as well as we might cut off
+	 * the end of the chain.
+	 */
+	new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
+	if (new_bucket_lock != new_bucket_lock2)
+		spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);
 
 	/* Find the subsequent node which does hash to the same
 	 * bucket as node P, or NULL if no such node exists.
 	 */
 	next = NULL;
 	if (he) {
-		rht_for_each_continue(he, he->next, old_tbl, n) {
-			if (head_hashfn(ht, new_tbl, he) == h) {
+		rht_for_each_continue(he, he->next, old_tbl, old_hash) {
+			if (head_hashfn(ht, new_tbl, he) == new_hash) {
 				next = he;
 				break;
 			}
@@ -182,7 +265,23 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Set p's next pointer to that subsequent node pointer,
 	 * bypassing the nodes which do not hash to p's bucket
 	 */
-	RCU_INIT_POINTER(p->next, next);
+	rcu_assign_pointer(p->next, next);
+
+	if (new_bucket_lock != new_bucket_lock2)
+		spin_unlock_bh(new_bucket_lock2);
+	spin_unlock_bh(new_bucket_lock);
+}
+
+static void link_old_to_new(struct bucket_table *new_tbl,
+			    unsigned int new_hash, struct rhash_head *entry)
+{
+	spinlock_t *new_bucket_lock;
+
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
+	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+	rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
+	spin_unlock_bh(new_bucket_lock);
 }
 
 /**
@@ -195,43 +294,59 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
  *
- * The caller must ensure that no concurrent table mutations take place.
- * It is however valid to have concurrent lookups if they are RCU protected.
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
  */
 int rhashtable_expand(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_head *he;
-	unsigned int i, h;
-	bool complete;
+	spinlock_t *old_bucket_lock;
+	unsigned int new_hash, old_hash;
+	bool complete = false;
 
 	ASSERT_RHT_MUTEX(ht);
 
 	if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
 		return 0;
 
-	new_tbl = bucket_table_alloc(old_tbl->size * 2);
+	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
 	ht->shift++;
 
-	/* For each new bucket, search the corresponding old bucket
-	 * for the first entry that hashes to the new bucket, and
-	 * link the new bucket to that entry. Since all the entries
-	 * which will end up in the new bucket appear in the same
-	 * old bucket, this constructs an entirely valid new hash
-	 * table, but with multiple buckets "zipped" together into a
-	 * single imprecise chain.
+	/* Make insertions go into the new, empty table right away. Deletions
+	 * and lookups will be attempted in both tables until we synchronize.
+	 * The synchronize_rcu() guarantees for the new table to be picked up
+	 * so no new additions go into the old table while we relink.
+	 */
+	rcu_assign_pointer(ht->future_tbl, new_tbl);
+	synchronize_rcu();
+
+	/* For each new bucket, search the corresponding old bucket for the
+	 * first entry that hashes to the new bucket, and link the end of
+	 * newly formed bucket chain (containing entries added to future
+	 * table) to that entry. Since all the entries which will end up in
+	 * the new bucket appear in the same old bucket, this constructs an
+	 * entirely valid new hash table, but with multiple buckets
+	 * "zipped" together into a single imprecise chain.
 	 */
-	for (i = 0; i < new_tbl->size; i++) {
-		h = rht_bucket_index(old_tbl, i);
-		rht_for_each(he, old_tbl, h) {
-			if (head_hashfn(ht, new_tbl, he) == i) {
-				RCU_INIT_POINTER(new_tbl->buckets[i], he);
+	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
+		old_hash = rht_bucket_index(old_tbl, new_hash);
+		old_bucket_lock = bucket_lock(old_tbl, old_hash);
+
+		spin_lock_bh(old_bucket_lock);
+		rht_for_each(he, old_tbl, old_hash) {
+			if (head_hashfn(ht, new_tbl, he) == new_hash) {
+				link_old_to_new(new_tbl, new_hash, he);
 				break;
 			}
 		}
+		spin_unlock_bh(old_bucket_lock);
 	}
 
 	/* Publish the new table pointer. Lookups may now traverse
@@ -241,7 +356,7 @@ int rhashtable_expand(struct rhashtable *ht)
 	rcu_assign_pointer(ht->tbl, new_tbl);
 
 	/* Unzip interleaved hash chains */
-	do {
+	while (!complete && !ht->being_destroyed) {
 		/* Wait for readers. All new readers will see the new
 		 * table, and thus no references to the old table will
 		 * remain.
@@ -253,12 +368,17 @@ int rhashtable_expand(struct rhashtable *ht)
 		 * table): ...
 		 */
 		complete = true;
-		for (i = 0; i < old_tbl->size; i++) {
-			hashtable_chain_unzip(ht, new_tbl, old_tbl, i);
-			if (old_tbl->buckets[i] != NULL)
+		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+			old_bucket_lock = bucket_lock(old_tbl, old_hash);
+			spin_lock_bh(old_bucket_lock);
+
+			hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
+			if (old_tbl->buckets[old_hash] != NULL)
 				complete = false;
+
+			spin_unlock_bh(old_bucket_lock);
 		}
-	} while (!complete);
+	}
 
 	bucket_table_free(old_tbl);
 	return 0;
@@ -272,38 +392,65 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
  *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
  * The caller must ensure that no concurrent table mutations take place.
  * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
  */
 int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *ntbl, *tbl = rht_dereference(ht->tbl, ht);
-	unsigned int i;
+	struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
+	spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
+	unsigned int new_hash;
 
 	ASSERT_RHT_MUTEX(ht);
 
 	if (ht->shift <= ht->p.min_shift)
 		return 0;
 
-	ntbl = bucket_table_alloc(tbl->size / 2);
-	if (ntbl == NULL)
+	new_tbl = bucket_table_alloc(ht, tbl->size / 2);
+	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift--;
+	rcu_assign_pointer(ht->future_tbl, new_tbl);
+	synchronize_rcu();
 
-	/* Link each bucket in the new table to the first bucket
-	 * in the old table that contains entries which will hash
-	 * to the new bucket.
+	/* Link the first entry in the old bucket to the end of the
+	 * bucket in the new table. As entries are concurrently being
+	 * added to the new table, lock down the new bucket. As we
+	 * always divide the size in half when shrinking, each bucket
+	 * in the new table maps to exactly two buckets in the old
+	 * table.
+	 *
+	 * As removals can occur concurrently on the old table, we need
+	 * to lock down both matching buckets in the old table.
 	 */
-	for (i = 0; i < ntbl->size; i++) {
-		ntbl->buckets[i] = tbl->buckets[i];
-		RCU_INIT_POINTER(*bucket_tail(ntbl, i),
-				 tbl->buckets[i + ntbl->size]);
-
+	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
+		old_bucket_lock1 = bucket_lock(tbl, new_hash);
+		old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
+		new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
+		spin_lock_bh(old_bucket_lock1);
+		spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
+		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
+
+		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
+				   tbl->buckets[new_hash]);
+		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
+				   tbl->buckets[new_hash + new_tbl->size]);
+
+		spin_unlock_bh(new_bucket_lock);
+		spin_unlock_bh(old_bucket_lock2);
+		spin_unlock_bh(old_bucket_lock1);
 	}
 
 	/* Publish the new, valid hash table */
-	rcu_assign_pointer(ht->tbl, ntbl);
+	rcu_assign_pointer(ht->tbl, new_tbl);
+	ht->shift--;
 
 	/* Wait for readers. No new readers will have references to the
 	 * old hash table.
@@ -316,31 +463,63 @@ int rhashtable_shrink(struct rhashtable *ht)
 }
 EXPORT_SYMBOL_GPL(rhashtable_shrink);
 
+static void rht_deferred_worker(struct work_struct *work)
+{
+	struct rhashtable *ht;
+	struct bucket_table *tbl;
+
+	ht = container_of(work, struct rhashtable, run_work.work);
+	mutex_lock(&ht->mutex);
+	tbl = rht_dereference(ht->tbl, ht);
+
+	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
+		rhashtable_expand(ht);
+	else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
+		rhashtable_shrink(ht);
+
+	mutex_unlock(&ht->mutex);
+}
+
 /**
  * rhashtable_insert - insert object into hash hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
  *
- * Will automatically grow the table via rhashtable_expand() if the the
- * grow_decision function specified at rhashtable_init() returns true.
+ * Will take a per bucket spinlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket lock.
  *
- * The caller must ensure that no concurrent table mutations occur. It is
- * however valid to have concurrent lookups if they are RCU protected.
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
  */
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
-	u32 hash;
+	struct bucket_table *tbl;
+	spinlock_t *lock;
+	unsigned hash;
 
-	ASSERT_RHT_MUTEX(ht);
+	rcu_read_lock();
 
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	hash = head_hashfn(ht, tbl, obj);
+	lock = bucket_lock(tbl, hash);
+
+	spin_lock_bh(lock);
 	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
 	rcu_assign_pointer(tbl->buckets[hash], obj);
-	ht->nelems++;
+	spin_unlock_bh(lock);
 
-	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
-		rhashtable_expand(ht);
+	atomic_inc(&ht->nelems);
+
+	/* Only grow the table if no resizing is currently in progress. */
+	if (ht->tbl != ht->future_tbl &&
+	    ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
+		schedule_delayed_work(&ht->run_work, 0);
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
 
@@ -361,32 +540,56 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  */
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *tbl;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
-	u32 h;
+	spinlock_t *lock;
+	unsigned int hash;
 
-	ASSERT_RHT_MUTEX(ht);
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	hash = head_hashfn(ht, tbl, obj);
 
-	h = head_hashfn(ht, tbl, obj);
+	lock = bucket_lock(tbl, hash);
+	spin_lock_bh(lock);
 
-	pprev = &tbl->buckets[h];
-	rht_for_each(he, tbl, h) {
+restart:
+	pprev = &tbl->buckets[hash];
+	rht_for_each(he, tbl, hash) {
 		if (he != obj) {
 			pprev = &he->next;
 			continue;
 		}
 
-		RCU_INIT_POINTER(*pprev, he->next);
-		ht->nelems--;
+		rcu_assign_pointer(*pprev, obj->next);
+		atomic_dec(&ht->nelems);
 
-		if (ht->p.shrink_decision &&
+		spin_unlock_bh(lock);
+
+		if (ht->tbl != ht->future_tbl &&
+		    ht->p.shrink_decision &&
 		    ht->p.shrink_decision(ht, tbl->size))
-			rhashtable_shrink(ht);
+			schedule_delayed_work(&ht->run_work, 0);
+
+		rcu_read_unlock();
 
 		return true;
 	}
 
+	if (tbl != rht_dereference_rcu(ht->tbl, ht)) {
+		spin_unlock_bh(lock);
+
+		tbl = rht_dereference_rcu(ht->tbl, ht);
+		hash = head_hashfn(ht, tbl, obj);
+
+		lock = bucket_lock(tbl, hash);
+		spin_lock_bh(lock);
+		goto restart;
+	}
+
+	spin_unlock_bh(lock);
+	rcu_read_unlock();
+
 	return false;
 }
 EXPORT_SYMBOL_GPL(rhashtable_remove);
@@ -402,25 +605,35 @@ EXPORT_SYMBOL_GPL(rhashtable_remove);
  * This lookup function may only be used for fixed key hash table (key_len
  * paramter set). It will BUG() if used inappropriately.
  *
- * Lookups may occur in parallel with hash mutations as long as the lookup is
- * guarded by rcu_read_lock(). The caller must take care of this.
+ * Lookups may occur in parallel with hashtable mutations and resizing.
  */
-void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
+void *rhashtable_lookup(struct rhashtable *ht, const void *key)
 {
-	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	const struct bucket_table *tbl, *old_tbl;
 	struct rhash_head *he;
-	u32 h;
+	u32 hash;
 
 	BUG_ON(!ht->p.key_len);
 
-	h = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl, h) {
+	rcu_read_lock();
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	hash = key_hashfn(ht, key, ht->p.key_len);
+restart:
+	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
 			continue;
+		rcu_read_unlock();
 		return rht_obj(ht, he);
 	}
 
+	if (unlikely(tbl != old_tbl)) {
+		tbl = old_tbl;
+		goto restart;
+	}
+
+	rcu_read_unlock();
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup);
@@ -435,25 +648,36 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
  * Traverses the bucket chain behind the provided hash value and calls the
  * specified compare function for each entry.
  *
- * Lookups may occur in parallel with hash mutations as long as the lookup is
- * guarded by rcu_read_lock(). The caller must take care of this.
+ * Lookups may occur in parallel with hashtable mutations and resizing.
  *
  * Returns the first entry on which the compare function returned true.
  */
-void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
+void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg)
 {
-	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	const struct bucket_table *tbl, *old_tbl;
 	struct rhash_head *he;
 	u32 hash;
 
+	rcu_read_lock();
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	hash = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl, hash) {
+restart:
+	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
+		rcu_read_unlock();
 		return rht_obj(ht, he);
 	}
 
+	if (unlikely(tbl != old_tbl)) {
+		tbl = old_tbl;
+		goto restart;
+	}
+	rcu_read_unlock();
+
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
@@ -485,9 +709,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
- * #ifdef CONFIG_PROVE_LOCKING
- *	.mutex_is_held = &my_mutex_is_held,
- * #endif
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -507,9 +728,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.head_offset = offsetof(struct test_obj, node),
  *	.hashfn = jhash,
  *	.obj_hashfn = my_hash_fn,
- * #ifdef CONFIG_PROVE_LOCKING
- *	.mutex_is_held = &my_mutex_is_held,
- * #endif
  * };
  */
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
@@ -529,18 +747,29 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(params);
 
-	tbl = bucket_table_alloc(size);
+	memset(ht, 0, sizeof(*ht));
+	mutex_init(&ht->mutex);
+	memcpy(&ht->p, params, sizeof(*params));
+
+	if (params->locks_mul)
+		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
+	else
+		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+
+	tbl = bucket_table_alloc(ht, size);
 	if (tbl == NULL)
 		return -ENOMEM;
 
-	memset(ht, 0, sizeof(*ht));
 	ht->shift = ilog2(tbl->size);
-	memcpy(&ht->p, params, sizeof(*params));
 	RCU_INIT_POINTER(ht->tbl, tbl);
+	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
 	if (!ht->p.hash_rnd)
 		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
 
+	if (ht->p.grow_decision || ht->p.shrink_decision)
+		INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_init);
@@ -553,9 +782,16 @@ EXPORT_SYMBOL_GPL(rhashtable_init);
  * has to make sure that no resizing may happen by unpublishing the hashtable
  * and waiting for the quiescent cycle before releasing the bucket array.
  */
-void rhashtable_destroy(const struct rhashtable *ht)
+void rhashtable_destroy(struct rhashtable *ht)
 {
-	bucket_table_free(ht->tbl);
+	ht->being_destroyed = true;
+
+	mutex_lock(&ht->mutex);
+
+	cancel_delayed_work(&ht->run_work);
+	bucket_table_free(rht_dereference(ht->tbl, ht));
+
+	mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
 
@@ -570,13 +806,6 @@ EXPORT_SYMBOL_GPL(rhashtable_destroy);
 #define TEST_PTR	((void *) 0xdeadbeef)
 #define TEST_NEXPANDS	4
 
-#ifdef CONFIG_PROVE_LOCKING
-static int test_mutex_is_held(void *parent)
-{
-	return 1;
-}
-#endif
-
 struct test_obj {
 	void			*ptr;
 	int			value;
@@ -646,10 +875,10 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 				i, tbl->buckets[i], cnt);
 	}
 
-	pr_info("  Traversal complete: counted=%u, nelems=%zu, entries=%d\n",
-		total, ht->nelems, TEST_ENTRIES);
+	pr_info("  Traversal complete: counted=%u, nelems=%u, entries=%d\n",
+		total, atomic_read(&ht->nelems), TEST_ENTRIES);
 
-	if (total != ht->nelems || total != TEST_ENTRIES)
+	if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES)
 		pr_warn("Test failed: Total count mismatch ^^^");
 }
 
@@ -688,7 +917,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table expansion iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
 		rhashtable_expand(ht);
+		mutex_unlock(&ht->mutex);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -698,7 +929,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table shrinkage iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
 		rhashtable_shrink(ht);
+		mutex_unlock(&ht->mutex);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -741,9 +974,6 @@ static int __init test_rht_init(void)
 		.key_offset = offsetof(struct test_obj, value),
 		.key_len = sizeof(int),
 		.hashfn = jhash,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = &test_mutex_is_held,
-#endif
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
 	};
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 7f903cf9a1b9..75887d7d2c6a 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -33,7 +33,7 @@ static bool nft_hash_lookup(const struct nft_set *set,
 			    const struct nft_data *key,
 			    struct nft_data *data)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
+	struct rhashtable *priv = nft_set_priv(set);
 	const struct nft_hash_elem *he;
 
 	he = rhashtable_lookup(priv, key);
@@ -113,7 +113,7 @@ static bool nft_hash_compare(void *ptr, void *arg)
 
 static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
+	struct rhashtable *priv = nft_set_priv(set);
 	struct nft_compare_arg arg = {
 		.set = set,
 		.elem = elem,
@@ -129,7 +129,7 @@ static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
 			  struct nft_set_iter *iter)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
+	struct rhashtable *priv = nft_set_priv(set);
 	const struct bucket_table *tbl;
 	const struct nft_hash_elem *he;
 	struct nft_set_elem elem;
@@ -162,13 +162,6 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
 	return sizeof(struct rhashtable);
 }
 
-#ifdef CONFIG_PROVE_LOCKING
-static int lockdep_nfnl_lock_is_held(void *parent)
-{
-	return lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES);
-}
-#endif
-
 static int nft_hash_init(const struct nft_set *set,
 			 const struct nft_set_desc *desc,
 			 const struct nlattr * const tb[])
@@ -182,9 +175,6 @@ static int nft_hash_init(const struct nft_set *set,
 		.hashfn = jhash,
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = lockdep_nfnl_lock_is_held,
-#endif
 	};
 
 	return rhashtable_init(priv, &params);
@@ -192,16 +182,23 @@ static int nft_hash_init(const struct nft_set *set,
 
 static void nft_hash_destroy(const struct nft_set *set)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
-	const struct bucket_table *tbl = priv->tbl;
+	struct rhashtable *priv = nft_set_priv(set);
+	const struct bucket_table *tbl;
 	struct nft_hash_elem *he;
 	struct rhash_head *pos, *next;
 	unsigned int i;
 
+	/* Stop an eventual async resizing */
+	priv->being_destroyed = true;
+	mutex_lock(&priv->mutex);
+
+	tbl = rht_dereference(priv->tbl, priv);
 	for (i = 0; i < tbl->size; i++) {
 		rht_for_each_entry_safe(he, pos, next, tbl, i, node)
 			nft_hash_elem_destroy(set, he);
 	}
+	mutex_unlock(&priv->mutex);
+
 	rhashtable_destroy(priv);
 }
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 57449b6089c2..738c3bfaa564 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -114,15 +114,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
 DEFINE_MUTEX(nl_sk_hash_lock);
 EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
 
-#ifdef CONFIG_PROVE_LOCKING
-static int lockdep_nl_sk_hash_is_held(void *parent)
-{
-	if (debug_locks)
-		return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
-	return 1;
-}
-#endif
-
 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static DEFINE_SPINLOCK(netlink_tap_lock);
@@ -1063,7 +1054,8 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
 		goto err;
 
 	err = -ENOMEM;
-	if (BITS_PER_LONG > 32 && unlikely(table->hash.nelems >= UINT_MAX))
+	if (BITS_PER_LONG > 32 &&
+	    unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
 		goto err;
 
 	nlk_sk(sk)->portid = portid;
@@ -3122,9 +3114,6 @@ static int __init netlink_proto_init(void)
 		.max_shift = 16, /* 64K */
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = lockdep_nl_sk_hash_is_held,
-#endif
 	};
 
 	if (err != 0)
-- 
cgit v1.2.3


From f89bd6f87a53ce5a7d60662429591ebac2745c10 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:21 +0100
Subject: rhashtable: Supports for nulls marker

In order to allow for wider usage of rhashtable, use a special nulls
marker to terminate each chain. The reason for not using the existing
nulls_list is that the prev pointer usage would not be valid as entries
can be linked in two different buckets at the same time.

The 4 nulls base bits can be set through the rhashtable_params structure
like this:

struct rhashtable_params params = {
        [...]
        .nulls_base = (1U << RHT_BASE_SHIFT),
};

This reduces the hash length from 32 bits to 27 bits.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list_nulls.h |  3 ++-
 include/linux/rhashtable.h | 57 ++++++++++++++++++++++++++++++++++++++--------
 lib/rhashtable.c           | 37 ++++++++++++++++++++++++------
 3 files changed, 79 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 5d10ae364b5e..e8c300e06438 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -21,8 +21,9 @@ struct hlist_nulls_head {
 struct hlist_nulls_node {
 	struct hlist_nulls_node *next, **pprev;
 };
+#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
 #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
-	((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1)))
+	((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
 
 #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
 /**
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index a1688f0a6193..de7cac753b09 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -18,15 +18,32 @@
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
-#include <linux/rculist.h>
+#include <linux/list_nulls.h>
 #include <linux/workqueue.h>
 
+/*
+ * The end of the chain is marked with a special nulls marks which has
+ * the following format:
+ *
+ * +-------+-----------------------------------------------------+-+
+ * | Base  |                      Hash                           |1|
+ * +-------+-----------------------------------------------------+-+
+ *
+ * Base (4 bits) : Reserved to distinguish between multiple tables.
+ *                 Specified via &struct rhashtable_params.nulls_base.
+ * Hash (27 bits): Full hash (unmasked) of first element added to bucket
+ * 1 (1 bit)     : Nulls marker (always set)
+ *
+ * The remaining bits of the next pointer remain unused for now.
+ */
+#define RHT_BASE_BITS		4
+#define RHT_HASH_BITS		27
+#define RHT_BASE_SHIFT		RHT_HASH_BITS
+
 struct rhash_head {
 	struct rhash_head __rcu		*next;
 };
 
-#define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL)
-
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
@@ -55,6 +72,7 @@ struct rhashtable;
  * @hash_rnd: Seed to use while hashing
  * @max_shift: Maximum number of shifts while expanding
  * @min_shift: Minimum number of shifts while shrinking
+ * @nulls_base: Base value to generate nulls marker
  * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
  * @hashfn: Function to hash key
  * @obj_hashfn: Function to hash object
@@ -69,6 +87,7 @@ struct rhashtable_params {
 	u32			hash_rnd;
 	size_t			max_shift;
 	size_t			min_shift;
+	u32			nulls_base;
 	size_t			locks_mul;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
@@ -100,6 +119,24 @@ struct rhashtable {
 	bool                            being_destroyed;
 };
 
+static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
+{
+	return NULLS_MARKER(ht->p.nulls_base + hash);
+}
+
+#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
+	((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+
+static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
+{
+	return ((unsigned long) ptr & 1);
+}
+
+static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr)
+{
+	return ((unsigned long) ptr) >> 1;
+}
+
 #ifdef CONFIG_PROVE_LOCKING
 int lockdep_rht_mutex_is_held(struct rhashtable *ht);
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
@@ -157,7 +194,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  */
 #define rht_for_each_continue(pos, head, tbl, hash) \
 	for (pos = rht_dereference_bucket(head, tbl, hash); \
-	     pos; \
+	     !rht_is_a_nulls(pos); \
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
 /**
@@ -180,7 +217,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  */
 #define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member)	\
 	for (pos = rht_dereference_bucket(head, tbl, hash);		\
-	     pos && rht_entry(tpos, pos, member);			\
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	\
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
 /**
@@ -209,9 +246,9 @@ void rhashtable_destroy(struct rhashtable *ht);
  */
 #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	    \
 	for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \
-	     next = pos ? rht_dereference_bucket(pos->next, tbl, hash)      \
-			: NULL;						    \
-	     pos && rht_entry(tpos, pos, member);			    \
+	     next = !rht_is_a_nulls(pos) ?				    \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
 	     pos = next)
 
 /**
@@ -228,7 +265,7 @@ void rhashtable_destroy(struct rhashtable *ht);
 #define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
 	for (({barrier(); }),						\
 	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
-	     pos;							\
+	     !rht_is_a_nulls(pos);					\
 	     pos = rcu_dereference_raw(pos->next))
 
 /**
@@ -260,7 +297,7 @@ void rhashtable_destroy(struct rhashtable *ht);
 #define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
 	for (({barrier(); }),						    \
 	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
-	     pos && rht_entry(tpos, pos, member);			    \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
 	     pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
 
 /**
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 312e3437c7bc..cbad192d3b3d 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -28,6 +28,9 @@
 #define HASH_MIN_SIZE		4UL
 #define BUCKET_LOCKS_PER_CPU   128UL
 
+/* Base bits plus 1 bit for nulls marker */
+#define HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
+
 enum {
 	RHT_LOCK_NORMAL,
 	RHT_LOCK_NESTED,
@@ -86,7 +89,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
 				    ht->p.hash_rnd);
 
-	return hash;
+	return hash >> HASH_RESERVED_SPACE;
 }
 
 static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
@@ -95,6 +98,7 @@ static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 	u32 hash;
 
 	hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
+	hash >>= HASH_RESERVED_SPACE;
 
 	return rht_bucket_index(tbl, hash);
 }
@@ -111,7 +115,7 @@ static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 	struct rhash_head __rcu **pprev;
 
 	for (pprev = &tbl->buckets[n];
-	     rht_dereference_bucket(*pprev, tbl, n);
+	     !rht_is_a_nulls(rht_dereference_bucket(*pprev, tbl, n));
 	     pprev = &rht_dereference_bucket(*pprev, tbl, n)->next)
 		;
 
@@ -164,6 +168,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 {
 	struct bucket_table *tbl;
 	size_t size;
+	int i;
 
 	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
 	tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
@@ -180,6 +185,9 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		return NULL;
 	}
 
+	for (i = 0; i < nbuckets; i++)
+		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+
 	return tbl;
 }
 
@@ -221,7 +229,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Old bucket empty, no work needed. */
 	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
 				   old_hash);
-	if (!p)
+	if (rht_is_a_nulls(p))
 		return;
 
 	new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
@@ -252,8 +260,8 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Find the subsequent node which does hash to the same
 	 * bucket as node P, or NULL if no such node exists.
 	 */
-	next = NULL;
-	if (he) {
+	INIT_RHT_NULLS_HEAD(next, ht, old_hash);
+	if (!rht_is_a_nulls(he)) {
 		rht_for_each_continue(he, he->next, old_tbl, old_hash) {
 			if (head_hashfn(ht, new_tbl, he) == new_hash) {
 				next = he;
@@ -369,11 +377,15 @@ int rhashtable_expand(struct rhashtable *ht)
 		 */
 		complete = true;
 		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+			struct rhash_head *head;
+
 			old_bucket_lock = bucket_lock(old_tbl, old_hash);
 			spin_lock_bh(old_bucket_lock);
 
 			hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
-			if (old_tbl->buckets[old_hash] != NULL)
+			head = rht_dereference_bucket(old_tbl->buckets[old_hash],
+						      old_tbl, old_hash);
+			if (!rht_is_a_nulls(head))
 				complete = false;
 
 			spin_unlock_bh(old_bucket_lock);
@@ -498,6 +510,7 @@ static void rht_deferred_worker(struct work_struct *work)
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
 	struct bucket_table *tbl;
+	struct rhash_head *head;
 	spinlock_t *lock;
 	unsigned hash;
 
@@ -508,7 +521,12 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 	lock = bucket_lock(tbl, hash);
 
 	spin_lock_bh(lock);
-	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
+	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	if (rht_is_a_nulls(head))
+		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
+	else
+		RCU_INIT_POINTER(obj->next, head);
+
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 	spin_unlock_bh(lock);
 
@@ -709,6 +727,7 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
+ *	.nulls_base = (1U << RHT_BASE_SHIFT),
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -741,6 +760,9 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	    (!params->key_len && !params->obj_hashfn))
 		return -EINVAL;
 
+	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
+		return -EINVAL;
+
 	params->min_shift = max_t(size_t, params->min_shift,
 				  ilog2(HASH_MIN_SIZE));
 
@@ -974,6 +996,7 @@ static int __init test_rht_init(void)
 		.key_offset = offsetof(struct test_obj, value),
 		.key_len = sizeof(int),
 		.hashfn = jhash,
+		.nulls_base = (3U << RHT_BASE_SHIFT),
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
 	};
-- 
cgit v1.2.3


From 86b35b64ed7b6b38305dee67a0f2ddff2ca5455d Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Sun, 4 Jan 2015 15:25:09 +0800
Subject: rhashtable: fix missing header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixup below build error:

include/linux/rhashtable.h: At top level:
include/linux/rhashtable.h:118:34: error: field ‘mutex’ has incomplete type

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index de7cac753b09..de1459c74c4d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -20,6 +20,7 @@
 
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
+#include <linux/mutex.h>
 
 /*
  * The end of the chain is marked with a special nulls marks which has
-- 
cgit v1.2.3


From a3449ded128d037e6b2bd7a59b0741de56506066 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Sun, 4 Jan 2015 15:24:35 +0800
Subject: list_nulls: fix missing header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixup below build error:

include/linux/list_nulls.h: In function ‘hlist_nulls_del’:
include/linux/list_nulls.h:84:13: error: ‘LIST_POISON2’ undeclared (first use in this function)

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list_nulls.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index e8c300e06438..f266661d2666 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -1,6 +1,9 @@
 #ifndef _LINUX_LIST_NULLS_H
 #define _LINUX_LIST_NULLS_H
 
+#include <linux/poison.h>
+#include <linux/const.h>
+
 /*
  * Special version of lists, where end of list is not a NULL pointer,
  * but a 'nulls' marker, which can have many different values.
-- 
cgit v1.2.3


From 224d019c4fbba242041e9b25a926ba873b7da1e2 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 5 Jan 2015 13:56:14 -0800
Subject: ip: Move checksum convert defines to inet

Move convert_csum from udp_sock to inet_sock. This allows the
possibility that we can use convert checksum for different types
of sockets and also allows convert checksum to be enabled from
inet layer (what we'll want to do when enabling IP_CHECKSUM cmsg).

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h     | 16 +---------------
 include/net/inet_sock.h | 17 +++++++++++++++++
 net/ipv4/fou.c          |  2 +-
 net/ipv4/udp.c          |  2 +-
 net/ipv4/udp_tunnel.c   |  2 +-
 net/ipv6/udp.c          |  2 +-
 6 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index ee3277593222..247cfdcc4b08 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -49,11 +49,7 @@ struct udp_sock {
 	unsigned int	 corkflag;	/* Cork is required */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
-			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-			 convert_csum:1;/* On receive, convert checksum
-					 * unnecessary to checksum complete
-					 * if possible.
-					 */
+			 no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -102,16 +98,6 @@ static inline bool udp_get_no_check6_rx(struct sock *sk)
 	return udp_sk(sk)->no_check6_rx;
 }
 
-static inline void udp_set_convert_csum(struct sock *sk, bool val)
-{
-	udp_sk(sk)->convert_csum = val;
-}
-
-static inline bool udp_get_convert_csum(struct sock *sk)
-{
-	return udp_sk(sk)->convert_csum;
-}
-
 #define udp_portaddr_for_each_entry(__sk, node, list) \
 	hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node)
 
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index a829b77523cf..360b110b3e36 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -184,6 +184,7 @@ struct inet_sock {
 				mc_all:1,
 				nodefrag:1;
 	__u8			rcv_tos;
+	__u8			convert_csum;
 	int			uc_index;
 	int			mc_index;
 	__be32			mc_addr;
@@ -250,4 +251,20 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
 	return flags;
 }
 
+static inline void inet_inc_convert_csum(struct sock *sk)
+{
+	inet_sk(sk)->convert_csum++;
+}
+
+static inline void inet_dec_convert_csum(struct sock *sk)
+{
+	if (inet_sk(sk)->convert_csum > 0)
+		inet_sk(sk)->convert_csum--;
+}
+
+static inline bool inet_get_convert_csum(struct sock *sk)
+{
+	return !!inet_sk(sk)->convert_csum;
+}
+
 #endif	/* _INET_SOCK_H */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index b986298a7ba3..2197c36f722f 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -490,7 +490,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 	sk->sk_user_data = fou;
 	fou->sock = sock;
 
-	udp_set_convert_csum(sk, true);
+	inet_inc_convert_csum(sk);
 
 	sk->sk_allocation = GFP_ATOMIC;
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 13b4dcf86ef6..53358d88f110 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1806,7 +1806,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (sk != NULL) {
 		int ret;
 
-		if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk))
+		if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
 			skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
 						 inet_compute_pseudo);
 
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 1671263e5fa0..9996e63ed304 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -63,7 +63,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 	inet_sk(sk)->mc_loop = 0;
 
 	/* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
-	udp_set_convert_csum(sk, true);
+	inet_inc_convert_csum(sk);
 
 	rcu_assign_sk_user_data(sk, cfg->sk_user_data);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 189dc4ae3eca..e41f017cd479 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -909,7 +909,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 			goto csum_error;
 		}
 
-		if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk))
+		if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
 			skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
 						 ip6_compute_pseudo);
 
-- 
cgit v1.2.3


From 6810e4a394f9d781050107529b8d1465c00b7b13 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Jan 2015 10:26:10 -0500
Subject: percpu_ref: remove unnecessary ACCESS_ONCE() in
 percpu_ref_tryget_live()

__ref_is_percpu() needs the implied ACCESS_ONCE() in
lockless_dereference() on @ref->percpu_count_ptr because the value is
tested for !__PERCPU_REF_ATOMIC, which may be set asynchronously, and
then used as a pointer.  If the compiler generates a separate fetch
when using it as a pointer, __PERCPU_REF_ATOMIC may be set in between
contaminating the pointer value.

percpu_ref_tryget_live() also uses ACCESS_ONCE() to test
__PERCPU_REF_DEAD; however, there's no reason for this.  I just copied
ACCESS_ONCE() usage blindly from __ref_is_percpu().  All it does is
confusing people trying to understand what's going on.

This patch removes the unnecessary ACCESS_ONCE() usage from
percpu_ref_tryget_live() and adds a comment explaining why
__ref_is_percpu() needs it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kent Overstreet <kmo@daterainc.com>
---
 include/linux/percpu-refcount.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index b4337646388b..6a7a670366ab 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -128,8 +128,22 @@ static inline void percpu_ref_kill(struct percpu_ref *ref)
 static inline bool __ref_is_percpu(struct percpu_ref *ref,
 					  unsigned long __percpu **percpu_countp)
 {
-	/* paired with smp_store_release() in percpu_ref_reinit() */
-	unsigned long percpu_ptr = lockless_dereference(ref->percpu_count_ptr);
+	unsigned long percpu_ptr;
+
+	/*
+	 * The value of @ref->percpu_count_ptr is tested for
+	 * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
+	 * used as a pointer.  If the compiler generates a separate fetch
+	 * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
+	 * between contaminating the pointer value, meaning that
+	 * ACCESS_ONCE() is required when fetching it.
+	 *
+	 * Also, we need a data dependency barrier to be paired with
+	 * smp_store_release() in __percpu_ref_switch_to_percpu().
+	 *
+	 * Use lockless deref which contains both.
+	 */
+	percpu_ptr = lockless_dereference(ref->percpu_count_ptr);
 
 	/*
 	 * Theoretically, the following could test just ATOMIC; however,
@@ -233,7 +247,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
 	if (__ref_is_percpu(ref, &percpu_count)) {
 		this_cpu_inc(*percpu_count);
 		ret = true;
-	} else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) {
+	} else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
 		ret = atomic_long_inc_not_zero(&ref->count);
 	}
 
-- 
cgit v1.2.3


From 4c907baf36d8339f393bb576d0bab29194d0e6ad Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Jan 2015 10:26:10 -0500
Subject: percpu_ref: implement percpu_ref_is_dying()

Implement percpu_ref_is_dying() which tests whether the ref is dying
or dead.  This is useful to determine the current state when a
percpu_ref is used as a cyclic on/off switch via kill and reinit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kent Overstreet <kmo@daterainc.com>
---
 include/linux/percpu-refcount.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 6a7a670366ab..12c9b485beb7 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -294,6 +294,20 @@ static inline void percpu_ref_put(struct percpu_ref *ref)
 	percpu_ref_put_many(ref, 1);
 }
 
+/**
+ * percpu_ref_is_dying - test whether a percpu refcount is dying or dead
+ * @ref: percpu_ref to test
+ *
+ * Returns %true if @ref is dying or dead.
+ *
+ * This function is safe to call as long as @ref is between init and exit
+ * and the caller is responsible for synchronizing against state changes.
+ */
+static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
+{
+	return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
+}
+
 /**
  * percpu_ref_is_zero - test whether a percpu refcount reached zero
  * @ref: percpu_ref to test
-- 
cgit v1.2.3


From 24dab7a7b3534ef40ecec20cfd7fb3ad99d9ff33 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Jan 2015 12:02:46 -0500
Subject: cgroup: reorder SUBSYS(blkio) in cgroup_subsys.h

The scheduled cgroup writeback support requires blkio to be
initialized before memcg as memcg needs to provide certain blkcg
related functionalities.  Relocate blkio so that it's right above
memory.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup_subsys.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 98c4f9b12b03..e4a96fb14403 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -15,6 +15,10 @@ SUBSYS(cpu)
 SUBSYS(cpuacct)
 #endif
 
+#if IS_ENABLED(CONFIG_BLK_CGROUP)
+SUBSYS(blkio)
+#endif
+
 #if IS_ENABLED(CONFIG_MEMCG)
 SUBSYS(memory)
 #endif
@@ -31,10 +35,6 @@ SUBSYS(freezer)
 SUBSYS(net_cls)
 #endif
 
-#if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
-#endif
-
 #if IS_ENABLED(CONFIG_CGROUP_PERF)
 SUBSYS(perf_event)
 #endif
-- 
cgit v1.2.3


From f3ba53802eff25e3eedb60d7afe5262710e20bd5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Jan 2015 12:02:46 -0500
Subject: cgroup: add dummy css_put() for !CONFIG_CGROUPS

This will later be depended upon by the scheduled cgroup writeback
support.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index da0dae0600e6..b9cb94c3102a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -943,6 +943,8 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 
 #else /* !CONFIG_CGROUPS */
 
+struct cgroup_subsys_state;
+
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_fork(struct task_struct *p) {}
@@ -955,6 +957,8 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 	return -EINVAL;
 }
 
+static inline void css_put(struct cgroup_subsys_state *css) {}
+
 /* No cgroups - nothing to do */
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t)
-- 
cgit v1.2.3


From 536fa402221f09633e7c5801b327055ab716a363 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 5 Sep 2014 11:14:48 -0700
Subject: compiler: Allow 1- and 2-byte smp_load_acquire() and
 smp_store_release()

CPUs without single-byte and double-byte loads and stores place some
"interesting" requirements on concurrent code.  For example (adapted
from Peter Hurley's test code), suppose we have the following structure:

	struct foo {
		spinlock_t lock1;
		spinlock_t lock2;
		char a; /* Protected by lock1. */
		char b; /* Protected by lock2. */
	};
	struct foo *foop;

Of course, it is common (and good) practice to place data protected
by different locks in separate cache lines.  However, if the locks are
rarely acquired (for example, only in rare error cases), and there are
a great many instances of the data structure, then memory footprint can
trump false-sharing concerns, so that it can be better to place them in
the same cache cache line as above.

But if the CPU does not support single-byte loads and stores, a store
to foop->a will do a non-atomic read-modify-write operation on foop->b,
which will come as a nasty surprise to someone holding foop->lock2.  So we
now require CPUs to support single-byte and double-byte loads and stores.
Therefore, this commit adjusts the definition of __native_word() to allow
these sizes to be used by smp_load_acquire() and smp_store_release().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index a1c81f80978e..49811cdddaa5 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -385,7 +385,7 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int
 
 /* Is this type a native word size -- useful for atomic operations */
 #ifndef __native_word
-# define __native_word(t) (sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
+# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
 #endif
 
 /* Compile time object size, -1 for unknown */
-- 
cgit v1.2.3


From ac59853c06993a442e8060bc19040b2ca3025aec Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Thu, 13 Nov 2014 14:24:14 -0500
Subject: rcupdate: Replace smp_read_barrier_depends() with
 lockless_dereference()

Recently lockless_dereference() was added which can be used in place of
hard-coding smp_read_barrier_depends(). The following PATCH makes the change.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ed4f5939a452..386ba288084a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -582,11 +582,11 @@ static inline void rcu_preempt_sleep_check(void)
 })
 #define __rcu_dereference_check(p, c, space) \
 ({ \
-	typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \
+	/* Dependency order vs. p above. */ \
+	typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \
 	rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \
 	rcu_dereference_sparse(p, space); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
-	((typeof(*p) __force __kernel *)(_________p1)); \
+	((typeof(*p) __force __kernel *)(________p1)); \
 })
 #define __rcu_dereference_protected(p, c, space) \
 ({ \
@@ -603,10 +603,10 @@ static inline void rcu_preempt_sleep_check(void)
 })
 #define __rcu_dereference_index_check(p, c) \
 ({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
+	/* Dependency order vs. p above. */ \
+	typeof(p) _________p1 = lockless_dereference(p); \
 	rcu_lockdep_assert(c, \
 			   "suspicious rcu_dereference_index_check() usage"); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
 	(_________p1); \
 })
 
-- 
cgit v1.2.3


From f520c98e3e5212d8c282a86d9b7697dd70326192 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Fri, 12 Dec 2014 09:36:14 +0800
Subject: rculist: Fix sparse warning

This fixes the following sparse warnings:

make C=1 CF=-D__CHECK_ENDIAN__ net/ipv6/addrconf.o
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)

To silence these spare complaints, an RCU annotation should be added to
"next" pointer of hlist_node structure through hlist_next_rcu() macro
when iterating over a hlist with hlist_for_each_entry_continue_rcu_bh().

By the way, this commit also resolves the same error appearing in
hlist_for_each_entry_continue_rcu().

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 529bc946f450..a18b16f1dc0e 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -524,11 +524,11 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
  * @member:	the name of the hlist_node within the struct.
  */
 #define hlist_for_each_entry_continue_rcu(pos, member)			\
-	for (pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
-			typeof(*(pos)), member);			\
+	for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
+			&(pos)->member)), typeof(*(pos)), member);	\
 	     pos;							\
-	     pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
-			typeof(*(pos)), member))
+	     pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
 
 /**
  * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
@@ -536,11 +536,11 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
  * @member:	the name of the hlist_node within the struct.
  */
 #define hlist_for_each_entry_continue_rcu_bh(pos, member)		\
-	for (pos = hlist_entry_safe(rcu_dereference_bh((pos)->member.next),\
-			typeof(*(pos)), member);			\
+	for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
+			&(pos)->member)), typeof(*(pos)), member);	\
 	     pos;							\
-	     pos = hlist_entry_safe(rcu_dereference_bh((pos)->member.next),\
-			typeof(*(pos)), member))
+	     pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
 
 /**
  * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
-- 
cgit v1.2.3


From a5c198f4f7da6cc48116ca239c59c9f44b753364 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 23 Nov 2014 20:30:06 -0800
Subject: rcu: Expand SRCU ->completed to 64 bits

When rcutorture used only the low-order 32 bits of the grace-period
number, it was not a problem for SRCU to use a 32-bit completed field.
However, rcutorture now uses the full 64 bits on 64-bit systems, so
this commit converts SRCU's ->completed field to unsigned long so as to
provide 64 bits on 64-bit systems.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h | 4 ++--
 kernel/rcu/srcu.c    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a2783cb5d275..ef923dd96249 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -45,7 +45,7 @@ struct rcu_batch {
 #define RCU_BATCH_INIT(name) { NULL, &(name.head) }
 
 struct srcu_struct {
-	unsigned completed;
+	unsigned long completed;
 	struct srcu_struct_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
 	bool running;
@@ -135,7 +135,7 @@ int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
 void synchronize_srcu_expedited(struct srcu_struct *sp);
-long srcu_batches_completed(struct srcu_struct *sp);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
  * Report the number of batches, correlated with, but not necessarily
  * precisely the same as, the number of grace periods that have elapsed.
  */
-long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
 {
 	return sp->completed;
 }
-- 
cgit v1.2.3


From 9735af5c78599703be633c057af3faee26482028 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 26 Nov 2014 10:42:50 -0800
Subject: rcu: Combine DEFINE_SRCU() and DEFINE_STATIC_SRCU()

The DEFINE_SRCU() and DEFINE_STATIC_SRCU() definitions are quite
similar, so this commit combines them, saving a bit of code and removing
redundancy.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index ef923dd96249..9cfd9623fb03 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -102,13 +102,11 @@ void process_srcu(struct work_struct *work);
  * define and init a srcu struct at build time.
  * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
  */
-#define DEFINE_SRCU(name)						\
+#define __DEFINE_SRCU(name, is_static)					\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	struct srcu_struct name = __SRCU_STRUCT_INIT(name);
-
-#define DEFINE_STATIC_SRCU(name)					\
-	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
+#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
 
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
-- 
cgit v1.2.3


From 8b3a38daff6f50027039d6979b9eb026907508eb Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend@broadcom.com>
Date: Tue, 23 Dec 2014 19:04:23 +0100
Subject: brcmfmac: Add support for bcm43340/1 wireless chipsets

This patch adds support for the bcm43340 and bcm43341 wireless
chipsets. These two chipsets are identical from wireless parts
perspective. As such they use the same firmware image.

Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Rob Herring <rob.herring@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
[arend@broadcom.com: squash to single commit, remove 43341 chipid]
Reviewed-by: Pieter-Paul Giesberts <pieterpg@broadcom.com>
Reviewed-by: Hante Meuleman <meuleman@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c     | 2 ++
 drivers/net/wireless/brcm80211/brcmfmac/chip.c       | 1 +
 drivers/net/wireless/brcm80211/brcmfmac/sdio.c       | 5 +++++
 drivers/net/wireless/brcm80211/include/brcm_hw_ids.h | 3 +++
 include/linux/mmc/sdio_ids.h                         | 6 ++++--
 5 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c
index 9880dae2a569..dffd9e44f5b6 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c
@@ -1005,6 +1005,8 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4329_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4330_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4334_DEVICE_ID),
+	BRCMF_SDIO_DEVICE(BRCM_SDIO_43340_DEVICE_ID),
+	BRCMF_SDIO_DEVICE(BRCM_SDIO_43341_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_43362_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4335_4339_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4354_DEVICE_ID),
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/brcm80211/brcmfmac/chip.c
index 519b79ebaabd..04d2ca0d87d6 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/chip.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/chip.c
@@ -481,6 +481,7 @@ static void brcmf_chip_get_raminfo(struct brcmf_chip_priv *ci)
 		ci->pub.ramsize = 0x48000;
 		break;
 	case BRCM_CC_4334_CHIP_ID:
+	case BRCM_CC_43340_CHIP_ID:
 		ci->pub.ramsize = 0x80000;
 		break;
 	case BRCM_CC_4335_CHIP_ID:
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c
index 0b0d51a61060..551da356a5bd 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c
@@ -608,6 +608,8 @@ static const struct sdiod_drive_str sdiod_drvstr_tab2_3v3[] = {
 #define BCM4330_NVRAM_NAME		"brcm/brcmfmac4330-sdio.txt"
 #define BCM4334_FIRMWARE_NAME		"brcm/brcmfmac4334-sdio.bin"
 #define BCM4334_NVRAM_NAME		"brcm/brcmfmac4334-sdio.txt"
+#define BCM43340_FIRMWARE_NAME		"brcm/brcmfmac43340-sdio.bin"
+#define BCM43340_NVRAM_NAME		"brcm/brcmfmac43340-sdio.txt"
 #define BCM4335_FIRMWARE_NAME		"brcm/brcmfmac4335-sdio.bin"
 #define BCM4335_NVRAM_NAME		"brcm/brcmfmac4335-sdio.txt"
 #define BCM43362_FIRMWARE_NAME		"brcm/brcmfmac43362-sdio.bin"
@@ -629,6 +631,8 @@ MODULE_FIRMWARE(BCM4330_FIRMWARE_NAME);
 MODULE_FIRMWARE(BCM4330_NVRAM_NAME);
 MODULE_FIRMWARE(BCM4334_FIRMWARE_NAME);
 MODULE_FIRMWARE(BCM4334_NVRAM_NAME);
+MODULE_FIRMWARE(BCM43340_FIRMWARE_NAME);
+MODULE_FIRMWARE(BCM43340_NVRAM_NAME);
 MODULE_FIRMWARE(BCM4335_FIRMWARE_NAME);
 MODULE_FIRMWARE(BCM4335_NVRAM_NAME);
 MODULE_FIRMWARE(BCM43362_FIRMWARE_NAME);
@@ -660,6 +664,7 @@ static const struct brcmf_firmware_names brcmf_fwname_data[] = {
 	{ BRCM_CC_4329_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4329) },
 	{ BRCM_CC_4330_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4330) },
 	{ BRCM_CC_4334_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4334) },
+	{ BRCM_CC_43340_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM43340) },
 	{ BRCM_CC_4335_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4335) },
 	{ BRCM_CC_43362_CHIP_ID, 0xFFFFFFFE, BRCMF_FIRMWARE_NVRAM(BCM43362) },
 	{ BRCM_CC_4339_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4339) },
diff --git a/drivers/net/wireless/brcm80211/include/brcm_hw_ids.h b/drivers/net/wireless/brcm80211/include/brcm_hw_ids.h
index 6996fcc144cf..00215efbc13b 100644
--- a/drivers/net/wireless/brcm80211/include/brcm_hw_ids.h
+++ b/drivers/net/wireless/brcm80211/include/brcm_hw_ids.h
@@ -34,6 +34,7 @@
 #define BRCM_CC_4329_CHIP_ID		0x4329
 #define BRCM_CC_4330_CHIP_ID		0x4330
 #define BRCM_CC_4334_CHIP_ID		0x4334
+#define BRCM_CC_43340_CHIP_ID		43340
 #define BRCM_CC_43362_CHIP_ID		43362
 #define BRCM_CC_4335_CHIP_ID		0x4335
 #define BRCM_CC_4339_CHIP_ID		0x4339
@@ -51,6 +52,8 @@
 #define BRCM_SDIO_4329_DEVICE_ID	BRCM_CC_4329_CHIP_ID
 #define BRCM_SDIO_4330_DEVICE_ID	BRCM_CC_4330_CHIP_ID
 #define BRCM_SDIO_4334_DEVICE_ID	BRCM_CC_4334_CHIP_ID
+#define BRCM_SDIO_43340_DEVICE_ID	BRCM_CC_43340_CHIP_ID
+#define BRCM_SDIO_43341_DEVICE_ID	43341
 #define BRCM_SDIO_43362_DEVICE_ID	BRCM_CC_43362_CHIP_ID
 #define BRCM_SDIO_4335_4339_DEVICE_ID	BRCM_CC_4335_CHIP_ID
 #define BRCM_SDIO_4354_DEVICE_ID	BRCM_CC_4354_CHIP_ID
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 0f01fe065424..996807963716 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -24,13 +24,15 @@
  * Vendors and devices.  Sort key: vendor first, device next.
  */
 #define SDIO_VENDOR_ID_BROADCOM			0x02d0
-#define SDIO_DEVICE_ID_BROADCOM_43143		43143
+#define SDIO_DEVICE_ID_BROADCOM_43143		0xa887
 #define SDIO_DEVICE_ID_BROADCOM_43241		0x4324
 #define SDIO_DEVICE_ID_BROADCOM_4329		0x4329
 #define SDIO_DEVICE_ID_BROADCOM_4330		0x4330
 #define SDIO_DEVICE_ID_BROADCOM_4334		0x4334
+#define SDIO_DEVICE_ID_BROADCOM_43340		0xa94c
+#define SDIO_DEVICE_ID_BROADCOM_43341		0xa94d
 #define SDIO_DEVICE_ID_BROADCOM_4335_4339	0x4335
-#define SDIO_DEVICE_ID_BROADCOM_43362		43362
+#define SDIO_DEVICE_ID_BROADCOM_43362		0xa962
 #define SDIO_DEVICE_ID_BROADCOM_4354		0x4354
 
 #define SDIO_VENDOR_ID_INTEL			0x0089
-- 
cgit v1.2.3


From 2f4383667d57d1c719070db86b14277277752841 Mon Sep 17 00:00:00 2001
From: Ed Swierk <eswierk@skyportsystems.com>
Date: Fri, 2 Jan 2015 17:27:56 -0800
Subject: ethtool: Extend ethtool plugin module eeprom API to phylib

This patch extends the ethtool plugin module eeprom API to support cards
whose phy support is delegated to a separate driver.

The handlers for ETHTOOL_GMODULEINFO and ETHTOOL_GMODULEEEPROM call the
module_info and module_eeprom functions if the phy driver provides them;
otherwise the handlers call the equivalent ethtool_ops functions provided
by network drivers with built-in phy support.

Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h |  9 +++++++++
 net/core/ethtool.c  | 45 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 22af8f8f5802..9c189a1fa3a2 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -565,6 +565,15 @@ struct phy_driver {
 	void (*write_mmd_indirect)(struct phy_device *dev, int ptrad,
 				   int devnum, int regnum, u32 val);
 
+	/* Get the size and type of the eeprom contained within a plug-in
+	 * module */
+	int (*module_info)(struct phy_device *dev,
+			   struct ethtool_modinfo *modinfo);
+
+	/* Get the eeprom information from the plug-in module */
+	int (*module_eeprom)(struct phy_device *dev,
+			     struct ethtool_eeprom *ee, u8 *data);
+
 	struct device_driver driver;
 };
 #define to_phy_driver(d) container_of(d, struct phy_driver, driver)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 550892cd6b3f..91f74f3eb204 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1597,20 +1597,31 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
 	return err;
 }
 
+static int __ethtool_get_module_info(struct net_device *dev,
+				     struct ethtool_modinfo *modinfo)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct phy_device *phydev = dev->phydev;
+
+	if (phydev && phydev->drv && phydev->drv->module_info)
+		return phydev->drv->module_info(phydev, modinfo);
+
+	if (ops->get_module_info)
+		return ops->get_module_info(dev, modinfo);
+
+	return -EOPNOTSUPP;
+}
+
 static int ethtool_get_module_info(struct net_device *dev,
 				   void __user *useraddr)
 {
 	int ret;
 	struct ethtool_modinfo modinfo;
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-
-	if (!ops->get_module_info)
-		return -EOPNOTSUPP;
 
 	if (copy_from_user(&modinfo, useraddr, sizeof(modinfo)))
 		return -EFAULT;
 
-	ret = ops->get_module_info(dev, &modinfo);
+	ret = __ethtool_get_module_info(dev, &modinfo);
 	if (ret)
 		return ret;
 
@@ -1620,21 +1631,33 @@ static int ethtool_get_module_info(struct net_device *dev,
 	return 0;
 }
 
+static int __ethtool_get_module_eeprom(struct net_device *dev,
+				       struct ethtool_eeprom *ee, u8 *data)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct phy_device *phydev = dev->phydev;
+
+	if (phydev && phydev->drv && phydev->drv->module_eeprom)
+		return phydev->drv->module_eeprom(phydev, ee, data);
+
+	if (ops->get_module_eeprom)
+		return ops->get_module_eeprom(dev, ee, data);
+
+	return -EOPNOTSUPP;
+}
+
 static int ethtool_get_module_eeprom(struct net_device *dev,
 				     void __user *useraddr)
 {
 	int ret;
 	struct ethtool_modinfo modinfo;
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-
-	if (!ops->get_module_info || !ops->get_module_eeprom)
-		return -EOPNOTSUPP;
 
-	ret = ops->get_module_info(dev, &modinfo);
+	ret = __ethtool_get_module_info(dev, &modinfo);
 	if (ret)
 		return ret;
 
-	return ethtool_get_any_eeprom(dev, useraddr, ops->get_module_eeprom,
+	return ethtool_get_any_eeprom(dev, useraddr,
+				      __ethtool_get_module_eeprom,
 				      modinfo.eeprom_len);
 }
 
-- 
cgit v1.2.3


From 9da7dae94fb8adab5cc5f395640e30736a66e910 Mon Sep 17 00:00:00 2001
From: Valentin Rothberg <valentinrothberg@gmail.com>
Date: Tue, 6 Jan 2015 17:29:29 +0100
Subject: workqueue.h: remove loops of single statement macros

checkpatch.pl complained about two single statement macros in
do while (0) loops.  The loops and the trailing semicolons are
now removed, which makes checkpatch happy and the two macros
consistent with the rest of the file.

Signed-off-by: Valentin Rothberg <valentinrothberg@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b996e6cde6bb..74db135f9957 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -220,14 +220,10 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #endif
 
 #define INIT_WORK(_work, _func)						\
-	do {								\
-		__INIT_WORK((_work), (_func), 0);			\
-	} while (0)
+	__INIT_WORK((_work), (_func), 0)
 
 #define INIT_WORK_ONSTACK(_work, _func)					\
-	do {								\
-		__INIT_WORK((_work), (_func), 1);			\
-	} while (0)
+	__INIT_WORK((_work), (_func), 1)
 
 #define __INIT_DELAYED_WORK(_work, _func, _tflags)			\
 	do {								\
-- 
cgit v1.2.3


From 548cd3ab54da10f896daa7ca422236847a915734 Mon Sep 17 00:00:00 2001
From: "Bean Huo 霍斌斌 (beanhuo)" <beanhuo@micron.com>
Date: Wed, 17 Dec 2014 07:35:45 +0000
Subject: mtd: spi-nor: Add quad I/O support for Micron SPI NOR

This patch adds code which enables Quad I/O mode on Micron SPI NOR flashes.

For Micron SPI NOR flash, enabling or disabling quad I/O protocol can be
done By two methods, which are to use EVCR (Enhanced Volatile
Configuration Register) and the ENTER QUAD I/O MODE command. There is no
difference between these two methods. Unfortunately, for some Micron SPI
NOR flashes, there no ENTER Quad I/O command (35h), such as n25q064. But
for all current Micron SPI NOR, if it support quad I/O mode, using EVCR
definitely be supported. It is a recommended method to enable Quad I/O
mode by EVCR, Quad I/O protocol bit 7. When EVCR bit 7 is reset to 0,
the SPI NOR flash will operate in quad I/O mode.

This patch has been tested on N25Q512A and MT25TL256BAA1ESF. Micron SPI
NOR of spi_nor_ids[] table all support this method.

Signed-off-by: Bean Huo <beanhuo@micron.com>
Acked-by: Marek Vasut <marex@denx.de>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/spi-nor/spi-nor.c | 62 +++++++++++++++++++++++++++++++++++++------
 include/linux/mtd/spi-nor.h   |  7 +++++
 2 files changed, 61 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 0f8ec3c2d015..ea196c18dec9 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -560,14 +560,14 @@ static const struct spi_device_id spi_nor_ids[] = {
 	{ "mx66l1g55g",  INFO(0xc2261b, 0, 64 * 1024, 2048, SPI_NOR_QUAD_READ) },
 
 	/* Micron */
-	{ "n25q032",	 INFO(0x20ba16, 0, 64 * 1024,   64, 0) },
-	{ "n25q064",     INFO(0x20ba17, 0, 64 * 1024,  128, 0) },
-	{ "n25q128a11",  INFO(0x20bb18, 0, 64 * 1024,  256, 0) },
-	{ "n25q128a13",  INFO(0x20ba18, 0, 64 * 1024,  256, 0) },
-	{ "n25q256a",    INFO(0x20ba19, 0, 64 * 1024,  512, SECT_4K) },
-	{ "n25q512a",    INFO(0x20bb20, 0, 64 * 1024, 1024, SECT_4K) },
-	{ "n25q512ax3",  INFO(0x20ba20, 0, 64 * 1024, 1024, USE_FSR) },
-	{ "n25q00",      INFO(0x20ba21, 0, 64 * 1024, 2048, USE_FSR) },
+	{ "n25q032",	 INFO(0x20ba16, 0, 64 * 1024,   64, SPI_NOR_QUAD_READ) },
+	{ "n25q064",     INFO(0x20ba17, 0, 64 * 1024,  128, SPI_NOR_QUAD_READ) },
+	{ "n25q128a11",  INFO(0x20bb18, 0, 64 * 1024,  256, SPI_NOR_QUAD_READ) },
+	{ "n25q128a13",  INFO(0x20ba18, 0, 64 * 1024,  256, SPI_NOR_QUAD_READ) },
+	{ "n25q256a",    INFO(0x20ba19, 0, 64 * 1024,  512, SECT_4K | SPI_NOR_QUAD_READ) },
+	{ "n25q512a",    INFO(0x20bb20, 0, 64 * 1024, 1024, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
+	{ "n25q512ax3",  INFO(0x20ba20, 0, 64 * 1024, 1024, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
+	{ "n25q00",      INFO(0x20ba21, 0, 64 * 1024, 2048, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
 
 	/* PMC */
 	{ "pm25lv512",   INFO(0,        0, 32 * 1024,    2, SECT_4K_PMC) },
@@ -891,6 +891,45 @@ static int spansion_quad_enable(struct spi_nor *nor)
 	return 0;
 }
 
+static int micron_quad_enable(struct spi_nor *nor)
+{
+	int ret;
+	u8 val;
+
+	ret = nor->read_reg(nor, SPINOR_OP_RD_EVCR, &val, 1);
+	if (ret < 0) {
+		dev_err(nor->dev, "error %d reading EVCR\n", ret);
+		return ret;
+	}
+
+	write_enable(nor);
+
+	/* set EVCR, enable quad I/O */
+	nor->cmd_buf[0] = val & ~EVCR_QUAD_EN_MICRON;
+	ret = nor->write_reg(nor, SPINOR_OP_WD_EVCR, nor->cmd_buf, 1, 0);
+	if (ret < 0) {
+		dev_err(nor->dev, "error while writing EVCR register\n");
+		return ret;
+	}
+
+	ret = spi_nor_wait_till_ready(nor);
+	if (ret)
+		return ret;
+
+	/* read EVCR and check it */
+	ret = nor->read_reg(nor, SPINOR_OP_RD_EVCR, &val, 1);
+	if (ret < 0) {
+		dev_err(nor->dev, "error %d reading EVCR\n", ret);
+		return ret;
+	}
+	if (val & EVCR_QUAD_EN_MICRON) {
+		dev_err(nor->dev, "Micron EVCR Quad bit not clear\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int set_quad_mode(struct spi_nor *nor, struct flash_info *info)
 {
 	int status;
@@ -903,6 +942,13 @@ static int set_quad_mode(struct spi_nor *nor, struct flash_info *info)
 			return -EINVAL;
 		}
 		return status;
+	case CFI_MFR_ST:
+		status = micron_quad_enable(nor);
+		if (status) {
+			dev_err(nor->dev, "Micron quad-read not enabled\n");
+			return -EINVAL;
+		}
+		return status;
 	default:
 		status = spansion_quad_enable(nor);
 		if (status) {
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 63aeccf9ddc8..4720b86ee73d 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -56,6 +56,10 @@
 /* Used for Spansion flashes only. */
 #define SPINOR_OP_BRWR		0x17	/* Bank register write */
 
+/* Used for Micron flashes only. */
+#define SPINOR_OP_RD_EVCR      0x65    /* Read EVCR register */
+#define SPINOR_OP_WD_EVCR      0x61    /* Write EVCR register */
+
 /* Status Register bits. */
 #define SR_WIP			1	/* Write in progress */
 #define SR_WEL			2	/* Write enable latch */
@@ -67,6 +71,9 @@
 
 #define SR_QUAD_EN_MX		0x40	/* Macronix Quad I/O */
 
+/* Enhanced Volatile Configuration Register bits */
+#define EVCR_QUAD_EN_MICRON    0x80    /* Micron Quad I/O */
+
 /* Flag Status Register bits */
 #define FSR_READY		0x80
 
-- 
cgit v1.2.3


From 3efe41be224c4441f2a872a25471a14d85ceb7c6 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Wed, 26 Nov 2014 01:01:08 -0800
Subject: mtd: implement common reboot notifier boilerplate

cfi_cmdset_000{1,2}.c already implement their own reboot notifiers, and
we're going to add one for NAND. Let's put the boilerplate in one place.

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Tested-by: Scott Branden <sbranden@broadcom.com>
---
 drivers/mtd/mtdcore.c   | 20 ++++++++++++++++++++
 include/linux/mtd/mtd.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 4c611871d7e6..cbc0fc46d2d6 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -37,6 +37,7 @@
 #include <linux/backing-dev.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
+#include <linux/reboot.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
@@ -365,6 +366,17 @@ static struct device_type mtd_devtype = {
 	.release	= mtd_release,
 };
 
+static int mtd_reboot_notifier(struct notifier_block *n, unsigned long state,
+			       void *cmd)
+{
+	struct mtd_info *mtd;
+
+	mtd = container_of(n, struct mtd_info, reboot_notifier);
+	mtd->_reboot(mtd);
+
+	return NOTIFY_DONE;
+}
+
 /**
  *	add_mtd_device - register an MTD device
  *	@mtd: pointer to new MTD device info structure
@@ -565,6 +577,11 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types,
 			err = -ENODEV;
 	}
 
+	if (mtd->_reboot) {
+		mtd->reboot_notifier.notifier_call = mtd_reboot_notifier;
+		register_reboot_notifier(&mtd->reboot_notifier);
+	}
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(mtd_device_parse_register);
@@ -579,6 +596,9 @@ int mtd_device_unregister(struct mtd_info *master)
 {
 	int err;
 
+	if (master->_reboot)
+		unregister_reboot_notifier(&master->reboot_notifier);
+
 	err = del_mtd_partitions(master);
 	if (err)
 		return err;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 031ff3a9a0bd..c06f5373d870 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -227,6 +227,7 @@ struct mtd_info {
 	int (*_block_markbad) (struct mtd_info *mtd, loff_t ofs);
 	int (*_suspend) (struct mtd_info *mtd);
 	void (*_resume) (struct mtd_info *mtd);
+	void (*_reboot) (struct mtd_info *mtd);
 	/*
 	 * If the driver is something smart, like UBI, it may need to maintain
 	 * its own reference counting. The below functions are only for driver.
-- 
cgit v1.2.3


From 26e022727f5e88c6e5054e14d954425deacbe56a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 18 Dec 2014 16:02:17 +0100
Subject: efi: Rename efi_guid_unparse to efi_guid_to_str

Call it what it does - "unparse" is plain-misleading.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 block/partitions/efi.c         | 2 +-
 drivers/firmware/efi/efi.c     | 4 ++--
 drivers/firmware/efi/efivars.c | 6 +++---
 fs/efivarfs/super.c            | 2 +-
 include/linux/efi.h            | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index 56d08fd75b1a..26cb624ace05 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -715,7 +715,7 @@ int efi_partition(struct parsed_partitions *state)
 			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
 
 		info = &state->parts[i + 1].info;
-		efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
+		efi_guid_to_str(&ptes[i].unique_partition_guid, info->uuid);
 
 		/* Naively convert UTF16-LE to 7 bits. */
 		label_max = min(ARRAY_SIZE(info->volname) - 1,
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 8590099ac148..ff0bbe383b31 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -272,10 +272,10 @@ static __init int match_config_table(efi_guid_t *guid,
 	int i;
 
 	if (table_types) {
-		efi_guid_unparse(guid, str);
+		efi_guid_to_str(guid, str);
 
 		for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) {
-			efi_guid_unparse(&table_types[i].guid, str);
+			efi_guid_to_str(&table_types[i].guid, str);
 
 			if (!efi_guidcmp(*guid, table_types[i].guid)) {
 				*(table_types[i].ptr) = table;
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index f256ecd8a176..7b2e0496e0c0 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -39,7 +39,7 @@
  *   fix locking per Peter Chubb's findings
  *
  *  25 Mar 2002 - Matt Domsch <Matt_Domsch@dell.com>
- *   move uuid_unparse() to include/asm-ia64/efi.h:efi_guid_unparse()
+ *   move uuid_unparse() to include/asm-ia64/efi.h:efi_guid_to_str()
  *
  *  12 Feb 2002 - Matt Domsch <Matt_Domsch@dell.com>
  *   use list_for_each_safe when deleting vars.
@@ -128,7 +128,7 @@ efivar_guid_read(struct efivar_entry *entry, char *buf)
 	if (!entry || !buf)
 		return 0;
 
-	efi_guid_unparse(&var->VendorGuid, str);
+	efi_guid_to_str(&var->VendorGuid, str);
 	str += strlen(str);
 	str += sprintf(str, "\n");
 
@@ -569,7 +569,7 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var)
 	   private variables from another's.         */
 
 	*(short_name + strlen(short_name)) = '-';
-	efi_guid_unparse(&new_var->var.VendorGuid,
+	efi_guid_to_str(&new_var->var.VendorGuid,
 			 short_name + strlen(short_name));
 
 	new_var->kobj.kset = efivars_kset;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6dad1176ec52..ddbce42548c9 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -140,7 +140,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 
 	name[len] = '-';
 
-	efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
+	efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
 
 	name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 0949f9c7e872..d762c81e62a8 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -844,7 +844,7 @@ efi_guidcmp (efi_guid_t left, efi_guid_t right)
 }
 
 static inline char *
-efi_guid_unparse(efi_guid_t *guid, char *out)
+efi_guid_to_str(efi_guid_t *guid, char *out)
 {
 	sprintf(out, "%pUl", guid->b);
         return out;
-- 
cgit v1.2.3


From 15acabfd02e35e270360fbe0def898e48754b3d6 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 5 Jan 2015 12:21:45 +0100
Subject: crypto: aead - add check for presence of auth tag

The AEAD decryption operation requires the authentication tag to be
present as part of the cipher text buffer. The added check verifies that
the caller provides a cipher text with at least the authentication tag.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 9c8776d0ada8..90998348e564 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -1412,6 +1412,9 @@ static inline int crypto_aead_encrypt(struct aead_request *req)
  */
 static inline int crypto_aead_decrypt(struct aead_request *req)
 {
+	if (req->cryptlen < crypto_aead_authsize(crypto_aead_reqtfm(req)))
+		return -EINVAL;
+
 	return crypto_aead_crt(crypto_aead_reqtfm(req))->decrypt(req);
 }
 
-- 
cgit v1.2.3


From ad26aa6c60974acf3228ed0ade97ba5793093dbe Mon Sep 17 00:00:00 2001
From: Jonghwa Lee <jonghwa3.lee@samsung.com>
Date: Thu, 8 Jan 2015 11:04:07 +0900
Subject: regulator: s2mps11: Fix wrong calculation of register offset

This patch adds missing registers('BUCK7_SW' & 'LDO29_CTRL'). Since BUCK7 has
1 more register (BUCK7_SW) than others, register offset should
be added one more for which has bigger address than BUCK7 registers.

Fixes: 76b9840b24ae04(regulator: s2mps11: Add support S2MPS13 regulator device)
Signed-off-by: Jonghwa Lee <jonghwa3.lee@samsung.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: <stable@vger.kernel.org>
---
 drivers/regulator/s2mps11.c         | 42 +++++++++++++++++++++++++++++++++----
 include/linux/mfd/samsung/s2mps13.h |  2 ++
 2 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c
index c1444c3d84c2..13ca20ed33a6 100644
--- a/drivers/regulator/s2mps11.c
+++ b/drivers/regulator/s2mps11.c
@@ -405,6 +405,40 @@ static struct regulator_ops s2mps14_reg_ops;
 	.enable_mask	= S2MPS14_ENABLE_MASK			\
 }
 
+#define regulator_desc_s2mps13_buck7(num, min, step, min_sel) {	\
+	.name		= "BUCK"#num,				\
+	.id		= S2MPS13_BUCK##num,			\
+	.ops		= &s2mps14_reg_ops,			\
+	.type		= REGULATOR_VOLTAGE,			\
+	.owner		= THIS_MODULE,				\
+	.min_uV		= min,					\
+	.uV_step	= step,					\
+	.linear_min_sel	= min_sel,				\
+	.n_voltages	= S2MPS14_BUCK_N_VOLTAGES,		\
+	.ramp_delay	= S2MPS13_BUCK_RAMP_DELAY,		\
+	.vsel_reg	= S2MPS13_REG_B1OUT + (num) * 2 - 1,	\
+	.vsel_mask	= S2MPS14_BUCK_VSEL_MASK,		\
+	.enable_reg	= S2MPS13_REG_B1CTRL + (num - 1) * 2,	\
+	.enable_mask	= S2MPS14_ENABLE_MASK			\
+}
+
+#define regulator_desc_s2mps13_buck8_10(num, min, step, min_sel) {	\
+	.name		= "BUCK"#num,				\
+	.id		= S2MPS13_BUCK##num,			\
+	.ops		= &s2mps14_reg_ops,			\
+	.type		= REGULATOR_VOLTAGE,			\
+	.owner		= THIS_MODULE,				\
+	.min_uV		= min,					\
+	.uV_step	= step,					\
+	.linear_min_sel	= min_sel,				\
+	.n_voltages	= S2MPS14_BUCK_N_VOLTAGES,		\
+	.ramp_delay	= S2MPS13_BUCK_RAMP_DELAY,		\
+	.vsel_reg	= S2MPS13_REG_B1OUT + (num) * 2 - 1,	\
+	.vsel_mask	= S2MPS14_BUCK_VSEL_MASK,		\
+	.enable_reg	= S2MPS13_REG_B1CTRL + (num) * 2 - 1,	\
+	.enable_mask	= S2MPS14_ENABLE_MASK			\
+}
+
 static const struct regulator_desc s2mps13_regulators[] = {
 	regulator_desc_s2mps13_ldo(1,  MIN_800_MV,  STEP_12_5_MV, 0x00),
 	regulator_desc_s2mps13_ldo(2,  MIN_1400_MV, STEP_50_MV,   0x0C),
@@ -452,10 +486,10 @@ static const struct regulator_desc s2mps13_regulators[] = {
 	regulator_desc_s2mps13_buck(4,  MIN_500_MV,  STEP_6_25_MV, 0x10),
 	regulator_desc_s2mps13_buck(5,  MIN_500_MV,  STEP_6_25_MV, 0x10),
 	regulator_desc_s2mps13_buck(6,  MIN_500_MV,  STEP_6_25_MV, 0x10),
-	regulator_desc_s2mps13_buck(7,  MIN_500_MV,  STEP_6_25_MV, 0x10),
-	regulator_desc_s2mps13_buck(8,  MIN_1000_MV, STEP_12_5_MV, 0x20),
-	regulator_desc_s2mps13_buck(9,  MIN_1000_MV, STEP_12_5_MV, 0x20),
-	regulator_desc_s2mps13_buck(10, MIN_500_MV,  STEP_6_25_MV, 0x10),
+	regulator_desc_s2mps13_buck7(7,  MIN_500_MV,  STEP_6_25_MV, 0x10),
+	regulator_desc_s2mps13_buck8_10(8,  MIN_1000_MV, STEP_12_5_MV, 0x20),
+	regulator_desc_s2mps13_buck8_10(9,  MIN_1000_MV, STEP_12_5_MV, 0x20),
+	regulator_desc_s2mps13_buck8_10(10, MIN_500_MV,  STEP_6_25_MV, 0x10),
 };
 
 static int s2mps14_regulator_enable(struct regulator_dev *rdev)
diff --git a/include/linux/mfd/samsung/s2mps13.h b/include/linux/mfd/samsung/s2mps13.h
index ce5dda8958fe..b1fd675fa36f 100644
--- a/include/linux/mfd/samsung/s2mps13.h
+++ b/include/linux/mfd/samsung/s2mps13.h
@@ -59,6 +59,7 @@ enum s2mps13_reg {
 	S2MPS13_REG_B6CTRL,
 	S2MPS13_REG_B6OUT,
 	S2MPS13_REG_B7CTRL,
+	S2MPS13_REG_B7SW,
 	S2MPS13_REG_B7OUT,
 	S2MPS13_REG_B8CTRL,
 	S2MPS13_REG_B8OUT,
@@ -102,6 +103,7 @@ enum s2mps13_reg {
 	S2MPS13_REG_L26CTRL,
 	S2MPS13_REG_L27CTRL,
 	S2MPS13_REG_L28CTRL,
+	S2MPS13_REG_L29CTRL,
 	S2MPS13_REG_L30CTRL,
 	S2MPS13_REG_L31CTRL,
 	S2MPS13_REG_L32CTRL,
-- 
cgit v1.2.3


From bfa21a0dfe6915dc85953b5d40ea9dae5fdf205f Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Mon, 5 Jan 2015 12:48:42 +0100
Subject: regulator: Allow parsing custom properties when using simplified DT
 parsing

When drivers use simplified DT parsing method (they provide
'regulator_desc.of_match') they still may want to parse custom
properties for some of the regulators. For example some of the
regulators support GPIO enable control.

Add a driver-supplied callback for such case. This way the regulator
core parses common bindings offloading a lot of code from drivers and
still custom properties may be used.

The callback, called for each parsed regulator, may modify the
'regulator_config' initially passed to regulator_register().

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         |  2 +-
 drivers/regulator/internal.h     |  2 ++
 drivers/regulator/of_regulator.c | 11 +++++++++++
 include/linux/regulator/driver.h | 13 +++++++++++++
 4 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index c13b557a560e..5fae8cabd254 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -3635,7 +3635,7 @@ regulator_register(const struct regulator_desc *regulator_desc,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	init_data = regulator_of_get_init_data(dev, regulator_desc,
+	init_data = regulator_of_get_init_data(dev, regulator_desc, config,
 					       &rdev->dev.of_node);
 	if (!init_data) {
 		init_data = config->init_data;
diff --git a/drivers/regulator/internal.h b/drivers/regulator/internal.h
index 80ba2a35a04b..c74ac8734023 100644
--- a/drivers/regulator/internal.h
+++ b/drivers/regulator/internal.h
@@ -38,11 +38,13 @@ struct regulator {
 #ifdef CONFIG_OF
 struct regulator_init_data *regulator_of_get_init_data(struct device *dev,
 			         const struct regulator_desc *desc,
+				 struct regulator_config *config,
 				 struct device_node **node);
 #else
 static inline struct regulator_init_data *
 regulator_of_get_init_data(struct device *dev,
 			   const struct regulator_desc *desc,
+			   struct regulator_config *config,
 			   struct device_node **node)
 {
 	return NULL;
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 91eaaf010524..24e812c48d93 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(of_regulator_match);
 
 struct regulator_init_data *regulator_of_get_init_data(struct device *dev,
 					    const struct regulator_desc *desc,
+					    struct regulator_config *config,
 					    struct device_node **node)
 {
 	struct device_node *search, *child;
@@ -307,6 +308,16 @@ struct regulator_init_data *regulator_of_get_init_data(struct device *dev,
 			break;
 		}
 
+		if (desc->of_parse_cb) {
+			if (desc->of_parse_cb(child, desc, config)) {
+				dev_err(dev,
+					"driver callback failed to parse DT for regulator %s\n",
+					child->name);
+				init_data = NULL;
+				break;
+			}
+		}
+
 		of_node_get(child);
 		*node = child;
 		break;
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 5f1e9ca47417..d4ad5b5a02bb 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -21,6 +21,7 @@
 
 struct regmap;
 struct regulator_dev;
+struct regulator_config;
 struct regulator_init_data;
 struct regulator_enable_gpio;
 
@@ -205,6 +206,15 @@ enum regulator_type {
  * @supply_name: Identifying the regulator supply
  * @of_match: Name used to identify regulator in DT.
  * @regulators_node: Name of node containing regulator definitions in DT.
+ * @of_parse_cb: Optional callback called only if of_match is present.
+ *               Will be called for each regulator parsed from DT, during
+ *               init_data parsing.
+ *               The regulator_config passed as argument to the callback will
+ *               be a copy of config passed to regulator_register, valid only
+ *               for this particular call. Callback may freely change the
+ *               config but it cannot store it for later usage.
+ *               Callback should return 0 on success or negative ERRNO
+ *               indicating failure.
  * @id: Numerical identifier for the regulator.
  * @ops: Regulator operations table.
  * @irq: Interrupt number for the regulator.
@@ -251,6 +261,9 @@ struct regulator_desc {
 	const char *supply_name;
 	const char *of_match;
 	const char *regulators_node;
+	int (*of_parse_cb)(struct device_node *,
+			    const struct regulator_desc *,
+			    struct regulator_config *);
 	int id;
 	bool continuous_voltage_range;
 	unsigned n_voltages;
-- 
cgit v1.2.3


From db30485408326a6f466a843b291b23535f63eda0 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:54 +0800
Subject: rhashtable: involve rhashtable_lookup_insert routine

Involve a new function called rhashtable_lookup_insert() which makes
lookup and insertion atomic under bucket lock protection, helping us
avoid to introduce an extra lock when we search and insert an object
into hash table.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  1 +
 lib/rhashtable.c           | 97 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 83 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index de1459c74c4d..73c913f31574 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -168,6 +168,7 @@ int rhashtable_shrink(struct rhashtable *ht);
 void *rhashtable_lookup(struct rhashtable *ht, const void *key);
 void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
+bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj);
 
 void rhashtable_destroy(struct rhashtable *ht);
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 20006854fce0..4430233c4e11 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -505,8 +505,26 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 		schedule_delayed_work(&ht->run_work, 0);
 }
 
+static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
+				struct bucket_table *tbl, u32 hash)
+{
+	struct rhash_head *head = rht_dereference_bucket(tbl->buckets[hash],
+							 tbl, hash);
+
+	if (rht_is_a_nulls(head))
+		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
+	else
+		RCU_INIT_POINTER(obj->next, head);
+
+	rcu_assign_pointer(tbl->buckets[hash], obj);
+
+	atomic_inc(&ht->nelems);
+
+	rhashtable_wakeup_worker(ht);
+}
+
 /**
- * rhashtable_insert - insert object into hash hash table
+ * rhashtable_insert - insert object into hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
  *
@@ -523,7 +541,6 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
 	struct bucket_table *tbl;
-	struct rhash_head *head;
 	spinlock_t *lock;
 	unsigned hash;
 
@@ -534,19 +551,9 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 	lock = bucket_lock(tbl, hash);
 
 	spin_lock_bh(lock);
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
-	if (rht_is_a_nulls(head))
-		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
-	else
-		RCU_INIT_POINTER(obj->next, head);
-
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+	__rhashtable_insert(ht, obj, tbl, hash);
 	spin_unlock_bh(lock);
 
-	atomic_inc(&ht->nelems);
-
-	rhashtable_wakeup_worker(ht);
-
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
@@ -560,7 +567,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  * walk the bucket chain upon removal. The removal operation is thus
  * considerable slow if the hash table is not correctly sized.
  *
- * Will automatically shrink the table via rhashtable_expand() if the the
+ * Will automatically shrink the table via rhashtable_expand() if the
  * shrink_decision function specified at rhashtable_init() returns true.
  *
  * The caller must ensure that no concurrent table mutations occur. It is
@@ -641,7 +648,7 @@ static bool rhashtable_compare(void *ptr, void *arg)
  * for a entry with an identical key. The first matching entry is returned.
  *
  * This lookup function may only be used for fixed key hash table (key_len
- * paramter set). It will BUG() if used inappropriately.
+ * parameter set). It will BUG() if used inappropriately.
  *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  */
@@ -702,6 +709,66 @@ restart:
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
 
+/**
+ * rhashtable_lookup_insert - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ *
+ * Locks down the bucket chain in both the old and new table if a resize
+ * is in progress to ensure that writers can't remove from the old table
+ * and can't insert to the new table during the atomic operation of search
+ * and insertion. Searches for duplicates in both the old and new table if
+ * a resize is in progress.
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
+{
+	struct bucket_table *new_tbl, *old_tbl;
+	spinlock_t *new_bucket_lock, *old_bucket_lock;
+	u32 new_hash, old_hash;
+	bool success = true;
+
+	BUG_ON(!ht->p.key_len);
+
+	rcu_read_lock();
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	old_hash = head_hashfn(ht, old_tbl, obj);
+	old_bucket_lock = bucket_lock(old_tbl, old_hash);
+	spin_lock_bh(old_bucket_lock);
+
+	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	new_hash = head_hashfn(ht, new_tbl, obj);
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+	if (unlikely(old_tbl != new_tbl))
+		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+
+	if (rhashtable_lookup(ht, rht_obj(ht, obj) + ht->p.key_offset)) {
+		success = false;
+		goto exit;
+	}
+
+	__rhashtable_insert(ht, obj, new_tbl, new_hash);
+
+exit:
+	if (unlikely(old_tbl != new_tbl))
+		spin_unlock_bh(new_bucket_lock);
+	spin_unlock_bh(old_bucket_lock);
+
+	rcu_read_unlock();
+
+	return success;
+}
+EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
 	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
-- 
cgit v1.2.3


From c0c09bfdc4150b3918526660768585cd477adf35 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:56 +0800
Subject: rhashtable: avoid unnecessary wakeup for worker queue

Move condition statements of verifying whether hash table size exceeds
its maximum threshold or reaches its minimum threshold from resizing
functions to resizing decision functions, avoiding unnecessary wakeup
for worker queue thread.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  2 +-
 lib/rhashtable.c           | 18 +++++++-----------
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 73c913f31574..326acd8c2e9f 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -113,7 +113,7 @@ struct rhashtable {
 	struct bucket_table __rcu	*tbl;
 	struct bucket_table __rcu       *future_tbl;
 	atomic_t			nelems;
-	size_t				shift;
+	atomic_t			shift;
 	struct rhashtable_params	p;
 	struct delayed_work             run_work;
 	struct mutex                    mutex;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 1aef942976fe..7fb474b18f1b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -199,7 +199,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
 {
 	/* Expand table when exceeding 75% load */
-	return atomic_read(&ht->nelems) > (new_size / 4 * 3);
+	return atomic_read(&ht->nelems) > (new_size / 4 * 3) &&
+	       (ht->p.max_shift && atomic_read(&ht->shift) < ht->p.max_shift);
 }
 EXPORT_SYMBOL_GPL(rht_grow_above_75);
 
@@ -211,7 +212,8 @@ EXPORT_SYMBOL_GPL(rht_grow_above_75);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 {
 	/* Shrink table beneath 30% load */
-	return atomic_read(&ht->nelems) < (new_size * 3 / 10);
+	return atomic_read(&ht->nelems) < (new_size * 3 / 10) &&
+	       (atomic_read(&ht->shift) > ht->p.min_shift);
 }
 EXPORT_SYMBOL_GPL(rht_shrink_below_30);
 
@@ -318,14 +320,11 @@ int rhashtable_expand(struct rhashtable *ht)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
-		return 0;
-
 	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift++;
+	atomic_inc(&ht->shift);
 
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
@@ -421,9 +420,6 @@ int rhashtable_shrink(struct rhashtable *ht)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	if (ht->shift <= ht->p.min_shift)
-		return 0;
-
 	new_tbl = bucket_table_alloc(ht, tbl->size / 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
@@ -462,7 +458,7 @@ int rhashtable_shrink(struct rhashtable *ht)
 
 	/* Publish the new, valid hash table */
 	rcu_assign_pointer(ht->tbl, new_tbl);
-	ht->shift--;
+	atomic_dec(&ht->shift);
 
 	/* Wait for readers. No new readers will have references to the
 	 * old hash table.
@@ -851,7 +847,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift = ilog2(tbl->size);
+	atomic_set(&ht->shift, ilog2(tbl->size));
 	RCU_INIT_POINTER(ht->tbl, tbl);
 	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
-- 
cgit v1.2.3


From e5a7a72cd51a585b8f1a1e299bf88fff44b94440 Mon Sep 17 00:00:00 2001
From: Robin Gong <b38343@freescale.com>
Date: Fri, 9 Jan 2015 09:57:33 +0800
Subject: regulator: pfuze100-regulator: add pfuze3000 support

Add pfuze3000 chip support.

Signed-off-by: Robin Gong <b38343@freescale.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/regulator/pfuze100.txt     |  94 ++++++++++++++-
 drivers/regulator/pfuze100-regulator.c             | 134 +++++++++++++++++++--
 include/linux/regulator/pfuze100.h                 |  14 +++
 3 files changed, 232 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/pfuze100.txt b/Documentation/devicetree/bindings/regulator/pfuze100.txt
index 34ef5d16d0f1..9b40db88f637 100644
--- a/Documentation/devicetree/bindings/regulator/pfuze100.txt
+++ b/Documentation/devicetree/bindings/regulator/pfuze100.txt
@@ -1,7 +1,7 @@
 PFUZE100 family of regulators
 
 Required properties:
-- compatible: "fsl,pfuze100" or "fsl,pfuze200"
+- compatible: "fsl,pfuze100", "fsl,pfuze200", "fsl,pfuze3000"
 - reg: I2C slave address
 
 Required child node:
@@ -14,6 +14,8 @@ Required child node:
   sw1ab,sw1c,sw2,sw3a,sw3b,sw4,swbst,vsnvs,vrefddr,vgen1~vgen6
   --PFUZE200
   sw1ab,sw2,sw3a,sw3b,swbst,vsnvs,vrefddr,vgen1~vgen6
+  --PFUZE3000
+  sw1a,sw1b,sw2,sw3,swbst,vsnvs,vrefddr,vldo1,vldo2,vccsd,v33,vldo3,vldo4
 
 Each regulator is defined using the standard binding for regulators.
 
@@ -205,3 +207,93 @@ Example 2: PFUZE200
 			};
 		};
 	};
+
+Example 3: PFUZE3000
+
+	pmic: pfuze3000@08 {
+		compatible = "fsl,pfuze3000";
+		reg = <0x08>;
+
+		regulators {
+			sw1a_reg: sw1a {
+				regulator-min-microvolt = <700000>;
+				regulator-max-microvolt = <1475000>;
+				regulator-boot-on;
+				regulator-always-on;
+				regulator-ramp-delay = <6250>;
+			};
+			/* use sw1c_reg to align with pfuze100/pfuze200 */
+			sw1c_reg: sw1b {
+				regulator-min-microvolt = <700000>;
+				regulator-max-microvolt = <1475000>;
+				regulator-boot-on;
+				regulator-always-on;
+				regulator-ramp-delay = <6250>;
+			};
+
+			sw2_reg: sw2 {
+				regulator-min-microvolt = <2500000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-boot-on;
+				regulator-always-on;
+			};
+
+			sw3a_reg: sw3 {
+				regulator-min-microvolt = <900000>;
+				regulator-max-microvolt = <1650000>;
+				regulator-boot-on;
+				regulator-always-on;
+			};
+
+			swbst_reg: swbst {
+				regulator-min-microvolt = <5000000>;
+				regulator-max-microvolt = <5150000>;
+			};
+
+			snvs_reg: vsnvs {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3000000>;
+				regulator-boot-on;
+				regulator-always-on;
+			};
+
+			vref_reg: vrefddr {
+				regulator-boot-on;
+				regulator-always-on;
+			};
+
+			vgen1_reg: vldo1 {
+				regulator-min-microvolt = <1800000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+			};
+
+			vgen2_reg: vldo2 {
+				regulator-min-microvolt = <800000>;
+				regulator-max-microvolt = <1550000>;
+			};
+
+			vgen3_reg: vccsd {
+				regulator-min-microvolt = <2850000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+			};
+
+			vgen4_reg: v33 {
+				regulator-min-microvolt = <2850000>;
+				regulator-max-microvolt = <3300000>;
+			};
+
+			vgen5_reg: vldo3 {
+				regulator-min-microvolt = <1800000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+			};
+
+			vgen6_reg: vldo4 {
+				regulator-min-microvolt = <1800000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+			};
+		};
+	};
diff --git a/drivers/regulator/pfuze100-regulator.c b/drivers/regulator/pfuze100-regulator.c
index c879dff597ee..8cc8d1877c44 100644
--- a/drivers/regulator/pfuze100-regulator.c
+++ b/drivers/regulator/pfuze100-regulator.c
@@ -56,7 +56,7 @@
 #define PFUZE100_VGEN5VOL	0x70
 #define PFUZE100_VGEN6VOL	0x71
 
-enum chips { PFUZE100, PFUZE200 };
+enum chips { PFUZE100, PFUZE200, PFUZE3000 = 3 };
 
 struct pfuze_regulator {
 	struct regulator_desc desc;
@@ -80,9 +80,18 @@ static const int pfuze100_vsnvs[] = {
 	1000000, 1100000, 1200000, 1300000, 1500000, 1800000, 3000000,
 };
 
+static const int pfuze3000_sw2lo[] = {
+	1500000, 1550000, 1600000, 1650000, 1700000, 1750000, 1800000, 1850000,
+};
+
+static const int pfuze3000_sw2hi[] = {
+	2500000, 2800000, 2850000, 3000000, 3100000, 3150000, 3200000, 3300000,
+};
+
 static const struct i2c_device_id pfuze_device_id[] = {
 	{.name = "pfuze100", .driver_data = PFUZE100},
 	{.name = "pfuze200", .driver_data = PFUZE200},
+	{.name = "pfuze3000", .driver_data = PFUZE3000},
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, pfuze_device_id);
@@ -90,6 +99,7 @@ MODULE_DEVICE_TABLE(i2c, pfuze_device_id);
 static const struct of_device_id pfuze_dt_ids[] = {
 	{ .compatible = "fsl,pfuze100", .data = (void *)PFUZE100},
 	{ .compatible = "fsl,pfuze200", .data = (void *)PFUZE200},
+	{ .compatible = "fsl,pfuze3000", .data = (void *)PFUZE3000},
 	{ }
 };
 MODULE_DEVICE_TABLE(of, pfuze_dt_ids);
@@ -219,6 +229,60 @@ static struct regulator_ops pfuze100_swb_regulator_ops = {
 		.stby_mask = 0x20,	\
 	}
 
+#define PFUZE3000_VCC_REG(_chip, _name, base, min, max, step)	{	\
+	.desc = {	\
+		.name = #_name,	\
+		.n_voltages = ((max) - (min)) / (step) + 1,	\
+		.ops = &pfuze100_ldo_regulator_ops,	\
+		.type = REGULATOR_VOLTAGE,	\
+		.id = _chip ## _ ## _name,	\
+		.owner = THIS_MODULE,	\
+		.min_uV = (min),	\
+		.uV_step = (step),	\
+		.vsel_reg = (base),	\
+		.vsel_mask = 0x3,	\
+		.enable_reg = (base),	\
+		.enable_mask = 0x10,	\
+	},	\
+	.stby_reg = (base),	\
+	.stby_mask = 0x20,	\
+}
+
+
+#define PFUZE3000_SW2_REG(_chip, _name, base, min, max, step)	{	\
+	.desc = {	\
+		.name = #_name,\
+		.n_voltages = ((max) - (min)) / (step) + 1,	\
+		.ops = &pfuze100_sw_regulator_ops,	\
+		.type = REGULATOR_VOLTAGE,	\
+		.id = _chip ## _ ## _name,	\
+		.owner = THIS_MODULE,	\
+		.min_uV = (min),	\
+		.uV_step = (step),	\
+		.vsel_reg = (base) + PFUZE100_VOL_OFFSET,	\
+		.vsel_mask = 0x7,	\
+	},	\
+	.stby_reg = (base) + PFUZE100_STANDBY_OFFSET,	\
+	.stby_mask = 0x7,	\
+}
+
+#define PFUZE3000_SW3_REG(_chip, _name, base, min, max, step)	{	\
+	.desc = {	\
+		.name = #_name,\
+		.n_voltages = ((max) - (min)) / (step) + 1,	\
+		.ops = &pfuze100_sw_regulator_ops,	\
+		.type = REGULATOR_VOLTAGE,	\
+		.id = _chip ## _ ## _name,	\
+		.owner = THIS_MODULE,	\
+		.min_uV = (min),	\
+		.uV_step = (step),	\
+		.vsel_reg = (base) + PFUZE100_VOL_OFFSET,	\
+		.vsel_mask = 0xf,	\
+	},	\
+	.stby_reg = (base) + PFUZE100_STANDBY_OFFSET,	\
+	.stby_mask = 0xf,	\
+}
+
 /* PFUZE100 */
 static struct pfuze_regulator pfuze100_regulators[] = {
 	PFUZE100_SW_REG(PFUZE100, SW1AB, PFUZE100_SW1ABVOL, 300000, 1875000, 25000),
@@ -254,6 +318,22 @@ static struct pfuze_regulator pfuze200_regulators[] = {
 	PFUZE100_VGEN_REG(PFUZE200, VGEN6, PFUZE100_VGEN6VOL, 1800000, 3300000, 100000),
 };
 
+static struct pfuze_regulator pfuze3000_regulators[] = {
+	PFUZE100_SW_REG(PFUZE3000, SW1A, PFUZE100_SW1ABVOL, 700000, 1475000, 25000),
+	PFUZE100_SW_REG(PFUZE3000, SW1B, PFUZE100_SW1CVOL, 700000, 1475000, 25000),
+	PFUZE100_SWB_REG(PFUZE3000, SW2, PFUZE100_SW2VOL, 0x7, pfuze3000_sw2lo),
+	PFUZE3000_SW3_REG(PFUZE3000, SW3, PFUZE100_SW3AVOL, 900000, 1650000, 50000),
+	PFUZE100_SWB_REG(PFUZE3000, SWBST, PFUZE100_SWBSTCON1, 0x3, pfuze100_swbst),
+	PFUZE100_SWB_REG(PFUZE3000, VSNVS, PFUZE100_VSNVSVOL, 0x7, pfuze100_vsnvs),
+	PFUZE100_FIXED_REG(PFUZE3000, VREFDDR, PFUZE100_VREFDDRCON, 750000),
+	PFUZE100_VGEN_REG(PFUZE3000, VLDO1, PFUZE100_VGEN1VOL, 1800000, 3300000, 100000),
+	PFUZE100_VGEN_REG(PFUZE3000, VLDO2, PFUZE100_VGEN2VOL, 800000, 1550000, 50000),
+	PFUZE3000_VCC_REG(PFUZE3000, VCCSD, PFUZE100_VGEN3VOL, 2850000, 3300000, 150000),
+	PFUZE3000_VCC_REG(PFUZE3000, V33, PFUZE100_VGEN4VOL, 2850000, 3300000, 150000),
+	PFUZE100_VGEN_REG(PFUZE3000, VLDO3, PFUZE100_VGEN5VOL, 1800000, 3300000, 100000),
+	PFUZE100_VGEN_REG(PFUZE3000, VLDO4, PFUZE100_VGEN6VOL, 1800000, 3300000, 100000),
+};
+
 static struct pfuze_regulator *pfuze_regulators;
 
 #ifdef CONFIG_OF
@@ -294,6 +374,24 @@ static struct of_regulator_match pfuze200_matches[] = {
 	{ .name = "vgen6",	},
 };
 
+/* PFUZE3000 */
+static struct of_regulator_match pfuze3000_matches[] = {
+
+	{ .name = "sw1a",	},
+	{ .name = "sw1b",	},
+	{ .name = "sw2",	},
+	{ .name = "sw3",	},
+	{ .name = "swbst",	},
+	{ .name = "vsnvs",	},
+	{ .name = "vrefddr",	},
+	{ .name = "vldo1",	},
+	{ .name = "vldo2",	},
+	{ .name = "vccsd",	},
+	{ .name = "v33",	},
+	{ .name = "vldo3",	},
+	{ .name = "vldo4",	},
+};
+
 static struct of_regulator_match *pfuze_matches;
 
 static int pfuze_parse_regulators_dt(struct pfuze_chip *chip)
@@ -313,6 +411,11 @@ static int pfuze_parse_regulators_dt(struct pfuze_chip *chip)
 	}
 
 	switch (chip->chip_id) {
+	case PFUZE3000:
+		pfuze_matches = pfuze3000_matches;
+		ret = of_regulator_match(dev, parent, pfuze3000_matches,
+					 ARRAY_SIZE(pfuze3000_matches));
+		break;
 	case PFUZE200:
 		pfuze_matches = pfuze200_matches;
 		ret = of_regulator_match(dev, parent, pfuze200_matches,
@@ -378,7 +481,8 @@ static int pfuze_identify(struct pfuze_chip *pfuze_chip)
 		 * as ID=8 in PFUZE100
 		 */
 		dev_info(pfuze_chip->dev, "Assuming misprogrammed ID=0x8");
-	} else if ((value & 0x0f) != pfuze_chip->chip_id) {
+	} else if ((value & 0x0f) != pfuze_chip->chip_id &&
+		   (value & 0xf0) >> 4 != pfuze_chip->chip_id) {
 		/* device id NOT match with your setting */
 		dev_warn(pfuze_chip->dev, "Illegal ID: %x\n", value);
 		return -ENODEV;
@@ -417,7 +521,7 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 	int i, ret;
 	const struct of_device_id *match;
 	u32 regulator_num;
-	u32 sw_check_start, sw_check_end;
+	u32 sw_check_start, sw_check_end, sw_hi = 0x40;
 
 	pfuze_chip = devm_kzalloc(&client->dev, sizeof(*pfuze_chip),
 			GFP_KERNEL);
@@ -458,13 +562,19 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 
 	/* use the right regulators after identify the right device */
 	switch (pfuze_chip->chip_id) {
+	case PFUZE3000:
+		pfuze_regulators = pfuze3000_regulators;
+		regulator_num = ARRAY_SIZE(pfuze3000_regulators);
+		sw_check_start = PFUZE3000_SW2;
+		sw_check_end = PFUZE3000_SW2;
+		sw_hi = 1 << 3;
+		break;
 	case PFUZE200:
 		pfuze_regulators = pfuze200_regulators;
 		regulator_num = ARRAY_SIZE(pfuze200_regulators);
 		sw_check_start = PFUZE200_SW2;
 		sw_check_end = PFUZE200_SW3B;
 		break;
-
 	case PFUZE100:
 	default:
 		pfuze_regulators = pfuze100_regulators;
@@ -474,7 +584,8 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 		break;
 	}
 	dev_info(&client->dev, "pfuze%s found.\n",
-		(pfuze_chip->chip_id == PFUZE100) ? "100" : "200");
+		(pfuze_chip->chip_id == PFUZE100) ? "100" :
+		((pfuze_chip->chip_id == PFUZE200) ? "200" : "3000"));
 
 	memcpy(pfuze_chip->regulator_descs, pfuze_regulators,
 		sizeof(pfuze_chip->regulator_descs));
@@ -498,10 +609,15 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 		/* SW2~SW4 high bit check and modify the voltage value table */
 		if (i >= sw_check_start && i <= sw_check_end) {
 			regmap_read(pfuze_chip->regmap, desc->vsel_reg, &val);
-			if (val & 0x40) {
-				desc->min_uV = 800000;
-				desc->uV_step = 50000;
-				desc->n_voltages = 51;
+			if (val & sw_hi) {
+				if (pfuze_chip->chip_id == PFUZE3000) {
+					desc->volt_table = pfuze3000_sw2hi;
+					desc->n_voltages = ARRAY_SIZE(pfuze3000_sw2hi);
+				} else {
+					desc->min_uV = 800000;
+					desc->uV_step = 50000;
+					desc->n_voltages = 51;
+				}
 			}
 		}
 
diff --git a/include/linux/regulator/pfuze100.h b/include/linux/regulator/pfuze100.h
index 364f7a7c43db..70c6c66c5bcf 100644
--- a/include/linux/regulator/pfuze100.h
+++ b/include/linux/regulator/pfuze100.h
@@ -49,6 +49,20 @@
 #define PFUZE200_VGEN5		11
 #define PFUZE200_VGEN6		12
 
+#define PFUZE3000_SW1A		0
+#define PFUZE3000_SW1B		1
+#define PFUZE3000_SW2		2
+#define PFUZE3000_SW3		3
+#define PFUZE3000_SWBST		4
+#define PFUZE3000_VSNVS		5
+#define PFUZE3000_VREFDDR	6
+#define PFUZE3000_VLDO1		7
+#define PFUZE3000_VLDO2		8
+#define PFUZE3000_VCCSD		9
+#define PFUZE3000_V33		10
+#define PFUZE3000_VLDO3		11
+#define PFUZE3000_VLDO4		12
+
 struct regulator_init_data;
 
 struct pfuze_regulator_platform_data {
-- 
cgit v1.2.3


From ed09dcc8bd7fe0991af7737e675996cbd022f38f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 30 Dec 2014 14:46:14 -0800
Subject: ses: close potential registration race

The slot and address fields have a small window of instability when
userspace can read them before initialization. Separate
enclosure_component
allocation from registration.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/enclosure.c  | 37 ++++++++++++++++++++++++++-----------
 drivers/scsi/ses.c        | 21 ++++++++++++++-------
 include/linux/enclosure.h |  5 +++--
 3 files changed, 43 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index 180a5442fd4b..e18278aa6bbb 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -273,27 +273,26 @@ enclosure_component_find_by_name(struct enclosure_device *edev,
 static const struct attribute_group *enclosure_component_groups[];
 
 /**
- * enclosure_component_register - add a particular component to an enclosure
+ * enclosure_component_alloc - prepare a new enclosure component
  * @edev:	the enclosure to add the component
  * @num:	the device number
  * @type:	the type of component being added
  * @name:	an optional name to appear in sysfs (leave NULL if none)
  *
- * Registers the component.  The name is optional for enclosures that
- * give their components a unique name.  If not, leave the field NULL
- * and a name will be assigned.
+ * The name is optional for enclosures that give their components a unique
+ * name.  If not, leave the field NULL and a name will be assigned.
  *
  * Returns a pointer to the enclosure component or an error.
  */
 struct enclosure_component *
-enclosure_component_register(struct enclosure_device *edev,
-			     unsigned int number,
-			     enum enclosure_component_type type,
-			     const char *name)
+enclosure_component_alloc(struct enclosure_device *edev,
+			  unsigned int number,
+			  enum enclosure_component_type type,
+			  const char *name)
 {
 	struct enclosure_component *ecomp;
 	struct device *cdev;
-	int err, i;
+	int i;
 	char newname[COMPONENT_NAME_SIZE];
 
 	if (number >= edev->components)
@@ -327,14 +326,30 @@ enclosure_component_register(struct enclosure_device *edev,
 	cdev->release = enclosure_component_release;
 	cdev->groups = enclosure_component_groups;
 
+	return ecomp;
+}
+EXPORT_SYMBOL_GPL(enclosure_component_alloc);
+
+/**
+ * enclosure_component_register - publishes an initialized enclosure component
+ * @ecomp:	component to add
+ *
+ * Returns 0 on successful registration, releases the component otherwise
+ */
+int enclosure_component_register(struct enclosure_component *ecomp)
+{
+	struct device *cdev;
+	int err;
+
+	cdev = &ecomp->cdev;
 	err = device_register(cdev);
 	if (err) {
 		ecomp->number = -1;
 		put_device(cdev);
-		return ERR_PTR(err);
+		return err;
 	}
 
-	return ecomp;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(enclosure_component_register);
 
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index b7e79e7646ad..7dd9cf558380 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -423,16 +423,23 @@ static void ses_enclosure_data_process(struct enclosure_device *edev,
 			    type_ptr[0] == ENCLOSURE_COMPONENT_ARRAY_DEVICE) {
 
 				if (create)
-					ecomp =	enclosure_component_register(edev,
-									     components++,
-									     type_ptr[0],
-									     name);
+					ecomp =	enclosure_component_alloc(
+						edev,
+						components++,
+						type_ptr[0],
+						name);
 				else
 					ecomp = &edev->component[components++];
 
-				if (!IS_ERR(ecomp) && addl_desc_ptr)
-					ses_process_descriptor(ecomp,
-							       addl_desc_ptr);
+				if (!IS_ERR(ecomp)) {
+					if (addl_desc_ptr)
+						ses_process_descriptor(
+							ecomp,
+							addl_desc_ptr);
+					if (create)
+						enclosure_component_register(
+							ecomp);
+				}
 			}
 			if (desc_ptr)
 				desc_ptr += len;
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index 9a33c5f7e126..a835d335b924 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -120,8 +120,9 @@ enclosure_register(struct device *, const char *, int,
 		   struct enclosure_component_callbacks *);
 void enclosure_unregister(struct enclosure_device *);
 struct enclosure_component *
-enclosure_component_register(struct enclosure_device *, unsigned int,
-				 enum enclosure_component_type, const char *);
+enclosure_component_alloc(struct enclosure_device *, unsigned int,
+			  enum enclosure_component_type, const char *);
+int enclosure_component_register(struct enclosure_component *);
 int enclosure_add_device(struct enclosure_device *enclosure, int component,
 			 struct device *dev);
 int enclosure_remove_device(struct enclosure_device *, struct device *);
-- 
cgit v1.2.3


From 967f7bab0eaaa74d7d01a56d45aa309f78fb87dd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 30 Dec 2014 14:46:16 -0800
Subject: ses: add enclosure logical id

Export the NAA logical id for the enclosure.  This is optionally
available from the sas_transport_class, but it is really a property of
the enclosure.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/enclosure.c  | 13 +++++++++++++
 drivers/scsi/ses.c        |  9 +++++++++
 include/linux/enclosure.h |  1 +
 3 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index e18278aa6bbb..958ee988a1e1 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -432,8 +432,21 @@ static ssize_t components_show(struct device *cdev,
 }
 static DEVICE_ATTR_RO(components);
 
+static ssize_t id_show(struct device *cdev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct enclosure_device *edev = to_enclosure_device(cdev);
+
+	if (edev->cb->show_id)
+		return edev->cb->show_id(edev, buf);
+	return -EINVAL;
+}
+static DEVICE_ATTR_RO(id);
+
 static struct attribute *enclosure_class_attrs[] = {
 	&dev_attr_components.attr,
+	&dev_attr_id.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(enclosure_class);
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index 6662b0ca6cfd..1041556cdbf3 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -258,6 +258,14 @@ static int ses_set_active(struct enclosure_device *edev,
 	return ses_set_page2_descriptor(edev, ecomp, desc);
 }
 
+static int ses_show_id(struct enclosure_device *edev, char *buf)
+{
+	struct ses_device *ses_dev = edev->scratch;
+	unsigned long long id = get_unaligned_be64(ses_dev->page1+8+4);
+
+	return sprintf(buf, "%#llx\n", id);
+}
+
 static struct enclosure_component_callbacks ses_enclosure_callbacks = {
 	.get_fault		= ses_get_fault,
 	.set_fault		= ses_set_fault,
@@ -265,6 +273,7 @@ static struct enclosure_component_callbacks ses_enclosure_callbacks = {
 	.get_locate		= ses_get_locate,
 	.set_locate		= ses_set_locate,
 	.set_active		= ses_set_active,
+	.show_id		= ses_show_id,
 };
 
 struct ses_host_edev {
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index a835d335b924..807622b252a4 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -79,6 +79,7 @@ struct enclosure_component_callbacks {
 	int (*set_locate)(struct enclosure_device *,
 			  struct enclosure_component *,
 			  enum enclosure_component_setting);
+	int (*show_id)(struct enclosure_device *, char *buf);
 };
 
 
-- 
cgit v1.2.3


From 921ce7f5786052749a22a75780f5ce1a456bcdc6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 30 Dec 2014 14:46:17 -0800
Subject: ses: add reliable slot attribute

The name provided by firmware is in a vendor specific format, publish
the slot number to have a reliable mechanism for identifying slots
across firmware implementations.  If the enclosure does not provide a
slot number fallback to the component number which is guaranteed unique,
and usually mirrors the slot number.

Cleaned up the unused ses_component.desc in the process.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/enclosure.c  | 20 +++++++++++++++++++-
 drivers/scsi/ses.c        | 17 ++++++++++++-----
 include/linux/enclosure.h |  1 +
 3 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index 958ee988a1e1..b62314d627ae 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -145,8 +145,10 @@ enclosure_register(struct device *dev, const char *name, int components,
 	if (err)
 		goto err;
 
-	for (i = 0; i < components; i++)
+	for (i = 0; i < components; i++) {
 		edev->component[i].number = -1;
+		edev->component[i].slot = -1;
+	}
 
 	mutex_lock(&container_list_lock);
 	list_add_tail(&edev->node, &container_list);
@@ -589,6 +591,20 @@ static ssize_t get_component_type(struct device *cdev,
 	return snprintf(buf, 40, "%s\n", enclosure_type[ecomp->type]);
 }
 
+static ssize_t get_component_slot(struct device *cdev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct enclosure_component *ecomp = to_enclosure_component(cdev);
+	int slot;
+
+	/* if the enclosure does not override then use 'number' as a stand-in */
+	if (ecomp->slot >= 0)
+		slot = ecomp->slot;
+	else
+		slot = ecomp->number;
+
+	return snprintf(buf, 40, "%d\n", slot);
+}
 
 static DEVICE_ATTR(fault, S_IRUGO | S_IWUSR, get_component_fault,
 		    set_component_fault);
@@ -599,6 +615,7 @@ static DEVICE_ATTR(active, S_IRUGO | S_IWUSR, get_component_active,
 static DEVICE_ATTR(locate, S_IRUGO | S_IWUSR, get_component_locate,
 		   set_component_locate);
 static DEVICE_ATTR(type, S_IRUGO, get_component_type, NULL);
+static DEVICE_ATTR(slot, S_IRUGO, get_component_slot, NULL);
 
 static struct attribute *enclosure_component_attrs[] = {
 	&dev_attr_fault.attr,
@@ -606,6 +623,7 @@ static struct attribute *enclosure_component_attrs[] = {
 	&dev_attr_active.attr,
 	&dev_attr_locate.attr,
 	&dev_attr_type.attr,
+	&dev_attr_slot.attr,
 	NULL
 };
 ATTRIBUTE_GROUPS(enclosure_component);
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index 1041556cdbf3..433de8e6f538 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -47,7 +47,6 @@ struct ses_device {
 
 struct ses_component {
 	u64 addr;
-	unsigned char *desc;
 };
 
 static int ses_probe(struct device *dev)
@@ -307,19 +306,26 @@ static void ses_process_descriptor(struct enclosure_component *ecomp,
 	int invalid = desc[0] & 0x80;
 	enum scsi_protocol proto = desc[0] & 0x0f;
 	u64 addr = 0;
+	int slot = -1;
 	struct ses_component *scomp = ecomp->scratch;
 	unsigned char *d;
 
-	scomp->desc = desc;
-
 	if (invalid)
 		return;
 
 	switch (proto) {
+	case SCSI_PROTOCOL_FCP:
+		if (eip) {
+			d = desc + 4;
+			slot = d[3];
+		}
+		break;
 	case SCSI_PROTOCOL_SAS:
-		if (eip)
+		if (eip) {
+			d = desc + 4;
+			slot = d[3];
 			d = desc + 8;
-		else
+		} else
 			d = desc + 4;
 		/* only take the phy0 addr */
 		addr = (u64)d[12] << 56 |
@@ -335,6 +341,7 @@ static void ses_process_descriptor(struct enclosure_component *ecomp,
 		/* FIXME: Need to add more protocols than just SAS */
 		break;
 	}
+	ecomp->slot = slot;
 	scomp->addr = addr;
 }
 
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index 807622b252a4..0f826c14a337 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -92,6 +92,7 @@ struct enclosure_component {
 	int fault;
 	int active;
 	int locate;
+	int slot;
 	enum enclosure_status status;
 };
 
-- 
cgit v1.2.3


From 08024885a2a3ed432716e9d50046a620a5b2df05 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 30 Dec 2014 14:46:18 -0800
Subject: ses: Add power_status to SES device slot

Add power_status to SES device slot, so we can power on/off the
HDDs behind the enclosure.

Check firmware status in ses_set_* before sending control pages to
firmware.

Signed-off-by: Song Liu <songliubraving@fb.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/enclosure.c  | 38 ++++++++++++++++++
 drivers/scsi/ses.c        | 98 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/enclosure.h |  6 +++
 3 files changed, 133 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index b62314d627ae..38552a31304a 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -148,6 +148,7 @@ enclosure_register(struct device *dev, const char *name, int components,
 	for (i = 0; i < components; i++) {
 		edev->component[i].number = -1;
 		edev->component[i].slot = -1;
+		edev->component[i].power_status = 1;
 	}
 
 	mutex_lock(&container_list_lock);
@@ -583,6 +584,40 @@ static ssize_t set_component_locate(struct device *cdev,
 	return count;
 }
 
+static ssize_t get_component_power_status(struct device *cdev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
+	struct enclosure_component *ecomp = to_enclosure_component(cdev);
+
+	if (edev->cb->get_power_status)
+		edev->cb->get_power_status(edev, ecomp);
+	return snprintf(buf, 40, "%s\n", ecomp->power_status ? "on" : "off");
+}
+
+static ssize_t set_component_power_status(struct device *cdev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
+	struct enclosure_component *ecomp = to_enclosure_component(cdev);
+	int val;
+
+	if (strncmp(buf, "on", 2) == 0 &&
+	    (buf[2] == '\n' || buf[2] == '\0'))
+		val = 1;
+	else if (strncmp(buf, "off", 3) == 0 &&
+	    (buf[3] == '\n' || buf[3] == '\0'))
+		val = 0;
+	else
+		return -EINVAL;
+
+	if (edev->cb->set_power_status)
+		edev->cb->set_power_status(edev, ecomp, val);
+	return count;
+}
+
 static ssize_t get_component_type(struct device *cdev,
 				  struct device_attribute *attr, char *buf)
 {
@@ -614,6 +649,8 @@ static DEVICE_ATTR(active, S_IRUGO | S_IWUSR, get_component_active,
 		   set_component_active);
 static DEVICE_ATTR(locate, S_IRUGO | S_IWUSR, get_component_locate,
 		   set_component_locate);
+static DEVICE_ATTR(power_status, S_IRUGO | S_IWUSR, get_component_power_status,
+		   set_component_power_status);
 static DEVICE_ATTR(type, S_IRUGO, get_component_type, NULL);
 static DEVICE_ATTR(slot, S_IRUGO, get_component_slot, NULL);
 
@@ -622,6 +659,7 @@ static struct attribute *enclosure_component_attrs[] = {
 	&dev_attr_status.attr,
 	&dev_attr_active.attr,
 	&dev_attr_locate.attr,
+	&dev_attr_power_status.attr,
 	&dev_attr_type.attr,
 	&dev_attr_slot.attr,
 	NULL
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index 433de8e6f538..dcb0d76d7312 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -67,6 +67,20 @@ static int ses_probe(struct device *dev)
 #define SES_TIMEOUT (30 * HZ)
 #define SES_RETRIES 3
 
+static void init_device_slot_control(unsigned char *dest_desc,
+				     struct enclosure_component *ecomp,
+				     unsigned char *status)
+{
+	memcpy(dest_desc, status, 4);
+	dest_desc[0] = 0;
+	/* only clear byte 1 for ENCLOSURE_COMPONENT_DEVICE */
+	if (ecomp->type == ENCLOSURE_COMPONENT_DEVICE)
+		dest_desc[1] = 0;
+	dest_desc[2] &= 0xde;
+	dest_desc[3] &= 0x3c;
+}
+
+
 static int ses_recv_diag(struct scsi_device *sdev, int page_code,
 			 void *buf, int bufflen)
 {
@@ -178,14 +192,22 @@ static int ses_set_fault(struct enclosure_device *edev,
 			  struct enclosure_component *ecomp,
 			 enum enclosure_component_setting val)
 {
-	unsigned char desc[4] = {0 };
+	unsigned char desc[4];
+	unsigned char *desc_ptr;
+
+	desc_ptr = ses_get_page2_descriptor(edev, ecomp);
+
+	if (!desc_ptr)
+		return -EIO;
+
+	init_device_slot_control(desc, ecomp, desc_ptr);
 
 	switch (val) {
 	case ENCLOSURE_SETTING_DISABLED:
-		/* zero is disabled */
+		desc[3] &= 0xdf;
 		break;
 	case ENCLOSURE_SETTING_ENABLED:
-		desc[3] = 0x20;
+		desc[3] |= 0x20;
 		break;
 	default:
 		/* SES doesn't do the SGPIO blink settings */
@@ -219,14 +241,22 @@ static int ses_set_locate(struct enclosure_device *edev,
 			  struct enclosure_component *ecomp,
 			  enum enclosure_component_setting val)
 {
-	unsigned char desc[4] = {0 };
+	unsigned char desc[4];
+	unsigned char *desc_ptr;
+
+	desc_ptr = ses_get_page2_descriptor(edev, ecomp);
+
+	if (!desc_ptr)
+		return -EIO;
+
+	init_device_slot_control(desc, ecomp, desc_ptr);
 
 	switch (val) {
 	case ENCLOSURE_SETTING_DISABLED:
-		/* zero is disabled */
+		desc[2] &= 0xfd;
 		break;
 	case ENCLOSURE_SETTING_ENABLED:
-		desc[2] = 0x02;
+		desc[2] |= 0x02;
 		break;
 	default:
 		/* SES doesn't do the SGPIO blink settings */
@@ -239,15 +269,23 @@ static int ses_set_active(struct enclosure_device *edev,
 			  struct enclosure_component *ecomp,
 			  enum enclosure_component_setting val)
 {
-	unsigned char desc[4] = {0 };
+	unsigned char desc[4];
+	unsigned char *desc_ptr;
+
+	desc_ptr = ses_get_page2_descriptor(edev, ecomp);
+
+	if (!desc_ptr)
+		return -EIO;
+
+	init_device_slot_control(desc, ecomp, desc_ptr);
 
 	switch (val) {
 	case ENCLOSURE_SETTING_DISABLED:
-		/* zero is disabled */
+		desc[2] &= 0x7f;
 		ecomp->active = 0;
 		break;
 	case ENCLOSURE_SETTING_ENABLED:
-		desc[2] = 0x80;
+		desc[2] |= 0x80;
 		ecomp->active = 1;
 		break;
 	default:
@@ -265,12 +303,53 @@ static int ses_show_id(struct enclosure_device *edev, char *buf)
 	return sprintf(buf, "%#llx\n", id);
 }
 
+static void ses_get_power_status(struct enclosure_device *edev,
+				 struct enclosure_component *ecomp)
+{
+	unsigned char *desc;
+
+	desc = ses_get_page2_descriptor(edev, ecomp);
+	if (desc)
+		ecomp->power_status = (desc[3] & 0x10) ? 0 : 1;
+}
+
+static int ses_set_power_status(struct enclosure_device *edev,
+				struct enclosure_component *ecomp,
+				int val)
+{
+	unsigned char desc[4];
+	unsigned char *desc_ptr;
+
+	desc_ptr = ses_get_page2_descriptor(edev, ecomp);
+
+	if (!desc_ptr)
+		return -EIO;
+
+	init_device_slot_control(desc, ecomp, desc_ptr);
+
+	switch (val) {
+	/* power = 1 is device_off = 0 and vice versa */
+	case 0:
+		desc[3] |= 0x10;
+		break;
+	case 1:
+		desc[3] &= 0xef;
+		break;
+	default:
+		return -EINVAL;
+	}
+	ecomp->power_status = val;
+	return ses_set_page2_descriptor(edev, ecomp, desc);
+}
+
 static struct enclosure_component_callbacks ses_enclosure_callbacks = {
 	.get_fault		= ses_get_fault,
 	.set_fault		= ses_set_fault,
 	.get_status		= ses_get_status,
 	.get_locate		= ses_get_locate,
 	.set_locate		= ses_set_locate,
+	.get_power_status	= ses_get_power_status,
+	.set_power_status	= ses_set_power_status,
 	.set_active		= ses_set_active,
 	.show_id		= ses_show_id,
 };
@@ -449,6 +528,7 @@ static void ses_enclosure_data_process(struct enclosure_device *edev,
 					ecomp = &edev->component[components++];
 
 				if (!IS_ERR(ecomp)) {
+					ses_get_power_status(edev, ecomp);
 					if (addl_desc_ptr)
 						ses_process_descriptor(
 							ecomp,
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index 0f826c14a337..7be22da321f3 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -79,6 +79,11 @@ struct enclosure_component_callbacks {
 	int (*set_locate)(struct enclosure_device *,
 			  struct enclosure_component *,
 			  enum enclosure_component_setting);
+	void (*get_power_status)(struct enclosure_device *,
+				 struct enclosure_component *);
+	int (*set_power_status)(struct enclosure_device *,
+				struct enclosure_component *,
+				int);
 	int (*show_id)(struct enclosure_device *, char *buf);
 };
 
@@ -94,6 +99,7 @@ struct enclosure_component {
 	int locate;
 	int slot;
 	enum enclosure_status status;
+	int power_status;
 };
 
 struct enclosure_device {
-- 
cgit v1.2.3


From 5efd2ea8c9f4f12916ffc8ba636792ce052f6911 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 5 Dec 2014 15:13:54 +0100
Subject: usb: core: buffer: smallest buffer should start at ARCH_DMA_MINALIGN

the following error pops up during "testusb -a -t 10"
| musb-hdrc musb-hdrc.1.auto: dma_pool_free buffer-128,	f134e000/be842000 (bad dma)
hcd_buffer_create() creates a few buffers, the smallest has 32 bytes of
size. ARCH_KMALLOC_MINALIGN is set to 64 bytes. This combo results in
hcd_buffer_alloc() returning memory which is 32 bytes aligned and it
might by identified by buffer_offset() as another buffer. This means the
buffer which is on a 32 byte boundary will not get freed, instead it
tries to free another buffer with the error message.

This patch fixes the issue by creating the smallest DMA buffer with the
size of ARCH_KMALLOC_MINALIGN (or 32 in case ARCH_KMALLOC_MINALIGN is
smaller). This might be 32, 64 or even 128 bytes. The next three pools
will have the size 128, 512 and 2048.
In case the smallest pool is 128 bytes then we have only three pools
instead of four (and zero the first entry in the array).
The last pool size is always 2048 bytes which is the assumed PAGE_SIZE /
2 of 4096. I doubt it makes sense to continue using PAGE_SIZE / 2 where
we would end up with 8KiB buffer in case we have 16KiB pages.
Instead I think it makes sense to have a common size(s) and extend them
if there is need to.
There is a BUILD_BUG_ON() now in case someone has a minalign of more than
128 bytes.

Cc: stable@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/buffer.c | 26 +++++++++++++++++---------
 drivers/usb/core/usb.c    |  1 +
 include/linux/usb/hcd.h   |  1 +
 3 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c
index 684ef70dc09d..506b969ea7fd 100644
--- a/drivers/usb/core/buffer.c
+++ b/drivers/usb/core/buffer.c
@@ -22,17 +22,25 @@
  */
 
 /* FIXME tune these based on pool statistics ... */
-static const size_t	pool_max[HCD_BUFFER_POOLS] = {
-	/* platforms without dma-friendly caches might need to
-	 * prevent cacheline sharing...
-	 */
-	32,
-	128,
-	512,
-	PAGE_SIZE / 2
-	/* bigger --> allocate pages */
+static size_t pool_max[HCD_BUFFER_POOLS] = {
+	32, 128, 512, 2048,
 };
 
+void __init usb_init_pool_max(void)
+{
+	/*
+	 * The pool_max values must never be smaller than
+	 * ARCH_KMALLOC_MINALIGN.
+	 */
+	if (ARCH_KMALLOC_MINALIGN <= 32)
+		;			/* Original value is okay */
+	else if (ARCH_KMALLOC_MINALIGN <= 64)
+		pool_max[0] = 64;
+	else if (ARCH_KMALLOC_MINALIGN <= 128)
+		pool_max[0] = 0;	/* Don't use this pool */
+	else
+		BUILD_BUG();		/* We don't allow this */
+}
 
 /* SETUP primitives */
 
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 2a92b97f0144..b1fb9aef0f5b 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -1049,6 +1049,7 @@ static int __init usb_init(void)
 		pr_info("%s: USB support disabled\n", usbcore_name);
 		return 0;
 	}
+	usb_init_pool_max();
 
 	retval = usb_debugfs_init();
 	if (retval)
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 086bf13307e6..8968f616e414 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -453,6 +453,7 @@ extern const struct dev_pm_ops usb_hcd_pci_pm_ops;
 #endif /* CONFIG_PCI */
 
 /* pci-ish (pdev null is ok) buffer alloc/mapping support */
+void usb_init_pool_max(void);
 int hcd_buffer_create(struct usb_hcd *hcd);
 void hcd_buffer_destroy(struct usb_hcd *hcd);
 
-- 
cgit v1.2.3


From 314b41b16a71ee824f55e2791fcb92997672da37 Mon Sep 17 00:00:00 2001
From: Wu Liang feng <wulf@rock-chips.com>
Date: Wed, 24 Dec 2014 18:22:19 +0800
Subject: USB: ehci-platform: Support ehci reset after resume quirk

The Rockchip rk3288 EHCI controller doesn't properly detect
the case when a device is removed during suspend. Specifically,
when usb resume from suspend, the EHCI controller maintaining
the USB state (FLAG_CF is 1, Current Connect Status is 1),
but a USB device (like a USB camera on rk3288) may have been
disconnected actually.

Let's add a quirk to force ehci to go into the
usb_root_hub_lost_power() path and reset after resume.
This should generally reset the whole controller and all
ports and initialize everything cleanly again, and bring
the devices back up.

As part of this, rename the "hibernation" paramter of
ehci_resume() to force_reset since hibernation is simply
another case where we can't trust the autodetected status
and need to force a reset of devices.

Signed-off-by: Wu Liang feng <wulf@rock-chips.com>
Reviewed-by: Julius Werner <jwerner@google.com>
Reviewed-by: Doug Anderson <dianders@google.com>
Reviewed-by: Tomasz Figa <tfiga@google.com>
Reviewed-by: Pawel Osciak <posciak@google.com>
Reviewed-by: Sonny Rao <sonnyrao@google.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Doug Anderson <dianders@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ehci-hcd.c      | 6 +++---
 drivers/usb/host/ehci-platform.c | 6 +++++-
 drivers/usb/host/ehci.h          | 2 +-
 include/linux/usb/ehci_pdriver.h | 3 +++
 4 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 38bfeedae1d0..85e56d1abd23 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -1110,7 +1110,7 @@ int ehci_suspend(struct usb_hcd *hcd, bool do_wakeup)
 EXPORT_SYMBOL_GPL(ehci_suspend);
 
 /* Returns 0 if power was preserved, 1 if power was lost */
-int ehci_resume(struct usb_hcd *hcd, bool hibernated)
+int ehci_resume(struct usb_hcd *hcd, bool force_reset)
 {
 	struct ehci_hcd		*ehci = hcd_to_ehci(hcd);
 
@@ -1124,12 +1124,12 @@ int ehci_resume(struct usb_hcd *hcd, bool hibernated)
 		return 0;		/* Controller is dead */
 
 	/*
-	 * If CF is still set and we aren't resuming from hibernation
+	 * If CF is still set and reset isn't forced
 	 * then we maintained suspend power.
 	 * Just undo the effect of ehci_suspend().
 	 */
 	if (ehci_readl(ehci, &ehci->regs->configured_flag) == FLAG_CF &&
-			!hibernated) {
+			!force_reset) {
 		int	mask = INTR_MASK;
 
 		ehci_prepare_ports_for_controller_resume(ehci);
diff --git a/drivers/usb/host/ehci-platform.c b/drivers/usb/host/ehci-platform.c
index 8557803e6154..db5c29edf6db 100644
--- a/drivers/usb/host/ehci-platform.c
+++ b/drivers/usb/host/ehci-platform.c
@@ -185,6 +185,10 @@ static int ehci_platform_probe(struct platform_device *dev)
 		if (of_property_read_bool(dev->dev.of_node, "big-endian"))
 			ehci->big_endian_mmio = ehci->big_endian_desc = 1;
 
+		if (of_property_read_bool(dev->dev.of_node,
+					  "needs-reset-on-resume"))
+			pdata->reset_on_resume = 1;
+
 		priv->phy = devm_phy_get(&dev->dev, "usb");
 		if (IS_ERR(priv->phy)) {
 			err = PTR_ERR(priv->phy);
@@ -340,7 +344,7 @@ static int ehci_platform_resume(struct device *dev)
 			return err;
 	}
 
-	ehci_resume(hcd, false);
+	ehci_resume(hcd, pdata->reset_on_resume);
 	return 0;
 }
 #endif /* CONFIG_PM_SLEEP */
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index 6f0577b0a5ae..52ef0844a180 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -871,7 +871,7 @@ extern int	ehci_handshake(struct ehci_hcd *ehci, void __iomem *ptr,
 
 #ifdef CONFIG_PM
 extern int	ehci_suspend(struct usb_hcd *hcd, bool do_wakeup);
-extern int	ehci_resume(struct usb_hcd *hcd, bool hibernated);
+extern int	ehci_resume(struct usb_hcd *hcd, bool force_reset);
 #endif	/* CONFIG_PM */
 
 extern int	ehci_hub_control(struct usb_hcd	*hcd, u16 typeReq, u16 wValue,
diff --git a/include/linux/usb/ehci_pdriver.h b/include/linux/usb/ehci_pdriver.h
index 7eb4dcd0d386..6287b398abd9 100644
--- a/include/linux/usb/ehci_pdriver.h
+++ b/include/linux/usb/ehci_pdriver.h
@@ -34,6 +34,8 @@ struct usb_hcd;
  *			after initialization.
  * @no_io_watchdog:	set to 1 if the controller does not need the I/O
  *			watchdog to run.
+ * @reset_on_resume:	set to 1 if the controller needs to be reset after
+ * 			a suspend / resume cycle (but can't detect that itself).
  *
  * These are general configuration options for the EHCI controller. All of
  * these options are activating more or less workarounds for some hardware.
@@ -45,6 +47,7 @@ struct usb_ehci_pdata {
 	unsigned	big_endian_desc:1;
 	unsigned	big_endian_mmio:1;
 	unsigned	no_io_watchdog:1;
+	unsigned	reset_on_resume:1;
 
 	/* Turn on all power and clocks */
 	int (*power_on)(struct platform_device *pdev);
-- 
cgit v1.2.3


From 4bf4ea9dca4ba1984abeb592f429265b9bacac42 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Tue, 30 Dec 2014 20:28:15 -0500
Subject: serial: omap_8250: Fix RTS handling

The OMAP3 UART ignores MCR[1] (ie., UART_MCR_RTS) when in autoRTS
mode (UPF_HARD_FLOW + CRTSCTS). This makes it impossible for either
the serial core or userspace to manually flow control the sender.

Disable autoRTS mode when RTS is lowered and restore the previous
mode when RTS is raised.

Note that the OMAP3 UART provides no mechanism for switching from
autoRTS mode without corrupting incoming data; to access the
necessary register, the line control settings must be set to 8-e-2
and thus any data received during that time will be interpreted with
those settings. This corruption has been observed in practice.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c | 12 +++++++++++-
 drivers/tty/serial/8250/8250_omap.c | 25 ++++++++++++++++++++++---
 include/linux/serial_8250.h         |  1 +
 include/linux/serial_core.h         |  1 +
 4 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 11c66856ba2f..3bfcfdb57d05 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -1924,7 +1924,7 @@ static unsigned int serial8250_get_mctrl(struct uart_port *port)
 	return ret;
 }
 
-static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
+void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned char mcr = 0;
@@ -1944,6 +1944,14 @@ static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
 
 	serial_port_out(port, UART_MCR, mcr);
 }
+EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl);
+
+static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	if (port->set_mctrl)
+		return port->set_mctrl(port, mctrl);
+	return serial8250_do_set_mctrl(port, mctrl);
+}
 
 static void serial8250_break_ctl(struct uart_port *port, int break_state)
 {
@@ -3605,6 +3613,8 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 		/*  Possibly override set_termios call */
 		if (up->port.set_termios)
 			uart->port.set_termios = up->port.set_termios;
+		if (up->port.set_mctrl)
+			uart->port.set_mctrl = up->port.set_mctrl;
 		if (up->port.startup)
 			uart->port.startup = up->port.startup;
 		if (up->port.shutdown)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 273f37c6b493..227033db214b 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -106,6 +106,27 @@ static u32 uart_read(struct uart_8250_port *up, u32 reg)
 	return readl(up->port.membase + (reg << up->port.regshift));
 }
 
+static void omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	struct uart_8250_port *up = up_to_u8250p(port);
+	struct omap8250_priv *priv = up->port.private_data;
+	u8 lcr;
+
+	serial8250_do_set_mctrl(port, mctrl);
+
+	/*
+	 * Turn off autoRTS if RTS is lowered and restore autoRTS setting
+	 * if RTS is raised
+	 */
+	lcr = serial_in(up, UART_LCR);
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
+	if (mctrl & TIOCM_RTS)
+		serial_out(up, UART_EFR, priv->efr);
+	else
+		serial_out(up, UART_EFR, priv->efr & ~UART_EFR_RTS);
+	serial_out(up, UART_LCR, lcr);
+}
+
 /*
  * Work Around for Errata i202 (2430, 3430, 3630, 4430 and 4460)
  * The access to uart register after MDR1 Access
@@ -400,9 +421,6 @@ static void omap_8250_set_termios(struct uart_port *port,
 	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW) {
 		/* Enable AUTORTS and AUTOCTS */
 		priv->efr |= UART_EFR_CTS | UART_EFR_RTS;
-
-		/* Ensure MCR RTS is asserted */
-		up->mcr |= UART_MCR_RTS;
 	} else	if (up->port.flags & UPF_SOFT_FLOW) {
 		/*
 		 * IXON Flag:
@@ -1007,6 +1025,7 @@ static int omap8250_probe(struct platform_device *pdev)
 	up.capabilities |= UART_CAP_RPM;
 #endif
 	up.port.set_termios = omap_8250_set_termios;
+	up.port.set_mctrl = omap8250_set_mctrl;
 	up.port.pm = omap_8250_pm;
 	up.port.startup = omap_8250_startup;
 	up.port.shutdown = omap_8250_shutdown;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index e02acf0a0ec9..245b959f1ff6 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -126,6 +126,7 @@ extern int serial8250_do_startup(struct uart_port *port);
 extern void serial8250_do_shutdown(struct uart_port *port);
 extern void serial8250_do_pm(struct uart_port *port, unsigned int state,
 			     unsigned int oldstate);
+extern void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl);
 extern int fsl8250_handle_irq(struct uart_port *port);
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
 unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 057038cf2788..a0c7033d5f91 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -123,6 +123,7 @@ struct uart_port {
 	void			(*set_termios)(struct uart_port *,
 				               struct ktermios *new,
 				               struct ktermios *old);
+	void			(*set_mctrl)(struct uart_port *, unsigned int);
 	int			(*startup)(struct uart_port *port);
 	void			(*shutdown)(struct uart_port *port);
 	void			(*throttle)(struct uart_port *port);
-- 
cgit v1.2.3


From a291b7d5f600a068af1e2186ce590a9a39093ddb Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Wed, 10 Dec 2014 12:49:24 +0100
Subject: serial: s3c: add missing register definitions

This macro definitions are necessary to implement DMA transfers
is samsung serial driver.

Based on previous work of Sylwester Nawrocki and Lukasz Czerwinski.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_s3c.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index e6fc9567690b..a7f004a3c177 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -104,6 +104,31 @@
 				   S3C2410_UCON_RXIRQMODE | \
 				   S3C2410_UCON_RXFIFO_TOI)
 
+#define S3C64XX_UCON_TXBURST_1          (0<<20)
+#define S3C64XX_UCON_TXBURST_4          (1<<20)
+#define S3C64XX_UCON_TXBURST_8          (2<<20)
+#define S3C64XX_UCON_TXBURST_16         (3<<20)
+#define S3C64XX_UCON_TXBURST_MASK       (0xf<<20)
+#define S3C64XX_UCON_RXBURST_1          (0<<16)
+#define S3C64XX_UCON_RXBURST_4          (1<<16)
+#define S3C64XX_UCON_RXBURST_8          (2<<16)
+#define S3C64XX_UCON_RXBURST_16         (3<<16)
+#define S3C64XX_UCON_RXBURST_MASK       (0xf<<16)
+#define S3C64XX_UCON_TIMEOUT_SHIFT      (12)
+#define S3C64XX_UCON_TIMEOUT_MASK       (0xf<<12)
+#define S3C64XX_UCON_EMPTYINT_EN        (1<<11)
+#define S3C64XX_UCON_DMASUS_EN          (1<<10)
+#define S3C64XX_UCON_TXINT_LEVEL        (1<<9)
+#define S3C64XX_UCON_RXINT_LEVEL        (1<<8)
+#define S3C64XX_UCON_TIMEOUT_EN         (1<<7)
+#define S3C64XX_UCON_ERRINT_EN          (1<<6)
+#define S3C64XX_UCON_TXMODE_DMA         (2<<2)
+#define S3C64XX_UCON_TXMODE_CPU         (1<<2)
+#define S3C64XX_UCON_TXMODE_MASK        (3<<2)
+#define S3C64XX_UCON_RXMODE_DMA         (2<<0)
+#define S3C64XX_UCON_RXMODE_CPU         (1<<0)
+#define S3C64XX_UCON_RXMODE_MASK        (3<<0)
+
 #define S3C2410_UFCON_FIFOMODE	  (1<<0)
 #define S3C2410_UFCON_TXTRIG0	  (0<<6)
 #define S3C2410_UFCON_RXTRIG8	  (1<<4)
@@ -155,6 +180,7 @@
 #define S3C2440_UFSTAT_TXMASK	  (63<<8)
 #define S3C2440_UFSTAT_RXMASK	  (63)
 
+#define S3C2410_UTRSTAT_TIMEOUT   (1<<3)
 #define S3C2410_UTRSTAT_TXE	  (1<<2)
 #define S3C2410_UTRSTAT_TXFE	  (1<<1)
 #define S3C2410_UTRSTAT_RXDR	  (1<<0)
@@ -179,8 +205,10 @@
 #define S3C64XX_UINTM		0x38
 
 #define S3C64XX_UINTM_RXD	(0)
+#define S3C64XX_UINTM_ERROR     (1)
 #define S3C64XX_UINTM_TXD	(2)
 #define S3C64XX_UINTM_RXD_MSK	(1 << S3C64XX_UINTM_RXD)
+#define S3C64XX_UINTM_ERR_MSK   (1 << S3C64XX_UINTM_ERROR)
 #define S3C64XX_UINTM_TXD_MSK	(1 << S3C64XX_UINTM_TXD)
 
 /* Following are specific to S5PV210 */
-- 
cgit v1.2.3


From 09eb483e895f36fd002e88c878e9578c359aa468 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 23 Dec 2014 16:26:31 -0800
Subject: f2fs: fix missing cold bit during recovery

In do_recover_data, we find and update previous node pages after updating
its new block addresses.
After then, we call fill_node_footer without reset field, we erase its
cold bit so that this new cold node block is written to wrong log area.
This patch fixes not to miss its old flag.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.h          | 10 +++++++++-
 include/linux/f2fs_fs.h |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index fa6f95968b42..cac8a3d9acbd 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -212,11 +212,19 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
 				nid_t ino, unsigned int ofs, bool reset)
 {
 	struct f2fs_node *rn = F2FS_NODE(page);
+	unsigned int old_flag = 0;
+
 	if (reset)
 		memset(rn, 0, sizeof(*rn));
+	else
+		old_flag = le32_to_cpu(rn->footer.flag);
+
 	rn->footer.nid = cpu_to_le32(nid);
 	rn->footer.ino = cpu_to_le32(ino);
-	rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
+
+	/* should remain old flag bits such as COLD_BIT_SHIFT */
+	rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) |
+					(old_flag & OFFSET_BIT_MASK));
 }
 
 static inline void copy_node_footer(struct page *dst, struct page *src)
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 87f14e90e984..e993b0bc9abf 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -224,6 +224,8 @@ enum {
 	OFFSET_BIT_SHIFT
 };
 
+#define OFFSET_BIT_MASK		(0x07)	/* (0x01 << OFFSET_BIT_SHIFT) - 1 */
+
 struct node_footer {
 	__le32 nid;		/* node id */
 	__le32 ino;		/* inode nunmber */
-- 
cgit v1.2.3


From 9733e4f0a973a354034f5dd603b4142a3095c85f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 12:49:13 -0800
Subject: rcu: Make _batches_completed() functions return unsigned long

Long ago, the various ->completed fields were of type long, but now are
unsigned long due to signed-integer-overflow concerns.  However, the
various _batches_completed() functions remained of type long, even though
their only purpose in life is to return the corresponding ->completed
field.  This patch cleans this up by changing these functions' return
types to unsigned long.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h  | 4 ++--
 include/linux/rcutree.h  | 6 +++---
 kernel/rcu/tree.c        | 4 ++--
 kernel/rcu/tree.h        | 2 +-
 kernel/rcu/tree_plugin.h | 6 +++---
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 0e5366200154..91f7e4c37800 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -94,7 +94,7 @@ static inline void rcu_virt_note_context_switch(int cpu)
 /*
  * Return the number of grace periods.
  */
-static inline long rcu_batches_completed(void)
+static inline unsigned long rcu_batches_completed(void)
 {
 	return 0;
 }
@@ -102,7 +102,7 @@ static inline long rcu_batches_completed(void)
 /*
  * Return the number of bottom-half grace periods.
  */
-static inline long rcu_batches_completed_bh(void)
+static inline unsigned long rcu_batches_completed_bh(void)
 {
 	return 0;
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 52953790dcca..9885bfb6b123 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -81,9 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
-long rcu_batches_completed(void);
-long rcu_batches_completed_bh(void);
-long rcu_batches_completed_sched(void);
+unsigned long rcu_batches_completed(void);
+unsigned long rcu_batches_completed_bh(void);
+unsigned long rcu_batches_completed_sched(void);
 void show_rcu_gp_kthreads(void);
 
 void rcu_force_quiescent_state(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4c106fcc0d54..e26d78712e16 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -317,7 +317,7 @@ static int rcu_pending(void);
 /*
  * Return the number of RCU-sched batches processed thus far for debug & stats.
  */
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_completed_sched(void)
 {
 	return rcu_sched_state.completed;
 }
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 /*
  * Return the number of RCU BH batches processed thus far for debug & stats.
  */
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
 {
 	return rcu_bh_state.completed;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..1a07d7379ac6 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -546,7 +546,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
+unsigned long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..f69300d4a51f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -117,7 +117,7 @@ static void __init rcu_bootup_announce(void)
  * Return the number of RCU-preempt batches processed thus far
  * for debug and statistics.
  */
-static long rcu_batches_completed_preempt(void)
+static unsigned long rcu_batches_completed_preempt(void)
 {
 	return rcu_preempt_state.completed;
 }
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
-long rcu_batches_completed(void)
+unsigned long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_preempt();
 }
@@ -935,7 +935,7 @@ static void __init rcu_bootup_announce(void)
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
-long rcu_batches_completed(void)
+unsigned long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_sched();
 }
-- 
cgit v1.2.3


From c1fe9cde4ae904fffb5b4d975d0a37e99136ff50 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 15:45:27 -0800
Subject: rcu: Provide rcu_batches_completed_sched() for TINY_RCU

A bug in rcutorture has caused it to ignore completed batches.
In preparation for fixing that bug, this commit provides TINY_RCU with
the required rcu_batches_completed_sched().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 91f7e4c37800..1ce2d6b8f0c3 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -107,6 +107,14 @@ static inline unsigned long rcu_batches_completed_bh(void)
 	return 0;
 }
 
+/*
+ * Return the number of sched grace periods.
+ */
+static inline unsigned long rcu_batches_completed_sched(void)
+{
+	return 0;
+}
+
 static inline void rcu_force_quiescent_state(void)
 {
 }
-- 
cgit v1.2.3


From 917963d0b30f9c4153c372c165178501d97b6b55 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 17:10:16 -0800
Subject: rcutorture: Check from beginning to end of grace period

Currently, rcutorture's Reader Batch checks measure from the end of
the previous grace period to the end of the current one.  This commit
tightens up these checks by measuring from the start and end of the same
grace period.  This involves adding rcu_batches_started() and friends
corresponding to the existing rcu_batches_completed() and friends.

We leave SRCU alone for the moment, as it does not yet have a way of
tracking both ends of its grace periods.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h  | 30 +++++++++++++++++++++++++++---
 include/linux/rcutree.h  |  3 +++
 kernel/rcu/rcutorture.c  | 37 +++++++++++++++++++++++++++----------
 kernel/rcu/tree.c        | 40 ++++++++++++++++++++++++++++++++++++++--
 kernel/rcu/tree_plugin.h | 28 ----------------------------
 5 files changed, 95 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 1ce2d6b8f0c3..984192160e9b 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -92,7 +92,31 @@ static inline void rcu_virt_note_context_switch(int cpu)
 }
 
 /*
- * Return the number of grace periods.
+ * Return the number of grace periods started.
+ */
+static inline unsigned long rcu_batches_started(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods started.
+ */
+static inline unsigned long rcu_batches_started_bh(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of sched grace periods started.
+ */
+static inline unsigned long rcu_batches_started_sched(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of grace periods completed.
  */
 static inline unsigned long rcu_batches_completed(void)
 {
@@ -100,7 +124,7 @@ static inline unsigned long rcu_batches_completed(void)
 }
 
 /*
- * Return the number of bottom-half grace periods.
+ * Return the number of bottom-half grace periods completed.
  */
 static inline unsigned long rcu_batches_completed_bh(void)
 {
@@ -108,7 +132,7 @@ static inline unsigned long rcu_batches_completed_bh(void)
 }
 
 /*
- * Return the number of sched grace periods.
+ * Return the number of sched grace periods completed.
  */
 static inline unsigned long rcu_batches_completed_sched(void)
 {
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9885bfb6b123..c0dd124e69ec 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -81,6 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
+unsigned long rcu_batches_started(void);
+unsigned long rcu_batches_started_bh(void);
+unsigned long rcu_batches_started_sched(void);
 unsigned long rcu_batches_completed(void);
 unsigned long rcu_batches_completed_bh(void);
 unsigned long rcu_batches_completed_sched(void);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index aadbc072ccf4..24142c200901 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,6 +244,7 @@ struct rcu_torture_ops {
 	int (*readlock)(void);
 	void (*read_delay)(struct torture_random_state *rrsp);
 	void (*readunlock)(int idx);
+	unsigned long (*started)(void);
 	unsigned long (*completed)(void);
 	void (*deferred_free)(struct rcu_torture *p);
 	void (*sync)(void);
@@ -372,6 +373,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,
 	.readunlock	= rcu_torture_read_unlock,
+	.started	= rcu_batches_started,
 	.completed	= rcu_batches_completed,
 	.deferred_free	= rcu_torture_deferred_free,
 	.sync		= synchronize_rcu,
@@ -413,6 +415,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.readlock	= rcu_bh_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_bh_torture_read_unlock,
+	.started	= rcu_batches_started_bh,
 	.completed	= rcu_batches_completed_bh,
 	.deferred_free	= rcu_bh_torture_deferred_free,
 	.sync		= synchronize_rcu_bh,
@@ -456,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_torture_read_unlock,
+	.started	= rcu_no_completed,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_busted_torture_deferred_free,
 	.sync		= synchronize_rcu_busted,
@@ -554,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
 	.readlock	= srcu_torture_read_lock,
 	.read_delay	= srcu_read_delay,
 	.readunlock	= srcu_torture_read_unlock,
+	.started	= NULL,
 	.completed	= srcu_torture_completed,
 	.deferred_free	= srcu_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
@@ -590,6 +595,7 @@ static struct rcu_torture_ops sched_ops = {
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
+	.started	= rcu_batches_started_sched,
 	.completed	= rcu_batches_completed_sched,
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= synchronize_sched,
@@ -628,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
 	.readlock	= tasks_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= tasks_torture_read_unlock,
+	.started	= rcu_no_completed,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_tasks_torture_deferred_free,
 	.sync		= synchronize_rcu_tasks,
@@ -1005,8 +1012,8 @@ static void rcutorture_trace_dump(void)
 static void rcu_torture_timer(unsigned long unused)
 {
 	int idx;
+	unsigned long started;
 	unsigned long completed;
-	unsigned long completed_end;
 	static DEFINE_TORTURE_RANDOM(rand);
 	static DEFINE_SPINLOCK(rand_lock);
 	struct rcu_torture *p;
@@ -1014,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
 	unsigned long long ts;
 
 	idx = cur_ops->readlock();
-	completed = cur_ops->completed();
+	if (cur_ops->started)
+		started = cur_ops->started();
+	else
+		started = cur_ops->completed();
 	ts = rcu_trace_clock_local();
 	p = rcu_dereference_check(rcu_torture_current,
 				  rcu_read_lock_bh_held() ||
@@ -1037,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
 		/* Should not happen, but... */
 		pipe_count = RCU_TORTURE_PIPE_LEN;
 	}
-	completed_end = cur_ops->completed();
+	completed = cur_ops->completed();
 	if (pipe_count > 1) {
 		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
-					  completed, completed_end);
+					  started, completed);
 		rcutorture_trace_dump();
 	}
 	__this_cpu_inc(rcu_torture_count[pipe_count]);
-	completed = completed_end - completed;
+	completed = completed - started;
+	if (cur_ops->started)
+		completed++;
 	if (completed > RCU_TORTURE_PIPE_LEN) {
 		/* Should not happen, but... */
 		completed = RCU_TORTURE_PIPE_LEN;
@@ -1063,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
 static int
 rcu_torture_reader(void *arg)
 {
+	unsigned long started;
 	unsigned long completed;
-	unsigned long completed_end;
 	int idx;
 	DEFINE_TORTURE_RANDOM(rand);
 	struct rcu_torture *p;
@@ -1083,7 +1095,10 @@ rcu_torture_reader(void *arg)
 				mod_timer(&t, jiffies + 1);
 		}
 		idx = cur_ops->readlock();
-		completed = cur_ops->completed();
+		if (cur_ops->started)
+			started = cur_ops->started();
+		else
+			started = cur_ops->completed();
 		ts = rcu_trace_clock_local();
 		p = rcu_dereference_check(rcu_torture_current,
 					  rcu_read_lock_bh_held() ||
@@ -1104,14 +1119,16 @@ rcu_torture_reader(void *arg)
 			/* Should not happen, but... */
 			pipe_count = RCU_TORTURE_PIPE_LEN;
 		}
-		completed_end = cur_ops->completed();
+		completed = cur_ops->completed();
 		if (pipe_count > 1) {
 			do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
-						  ts, completed, completed_end);
+						  ts, started, completed);
 			rcutorture_trace_dump();
 		}
 		__this_cpu_inc(rcu_torture_count[pipe_count]);
-		completed = completed_end - completed;
+		completed = completed - started;
+		if (cur_ops->started)
+			completed++;
 		if (completed > RCU_TORTURE_PIPE_LEN) {
 			/* Should not happen, but... */
 			completed = RCU_TORTURE_PIPE_LEN;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e26d78712e16..c0faad51ae87 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -315,7 +315,43 @@ static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
 
 /*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+	return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+	return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_bh(void)
+{
+	return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+	return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
  */
 unsigned long rcu_batches_completed_sched(void)
 {
@@ -324,7 +360,7 @@ unsigned long rcu_batches_completed_sched(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 
 /*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
  */
 unsigned long rcu_batches_completed_bh(void)
 {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index f69300d4a51f..07e61a04de1d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -113,25 +113,6 @@ static void __init rcu_bootup_announce(void)
 	rcu_bootup_announce_oddness();
 }
 
-/*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-static unsigned long rcu_batches_completed_preempt(void)
-{
-	return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed(void)
-{
-	return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
 /*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
@@ -932,15 +913,6 @@ static void __init rcu_bootup_announce(void)
 	rcu_bootup_announce_oddness();
 }
 
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed(void)
-{
-	return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * CPUs being in quiescent states.
-- 
cgit v1.2.3


From 31c89c959667194350f496947b576c149503ce98 Mon Sep 17 00:00:00 2001
From: Soren Brinkmann <soren.brinkmann@xilinx.com>
Date: Fri, 9 Jan 2015 07:43:45 -0800
Subject: pinctrl: pinconf-generic: Infer map type from DT property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the new 'groups' property, the DT parser can infer the map type
from the fact whether 'pins' or 'groups' is used to specify the pin
group to work on.

To maintain backwards compatibitliy with current usage of the DT
binding, this is only done when PIN_MAP_TYPE_INVALID is passed to the
parsing function as type.

Also, a new helper 'pinconf_generic_dt_node_to_map_all()' is introduced,
which can be used by drivers as generic callback for dt_node_to_map() to
leverage the new feature.

Changes since v2:
 - rename dt_pin_specifier to subnode_target_type
 - add additional comment in header file explaining passing an invalid
   map type
 - mention map_all() helper in commit message
Changes since RFC v2:
 - none

Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Tested-by: Andreas FÃ¤rber <afaerber@suse.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 17 ++++++++++++++---
 include/linux/pinctrl/pinconf-generic.h | 11 +++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index f78b416d7984..21b3d90ebb2d 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -264,6 +264,7 @@ int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 	unsigned reserve;
 	struct property *prop;
 	const char *group;
+	const char *subnode_target_type = "pins";
 
 	ret = of_property_read_string(np, "function", &function);
 	if (ret < 0) {
@@ -284,10 +285,20 @@ int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		reserve++;
 	if (num_configs)
 		reserve++;
+
 	ret = of_property_count_strings(np, "pins");
 	if (ret < 0) {
-		dev_err(dev, "could not parse property pins\n");
-		goto exit;
+		ret = of_property_count_strings(np, "groups");
+		if (ret < 0) {
+			dev_err(dev, "could not parse property pins/groups\n");
+			goto exit;
+		}
+		if (type == PIN_MAP_TYPE_INVALID)
+			type = PIN_MAP_TYPE_CONFIGS_GROUP;
+		subnode_target_type = "groups";
+	} else {
+		if (type == PIN_MAP_TYPE_INVALID)
+			type = PIN_MAP_TYPE_CONFIGS_PIN;
 	}
 	reserve *= ret;
 
@@ -296,7 +307,7 @@ int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 	if (ret < 0)
 		goto exit;
 
-	of_property_for_each_string(np, "pins", prop, group) {
+	of_property_for_each_string(np, subnode_target_type, prop, group) {
 		if (function) {
 			ret = pinctrl_utils_add_map_mux(pctldev, map,
 					reserved_maps, num_maps, group,
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index d578a60eff23..83c89f5ab705 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -174,6 +174,17 @@ static inline int pinconf_generic_dt_node_to_map_pin(
 			PIN_MAP_TYPE_CONFIGS_PIN);
 }
 
+static inline int pinconf_generic_dt_node_to_map_all(
+		struct pinctrl_dev *pctldev, struct device_node *np_config,
+		struct pinctrl_map **map, unsigned *num_maps)
+{
+	/*
+	 * passing the type as PIN_MAP_TYPE_INVALID causes the underlying parser
+	 * to infer the map type from the DT properties used.
+	 */
+	return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps,
+			PIN_MAP_TYPE_INVALID);
+}
 #endif
 
 #endif /* CONFIG_GENERIC_PINCONF */
-- 
cgit v1.2.3


From dd4d01f7bad886c22687224bc7070b87de8deb51 Mon Sep 17 00:00:00 2001
From: Soren Brinkmann <soren.brinkmann@xilinx.com>
Date: Fri, 9 Jan 2015 07:43:46 -0800
Subject: pinctrl: pinconf-generic: Allow driver to specify DT params
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Additionally to the generic DT parameters, allow drivers to provide
driver-specific DT parameters to be used with the generic parser
infrastructure.

To achieve this 'struct pinctrl_desc' is extended to pass custom pinconf
option to the core. In order to pass this kind of information, the
related data structures - 'struct pinconf_generic_dt_params',
'pin_config_item' - are moved from pinconf internals to the
pinconf-generic header.

Additionally pinconfg-generic is refactored to not only iterate over the
generic pinconf parameters but also take the parameters into account
that are provided through the driver's 'struct pinctrl_desc'.
In particular 'pinconf_generic_parse_dt_config()' and
'pinconf_generic_dump' helpers are split into two parts each. In order
to have a more generic helper that can be used to process the generic
parameters as well as the driver-specific ones.

v2:
 - fix typo
 - add missing documentation for @conf_items member in struct
 - rebase to pinctrl/devel: conflict in abx500
 - rename _pinconf_generic_dump() to pinconf_generic_dump_one()
 - removed '_' from _parse_dt_cfg()
 - removed BUG_ONs, error condition is handled in if statements
 - removed pinconf_generic_dump_group() & pinconf_generic_dump_pin
   helpers
   - fixed up corresponding call sites
   - renamed pinconf_generic_dump() to pinconf_generic_dump_pins()
   - added kernel-doc to pinconf_generic_dump_pins()
 - add kernel-doc
 - more verbose commit message

Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Tested-by: Andreas FÃ¤rber <afaerber@suse.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/nomadik/pinctrl-abx500.c |   2 +-
 drivers/pinctrl/pinconf-generic.c        | 182 +++++++++++++++++--------------
 drivers/pinctrl/pinconf.c                |   4 +-
 drivers/pinctrl/pinconf.h                |  22 ++--
 drivers/pinctrl/pinctrl-rockchip.c       |   2 +-
 drivers/pinctrl/pinctrl-tz1090-pdc.c     |   2 +-
 drivers/pinctrl/pinctrl-tz1090.c         |   2 +-
 drivers/pinctrl/sh-pfc/pinctrl.c         |   2 +-
 include/linux/pinctrl/pinconf-generic.h  |  18 +++
 include/linux/pinctrl/pinctrl.h          |   9 ++
 10 files changed, 141 insertions(+), 104 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/nomadik/pinctrl-abx500.c b/drivers/pinctrl/nomadik/pinctrl-abx500.c
index 3d6d97228523..1806b24faa14 100644
--- a/drivers/pinctrl/nomadik/pinctrl-abx500.c
+++ b/drivers/pinctrl/nomadik/pinctrl-abx500.c
@@ -914,7 +914,7 @@ static int abx500_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		}
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &nconfigs);
+	ret = pinconf_generic_parse_dt_config(np, pctldev, &configs, &nconfigs);
 	if (nconfigs) {
 		const char *gpio_name;
 		const char *pin;
diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 21b3d90ebb2d..e0886665b70a 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -27,17 +27,6 @@
 #include "pinctrl-utils.h"
 
 #ifdef CONFIG_DEBUG_FS
-
-struct pin_config_item {
-	const enum pin_config_param param;
-	const char * const display;
-	const char * const format;
-	bool has_arg;
-};
-
-#define PCONFDUMP(a, b, c, d) { .param = a, .display = b, .format = c, \
-				.has_arg = d }
-
 static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_BIAS_DISABLE, "input bias disabled", NULL, false),
 	PCONFDUMP(PIN_CONFIG_BIAS_HIGH_IMPEDANCE, "input bias high impedance", NULL, false),
@@ -60,22 +49,25 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_OUTPUT, "pin output", "level", true),
 };
 
-void pinconf_generic_dump_pin(struct pinctrl_dev *pctldev,
-			      struct seq_file *s, unsigned pin)
+static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev,
+				     struct seq_file *s, const char *gname,
+				     unsigned pin,
+				     const struct pin_config_item *items,
+				     int nitems)
 {
-	const struct pinconf_ops *ops = pctldev->desc->confops;
 	int i;
 
-	if (!ops->is_generic)
-		return;
-
-	for (i = 0; i < ARRAY_SIZE(conf_items); i++) {
+	for (i = 0; i < nitems; i++) {
 		unsigned long config;
 		int ret;
 
 		/* We want to check out this parameter */
-		config = pinconf_to_config_packed(conf_items[i].param, 0);
-		ret = pin_config_get_for_pin(pctldev, pin, &config);
+		config = pinconf_to_config_packed(items[i].param, 0);
+		if (gname)
+			ret = pin_config_group_get(dev_name(pctldev->dev),
+						   gname, &config);
+		else
+			ret = pin_config_get_for_pin(pctldev, pin, &config);
 		/* These are legal errors */
 		if (ret == -EINVAL || ret == -ENOTSUPP)
 			continue;
@@ -85,56 +77,46 @@ void pinconf_generic_dump_pin(struct pinctrl_dev *pctldev,
 		}
 		/* Space between multiple configs */
 		seq_puts(s, " ");
-		seq_puts(s, conf_items[i].display);
+		seq_puts(s, items[i].display);
 		/* Print unit if available */
-		if (conf_items[i].has_arg) {
+		if (items[i].has_arg) {
 			seq_printf(s, " (%u",
 				   pinconf_to_config_argument(config));
-			if (conf_items[i].format)
-				seq_printf(s, " %s)", conf_items[i].format);
+			if (items[i].format)
+				seq_printf(s, " %s)", items[i].format);
 			else
 				seq_puts(s, ")");
 		}
 	}
 }
 
-void pinconf_generic_dump_group(struct pinctrl_dev *pctldev,
-			      struct seq_file *s, const char *gname)
+/**
+ * pinconf_generic_dump_pins - Print information about pin or group of pins
+ * @pctldev:	Pincontrol device
+ * @s:		File to print to
+ * @gname:	Group name specifying pins
+ * @pin:	Pin number specyfying pin
+ *
+ * Print the pinconf configuration for the requested pin(s) to @s. Pins can be
+ * specified either by pin using @pin or by group using @gname. Only one needs
+ * to be specified the other can be NULL/0.
+ */
+void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev, struct seq_file *s,
+			       const char *gname, unsigned pin)
 {
 	const struct pinconf_ops *ops = pctldev->desc->confops;
-	int i;
 
 	if (!ops->is_generic)
 		return;
 
-	for (i = 0; i < ARRAY_SIZE(conf_items); i++) {
-		unsigned long config;
-		int ret;
-
-		/* We want to check out this parameter */
-		config = pinconf_to_config_packed(conf_items[i].param, 0);
-		ret = pin_config_group_get(dev_name(pctldev->dev), gname,
-					   &config);
-		/* These are legal errors */
-		if (ret == -EINVAL || ret == -ENOTSUPP)
-			continue;
-		if (ret) {
-			seq_printf(s, "ERROR READING CONFIG SETTING %d ", i);
-			continue;
-		}
-		/* Space between multiple configs */
-		seq_puts(s, " ");
-		seq_puts(s, conf_items[i].display);
-		/* Print unit if available */
-		if (conf_items[i].has_arg) {
-			seq_printf(s, " (%u",
-				   pinconf_to_config_argument(config));
-			if (conf_items[i].format)
-				seq_printf(s, " %s)", conf_items[i].format);
-			else
-				seq_puts(s, ")");
-		}
-	}
+	/* generic parameters */
+	pinconf_generic_dump_one(pctldev, s, gname, pin, conf_items,
+				 ARRAY_SIZE(conf_items));
+	/* driver-specific parameters */
+	if (pctldev->desc->num_dt_params && pctldev->desc->conf_items)
+		pinconf_generic_dump_one(pctldev, s, gname, pin,
+					 pctldev->desc->conf_items,
+					 pctldev->desc->num_dt_params);
 }
 
 void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
@@ -148,17 +130,21 @@ void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 		seq_printf(s, "%s: 0x%x", conf_items[i].display,
 			   pinconf_to_config_argument(config));
 	}
+
+	if (!pctldev->desc->num_dt_params || !pctldev->desc->conf_items)
+		return;
+
+	for (i = 0; i < pctldev->desc->num_dt_params; i++) {
+		if (pinconf_to_config_param(config) != pctldev->desc->conf_items[i].param)
+			continue;
+		seq_printf(s, "%s: 0x%x", pctldev->desc->conf_items[i].display,
+			   pinconf_to_config_argument(config));
+	}
 }
 EXPORT_SYMBOL_GPL(pinconf_generic_dump_config);
 #endif
 
 #ifdef CONFIG_OF
-struct pinconf_generic_dt_params {
-	const char * const property;
-	enum pin_config_param param;
-	u32 default_value;
-};
-
 static const struct pinconf_generic_dt_params dt_params[] = {
 	{ "bias-disable", PIN_CONFIG_BIAS_DISABLE, 0 },
 	{ "bias-high-impedance", PIN_CONFIG_BIAS_HIGH_IMPEDANCE, 0 },
@@ -183,6 +169,47 @@ static const struct pinconf_generic_dt_params dt_params[] = {
 	{ "slew-rate", PIN_CONFIG_SLEW_RATE, 0},
 };
 
+/**
+ * parse_dt_cfg - Parse DT pinconf parameters
+ * @np:	DT node
+ * @params:	Array of describing DT parameters
+ * @count:	Number of entries in @params
+ * @cfg:	Array of parsed config options
+ * @ncfg:	Number of entries in @cfg
+ *
+ * Parse the config options described in @params from @np and puts the result
+ * in @cfg. @cfg does not need to be empty, entries are added beggining at
+ * @ncfg. @ncfg is updated to reflect the number of entries after parsing. @cfg
+ * needs to have enough memory allocated to hold all possible entries.
+ */
+static void parse_dt_cfg(struct device_node *np,
+			 const struct pinconf_generic_dt_params *params,
+			 unsigned int count, unsigned long *cfg,
+			 unsigned int *ncfg)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		u32 val;
+		int ret;
+		const struct pinconf_generic_dt_params *par = &params[i];
+
+		ret = of_property_read_u32(np, par->property, &val);
+
+		/* property not found */
+		if (ret == -EINVAL)
+			continue;
+
+		/* use default value, when no value is specified */
+		if (ret)
+			val = par->default_value;
+
+		pr_debug("found %s with value %u\n", par->property, val);
+		cfg[*ncfg] = pinconf_to_config_packed(par->param, val);
+		(*ncfg)++;
+	}
+}
+
 /**
  * pinconf_generic_parse_dt_config()
  * parse the config properties into generic pinconfig values.
@@ -191,39 +218,29 @@ static const struct pinconf_generic_dt_params dt_params[] = {
  * @nconfigs: umber of configurations
  */
 int pinconf_generic_parse_dt_config(struct device_node *np,
+				    struct pinctrl_dev *pctldev,
 				    unsigned long **configs,
 				    unsigned int *nconfigs)
 {
 	unsigned long *cfg;
-	unsigned int ncfg = 0;
+	unsigned int max_cfg, ncfg = 0;
 	int ret;
-	int i;
-	u32 val;
 
 	if (!np)
 		return -EINVAL;
 
 	/* allocate a temporary array big enough to hold one of each option */
-	cfg = kzalloc(sizeof(*cfg) * ARRAY_SIZE(dt_params), GFP_KERNEL);
+	max_cfg = ARRAY_SIZE(dt_params);
+	if (pctldev)
+		max_cfg += pctldev->desc->num_dt_params;
+	cfg = kcalloc(max_cfg, sizeof(*cfg), GFP_KERNEL);
 	if (!cfg)
 		return -ENOMEM;
 
-	for (i = 0; i < ARRAY_SIZE(dt_params); i++) {
-		const struct pinconf_generic_dt_params *par = &dt_params[i];
-		ret = of_property_read_u32(np, par->property, &val);
-
-		/* property not found */
-		if (ret == -EINVAL)
-			continue;
-
-		/* use default value, when no value is specified */
-		if (ret)
-			val = par->default_value;
-
-		pr_debug("found %s with value %u\n", par->property, val);
-		cfg[ncfg] = pinconf_to_config_packed(par->param, val);
-		ncfg++;
-	}
+	parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg);
+	if (pctldev && pctldev->desc->num_dt_params && pctldev->desc->params)
+		parse_dt_cfg(np, pctldev->desc->params,
+			      pctldev->desc->num_dt_params, cfg, &ncfg);
 
 	ret = 0;
 
@@ -274,7 +291,8 @@ int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		function = NULL;
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &num_configs);
+	ret = pinconf_generic_parse_dt_config(np, pctldev, &configs,
+					      &num_configs);
 	if (ret < 0) {
 		dev_err(dev, "could not parse node property\n");
 		return ret;
diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index 8bfa0643e5dc..1fc09dc20199 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -288,7 +288,7 @@ static void pinconf_dump_pin(struct pinctrl_dev *pctldev,
 	const struct pinconf_ops *ops = pctldev->desc->confops;
 
 	/* no-op when not using generic pin config */
-	pinconf_generic_dump_pin(pctldev, s, pin);
+	pinconf_generic_dump_pins(pctldev, s, NULL, pin);
 	if (ops && ops->pin_config_dbg_show)
 		ops->pin_config_dbg_show(pctldev, s, pin);
 }
@@ -333,7 +333,7 @@ static void pinconf_dump_group(struct pinctrl_dev *pctldev,
 	const struct pinconf_ops *ops = pctldev->desc->confops;
 
 	/* no-op when not using generic pin config */
-	pinconf_generic_dump_group(pctldev, s, gname);
+	pinconf_generic_dump_pins(pctldev, s, gname, 0);
 	if (ops && ops->pin_config_group_dbg_show)
 		ops->pin_config_group_dbg_show(pctldev, s, selector);
 }
diff --git a/drivers/pinctrl/pinconf.h b/drivers/pinctrl/pinconf.h
index a4a5417e1413..55c75780b3b2 100644
--- a/drivers/pinctrl/pinconf.h
+++ b/drivers/pinctrl/pinconf.h
@@ -92,26 +92,17 @@ static inline void pinconf_init_device_debugfs(struct dentry *devroot,
 
 #if defined(CONFIG_GENERIC_PINCONF) && defined(CONFIG_DEBUG_FS)
 
-void pinconf_generic_dump_pin(struct pinctrl_dev *pctldev,
-			      struct seq_file *s, unsigned pin);
-
-void pinconf_generic_dump_group(struct pinctrl_dev *pctldev,
-			      struct seq_file *s, const char *gname);
+void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev,
+			       struct seq_file *s, const char *gname,
+			       unsigned pin);
 
 void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 				 struct seq_file *s, unsigned long config);
 #else
 
-static inline void pinconf_generic_dump_pin(struct pinctrl_dev *pctldev,
-					    struct seq_file *s,
-					    unsigned pin)
-{
-	return;
-}
-
-static inline void pinconf_generic_dump_group(struct pinctrl_dev *pctldev,
-					      struct seq_file *s,
-					      const char *gname)
+static inline void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev,
+					     struct seq_file *s,
+					     const char *gname, unsigned pin)
 {
 	return;
 }
@@ -126,6 +117,7 @@ static inline void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 
 #if defined(CONFIG_GENERIC_PINCONF) && defined(CONFIG_OF)
 int pinconf_generic_parse_dt_config(struct device_node *np,
+				    struct pinctrl_dev *pctldev,
 				    unsigned long **configs,
 				    unsigned int *nconfigs);
 #endif
diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c
index ba74f0aa60c7..7625f333ab07 100644
--- a/drivers/pinctrl/pinctrl-rockchip.c
+++ b/drivers/pinctrl/pinctrl-rockchip.c
@@ -1140,7 +1140,7 @@ static int rockchip_pinctrl_parse_groups(struct device_node *np,
 			return -EINVAL;
 
 		np_config = of_find_node_by_phandle(be32_to_cpup(phandle));
-		ret = pinconf_generic_parse_dt_config(np_config,
+		ret = pinconf_generic_parse_dt_config(np_config, NULL,
 				&grp->data[j].configs, &grp->data[j].nconfigs);
 		if (ret)
 			return ret;
diff --git a/drivers/pinctrl/pinctrl-tz1090-pdc.c b/drivers/pinctrl/pinctrl-tz1090-pdc.c
index 146e48a9b839..fab6aafa6a9f 100644
--- a/drivers/pinctrl/pinctrl-tz1090-pdc.c
+++ b/drivers/pinctrl/pinctrl-tz1090-pdc.c
@@ -415,7 +415,7 @@ static int tz1090_pdc_pinctrl_dt_subnode_to_map(struct device *dev,
 		function = NULL;
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &num_configs);
+	ret = pinconf_generic_parse_dt_config(np, NULL, &configs, &num_configs);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/pinctrl-tz1090.c b/drivers/pinctrl/pinctrl-tz1090.c
index df8cb1e5b7b4..8bd73075f9dd 100644
--- a/drivers/pinctrl/pinctrl-tz1090.c
+++ b/drivers/pinctrl/pinctrl-tz1090.c
@@ -1131,7 +1131,7 @@ static int tz1090_pinctrl_dt_subnode_to_map(struct device *dev,
 		function = NULL;
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &num_configs);
+	ret = pinconf_generic_parse_dt_config(np, NULL, &configs, &num_configs);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/sh-pfc/pinctrl.c b/drivers/pinctrl/sh-pfc/pinctrl.c
index 910deaefa0ac..072e7c62cab7 100644
--- a/drivers/pinctrl/sh-pfc/pinctrl.c
+++ b/drivers/pinctrl/sh-pfc/pinctrl.c
@@ -122,7 +122,7 @@ static int sh_pfc_dt_subnode_to_map(struct device *dev, struct device_node *np,
 		return ret;
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &num_configs);
+	ret = pinconf_generic_parse_dt_config(np, NULL, &configs, &num_configs);
 	if (ret < 0)
 		return ret;
 
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 83c89f5ab705..342409f7f3ec 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -115,6 +115,18 @@ enum pin_config_param {
 	PIN_CONFIG_END = 0x7FFF,
 };
 
+#ifdef CONFIG_DEBUG_FS
+#define PCONFDUMP(a, b, c, d) { .param = a, .display = b, .format = c, \
+				.has_arg = d }
+
+struct pin_config_item {
+	const enum pin_config_param param;
+	const char * const display;
+	const char * const format;
+	bool has_arg;
+};
+#endif /* CONFIG_DEBUG_FS */
+
 /*
  * Helpful configuration macro to be used in tables etc.
  */
@@ -150,6 +162,12 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param
 struct pinctrl_dev;
 struct pinctrl_map;
 
+struct pinconf_generic_dt_params {
+	const char * const property;
+	enum pin_config_param param;
+	u32 default_value;
+};
+
 int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		struct device_node *np, struct pinctrl_map **map,
 		unsigned *reserved_maps, unsigned *num_maps,
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index cc8e1aff0e28..c58b3e11ba8e 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -24,6 +24,7 @@ struct pinctrl_dev;
 struct pinctrl_map;
 struct pinmux_ops;
 struct pinconf_ops;
+struct pin_config_item;
 struct gpio_chip;
 struct device_node;
 
@@ -117,6 +118,9 @@ struct pinctrl_ops {
  * @confops: pin config operations vtable, if you support pin configuration in
  *	your driver
  * @owner: module providing the pin controller, used for refcounting
+ * @num_dt_params: Number of driver-specific DT parameters
+ * @params: List of DT parameters
+ * @conf_items: Information how to print @params in debugfs
  */
 struct pinctrl_desc {
 	const char *name;
@@ -126,6 +130,11 @@ struct pinctrl_desc {
 	const struct pinmux_ops *pmxops;
 	const struct pinconf_ops *confops;
 	struct module *owner;
+#if defined(CONFIG_GENERIC_PINCONF) && defined(CONFIG_OF)
+	unsigned int num_dt_params;
+	const struct pinconf_generic_dt_params *params;
+	const struct pin_config_item *conf_items;
+#endif
 };
 
 /* External interface to pin controller */
-- 
cgit v1.2.3


From 7bb68410ef22067b08fd52887875b8f337f89dcc Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sat, 18 Oct 2014 15:04:15 +0200
Subject: efi: split off remapping code from efi_config_init()

Split of the remapping code from efi_config_init() so that the caller
can perform its own remapping. This is necessary to correctly handle
virtually remapped UEFI memory regions under kexec, as efi.systab will
have been updated to a virtual address.

Acked-by: Matt Fleming <matt.fleming@intel.com>
Tested-by: Leif Lindholm <leif.lindholm@linaro.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 drivers/firmware/efi/efi.c | 56 ++++++++++++++++++++++++++--------------------
 include/linux/efi.h        |  2 ++
 2 files changed, 34 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 9035c1b74d58..b7ba9d8ec4b0 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -293,29 +293,15 @@ static __init int match_config_table(efi_guid_t *guid,
 	return 0;
 }
 
-int __init efi_config_init(efi_config_table_type_t *arch_tables)
+int __init efi_config_parse_tables(void *config_tables, int count, int sz,
+				   efi_config_table_type_t *arch_tables)
 {
-	void *config_tables, *tablep;
-	int i, sz;
-
-	if (efi_enabled(EFI_64BIT))
-		sz = sizeof(efi_config_table_64_t);
-	else
-		sz = sizeof(efi_config_table_32_t);
-
-	/*
-	 * Let's see what config tables the firmware passed to us.
-	 */
-	config_tables = early_memremap(efi.systab->tables,
-				       efi.systab->nr_tables * sz);
-	if (config_tables == NULL) {
-		pr_err("Could not map Configuration table!\n");
-		return -ENOMEM;
-	}
+	void *tablep;
+	int i;
 
 	tablep = config_tables;
 	pr_info("");
-	for (i = 0; i < efi.systab->nr_tables; i++) {
+	for (i = 0; i < count; i++) {
 		efi_guid_t guid;
 		unsigned long table;
 
@@ -328,8 +314,6 @@ int __init efi_config_init(efi_config_table_type_t *arch_tables)
 			if (table64 >> 32) {
 				pr_cont("\n");
 				pr_err("Table located above 4GB, disabling EFI.\n");
-				early_memunmap(config_tables,
-					       efi.systab->nr_tables * sz);
 				return -EINVAL;
 			}
 #endif
@@ -344,13 +328,37 @@ int __init efi_config_init(efi_config_table_type_t *arch_tables)
 		tablep += sz;
 	}
 	pr_cont("\n");
-	early_memunmap(config_tables, efi.systab->nr_tables * sz);
-
 	set_bit(EFI_CONFIG_TABLES, &efi.flags);
-
 	return 0;
 }
 
+int __init efi_config_init(efi_config_table_type_t *arch_tables)
+{
+	void *config_tables;
+	int sz, ret;
+
+	if (efi_enabled(EFI_64BIT))
+		sz = sizeof(efi_config_table_64_t);
+	else
+		sz = sizeof(efi_config_table_32_t);
+
+	/*
+	 * Let's see what config tables the firmware passed to us.
+	 */
+	config_tables = early_memremap(efi.systab->tables,
+				       efi.systab->nr_tables * sz);
+	if (config_tables == NULL) {
+		pr_err("Could not map Configuration table!\n");
+		return -ENOMEM;
+	}
+
+	ret = efi_config_parse_tables(config_tables, efi.systab->nr_tables, sz,
+				      arch_tables);
+
+	early_memunmap(config_tables, efi.systab->nr_tables * sz);
+	return ret;
+}
+
 #ifdef CONFIG_EFI_VARS_MODULE
 static int __init efi_load_efivars(void)
 {
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 0238d612750e..5ffe5115951f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -875,6 +875,8 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon
 #endif
 extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
 extern int efi_config_init(efi_config_table_type_t *arch_tables);
+extern int efi_config_parse_tables(void *config_tables, int count, int sz,
+				   efi_config_table_type_t *arch_tables);
 extern u64 efi_get_iobase (void);
 extern u32 efi_mem_type (unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
-- 
cgit v1.2.3


From 39db74ce1aa83626a0a70ed2abf29a17598fff49 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 27 Nov 2014 14:07:28 +0200
Subject: mei: bus: use ssize_t as the return type for send and receive

Mei bus receive and send function may return either number
of transmitted bytes or errno.  It is better to use ssize_t
type for that purpose that mixing size_t with int.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c     | 30 ++++++++++++++----------------
 drivers/misc/mei/mei_dev.h |  6 +++---
 include/linux/mei_cl_bus.h |  4 ++--
 3 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index b3a72bca5242..31164dd14bd0 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -224,13 +224,13 @@ void mei_cl_driver_unregister(struct mei_cl_driver *driver)
 }
 EXPORT_SYMBOL_GPL(mei_cl_driver_unregister);
 
-static int ___mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
+static ssize_t ___mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
 			bool blocking)
 {
 	struct mei_device *dev;
 	struct mei_me_client *me_cl;
 	struct mei_cl_cb *cb;
-	int rets;
+	ssize_t rets;
 
 	if (WARN_ON(!cl || !cl->dev))
 		return -ENODEV;
@@ -271,12 +271,12 @@ static int ___mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
 	return rets;
 }
 
-int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
+ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
 {
 	struct mei_device *dev;
 	struct mei_cl_cb *cb;
 	size_t r_length;
-	int err;
+	ssize_t rets;
 
 	if (WARN_ON(!cl || !cl->dev))
 		return -ENODEV;
@@ -286,11 +286,9 @@ int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
 	mutex_lock(&dev->device_lock);
 
 	if (!cl->read_cb) {
-		err = mei_cl_read_start(cl, length);
-		if (err < 0) {
-			mutex_unlock(&dev->device_lock);
-			return err;
-		}
+		rets = mei_cl_read_start(cl, length);
+		if (rets < 0)
+			goto out;
 	}
 
 	if (cl->reading_state != MEI_READ_COMPLETE &&
@@ -313,13 +311,13 @@ int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
 	cb = cl->read_cb;
 
 	if (cl->reading_state != MEI_READ_COMPLETE) {
-		r_length = 0;
+		rets = 0;
 		goto out;
 	}
 
 	r_length = min_t(size_t, length, cb->buf_idx);
-
 	memcpy(buf, cb->response_buffer.data, r_length);
+	rets = r_length;
 
 	mei_io_cb_free(cb);
 	cl->reading_state = MEI_IDLE;
@@ -328,20 +326,20 @@ int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
 out:
 	mutex_unlock(&dev->device_lock);
 
-	return r_length;
+	return rets;
 }
 
-inline int __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length)
+inline ssize_t __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length)
 {
 	return ___mei_cl_send(cl, buf, length, 0);
 }
 
-inline int __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length)
+inline ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length)
 {
 	return ___mei_cl_send(cl, buf, length, 1);
 }
 
-int mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length)
+ssize_t mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length)
 {
 	struct mei_cl *cl = device->cl;
 
@@ -355,7 +353,7 @@ int mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length)
 }
 EXPORT_SYMBOL_GPL(mei_cl_send);
 
-int mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length)
+ssize_t mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length)
 {
 	struct mei_cl *cl =  device->cl;
 
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index 3dad74a8d496..7f350af2ee10 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -345,9 +345,9 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *dev,
 					struct mei_cl_ops *ops);
 void mei_cl_remove_device(struct mei_cl_device *device);
 
-int __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length);
-int __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length);
-int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
+ssize_t __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length);
+ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length);
+ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
 void mei_cl_bus_rx_event(struct mei_cl *cl);
 void mei_cl_bus_remove_devices(struct mei_device *dev);
 int mei_cl_bus_init(void);
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 164aad1f9f12..0819d36a3a74 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -25,8 +25,8 @@ int __mei_cl_driver_register(struct mei_cl_driver *driver,
 
 void mei_cl_driver_unregister(struct mei_cl_driver *driver);
 
-int mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length);
-int mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length);
+ssize_t mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length);
+ssize_t  mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length);
 
 typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
 			       u32 events, void *context);
-- 
cgit v1.2.3


From 46d0d33350e9b32642d745a8b46a954910196b4d Mon Sep 17 00:00:00 2001
From: Gigi Joseph <gigi.joseph@gmail.com>
Date: Fri, 9 Jan 2015 03:45:02 +0000
Subject: ti-st: add device tree support

When using device tree, driver configuration data need to be read from
device node.
Add support for getting the platform data information from the device
tree information stored in the .dtb file in case it exists.

Signed-off-by: Eyal Reizer <eyalr@ti.com>
Signed-off-by: bvijay <bvijay@ti.com>
Diff rendering mode:inlineside by side

Signed-off-by: Gigi Joseph <gigi.joseph@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/ti-st/st_kim.c  | 97 ++++++++++++++++++++++++++++++++++++++++----
 drivers/misc/ti-st/st_ll.c   | 17 +++++++-
 include/linux/ti_wilink_st.h |  1 +
 3 files changed, 105 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c
index e4b7ee4f57b8..68a0b582d81a 100644
--- a/drivers/misc/ti-st/st_kim.c
+++ b/drivers/misc/ti-st/st_kim.c
@@ -36,7 +36,8 @@
 #include <linux/skbuff.h>
 #include <linux/ti_wilink_st.h>
 #include <linux/module.h>
-
+#include <linux/of.h>
+#include <linux/of_device.h>
 
 #define MAX_ST_DEVICES	3	/* Imagine 1 on each UART for now */
 static struct platform_device *st_kim_devices[MAX_ST_DEVICES];
@@ -44,6 +45,9 @@ static struct platform_device *st_kim_devices[MAX_ST_DEVICES];
 /**********************************************************************/
 /* internal functions */
 
+struct ti_st_plat_data	*dt_pdata;
+static struct ti_st_plat_data *get_platform_data(struct device *dev);
+
 /**
  * st_get_plat_device -
  *	function which returns the reference to the platform device
@@ -462,7 +466,12 @@ long st_kim_start(void *kim_data)
 	struct kim_data_s	*kim_gdata = (struct kim_data_s *)kim_data;
 
 	pr_info(" %s", __func__);
-	pdata = kim_gdata->kim_pdev->dev.platform_data;
+	if (kim_gdata->kim_pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = kim_gdata->kim_pdev->dev.platform_data;
+	}
 
 	do {
 		/* platform specific enabling code here */
@@ -522,12 +531,18 @@ long st_kim_stop(void *kim_data)
 {
 	long err = 0;
 	struct kim_data_s	*kim_gdata = (struct kim_data_s *)kim_data;
-	struct ti_st_plat_data	*pdata =
-		kim_gdata->kim_pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
 	struct tty_struct	*tty = kim_gdata->core_data->tty;
 
 	reinit_completion(&kim_gdata->ldisc_installed);
 
+	if (kim_gdata->kim_pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else
+		pdata = kim_gdata->kim_pdev->dev.platform_data;
+
+
 	if (tty) {	/* can be called before ldisc is installed */
 		/* Flush any pending characters in the driver and discipline. */
 		tty_ldisc_flush(tty);
@@ -715,13 +730,53 @@ static const struct file_operations list_debugfs_fops = {
  * board-*.c file
  */
 
+static const struct of_device_id kim_of_match[] = {
+{
+	.compatible = "kim",
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, kim_of_match);
+
+static struct ti_st_plat_data *get_platform_data(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	const u32 *dt_property;
+	int len;
+
+	dt_pdata = kzalloc(sizeof(*dt_pdata), GFP_KERNEL);
+
+	if (!dt_pdata)
+		pr_err("Can't allocate device_tree platform data\n");
+
+	dt_property = of_get_property(np, "dev_name", &len);
+	if (dt_property)
+		memcpy(&dt_pdata->dev_name, dt_property, len);
+	of_property_read_u32(np, "nshutdown_gpio",
+			     (u32 *)&dt_pdata->nshutdown_gpio);
+	of_property_read_u32(np, "flow_cntrl", (u32 *)&dt_pdata->flow_cntrl);
+	of_property_read_u32(np, "baud_rate", (u32 *)&dt_pdata->baud_rate);
+
+	return dt_pdata;
+}
+
 static struct dentry *kim_debugfs_dir;
 static int kim_probe(struct platform_device *pdev)
 {
 	struct kim_data_s	*kim_gdata;
-	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
 	int err;
 
+	if (pdev->dev.of_node)
+		pdata = get_platform_data(&pdev->dev);
+	else
+		pdata = pdev->dev.platform_data;
+
+	if (pdata == NULL) {
+		dev_err(&pdev->dev, "Platform Data is missing\n");
+		return -ENXIO;
+	}
+
 	if ((pdev->id != -1) && (pdev->id < MAX_ST_DEVICES)) {
 		/* multiple devices could exist */
 		st_kim_devices[pdev->id] = pdev;
@@ -806,9 +861,16 @@ err_core_init:
 static int kim_remove(struct platform_device *pdev)
 {
 	/* free the GPIOs requested */
-	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
 	struct kim_data_s	*kim_gdata;
 
+	if (pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = pdev->dev.platform_data;
+	}
+
 	kim_gdata = platform_get_drvdata(pdev);
 
 	/* Free the Bluetooth/FM/GPIO
@@ -826,12 +888,22 @@ static int kim_remove(struct platform_device *pdev)
 
 	kfree(kim_gdata);
 	kim_gdata = NULL;
+	kfree(dt_pdata);
+	dt_pdata = NULL;
+
 	return 0;
 }
 
 static int kim_suspend(struct platform_device *pdev, pm_message_t state)
 {
-	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
+
+	if (pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = pdev->dev.platform_data;
+	}
 
 	if (pdata->suspend)
 		return pdata->suspend(pdev, state);
@@ -841,7 +913,14 @@ static int kim_suspend(struct platform_device *pdev, pm_message_t state)
 
 static int kim_resume(struct platform_device *pdev)
 {
-	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
+	struct ti_st_plat_data	*pdata;
+
+	if (pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = pdev->dev.platform_data;
+	}
 
 	if (pdata->resume)
 		return pdata->resume(pdev);
@@ -858,6 +937,8 @@ static struct platform_driver kim_platform_driver = {
 	.resume = kim_resume,
 	.driver = {
 		.name = "kim",
+		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(kim_of_match),
 	},
 };
 
diff --git a/drivers/misc/ti-st/st_ll.c b/drivers/misc/ti-st/st_ll.c
index 93b4d67cc4a3..518e1b7f2f95 100644
--- a/drivers/misc/ti-st/st_ll.c
+++ b/drivers/misc/ti-st/st_ll.c
@@ -26,6 +26,7 @@
 #include <linux/ti_wilink_st.h>
 
 /**********************************************************************/
+
 /* internal functions */
 static void send_ll_cmd(struct st_data_s *st_data,
 	unsigned char cmd)
@@ -53,7 +54,13 @@ static void ll_device_want_to_sleep(struct st_data_s *st_data)
 
 	/* communicate to platform about chip asleep */
 	kim_data = st_data->kim_data;
-	pdata = kim_data->kim_pdev->dev.platform_data;
+	if (kim_data->kim_pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = kim_data->kim_pdev->dev.platform_data;
+	}
+
 	if (pdata->chip_asleep)
 		pdata->chip_asleep(NULL);
 }
@@ -86,7 +93,13 @@ static void ll_device_want_to_wakeup(struct st_data_s *st_data)
 
 	/* communicate to platform about chip wakeup */
 	kim_data = st_data->kim_data;
-	pdata = kim_data->kim_pdev->dev.platform_data;
+	if (kim_data->kim_pdev->dev.of_node) {
+		pr_debug("use device tree data");
+		pdata = dt_pdata;
+	} else {
+		pdata = kim_data->kim_pdev->dev.platform_data;
+	}
+
 	if (pdata->chip_awake)
 		pdata->chip_awake(NULL);
 }
diff --git a/include/linux/ti_wilink_st.h b/include/linux/ti_wilink_st.h
index 884d6263e962..9072d9f95cff 100644
--- a/include/linux/ti_wilink_st.h
+++ b/include/linux/ti_wilink_st.h
@@ -86,6 +86,7 @@ struct st_proto_s {
 extern long st_register(struct st_proto_s *);
 extern long st_unregister(struct st_proto_s *);
 
+extern struct ti_st_plat_data   *dt_pdata;
 
 /*
  * header information used by st_core.c
-- 
cgit v1.2.3


From f379984f849d729bd2eb076b633200b1c040611e Mon Sep 17 00:00:00 2001
From: Xia Kaixu <xiakaixu@huawei.com>
Date: Fri, 9 Jan 2015 16:57:18 -0700
Subject: coresight: remove the unused macro CORESIGHT_DEBUGFS_ENTRY

Debugfs isn't used for coresight configuration, so the macro
CORESIGHT_DEBUGFS_ENTRY is unnecessary, just remove it.

Signed-off-by: Xia Kaixu <xiakaixu@huawei.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/coresight.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 5d3c54311f7a..7cbfecbfa643 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -179,15 +179,6 @@ struct coresight_device {
 #define sink_ops(csdev)		csdev->ops->sink_ops
 #define link_ops(csdev)		csdev->ops->link_ops
 
-#define CORESIGHT_DEBUGFS_ENTRY(__name, __entry_name,			\
-				 __mode, __get, __set, __fmt)		\
-DEFINE_SIMPLE_ATTRIBUTE(__name ## _ops, __get, __set, __fmt);		\
-static const struct coresight_ops_entry __name ## _entry = {		\
-	.name = __entry_name,						\
-	.mode = __mode,							\
-	.ops  = &__name ## _ops						\
-}
-
 /**
  * struct coresight_ops_sink - basic operations for a sink
  * Operations available for sinks
-- 
cgit v1.2.3


From c61c4b5dd2c6b5dbf0f7e299db1e8411ef590f5c Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 9 Jan 2015 16:57:20 -0700
Subject: coresight: Fixing wrong #ifdef/#endif placement

Fixing problem reported by:
        https://lkml.org/lkml/2015/1/6/86

The #ifdef/#endif is wrong and prevents the stub of function
of_get_coresight_platform_data() from being visible when
CONFIG_OF is not defined.

Moving CONFIG_OF condition out of CONFIG_CORESIGHT, making
them both independent.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/coresight.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 7cbfecbfa643..cd6d4c384ca3 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -230,10 +230,6 @@ extern void coresight_disable(struct coresight_device *csdev);
 extern int coresight_is_bit_set(u32 val, int position, int value);
 extern int coresight_timeout(void __iomem *addr, u32 offset,
 			     int position, int value);
-#ifdef CONFIG_OF
-extern struct coresight_platform_data *of_get_coresight_platform_data(
-				struct device *dev, struct device_node *node);
-#endif
 #else
 static inline struct coresight_device *
 coresight_register(struct coresight_desc *desc) { return NULL; }
@@ -245,10 +241,14 @@ static inline int coresight_is_bit_set(u32 val, int position, int value)
 					 { return 0; }
 static inline int coresight_timeout(void __iomem *addr, u32 offset,
 				     int position, int value) { return 1; }
+#endif
+
 #ifdef CONFIG_OF
+extern struct coresight_platform_data *of_get_coresight_platform_data(
+				struct device *dev, struct device_node *node);
+#else
 static inline struct coresight_platform_data *of_get_coresight_platform_data(
 	struct device *dev, struct device_node *node) { return NULL; }
 #endif
-#endif
 
 #endif
-- 
cgit v1.2.3


From 07673204777abe893092cc6e42ea04e5c0ac8cb7 Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@freescale.com>
Date: Tue, 30 Dec 2014 14:02:28 +0800
Subject: usb: phy: change some comments

- Delete the OTG stuffs
- .set_suspend is for controller, not for A-device or B-device.

Signed-off-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/phy.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index f499c23e6342..bc91b5d380fd 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -1,5 +1,5 @@
-/* USB OTG (On The Go) defines */
 /*
+ * USB PHY defines
  *
  * These APIs may be used between USB controllers.  USB device drivers
  * (for either host or peripheral roles) don't use these calls; they
@@ -106,7 +106,7 @@ struct usb_phy {
 	int	(*set_power)(struct usb_phy *x,
 				unsigned mA);
 
-	/* for non-OTG B devices: set transceiver into suspend mode */
+	/* Set transceiver into suspend mode */
 	int	(*set_suspend)(struct usb_phy *x,
 				int suspend);
 
-- 
cgit v1.2.3


From 7acc9973e3c42de9926b28eec8ae3434dfdde3be Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Sat, 6 Dec 2014 22:05:15 +0100
Subject: usb: phy: generic: add vbus support

Add support for vbus detection and power supply. This code is more or
less stolen from phy-gpio-vbus-usb.c, and aims at providing a detection
mechanism for VBus (ie. usb cable plug) based on a GPIO line, and a
power supply activation which draws current from the VBus.

[ balbi@ti.com : fix build break ]

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/phy/phy-generic.c       | 91 ++++++++++++++++++++++++++++++++++++-
 drivers/usb/phy/phy-generic.h       |  6 +++
 include/linux/usb/usb_phy_generic.h |  2 +
 3 files changed, 98 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/phy/phy-generic.c b/drivers/usb/phy/phy-generic.c
index d53928f3a6ea..dd05254241fb 100644
--- a/drivers/usb/phy/phy-generic.c
+++ b/drivers/usb/phy/phy-generic.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
+#include <linux/usb/gadget.h>
 #include <linux/usb/otg.h>
 #include <linux/usb/usb_phy_generic.h>
 #include <linux/slab.h>
@@ -39,6 +40,9 @@
 
 #include "phy-generic.h"
 
+#define VBUS_IRQ_FLAGS \
+	(IRQF_SHARED | IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING)
+
 struct platform_device *usb_phy_generic_register(void)
 {
 	return platform_device_register_simple("usb_phy_generic",
@@ -66,6 +70,73 @@ static void nop_reset_set(struct usb_phy_generic *nop, int asserted)
 		usleep_range(10000, 20000);
 }
 
+/* interface to regulator framework */
+static void nop_set_vbus_draw(struct usb_phy_generic *nop, unsigned mA)
+{
+	struct regulator *vbus_draw = nop->vbus_draw;
+	int enabled;
+	int ret;
+
+	if (!vbus_draw)
+		return;
+
+	enabled = nop->vbus_draw_enabled;
+	if (mA) {
+		regulator_set_current_limit(vbus_draw, 0, 1000 * mA);
+		if (!enabled) {
+			ret = regulator_enable(vbus_draw);
+			if (ret < 0)
+				return;
+			nop->vbus_draw_enabled = 1;
+		}
+	} else {
+		if (enabled) {
+			ret = regulator_disable(vbus_draw);
+			if (ret < 0)
+				return;
+			nop->vbus_draw_enabled = 0;
+		}
+	}
+	nop->mA = mA;
+}
+
+
+static irqreturn_t nop_gpio_vbus_thread(int irq, void *data)
+{
+	struct usb_phy_generic *nop = data;
+	struct usb_otg *otg = nop->phy.otg;
+	int vbus, status;
+
+	vbus = gpiod_get_value(nop->gpiod_vbus);
+	if ((vbus ^ nop->vbus) == 0)
+		return IRQ_HANDLED;
+	nop->vbus = vbus;
+
+	if (vbus) {
+		status = USB_EVENT_VBUS;
+		otg->state = OTG_STATE_B_PERIPHERAL;
+		nop->phy.last_event = status;
+		usb_gadget_vbus_connect(otg->gadget);
+
+		/* drawing a "unit load" is *always* OK, except for OTG */
+		nop_set_vbus_draw(nop, 100);
+
+		atomic_notifier_call_chain(&nop->phy.notifier, status,
+					   otg->gadget);
+	} else {
+		nop_set_vbus_draw(nop, 0);
+
+		usb_gadget_vbus_disconnect(otg->gadget);
+		status = USB_EVENT_NONE;
+		otg->state = OTG_STATE_B_IDLE;
+		nop->phy.last_event = status;
+
+		atomic_notifier_call_chain(&nop->phy.notifier, status,
+					   otg->gadget);
+	}
+	return IRQ_HANDLED;
+}
+
 int usb_gen_phy_init(struct usb_phy *phy)
 {
 	struct usb_phy_generic *nop = dev_get_drvdata(phy->dev);
@@ -149,17 +220,23 @@ int usb_phy_gen_create_phy(struct device *dev, struct usb_phy_generic *nop,
 		needs_vcc = of_property_read_bool(node, "vcc-supply");
 		nop->gpiod_reset = devm_gpiod_get(dev, "reset-gpios");
 		err = PTR_ERR(nop->gpiod_reset);
+		if (!err) {
+			nop->gpiod_vbus = devm_gpiod_get(dev,
+							 "vbus-detect-gpio");
+			err = PTR_ERR(nop->gpiod_vbus);
+		}
 	} else if (pdata) {
 		type = pdata->type;
 		clk_rate = pdata->clk_rate;
 		needs_vcc = pdata->needs_vcc;
-		if (gpio_is_valid(gpio->gpio_reset)) {
+		if (gpio_is_valid(pdata->gpio_reset)) {
 			err = devm_gpio_request_one(dev, pdata->gpio_reset, 0,
 						    dev_name(dev));
 			if (!err)
 				nop->gpiod_reset =
 					gpio_to_desc(pdata->gpio_reset);
 		}
+		nop->gpiod_vbus = pdata->gpiod_vbus;
 	}
 
 	if (err == -EPROBE_DEFER)
@@ -224,6 +301,18 @@ static int usb_phy_generic_probe(struct platform_device *pdev)
 	err = usb_phy_gen_create_phy(dev, nop, dev_get_platdata(&pdev->dev));
 	if (err)
 		return err;
+	if (nop->gpiod_vbus) {
+		err = devm_request_threaded_irq(&pdev->dev,
+						gpiod_to_irq(nop->gpiod_vbus),
+						NULL, nop_gpio_vbus_thread,
+						VBUS_IRQ_FLAGS, "vbus_detect",
+						nop);
+		if (err) {
+			dev_err(&pdev->dev, "can't request irq %i, err: %d\n",
+				gpiod_to_irq(nop->gpiod_vbus), err);
+			return err;
+		}
+	}
 
 	nop->phy.init		= usb_gen_phy_init;
 	nop->phy.shutdown	= usb_gen_phy_shutdown;
diff --git a/drivers/usb/phy/phy-generic.h b/drivers/usb/phy/phy-generic.h
index 09924fdaaabe..0d0eadd54ed9 100644
--- a/drivers/usb/phy/phy-generic.h
+++ b/drivers/usb/phy/phy-generic.h
@@ -3,6 +3,7 @@
 
 #include <linux/usb/usb_phy_generic.h>
 #include <linux/gpio/consumer.h>
+#include <linux/regulator/consumer.h>
 
 struct usb_phy_generic {
 	struct usb_phy phy;
@@ -10,6 +11,11 @@ struct usb_phy_generic {
 	struct clk *clk;
 	struct regulator *vcc;
 	struct gpio_desc *gpiod_reset;
+	struct gpio_desc *gpiod_vbus;
+	struct regulator *vbus_draw;
+	bool vbus_draw_enabled;
+	unsigned long mA;
+	unsigned int vbus;
 };
 
 int usb_gen_phy_init(struct usb_phy *phy);
diff --git a/include/linux/usb/usb_phy_generic.h b/include/linux/usb/usb_phy_generic.h
index 68adae83affc..c13632d5292e 100644
--- a/include/linux/usb/usb_phy_generic.h
+++ b/include/linux/usb/usb_phy_generic.h
@@ -2,6 +2,7 @@
 #define __LINUX_USB_NOP_XCEIV_H
 
 #include <linux/usb/otg.h>
+#include <linux/gpio/consumer.h>
 
 struct usb_phy_generic_platform_data {
 	enum usb_phy_type type;
@@ -11,6 +12,7 @@ struct usb_phy_generic_platform_data {
 	unsigned int needs_vcc:1;
 	unsigned int needs_reset:1;	/* deprecated */
 	int gpio_reset;
+	struct gpio_desc *gpiod_vbus;
 };
 
 #if IS_ENABLED(CONFIG_NOP_USB_XCEIV)
-- 
cgit v1.2.3


From cbf6ab52add20b845f903decc973afbd5463c527 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 5 Jan 2015 19:29:32 +0800
Subject: kprobes: Pass the original kprobe for preparing optimized kprobe

Pass the original kprobe for preparing an optimized kprobe arch-dep
part, since for some architecture (e.g. ARM32) requires the information
in original kprobe.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Jon Medhurst <tixy@linaro.org>
---
 arch/x86/kernel/kprobes/opt.c | 3 ++-
 include/linux/kprobes.h       | 3 ++-
 kernel/kprobes.c              | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..0dd8d089c315 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -322,7 +322,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
  * Target instructions MUST be relocatable (checked inside)
  * This is called when new aggr(opt)probe is allocated or reused.
  */
-int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+				  struct kprobe *__unused)
 {
 	u8 *buf;
 	int ret;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 5297f9fa0ef2..1ab54754a86d 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -308,7 +308,8 @@ struct optimized_kprobe {
 /* Architecture dependent functions for direct jump optimization */
 extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
 extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
-extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
+extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+					 struct kprobe *orig);
 extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
 extern void arch_optimize_kprobes(struct list_head *oplist);
 extern void arch_unoptimize_kprobes(struct list_head *oplist,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 06f58309fed2..bad4e959f2f7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
 	struct optimized_kprobe *op;
 
 	op = container_of(p, struct optimized_kprobe, kp);
-	arch_prepare_optimized_kprobe(op);
+	arch_prepare_optimized_kprobe(op, p);
 }
 
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 
 	INIT_LIST_HEAD(&op->list);
 	op->kp.addr = p->addr;
-	arch_prepare_optimized_kprobe(op);
+	arch_prepare_optimized_kprobe(op, p);
 
 	return &op->kp;
 }
-- 
cgit v1.2.3


From 7a868d1e9ab3c534c5ad44e3e5dc46753a1e5636 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 12 Jan 2015 14:52:22 +0800
Subject: rhashtable: involve rhashtable_lookup_compare_insert routine

Introduce a new function called rhashtable_lookup_compare_insert()
which is very similar to rhashtable_lookup_insert(). But the former
makes use of users' given compare function to look for an object,
and then inserts it into hash table if found. As the entire process
of search and insertion is under protection of per bucket lock, this
can help users to avoid the involvement of extra lock.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  5 +++++
 lib/rhashtable.c           | 42 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 326acd8c2e9f..7b9bd77ed684 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -168,7 +168,12 @@ int rhashtable_shrink(struct rhashtable *ht);
 void *rhashtable_lookup(struct rhashtable *ht, const void *key);
 void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
+
 bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj);
+bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
+				      struct rhash_head *obj,
+				      bool (*compare)(void *, void *),
+				      void *arg);
 
 void rhashtable_destroy(struct rhashtable *ht);
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 8023b554905c..ed6ae1ad304c 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -726,6 +726,43 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
  * to rhashtable_init().
  */
 bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
+{
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = rht_obj(ht, obj) + ht->p.key_offset,
+	};
+
+	BUG_ON(!ht->p.key_len);
+
+	return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare,
+						&arg);
+}
+EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+
+/**
+ * rhashtable_lookup_compare_insert - search and insert object to hash table
+ *                                    with compare function
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @compare:	compare function, must return true on match
+ * @arg:	argument passed on to compare function
+ *
+ * Locks down the bucket chain in both the old and new table if a resize
+ * is in progress to ensure that writers can't remove from the old table
+ * and can't insert to the new table during the atomic operation of search
+ * and insertion. Searches for duplicates in both the old and new table if
+ * a resize is in progress.
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
+				      struct rhash_head *obj,
+				      bool (*compare)(void *, void *),
+				      void *arg)
 {
 	struct bucket_table *new_tbl, *old_tbl;
 	spinlock_t *new_bucket_lock, *old_bucket_lock;
@@ -747,7 +784,8 @@ bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
 	if (unlikely(old_tbl != new_tbl))
 		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
 
-	if (rhashtable_lookup(ht, rht_obj(ht, obj) + ht->p.key_offset)) {
+	if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
+				      compare, arg)) {
 		success = false;
 		goto exit;
 	}
@@ -763,7 +801,7 @@ exit:
 
 	return success;
 }
-EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
 
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
-- 
cgit v1.2.3


From 6f73d3b13dc5e16ae06025cd1b12a36b2857caa2 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 12 Jan 2015 14:52:24 +0800
Subject: rhashtable: add a note for grow and shrink decision functions

As commit c0c09bfdc415 ("rhashtable: avoid unnecessary wakeup for
worker queue") moves condition statements of verifying whether hash
table size exceeds its maximum threshold or reaches its minimum
threshold from resizing functions to resizing decision functions,
we should add a note in rhashtable.h to indicate the implementation
of what the grow and shrink decision function must enforce min/max
shift, otherwise, it's failed to take min/max shift's set watermarks
into effect.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 7b9bd77ed684..9570832ab07c 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -79,6 +79,10 @@ struct rhashtable;
  * @obj_hashfn: Function to hash object
  * @grow_decision: If defined, may return true if table should expand
  * @shrink_decision: If defined, may return true if table should shrink
+ *
+ * Note: when implementing the grow and shrink decision function, min/max
+ * shift must be enforced, otherwise, resizing watermarks they set may be
+ * useless.
  */
 struct rhashtable_params {
 	size_t			nelem_hint;
-- 
cgit v1.2.3


From df8a39defad46b83694ea6dd868d332976d62cc0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 13 Jan 2015 17:13:44 +0100
Subject: net: rename vlan_tx_* helpers since "tx" is misleading there

The same macros are used for rx as well. So rename it.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/filter.txt                  |  4 ++--
 drivers/infiniband/hw/nes/nes_nic.c                  | 13 +++++++------
 drivers/net/ethernet/3com/typhoon.c                  |  4 ++--
 drivers/net/ethernet/alteon/acenic.c                 |  8 ++++----
 drivers/net/ethernet/amd/amd8111e.c                  |  4 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c             |  8 ++++----
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c      |  4 ++--
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c      |  4 ++--
 drivers/net/ethernet/atheros/atlx/atl1.c             |  4 ++--
 drivers/net/ethernet/atheros/atlx/atl2.c             |  4 ++--
 drivers/net/ethernet/broadcom/bnx2.c                 |  4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c      |  4 ++--
 drivers/net/ethernet/broadcom/tg3.c                  |  4 ++--
 drivers/net/ethernet/brocade/bna/bnad.c              |  4 ++--
 drivers/net/ethernet/chelsio/cxgb/sge.c              |  4 ++--
 drivers/net/ethernet/chelsio/cxgb3/sge.c             |  6 +++---
 drivers/net/ethernet/chelsio/cxgb4/sge.c             |  4 ++--
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c           |  4 ++--
 drivers/net/ethernet/cisco/enic/enic_main.c          |  4 ++--
 drivers/net/ethernet/emulex/benet/be_main.c          | 12 ++++++------
 drivers/net/ethernet/freescale/gianfar.c             |  4 ++--
 drivers/net/ethernet/ibm/ehea/ehea_main.c            |  4 ++--
 drivers/net/ethernet/intel/e1000/e1000_main.c        |  5 +++--
 drivers/net/ethernet/intel/e1000e/netdev.c           |  9 +++++----
 drivers/net/ethernet/intel/fm10k/fm10k_main.c        |  4 ++--
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c      |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c          |  4 ++--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c        |  4 ++--
 drivers/net/ethernet/intel/igb/igb_main.c            |  4 ++--
 drivers/net/ethernet/intel/igbvf/netdev.c            |  5 +++--
 drivers/net/ethernet/intel/ixgb/ixgb_main.c          |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c        |  4 ++--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c    |  4 ++--
 drivers/net/ethernet/jme.c                           |  4 ++--
 drivers/net/ethernet/marvell/sky2.c                  |  6 +++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c           | 12 ++++++------
 drivers/net/ethernet/natsemi/ns83820.c               |  4 ++--
 drivers/net/ethernet/neterion/s2io.c                 |  4 ++--
 drivers/net/ethernet/neterion/vxge/vxge-main.c       |  4 ++--
 drivers/net/ethernet/nvidia/forcedeth.c              |  4 ++--
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  4 ++--
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c       |  8 ++++----
 drivers/net/ethernet/qlogic/qlge/qlge_main.c         |  6 +++---
 drivers/net/ethernet/realtek/8139cp.c                |  4 ++--
 drivers/net/ethernet/realtek/r8169.c                 |  4 ++--
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c      |  2 +-
 drivers/net/ethernet/tehuti/tehuti.c                 |  4 ++--
 drivers/net/ethernet/via/via-rhine.c                 |  6 +++---
 drivers/net/ethernet/via/via-velocity.c              |  4 ++--
 drivers/net/macvtap.c                                |  6 +++---
 drivers/net/tun.c                                    |  4 ++--
 drivers/net/usb/r8152.c                              |  4 ++--
 drivers/net/vmxnet3/vmxnet3_drv.c                    |  4 ++--
 drivers/net/vxlan.c                                  |  4 ++--
 drivers/s390/net/qeth_l3_main.c                      |  8 ++++----
 drivers/vhost/net.c                                  |  2 +-
 include/linux/if_vlan.h                              | 16 ++++++++--------
 include/net/pkt_sched.h                              |  2 +-
 include/trace/events/net.h                           |  8 ++++----
 net/8021q/vlan_core.c                                |  2 +-
 net/bridge/br_netfilter.c                            | 12 ++++++------
 net/bridge/br_private.h                              |  4 ++--
 net/bridge/br_vlan.c                                 |  4 ++--
 net/bridge/netfilter/ebt_vlan.c                      |  4 ++--
 net/bridge/netfilter/ebtables.c                      |  2 +-
 net/core/dev.c                                       | 10 +++++-----
 net/core/netpoll.c                                   |  2 +-
 net/core/skbuff.c                                    |  8 ++++----
 net/ipv4/geneve.c                                    |  2 +-
 net/openvswitch/actions.c                            |  4 ++--
 net/openvswitch/datapath.c                           |  2 +-
 net/openvswitch/flow.c                               |  4 ++--
 net/openvswitch/vport-gre.c                          |  2 +-
 net/openvswitch/vport.c                              |  3 ++-
 net/packet/af_packet.c                               | 12 ++++++------
 net/sched/em_meta.c                                  |  2 +-
 net/wireless/util.c                                  |  4 ++--
 77 files changed, 195 insertions(+), 190 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 58d08f8d8d80..9930ecfbb465 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -279,8 +279,8 @@ Possible BPF extensions are shown in the following table:
   hatype                                skb->dev->type
   rxhash                                skb->hash
   cpu                                   raw_smp_processor_id()
-  vlan_tci                              vlan_tx_tag_get(skb)
-  vlan_pr                               vlan_tx_tag_present(skb)
+  vlan_tci                              skb_vlan_tag_get(skb)
+  vlan_pr                               skb_vlan_tag_present(skb)
   rand                                  prandom_u32()
 
 These extensions can also be prefixed with '#'.
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 49eb5111d2cd..70acda91eb2a 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -373,11 +373,11 @@ static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
 	wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
 
 	/* setup the VLAN tag if present */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-				netdev->name, vlan_tx_tag_get(skb));
+				netdev->name, skb_vlan_tag_get(skb));
 		wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-		wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+		wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
 	} else
 		wqe_misc = 0;
 
@@ -576,11 +576,12 @@ tso_sq_no_longer_full:
 				wqe_fragment_length =
 						(__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
 				/* setup the VLAN tag if present */
-				if (vlan_tx_tag_present(skb)) {
+				if (skb_vlan_tag_present(skb)) {
 					nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-							netdev->name, vlan_tx_tag_get(skb) );
+							netdev->name,
+						  skb_vlan_tag_get(skb));
 					wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-					wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+					wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
 				} else
 					wqe_misc = 0;
 
diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c
index dede43f4ce09..8f8418d2ac4a 100644
--- a/drivers/net/ethernet/3com/typhoon.c
+++ b/drivers/net/ethernet/3com/typhoon.c
@@ -769,11 +769,11 @@ typhoon_start_tx(struct sk_buff *skb, struct net_device *dev)
 		first_txd->processFlags |= TYPHOON_TX_PF_IP_CHKSUM;
 	}
 
-	if(vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		first_txd->processFlags |=
 		    TYPHOON_TX_PF_INSERT_VLAN | TYPHOON_TX_PF_VLAN_PRIORITY;
 		first_txd->processFlags |=
-		    cpu_to_le32(htons(vlan_tx_tag_get(skb)) <<
+		    cpu_to_le32(htons(skb_vlan_tag_get(skb)) <<
 				TYPHOON_TX_PF_VLAN_TAG_SHIFT);
 	}
 
diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c
index b68074803de3..b90a26b13fdf 100644
--- a/drivers/net/ethernet/alteon/acenic.c
+++ b/drivers/net/ethernet/alteon/acenic.c
@@ -2429,9 +2429,9 @@ restart:
 		flagsize = (skb->len << 16) | (BD_FLG_END);
 		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			flagsize |= BD_FLG_TCP_UDP_SUM;
-		if (vlan_tx_tag_present(skb)) {
+		if (skb_vlan_tag_present(skb)) {
 			flagsize |= BD_FLG_VLAN_TAG;
-			vlan_tag = vlan_tx_tag_get(skb);
+			vlan_tag = skb_vlan_tag_get(skb);
 		}
 		desc = ap->tx_ring + idx;
 		idx = (idx + 1) % ACE_TX_RING_ENTRIES(ap);
@@ -2450,9 +2450,9 @@ restart:
 		flagsize = (skb_headlen(skb) << 16);
 		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			flagsize |= BD_FLG_TCP_UDP_SUM;
-		if (vlan_tx_tag_present(skb)) {
+		if (skb_vlan_tag_present(skb)) {
 			flagsize |= BD_FLG_VLAN_TAG;
-			vlan_tag = vlan_tx_tag_get(skb);
+			vlan_tag = skb_vlan_tag_get(skb);
 		}
 
 		ace_load_tx_bd(ap, ap->tx_ring + idx, mapping, flagsize, vlan_tag);
diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index 841e6558db68..4c2ae2221780 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -1299,11 +1299,11 @@ static netdev_tx_t amd8111e_start_xmit(struct sk_buff *skb,
 	lp->tx_ring[tx_index].tx_flags = 0;
 
 #if AMD8111E_VLAN_TAG_USED
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		lp->tx_ring[tx_index].tag_ctrl_cmd |=
 				cpu_to_le16(TCC_VLAN_INSERT);
 		lp->tx_ring[tx_index].tag_ctrl_info =
-				cpu_to_le16(vlan_tx_tag_get(skb));
+				cpu_to_le16(skb_vlan_tag_get(skb));
 
 	}
 #endif
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 7bb5f07dbeef..2ba1dd22ad64 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1165,8 +1165,8 @@ static void xgbe_prep_tx_tstamp(struct xgbe_prv_data *pdata,
 
 static void xgbe_prep_vlan(struct sk_buff *skb, struct xgbe_packet_data *packet)
 {
-	if (vlan_tx_tag_present(skb))
-		packet->vlan_ctag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb))
+		packet->vlan_ctag = skb_vlan_tag_get(skb);
 }
 
 static int xgbe_prep_tso(struct sk_buff *skb, struct xgbe_packet_data *packet)
@@ -1247,9 +1247,9 @@ static void xgbe_packet_info(struct xgbe_prv_data *pdata,
 		XGMAC_SET_BITS(packet->attributes, TX_PACKET_ATTRIBUTES,
 			       CSUM_ENABLE, 1);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/* VLAN requires an extra descriptor if tag is different */
-		if (vlan_tx_tag_get(skb) != ring->tx.cur_vlan_ctag)
+		if (skb_vlan_tag_get(skb) != ring->tx.cur_vlan_ctag)
 			/* We can share with the TSO context descriptor */
 			if (!context_desc) {
 				context_desc = 1;
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index c9946c6c119e..587f63e87588 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2235,8 +2235,8 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_OK;
 	}
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		u16 vlan = vlan_tx_tag_get(skb);
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		u16 vlan = skb_vlan_tag_get(skb);
 		__le16 tag;
 
 		vlan = cpu_to_le16(vlan);
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index c88abf5b6415..59a03a193e83 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1892,8 +1892,8 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
 
 	tpd = atl1e_get_tpd(adapter);
 
-	if (vlan_tx_tag_present(skb)) {
-		u16 vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_tag = skb_vlan_tag_get(skb);
 		u16 atl1e_vlan_tag;
 
 		tpd->word3 |= 1 << TPD_INS_VL_TAG_SHIFT;
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index 2c8f398aeda9..eca1d113fee1 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -2415,8 +2415,8 @@ static netdev_tx_t atl1_xmit_frame(struct sk_buff *skb,
 		(u16) atomic_read(&tpd_ring->next_to_use));
 	memset(ptpd, 0, sizeof(struct tx_packet_desc));
 
-	if (vlan_tx_tag_present(skb)) {
-		vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		vlan_tag = skb_vlan_tag_get(skb);
 		vlan_tag = (vlan_tag << 4) | (vlan_tag >> 13) |
 			((vlan_tag >> 9) & 0x8);
 		ptpd->word3 |= 1 << TPD_INS_VL_TAG_SHIFT;
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 482a7cabb0a1..46a535318c7a 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -887,8 +887,8 @@ static netdev_tx_t atl2_xmit_frame(struct sk_buff *skb,
 		offset = ((u32)(skb->len-copy_len + 3) & ~3);
 	}
 #ifdef NETIF_F_HW_VLAN_CTAG_TX
-	if (vlan_tx_tag_present(skb)) {
-		u16 vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_tag = skb_vlan_tag_get(skb);
 		vlan_tag = (vlan_tag << 4) |
 			(vlan_tag >> 13) |
 			((vlan_tag >> 9) & 0x8);
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 823d01c5684c..02bf0b86995b 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6597,9 +6597,9 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		vlan_tag_flags |= TX_BD_FLAGS_TCP_UDP_CKSUM;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		vlan_tag_flags |=
-			(TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16));
+			(TX_BD_FLAGS_VLAN_TAG | (skb_vlan_tag_get(skb) << 16));
 	}
 
 	if ((mss = skb_shinfo(skb)->gso_size)) {
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 1d1147c93d59..b51a18a09d4d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3865,9 +3865,9 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	   "sending pkt %u @%p  next_idx %u  bd %u @%p\n",
 	   pkt_prod, tx_buf, txdata->tx_pkt_prod, bd_prod, tx_start_bd);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_start_bd->vlan_or_ethertype =
-		    cpu_to_le16(vlan_tx_tag_get(skb));
+		    cpu_to_le16(skb_vlan_tag_get(skb));
 		tx_start_bd->bd_flags.as_bitfield |=
 		    (X_ETH_OUTBAND_VLAN << ETH_TX_BD_FLAGS_VLAN_MODE_SHIFT);
 	} else {
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 356bd5b022a5..4cf43bfbc955 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -8002,9 +8002,9 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	    !mss && skb->len > VLAN_ETH_FRAME_LEN)
 		base_flags |= TXD_FLAG_JMB_PKT;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		base_flags |= TXD_FLAG_VLAN;
-		vlan = vlan_tx_tag_get(skb);
+		vlan = skb_vlan_tag_get(skb);
 	}
 
 	if ((unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) &&
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index 323721838cf9..7714d7790089 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -2824,8 +2824,8 @@ bnad_txq_wi_prepare(struct bnad *bnad, struct bna_tcb *tcb,
 	u32 gso_size;
 	u16 vlan_tag = 0;
 
-	if (vlan_tx_tag_present(skb)) {
-		vlan_tag = (u16)vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		vlan_tag = (u16)skb_vlan_tag_get(skb);
 		flags |= (BNA_TXQ_WI_CF_INS_PRIO | BNA_TXQ_WI_CF_INS_VLAN);
 	}
 	if (test_bit(BNAD_RF_CEE_RUNNING, &bnad->run_flags)) {
diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
index babe2a915b00..526ea74e82d9 100644
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -1860,9 +1860,9 @@ netdev_tx_t t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	cpl->iff = dev->if_port;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		cpl->vlan_valid = 1;
-		cpl->vlan = htons(vlan_tx_tag_get(skb));
+		cpl->vlan = htons(skb_vlan_tag_get(skb));
 		st->vlan_insert++;
 	} else
 		cpl->vlan_valid = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
index 3dfcf600fcc6..d6aa602f168d 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -1148,8 +1148,8 @@ static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
 	cpl->len = htonl(skb->len);
 	cntrl = V_TXPKT_INTF(pi->port_id);
 
-	if (vlan_tx_tag_present(skb))
-		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(vlan_tx_tag_get(skb));
+	if (skb_vlan_tag_present(skb))
+		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(skb_vlan_tag_get(skb));
 
 	tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
 	if (tso_info) {
@@ -1282,7 +1282,7 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 		qs->port_stats[SGE_PSTAT_TX_CSUM]++;
 	if (skb_shinfo(skb)->gso_size)
 		qs->port_stats[SGE_PSTAT_TSO]++;
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		qs->port_stats[SGE_PSTAT_VLANINS]++;
 
 	/*
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index ca42e2e9dec9..619156112b21 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -1154,9 +1154,9 @@ out_free:	dev_kfree_skb_any(skb);
 			cntrl = TXPKT_L4CSUM_DIS | TXPKT_IPCSUM_DIS;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		q->vlan_ins++;
-		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(vlan_tx_tag_get(skb));
+		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(skb_vlan_tag_get(skb));
 	}
 
 	cpl->ctrl0 = htonl(TXPKT_OPCODE(CPL_TX_PKT_XT) |
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index 4424277a7e4d..0545f0de1c52 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -1326,9 +1326,9 @@ int t4vf_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * If there's a VLAN tag present, add that to the list of things to
 	 * do in this Work Request.
 	 */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		txq->vlan_ins++;
-		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(vlan_tx_tag_get(skb));
+		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(skb_vlan_tag_get(skb));
 	}
 
 	/*
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 9a952df6606e..0535f6fbdc71 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -520,10 +520,10 @@ static inline void enic_queue_wq_skb(struct enic *enic,
 	int loopback = 0;
 	int err;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/* VLAN tag from trunking driver */
 		vlan_tag_insert = 1;
-		vlan_tag = vlan_tx_tag_get(skb);
+		vlan_tag = skb_vlan_tag_get(skb);
 	} else if (enic->loop_enable) {
 		vlan_tag = enic->loop_tag;
 		loopback = 1;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 37a26b0b7e33..ed46610e5453 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -694,7 +694,7 @@ static inline u16 be_get_tx_vlan_tag(struct be_adapter *adapter,
 	u8 vlan_prio;
 	u16 vlan_tag;
 
-	vlan_tag = vlan_tx_tag_get(skb);
+	vlan_tag = skb_vlan_tag_get(skb);
 	vlan_prio = (vlan_tag & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
 	/* If vlan priority provided by OS is NOT in available bmap */
 	if (!(adapter->vlan_prio_bmap & (1 << vlan_prio)))
@@ -745,7 +745,7 @@ static void wrb_fill_hdr(struct be_adapter *adapter, struct be_eth_hdr_wrb *hdr,
 			SET_TX_WRB_HDR_BITS(udpcs, hdr, 1);
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		SET_TX_WRB_HDR_BITS(vlan, hdr, 1);
 		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
 		SET_TX_WRB_HDR_BITS(vlan_tag, hdr, vlan_tag);
@@ -864,7 +864,7 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
 	if (unlikely(!skb))
 		return skb;
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
 
 	if (qnq_async_evt_rcvd(adapter) && adapter->pvid) {
@@ -923,7 +923,7 @@ static bool be_ipv6_exthdr_check(struct sk_buff *skb)
 
 static int be_vlan_tag_tx_chk(struct be_adapter *adapter, struct sk_buff *skb)
 {
-	return vlan_tx_tag_present(skb) || adapter->pvid || adapter->qnq_vid;
+	return skb_vlan_tag_present(skb) || adapter->pvid || adapter->qnq_vid;
 }
 
 static int be_ipv6_tx_stall_chk(struct be_adapter *adapter, struct sk_buff *skb)
@@ -946,7 +946,7 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
 	eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ?
 						VLAN_ETH_HLEN : ETH_HLEN;
 	if (skb->len <= 60 &&
-	    (lancer_chip(adapter) || vlan_tx_tag_present(skb)) &&
+	    (lancer_chip(adapter) || skb_vlan_tag_present(skb)) &&
 	    is_ipv4_pkt(skb)) {
 		ip = (struct iphdr *)ip_hdr(skb);
 		pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len));
@@ -964,7 +964,7 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
 	 * Manually insert VLAN in pkt.
 	 */
 	if (skb->ip_summed != CHECKSUM_PARTIAL &&
-	    vlan_tx_tag_present(skb)) {
+	    skb_vlan_tag_present(skb)) {
 		skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan);
 		if (unlikely(!skb))
 			goto err;
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index e54b1e39f9b4..93ff846e96f1 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -2170,7 +2170,7 @@ static inline void gfar_tx_checksum(struct sk_buff *skb, struct txfcb *fcb,
 void inline gfar_tx_vlan(struct sk_buff *skb, struct txfcb *fcb)
 {
 	fcb->flags |= TXFCB_VLN;
-	fcb->vlctl = vlan_tx_tag_get(skb);
+	fcb->vlctl = skb_vlan_tag_get(skb);
 }
 
 static inline struct txbd8 *skip_txbd(struct txbd8 *bdp, int stride,
@@ -2230,7 +2230,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	regs = tx_queue->grp->regs;
 
 	do_csum = (CHECKSUM_PARTIAL == skb->ip_summed);
-	do_vlan = vlan_tx_tag_present(skb);
+	do_vlan = skb_vlan_tag_present(skb);
 	do_tstamp = (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
 		    priv->hwts_tx_en;
 
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 566b17db135a..e8a1adb7a962 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -2064,9 +2064,9 @@ static int ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	memset(swqe, 0, SWQE_HEADER_SIZE);
 	atomic_dec(&pr->swqe_avail);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		swqe->tx_control |= EHEA_SWQE_VLAN_INSERT;
-		swqe->vlan_tag = vlan_tx_tag_get(skb);
+		swqe->vlan_tag = skb_vlan_tag_get(skb);
 	}
 
 	pr->tx_packets++;
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 83140cbb5f01..9242982db3e0 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -3226,9 +3226,10 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_BUSY;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= E1000_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << E1000_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) <<
+			     E1000_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	first = tx_ring->next_to_use;
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 332a298e95b5..38cb586b1bf4 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5463,8 +5463,8 @@ static int e1000_transfer_dhcp_info(struct e1000_adapter *adapter,
 	struct e1000_hw *hw = &adapter->hw;
 	u16 length, offset;
 
-	if (vlan_tx_tag_present(skb) &&
-	    !((vlan_tx_tag_get(skb) == adapter->hw.mng_cookie.vlan_id) &&
+	if (skb_vlan_tag_present(skb) &&
+	    !((skb_vlan_tag_get(skb) == adapter->hw.mng_cookie.vlan_id) &&
 	      (adapter->hw.mng_cookie.status &
 	       E1000_MNG_DHCP_COOKIE_STATUS_VLAN)))
 		return 0;
@@ -5603,9 +5603,10 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 	if (e1000_maybe_stop_tx(tx_ring, count + 2))
 		return NETDEV_TX_BUSY;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= E1000_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << E1000_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) <<
+			     E1000_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	first = tx_ring->next_to_use;
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index eb088b129bc7..caa43f7c2931 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -965,8 +965,8 @@ static void fm10k_tx_map(struct fm10k_ring *tx_ring,
 	tx_desc = FM10K_TX_DESC(tx_ring, i);
 
 	/* add HW VLAN tag */
-	if (vlan_tx_tag_present(skb))
-		tx_desc->vlan = cpu_to_le16(vlan_tx_tag_get(skb));
+	if (skb_vlan_tag_present(skb))
+		tx_desc->vlan = cpu_to_le16(skb_vlan_tag_get(skb));
 	else
 		tx_desc->vlan = 0;
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 8811364b91cb..945b35d31c71 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -609,7 +609,7 @@ static netdev_tx_t fm10k_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 	int err;
 
 	if ((skb->protocol ==  htons(ETH_P_8021Q)) &&
-	    !vlan_tx_tag_present(skb)) {
+	    !skb_vlan_tag_present(skb)) {
 		/* FM10K only supports hardware tagging, any tags in frame
 		 * are considered 2nd level or "outer" tags
 		 */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 04b441460bbd..9f536dd8e1ec 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1772,8 +1772,8 @@ static int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
 	u32  tx_flags = 0;
 
 	/* if we have a HW VLAN tag being added, default to the HW one */
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
 	/* else if it is a SW VLAN, check the next protocol and store the tag */
 	} else if (protocol == htons(ETH_P_8021Q)) {
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 04c7c1557a0c..82c3798fdd36 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1122,8 +1122,8 @@ static int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
 	u32  tx_flags = 0;
 
 	/* if we have a HW VLAN tag being added, default to the HW one */
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
 	/* else if it is a SW VLAN, check the next protocol and store the tag */
 	} else if (protocol == htons(ETH_P_8021Q)) {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index ff59897a9463..6c25ec314183 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -5035,9 +5035,9 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 
 	skb_tx_timestamp(skb);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IGB_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	/* record initial flags and protocol */
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 63c807c9b21c..ad2b4897b392 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -2234,9 +2234,10 @@ static netdev_tx_t igbvf_xmit_frame_ring_adv(struct sk_buff *skb,
 		return NETDEV_TX_BUSY;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IGBVF_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << IGBVF_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) <<
+			     IGBVF_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	if (skb->protocol == htons(ETH_P_IP))
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index aa87605b144a..11a1bdbe3fd9 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -1532,9 +1532,9 @@ ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
                      DESC_NEEDED)))
 		return NETDEV_TX_BUSY;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IXGB_TX_FLAGS_VLAN;
-		vlan_id = vlan_tx_tag_get(skb);
+		vlan_id = skb_vlan_tag_get(skb);
 	}
 
 	first = adapter->tx_ring.next_to_use;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 2ed2c7de2304..7bb421bfd84e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7217,8 +7217,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 	first->gso_segs = 1;
 
 	/* if we have a HW VLAN tag being added default to the HW one */
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb) << IXGBE_TX_FLAGS_VLAN_SHIFT;
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << IXGBE_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= IXGBE_TX_FLAGS_HW_VLAN;
 	/* else if it is a SW VLAN check the next protocol and store the tag */
 	} else if (protocol == htons(ETH_P_8021Q)) {
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 62a0d8e0f17d..c9b49bfb51bb 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3452,8 +3452,8 @@ static int ixgbevf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	first->bytecount = skb->len;
 	first->gso_segs = 1;
 
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb);
 		tx_flags <<= IXGBE_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= IXGBE_TX_FLAGS_VLAN;
 	}
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 44ce7d88f554..6e9a792097d3 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -2154,9 +2154,9 @@ jme_tx_csum(struct jme_adapter *jme, struct sk_buff *skb, u8 *flags)
 static inline void
 jme_tx_vlan(struct sk_buff *skb, __le16 *vlan, u8 *flags)
 {
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		*flags |= TXFLAG_TAGON;
-		*vlan = cpu_to_le16(vlan_tx_tag_get(skb));
+		*vlan = cpu_to_le16(skb_vlan_tag_get(skb));
 	}
 }
 
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 867a6a3ef81f..d9f4498832a1 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -1895,14 +1895,14 @@ static netdev_tx_t sky2_xmit_frame(struct sk_buff *skb,
 	ctrl = 0;
 
 	/* Add VLAN tag, can piggyback on LRGLEN or ADDR64 */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		if (!le) {
 			le = get_tx_le(sky2, &slot);
 			le->addr = 0;
 			le->opcode = OP_VLAN|HW_OWNER;
 		} else
 			le->opcode |= OP_VLAN;
-		le->length = cpu_to_be16(vlan_tx_tag_get(skb));
+		le->length = cpu_to_be16(skb_vlan_tag_get(skb));
 		ctrl |= INS_VLAN;
 	}
 
@@ -2594,7 +2594,7 @@ static struct sk_buff *sky2_receive(struct net_device *dev,
 	sky2->rx_next = (sky2->rx_next + 1) % sky2->rx_pending;
 	prefetch(sky2->rx_ring + sky2->rx_next);
 
-	if (vlan_tx_tag_present(re->skb))
+	if (skb_vlan_tag_present(re->skb))
 		count -= VLAN_HLEN;	/* Account for vlan tag */
 
 	/* This chip has hardware problems that generates bogus status.
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index e3357bf523df..359bb1286eb5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -682,8 +682,8 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
 	if (dev->num_tc)
 		return skb_tx_hash(dev, skb);
 
-	if (vlan_tx_tag_present(skb))
-		up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT;
+	if (skb_vlan_tag_present(skb))
+		up = skb_vlan_tag_get(skb) >> VLAN_PRIO_SHIFT;
 
 	return fallback(dev, skb) % rings_p_up + up * rings_p_up;
 }
@@ -742,8 +742,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto tx_drop;
 	}
 
-	if (vlan_tx_tag_present(skb))
-		vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb))
+		vlan_tag = skb_vlan_tag_get(skb);
 
 
 	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
@@ -930,7 +930,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	real_size = (real_size / 16) & 0x3f;
 
 	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
-	    !vlan_tx_tag_present(skb) && send_doorbell) {
+	    !skb_vlan_tag_present(skb) && send_doorbell) {
 		tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
 				       cpu_to_be32(real_size);
 
@@ -952,7 +952,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	} else {
 		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
 		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
-			!!vlan_tx_tag_present(skb);
+			!!skb_vlan_tag_present(skb);
 		tx_desc->ctrl.fence_size = real_size;
 
 		/* Ensure new descriptor hits memory
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index 2552e550a78c..eb807b0dc72a 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -1122,12 +1122,12 @@ again:
 	}
 
 #ifdef NS83820_VLAN_ACCEL_SUPPORT
-	if(vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/* fetch the vlan tag info out of the
 		 * ancillary data if the vlan code
 		 * is using hw vlan acceleration
 		 */
-		short tag = vlan_tx_tag_get(skb);
+		short tag = skb_vlan_tag_get(skb);
 		extsts |= (EXTSTS_VPKT | htons(tag));
 	}
 #endif
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index f5e4b820128b..0529cad75b10 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -4045,8 +4045,8 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	queue = 0;
-	if (vlan_tx_tag_present(skb))
-		vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb))
+		vlan_tag = skb_vlan_tag_get(skb);
 	if (sp->config.tx_steering_type == TX_DEFAULT_STEERING) {
 		if (skb->protocol == htons(ETH_P_IP)) {
 			struct iphdr *ip;
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index cc0485e3c621..50d5604833ed 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -890,8 +890,8 @@ vxge_xmit(struct sk_buff *skb, struct net_device *dev)
 		dev->name, __func__, __LINE__,
 		fifo_hw, dtr, dtr_priv);
 
-	if (vlan_tx_tag_present(skb)) {
-		u16 vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_tag = skb_vlan_tag_get(skb);
 		vxge_hw_fifo_txdl_vlan_set(dtr, vlan_tag);
 	}
 
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index f39cae620f61..a41bb5e6b954 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -2462,9 +2462,9 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 			 NV_TX2_CHECKSUM_L3 | NV_TX2_CHECKSUM_L4 : 0;
 
 	/* vlan tag */
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		start_tx->txvlan = cpu_to_le32(NV_TX3_VLAN_TAG_PRESENT |
-					vlan_tx_tag_get(skb));
+					skb_vlan_tag_get(skb));
 	else
 		start_tx->txvlan = 0;
 
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 613037584d08..a47fe67fdf58 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -1893,9 +1893,9 @@ netxen_tso_check(struct net_device *netdev,
 		protocol = vh->h_vlan_encapsulated_proto;
 		flags = FLAGS_VLAN_TAGGED;
 
-	} else if (vlan_tx_tag_present(skb)) {
+	} else if (skb_vlan_tag_present(skb)) {
 		flags = FLAGS_VLAN_OOB;
-		vid = vlan_tx_tag_get(skb);
+		vid = skb_vlan_tag_get(skb);
 		netxen_set_tx_vlan_tci(first_desc, vid);
 		vlan_oob = 1;
 	}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
index d166e534925d..4d2496f28b85 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
@@ -321,8 +321,8 @@ static void qlcnic_send_filter(struct qlcnic_adapter *adapter,
 		if (protocol == ETH_P_8021Q) {
 			vh = (struct vlan_ethhdr *)skb->data;
 			vlan_id = ntohs(vh->h_vlan_TCI);
-		} else if (vlan_tx_tag_present(skb)) {
-			vlan_id = vlan_tx_tag_get(skb);
+		} else if (skb_vlan_tag_present(skb)) {
+			vlan_id = skb_vlan_tag_get(skb);
 		}
 	}
 
@@ -473,9 +473,9 @@ static int qlcnic_tx_pkt(struct qlcnic_adapter *adapter,
 		flags = QLCNIC_FLAGS_VLAN_TAGGED;
 		vlan_tci = ntohs(vh->h_vlan_TCI);
 		protocol = ntohs(vh->h_vlan_encapsulated_proto);
-	} else if (vlan_tx_tag_present(skb)) {
+	} else if (skb_vlan_tag_present(skb)) {
 		flags = QLCNIC_FLAGS_VLAN_OOB;
-		vlan_tci = vlan_tx_tag_get(skb);
+		vlan_tci = skb_vlan_tag_get(skb);
 	}
 	if (unlikely(adapter->tx_pvid)) {
 		if (vlan_tci && !(adapter->flags & QLCNIC_TAGGING_ENABLED))
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 6c904a6cad2a..dc0058f90370 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -2660,11 +2660,11 @@ static netdev_tx_t qlge_send(struct sk_buff *skb, struct net_device *ndev)
 
 	mac_iocb_ptr->frame_len = cpu_to_le16((u16) skb->len);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		netif_printk(qdev, tx_queued, KERN_DEBUG, qdev->ndev,
-			     "Adding a vlan tag %d.\n", vlan_tx_tag_get(skb));
+			     "Adding a vlan tag %d.\n", skb_vlan_tag_get(skb));
 		mac_iocb_ptr->flags3 |= OB_MAC_IOCB_V;
-		mac_iocb_ptr->vlan_tci = cpu_to_le16(vlan_tx_tag_get(skb));
+		mac_iocb_ptr->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
 	}
 	tso = ql_tso(skb, (struct ob_mac_tso_iocb_req *)mac_iocb_ptr);
 	if (tso < 0) {
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 9c31e46d1eee..d79e33b3c191 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -708,8 +708,8 @@ static void cp_tx (struct cp_private *cp)
 
 static inline u32 cp_tx_vlan_tag(struct sk_buff *skb)
 {
-	return vlan_tx_tag_present(skb) ?
-		TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00;
+	return skb_vlan_tag_present(skb) ?
+		TxVlanTag | swab16(skb_vlan_tag_get(skb)) : 0x00;
 }
 
 static void unwind_tx_frag_mapping(struct cp_private *cp, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 3a280598a15a..cd286b0356ab 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -2073,8 +2073,8 @@ static int rtl8169_set_features(struct net_device *dev,
 
 static inline u32 rtl8169_tx_vlan_tag(struct sk_buff *skb)
 {
-	return (vlan_tx_tag_present(skb)) ?
-		TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00;
+	return (skb_vlan_tag_present(skb)) ?
+		TxVlanTag | swab16(skb_vlan_tag_get(skb)) : 0x00;
 }
 
 static void rtl8169_rx_vlan_tag(struct RxDesc *desc, struct sk_buff *skb)
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index b6612d6090ac..23545e1e605a 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -1272,7 +1272,7 @@ static netdev_tx_t sxgbe_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb_is_gso(skb) && tqueue->prev_mss != cur_mss))
 		ctxt_desc_req = 1;
 
-	if (unlikely(vlan_tx_tag_present(skb) ||
+	if (unlikely(skb_vlan_tag_present(skb) ||
 		     ((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
 		      tqueue->hwts_tx_en)))
 		ctxt_desc_req = 1;
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 6ab36d9ff2ab..a9cac8413e49 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -1650,9 +1650,9 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
 		    txd_mss);
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/*Cut VLAN ID to 12 bits */
-		txd_vlan_id = vlan_tx_tag_get(skb) & BITS_MASK(12);
+		txd_vlan_id = skb_vlan_tag_get(skb) & BITS_MASK(12);
 		txd_vtag = 1;
 	}
 
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index a191afc23b56..0ac76102b33d 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -1781,8 +1781,8 @@ static netdev_tx_t rhine_start_tx(struct sk_buff *skb,
 	rp->tx_ring[entry].desc_length =
 		cpu_to_le32(TXDESC | (skb->len >= ETH_ZLEN ? skb->len : ETH_ZLEN));
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		u16 vid_pcp = vlan_tx_tag_get(skb);
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		u16 vid_pcp = skb_vlan_tag_get(skb);
 
 		/* drop CFI/DEI bit, register needs VID and PCP */
 		vid_pcp = (vid_pcp & VLAN_VID_MASK) |
@@ -1803,7 +1803,7 @@ static netdev_tx_t rhine_start_tx(struct sk_buff *skb,
 
 	/* Non-x86 Todo: explicitly flush cache lines here. */
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		/* Tx queues are bits 7-0 (first Tx queue: bit 7) */
 		BYTE_REG_BITS_ON(1 << 7, ioaddr + TQWake);
 
diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c
index 282f83a63b67..c20206f83cc1 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -2611,8 +2611,8 @@ static netdev_tx_t velocity_xmit(struct sk_buff *skb,
 
 	td_ptr->tdesc1.cmd = TCPLS_NORMAL + (tdinfo->nskb_dma + 1) * 16;
 
-	if (vlan_tx_tag_present(skb)) {
-		td_ptr->tdesc1.vlan = cpu_to_le16(vlan_tx_tag_get(skb));
+	if (skb_vlan_tag_present(skb)) {
+		td_ptr->tdesc1.vlan = cpu_to_le16(skb_vlan_tag_get(skb));
 		td_ptr->tdesc1.TCR |= TCR0_VETAG;
 	}
 
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 7df221788cd4..d0ed5694dd7d 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -645,7 +645,7 @@ static void macvtap_skb_to_vnet_hdr(struct macvtap_queue *q,
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		if (vlan_tx_tag_present(skb))
+		if (skb_vlan_tag_present(skb))
 			vnet_hdr->csum_start = cpu_to_macvtap16(q,
 				skb_checksum_start_offset(skb) + VLAN_HLEN);
 		else
@@ -821,13 +821,13 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 	total = vnet_hdr_len;
 	total += skb->len;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
 		} veth;
 		veth.h_vlan_proto = skb->vlan_proto;
-		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
+		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 		total += VLAN_HLEN;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 74fdf1158448..be196e89ab6c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1260,7 +1260,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		vlan_hlen = VLAN_HLEN;
 
 	if (tun->flags & IFF_VNET_HDR)
@@ -1337,7 +1337,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 		} veth;
 
 		veth.h_vlan_proto = skb->vlan_proto;
-		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
+		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index b23426e4952c..e519e6a269b9 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -1421,10 +1421,10 @@ static int msdn_giant_send_check(struct sk_buff *skb)
 
 static inline void rtl_tx_vlan_tag(struct tx_desc *desc, struct sk_buff *skb)
 {
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		u32 opts2;
 
-		opts2 = TX_VLAN_TAG | swab16(vlan_tx_tag_get(skb));
+		opts2 = TX_VLAN_TAG | swab16(skb_vlan_tag_get(skb));
 		desc->opts2 |= cpu_to_le32(opts2);
 	}
 }
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 31439818c27e..294214c15292 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1038,9 +1038,9 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 		le32_add_cpu(&tq->shared->txNumDeferred, 1);
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		gdesc->txd.ti = 1;
-		gdesc->txd.tci = vlan_tx_tag_get(skb);
+		gdesc->txd.tci = skb_vlan_tag_get(skb);
 	}
 
 	/* finally flips the GEN bit of the SOP desc. */
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3a18d8ed89ca..985359dd6033 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1561,7 +1561,7 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 
 	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
 			+ VXLAN_HLEN + sizeof(struct ipv6hdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	/* Need space for new headers (invalidates iph ptr) */
 	err = skb_cow_head(skb, min_headroom);
@@ -1607,7 +1607,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ VXLAN_HLEN + sizeof(struct iphdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	/* Need space for new headers (invalidates iph ptr) */
 	err = skb_cow_head(skb, min_headroom);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 625227ad16ee..dd4ab8d73d34 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2800,12 +2800,12 @@ static void qeth_l3_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
 	 * before we're going to overwrite this location with next hop ip.
 	 * v6 uses passthrough, v4 sets the tag in the QDIO header.
 	 */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		if ((ipv == 4) || (card->info.type == QETH_CARD_TYPE_IQD))
 			hdr->hdr.l3.ext_flags = QETH_HDR_EXT_VLAN_FRAME;
 		else
 			hdr->hdr.l3.ext_flags = QETH_HDR_EXT_INCLUDE_VLAN_TAG;
-		hdr->hdr.l3.vlan_id = vlan_tx_tag_get(skb);
+		hdr->hdr.l3.vlan_id = skb_vlan_tag_get(skb);
 	}
 
 	hdr->hdr.l3.length = skb->len - sizeof(struct qeth_hdr);
@@ -2986,7 +2986,7 @@ static int qeth_l3_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			skb_pull(new_skb, ETH_HLEN);
 		}
 
-		if (ipv != 4 && vlan_tx_tag_present(new_skb)) {
+		if (ipv != 4 && skb_vlan_tag_present(new_skb)) {
 			skb_push(new_skb, VLAN_HLEN);
 			skb_copy_to_linear_data(new_skb, new_skb->data + 4, 4);
 			skb_copy_to_linear_data_offset(new_skb, 4,
@@ -2995,7 +2995,7 @@ static int qeth_l3_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				new_skb->data + 12, 4);
 			tag = (u16 *)(new_skb->data + 12);
 			*tag = __constant_htons(ETH_P_8021Q);
-			*(tag + 1) = htons(vlan_tx_tag_get(new_skb));
+			*(tag + 1) = htons(skb_vlan_tag_get(new_skb));
 		}
 	}
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 14419a8ccbb6..bcaf4cabb858 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -469,7 +469,7 @@ static int peek_head_len(struct sock *sk)
 	head = skb_peek(&sk->sk_receive_queue);
 	if (likely(head)) {
 		len = head->len;
-		if (vlan_tx_tag_present(head))
+		if (skb_vlan_tag_present(head))
 			len += VLAN_HLEN;
 	}
 
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 515a35e2a48a..bea465f24ebb 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -78,9 +78,9 @@ static inline bool is_vlan_dev(struct net_device *dev)
         return dev->priv_flags & IFF_802_1Q_VLAN;
 }
 
-#define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
-#define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
-#define vlan_tx_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
+#define skb_vlan_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
+#define skb_vlan_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
+#define skb_vlan_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
 
 /**
  *	struct vlan_pcpu_stats - VLAN percpu rx/tx stats
@@ -376,7 +376,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
 static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
 {
 	skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-					vlan_tx_tag_get(skb));
+					skb_vlan_tag_get(skb));
 	if (likely(skb))
 		skb->vlan_tci = 0;
 	return skb;
@@ -393,7 +393,7 @@ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
  */
 static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb)
 {
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		skb = __vlan_hwaccel_push_inside(skb);
 	return skb;
 }
@@ -442,8 +442,8 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
 					 u16 *vlan_tci)
 {
-	if (vlan_tx_tag_present(skb)) {
-		*vlan_tci = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		*vlan_tci = skb_vlan_tag_get(skb);
 		return 0;
 	} else {
 		*vlan_tci = 0;
@@ -480,7 +480,7 @@ static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
 {
 	__be16 protocol = 0;
 
-	if (vlan_tx_tag_present(skb) ||
+	if (skb_vlan_tag_present(skb) ||
 	     skb->protocol != cpu_to_be16(ETH_P_8021Q))
 		protocol = skb->protocol;
 	else {
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index fe6e7aac3c56..2342bf12cb78 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -121,7 +121,7 @@ static inline __be16 tc_skb_protocol(const struct sk_buff *skb)
 	 * vlan accelerated path. In that case, use skb->vlan_proto
 	 * as the original vlan header was already stripped.
 	 */
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		return skb->vlan_proto;
 	return skb->protocol;
 }
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 1de256b35807..49cc7c3de252 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -40,9 +40,9 @@ TRACE_EVENT(net_dev_start_xmit,
 		__assign_str(name, dev->name);
 		__entry->queue_mapping = skb->queue_mapping;
 		__entry->skbaddr = skb;
-		__entry->vlan_tagged = vlan_tx_tag_present(skb);
+		__entry->vlan_tagged = skb_vlan_tag_present(skb);
 		__entry->vlan_proto = ntohs(skb->vlan_proto);
-		__entry->vlan_tci = vlan_tx_tag_get(skb);
+		__entry->vlan_tci = skb_vlan_tag_get(skb);
 		__entry->protocol = ntohs(skb->protocol);
 		__entry->ip_summed = skb->ip_summed;
 		__entry->len = skb->len;
@@ -174,9 +174,9 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 #endif
 		__entry->queue_mapping = skb->queue_mapping;
 		__entry->skbaddr = skb;
-		__entry->vlan_tagged = vlan_tx_tag_present(skb);
+		__entry->vlan_tagged = skb_vlan_tag_present(skb);
 		__entry->vlan_proto = ntohs(skb->vlan_proto);
-		__entry->vlan_tci = vlan_tx_tag_get(skb);
+		__entry->vlan_tci = skb_vlan_tag_get(skb);
 		__entry->protocol = ntohs(skb->protocol);
 		__entry->ip_summed = skb->ip_summed;
 		__entry->hash = skb->hash;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 90cc2bdd4064..61bf2a06e85d 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -9,7 +9,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	__be16 vlan_proto = skb->vlan_proto;
-	u16 vlan_id = vlan_tx_tag_get_id(skb);
+	u16 vlan_id = skb_vlan_tag_get_id(skb);
 	struct net_device *vlan_dev;
 	struct vlan_pcpu_stats *rx_stats;
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index c190d22b6b3d..65728e0dc4ff 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -66,17 +66,17 @@ static int brnf_pass_vlan_indev __read_mostly = 0;
 #endif
 
 #define IS_IP(skb) \
-	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
 
 #define IS_IPV6(skb) \
-	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
 
 #define IS_ARP(skb) \
-	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
 
 static inline __be16 vlan_proto(const struct sk_buff *skb)
 {
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		return skb->protocol;
 	else if (skb->protocol == htons(ETH_P_8021Q))
 		return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
@@ -436,11 +436,11 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
 	struct net_device *vlan, *br;
 
 	br = bridge_parent(dev);
-	if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb))
+	if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
 		return br;
 
 	vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
-				    vlan_tx_tag_get(skb) & VLAN_VID_MASK);
+				    skb_vlan_tag_get(skb) & VLAN_VID_MASK);
 
 	return vlan ? vlan : br;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aea3d1339b3f..d808d766334d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -628,8 +628,8 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
 {
 	int err = 0;
 
-	if (vlan_tx_tag_present(skb))
-		*vid = vlan_tx_tag_get(skb) & VLAN_VID_MASK;
+	if (skb_vlan_tag_present(skb))
+		*vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK;
 	else {
 		*vid = 0;
 		err = -EINVAL;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 97b8ddf57363..13013fe8db24 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -187,7 +187,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 	 * sent from vlan device on the bridge device, it does not have
 	 * HW accelerated vlan tag.
 	 */
-	if (unlikely(!vlan_tx_tag_present(skb) &&
+	if (unlikely(!skb_vlan_tag_present(skb) &&
 		     skb->protocol == proto)) {
 		skb = skb_vlan_untag(skb);
 		if (unlikely(!skb))
@@ -200,7 +200,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 			/* Protocol-mismatch, empty out vlan_tci for new tag */
 			skb_push(skb, ETH_HLEN);
 			skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-							vlan_tx_tag_get(skb));
+							skb_vlan_tag_get(skb));
 			if (unlikely(!skb))
 				return false;
 
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 8d3f8c7651f0..618568888128 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -45,8 +45,8 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	/* VLAN encapsulated Type/Length field, given from orig frame */
 	__be16 encap;
 
-	if (vlan_tx_tag_present(skb)) {
-		TCI = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		TCI = skb_vlan_tag_get(skb);
 		encap = skb->protocol;
 	} else {
 		const struct vlan_hdr *fp;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index d9a8c05d995d..91180a7fc943 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -133,7 +133,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
 	__be16 ethproto;
 	int verdict, i;
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		ethproto = htons(ETH_P_8021Q);
 	else
 		ethproto = h->h_proto;
diff --git a/net/core/dev.c b/net/core/dev.c
index 805456147c30..1e325adc4367 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2578,7 +2578,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	if (skb->encapsulation)
 		features &= dev->hw_enc_features;
 
-	if (!vlan_tx_tag_present(skb)) {
+	if (!skb_vlan_tag_present(skb)) {
 		if (unlikely(protocol == htons(ETH_P_8021Q) ||
 			     protocol == htons(ETH_P_8021AD))) {
 			struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2659,7 +2659,7 @@ out:
 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 					  netdev_features_t features)
 {
-	if (vlan_tx_tag_present(skb) &&
+	if (skb_vlan_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 		skb = __vlan_hwaccel_push_inside(skb);
 	return skb;
@@ -3676,7 +3676,7 @@ ncls:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = NULL;
@@ -3708,8 +3708,8 @@ ncls:
 		}
 	}
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		if (vlan_tx_tag_get_id(skb))
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		if (skb_vlan_tag_get_id(skb))
 			skb->pkt_type = PACKET_OTHERHOST;
 		/* Note: we might in the future use prio bits
 		 * and set skb->priority like in vlan_do_receive()
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e0ad5d16c9c5..c126a878c47c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -77,7 +77,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	features = netif_skb_features(skb);
 
-	if (vlan_tx_tag_present(skb) &&
+	if (skb_vlan_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
 		skb = __vlan_hwaccel_push_inside(skb);
 		if (unlikely(!skb)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a2a2e887a12..56db472e9b86 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4197,7 +4197,7 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
 	struct vlan_hdr *vhdr;
 	u16 vlan_tci;
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
+	if (unlikely(skb_vlan_tag_present(skb))) {
 		/* vlan_tci is already set-up so leave this for another time */
 		return skb;
 	}
@@ -4283,7 +4283,7 @@ int skb_vlan_pop(struct sk_buff *skb)
 	__be16 vlan_proto;
 	int err;
 
-	if (likely(vlan_tx_tag_present(skb))) {
+	if (likely(skb_vlan_tag_present(skb))) {
 		skb->vlan_tci = 0;
 	} else {
 		if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
@@ -4313,7 +4313,7 @@ EXPORT_SYMBOL(skb_vlan_pop);
 
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 {
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		unsigned int offset = skb->data - skb_mac_header(skb);
 		int err;
 
@@ -4323,7 +4323,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 		 */
 		__skb_push(skb, offset);
 		err = __vlan_insert_tag(skb, skb->vlan_proto,
-					vlan_tx_tag_get(skb));
+					skb_vlan_tag_get(skb));
 		if (err)
 			return err;
 		skb->protocol = skb->vlan_proto;
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 5b52046ec7a2..23744c7a9718 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -119,7 +119,7 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	err = skb_cow_head(skb, min_headroom);
 	if (unlikely(err)) {
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 770064c83711..b4cffe686126 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -212,7 +212,7 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 	int err;
 
 	err = skb_vlan_pop(skb);
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		invalidate_flow_key(key);
 	else
 		key->eth.tci = 0;
@@ -222,7 +222,7 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_vlan *vlan)
 {
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		invalidate_flow_key(key);
 	else
 		key->eth.tci = vlan->vlan_tci;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4e9a5f035cbc..54854e3ecd83 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -419,7 +419,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	if (!dp_ifindex)
 		return -ENODEV;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		nskb = skb_clone(skb, GFP_ATOMIC);
 		if (!nskb)
 			return -ENOMEM;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index da2fae0873a5..df334fe43d7f 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -70,7 +70,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
 {
 	struct flow_stats *stats;
 	int node = numa_node_id();
-	int len = skb->len + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+	int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	stats = rcu_dereference(flow->stats[node]);
 
@@ -472,7 +472,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 	 */
 
 	key->eth.tci = 0;
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		key->eth.tci = htons(skb->vlan_tci);
 	else if (eth->h_proto == htons(ETH_P_8021Q))
 		if (unlikely(parse_vlan(skb, key)))
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index d4168c442db5..e9aedb7c7106 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -166,7 +166,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ tunnel_hlen + sizeof(struct iphdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
 		int head_delta = SKB_DATA_ALIGN(min_headroom -
 						skb_headroom(skb) +
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 2034c6d9cb5a..464739aac0f3 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -480,7 +480,8 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 	stats = this_cpu_ptr(vport->percpu_stats);
 	u64_stats_update_begin(&stats->syncp);
 	stats->rx_packets++;
-	stats->rx_bytes += skb->len + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+	stats->rx_bytes += skb->len +
+			   (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 	u64_stats_update_end(&stats->syncp);
 
 	OVS_CB(skb)->input_vport = vport;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0f02668dc219..d37075b0d6d5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -986,8 +986,8 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
 			struct tpacket3_hdr *ppd)
 {
-	if (vlan_tx_tag_present(pkc->skb)) {
-		ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
+	if (skb_vlan_tag_present(pkc->skb)) {
+		ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
 		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
 		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 	} else {
@@ -2000,8 +2000,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h2->tp_net = netoff;
 		h.h2->tp_sec = ts.tv_sec;
 		h.h2->tp_nsec = ts.tv_nsec;
-		if (vlan_tx_tag_present(skb)) {
-			h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
+		if (skb_vlan_tag_present(skb)) {
+			h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
 			h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
 			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 		} else {
@@ -3010,8 +3010,8 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		aux.tp_snaplen = skb->len;
 		aux.tp_mac = 0;
 		aux.tp_net = skb_network_offset(skb);
-		if (vlan_tx_tag_present(skb)) {
-			aux.tp_vlan_tci = vlan_tx_tag_get(skb);
+		if (skb_vlan_tag_present(skb)) {
+			aux.tp_vlan_tci = skb_vlan_tag_get(skb);
 			aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
 			aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 		} else {
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 2159981b604e..b5294ce20cd4 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -176,7 +176,7 @@ META_COLLECTOR(int_vlan_tag)
 {
 	unsigned short tag;
 
-	tag = vlan_tx_tag_get(skb);
+	tag = skb_vlan_tag_get(skb);
 	if (!tag && __vlan_get_tag(skb, &tag))
 		*err = -1;
 	else
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d0ac795445b7..1d2fcfad06cc 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -708,8 +708,8 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb,
 	if (skb->priority >= 256 && skb->priority <= 263)
 		return skb->priority - 256;
 
-	if (vlan_tx_tag_present(skb)) {
-		vlan_priority = (vlan_tx_tag_get(skb) & VLAN_PRIO_MASK)
+	if (skb_vlan_tag_present(skb)) {
+		vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK)
 			>> VLAN_PRIO_SHIFT;
 		if (vlan_priority > 0)
 			return vlan_priority;
-- 
cgit v1.2.3


From dd22f551ac0ad366f92f601835f6623b83adc331 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 7 Jan 2015 18:05:34 +0200
Subject: block: Change direct_access calling convention

In order to support accesses to larger chunks of memory, pass in a
'size' parameter (counted in bytes), and return the amount available at
that address.

Add a new helper function, bdev_direct_access(), to handle common
functionality including partition handling, checking the length requested
is positive, checking for the sector being page-aligned, and checking
the length of the request does not pass the end of the partition.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/filesystems/xip.txt | 15 +++++++++------
 arch/powerpc/sysdev/axonram.c     | 17 ++++-------------
 drivers/block/brd.c               | 14 +++++++-------
 drivers/s390/block/dcssblk.c      | 21 +++++++++-----------
 fs/block_dev.c                    | 40 +++++++++++++++++++++++++++++++++++++++
 fs/ext2/xip.c                     | 31 +++++++++++++-----------------
 include/linux/blkdev.h            |  6 ++++--
 7 files changed, 86 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt
index 0466ee569278..b77472949ede 100644
--- a/Documentation/filesystems/xip.txt
+++ b/Documentation/filesystems/xip.txt
@@ -28,12 +28,15 @@ Implementation
 Execute-in-place is implemented in three steps: block device operation,
 address space operation, and file operations.
 
-A block device operation named direct_access is used to retrieve a
-reference (pointer) to a block on-disk. The reference is supposed to be
-cpu-addressable, physical address and remain valid until the release operation
-is performed. A struct block_device reference is used to address the device,
-and a sector_t argument is used to identify the individual block. As an
-alternative, memory technology devices can be used for this.
+A block device operation named direct_access is used to translate the
+block device sector number to a page frame number (pfn) that identifies
+the physical page for the memory.  It also returns a kernel virtual
+address that can be used to access the memory.
+
+The direct_access method takes a 'size' parameter that indicates the
+number of bytes being requested.  The function should return the number
+of bytes that can be contiguously accessed at that offset.  It may also
+return a negative errno if an error occurs.
 
 The block device operation is optional, these block devices support it as of
 today:
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index f532c92bf99d..20f8afe855d1 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -139,26 +139,17 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  * axon_ram_direct_access - direct_access() method for block device
  * @device, @sector, @data: see block_device_operations method
  */
-static int
+static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-		       void **kaddr, unsigned long *pfn)
+		       void **kaddr, unsigned long *pfn, long size)
 {
 	struct axon_ram_bank *bank = device->bd_disk->private_data;
-	loff_t offset;
-
-	offset = sector;
-	if (device->bd_part != NULL)
-		offset += device->bd_part->start_sect;
-	offset <<= AXON_RAM_SECTOR_SHIFT;
-	if (offset >= bank->size) {
-		dev_err(&bank->device->dev, "Access outside of address space\n");
-		return -ERANGE;
-	}
+	loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
 
 	*kaddr = (void *)(bank->ph_addr + offset);
 	*pfn = virt_to_phys(kaddr) >> PAGE_SHIFT;
 
-	return 0;
+	return bank->size - offset;
 }
 
 static const struct block_device_operations axon_ram_devops = {
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 3598110d2cef..89e90ec52f28 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -370,25 +370,25 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 #ifdef CONFIG_BLK_DEV_XIP
-static int brd_direct_access(struct block_device *bdev, sector_t sector,
-			void **kaddr, unsigned long *pfn)
+static long brd_direct_access(struct block_device *bdev, sector_t sector,
+			void **kaddr, unsigned long *pfn, long size)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	struct page *page;
 
 	if (!brd)
 		return -ENODEV;
-	if (sector & (PAGE_SECTORS-1))
-		return -EINVAL;
-	if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk))
-		return -ERANGE;
 	page = brd_insert_page(brd, sector);
 	if (!page)
 		return -ENOSPC;
 	*kaddr = page_address(page);
 	*pfn = page_to_pfn(page);
 
-	return 0;
+	/*
+	 * TODO: If size > PAGE_SIZE, we could look to see if the next page in
+	 * the file happens to be mapped to the next page of physical RAM.
+	 */
+	return PAGE_SIZE;
 }
 #endif
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index b550c8c8d010..31d6884f3351 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -28,8 +28,8 @@
 static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
-static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-				 void **kaddr, unsigned long *pfn);
+static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
+				 void **kaddr, unsigned long *pfn, long size);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -866,25 +866,22 @@ fail:
 	bio_io_error(bio);
 }
 
-static int
+static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-			void **kaddr, unsigned long *pfn)
+			void **kaddr, unsigned long *pfn, long size)
 {
 	struct dcssblk_dev_info *dev_info;
-	unsigned long pgoff;
+	unsigned long offset, dev_sz;
 
 	dev_info = bdev->bd_disk->private_data;
 	if (!dev_info)
 		return -ENODEV;
-	if (secnum % (PAGE_SIZE/512))
-		return -EINVAL;
-	pgoff = secnum / (PAGE_SIZE / 512);
-	if ((pgoff+1)*PAGE_SIZE-1 > dev_info->end - dev_info->start)
-		return -ERANGE;
-	*kaddr = (void *) (dev_info->start+pgoff*PAGE_SIZE);
+	dev_sz = dev_info->end - dev_info->start;
+	offset = secnum * 512;
+	*kaddr = (void *) (dev_info->start + offset);
 	*pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
 
-	return 0;
+	return dev_sz - offset;
 }
 
 static void
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b48c41bf0f86..f314c2c0567d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -429,6 +429,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
 
+/**
+ * bdev_direct_access() - Get the address for directly-accessibly memory
+ * @bdev: The device containing the memory
+ * @sector: The offset within the device
+ * @addr: Where to put the address of the memory
+ * @pfn: The Page Frame Number for the memory
+ * @size: The number of bytes requested
+ *
+ * If a block device is made up of directly addressable memory, this function
+ * will tell the caller the PFN and the address of the memory.  The address
+ * may be directly dereferenced within the kernel without the need to call
+ * ioremap(), kmap() or similar.  The PFN is suitable for inserting into
+ * page tables.
+ *
+ * Return: negative errno if an error occurs, otherwise the number of bytes
+ * accessible at this address.
+ */
+long bdev_direct_access(struct block_device *bdev, sector_t sector,
+			void **addr, unsigned long *pfn, long size)
+{
+	long avail;
+	const struct block_device_operations *ops = bdev->bd_disk->fops;
+
+	if (size < 0)
+		return size;
+	if (!ops->direct_access)
+		return -EOPNOTSUPP;
+	if ((sector + DIV_ROUND_UP(size, 512)) >
+					part_nr_sects_read(bdev->bd_part))
+		return -ERANGE;
+	sector += get_start_sect(bdev);
+	if (sector % (PAGE_SIZE / 512))
+		return -EINVAL;
+	avail = ops->direct_access(bdev, sector, addr, pfn, size);
+	if (!avail)
+		return -ERANGE;
+	return min(avail, size);
+}
+EXPORT_SYMBOL_GPL(bdev_direct_access);
+
 /*
  * pseudo-fs
  */
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index e98171a11cfe..bbc5fec6ff7f 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -13,18 +13,12 @@
 #include "ext2.h"
 #include "xip.h"
 
-static inline int
-__inode_direct_access(struct inode *inode, sector_t block,
-		      void **kaddr, unsigned long *pfn)
+static inline long __inode_direct_access(struct inode *inode, sector_t block,
+				void **kaddr, unsigned long *pfn, long size)
 {
 	struct block_device *bdev = inode->i_sb->s_bdev;
-	const struct block_device_operations *ops = bdev->bd_disk->fops;
-	sector_t sector;
-
-	sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
-
-	BUG_ON(!ops->direct_access);
-	return ops->direct_access(bdev, sector, kaddr, pfn);
+	sector_t sector = block * (PAGE_SIZE / 512);
+	return bdev_direct_access(bdev, sector, kaddr, pfn, size);
 }
 
 static inline int
@@ -53,12 +47,13 @@ ext2_clear_xip_target(struct inode *inode, sector_t block)
 {
 	void *kaddr;
 	unsigned long pfn;
-	int rc;
+	long size;
 
-	rc = __inode_direct_access(inode, block, &kaddr, &pfn);
-	if (!rc)
-		clear_page(kaddr);
-	return rc;
+	size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE);
+	if (size < 0)
+		return size;
+	clear_page(kaddr);
+	return 0;
 }
 
 void ext2_xip_verify_sb(struct super_block *sb)
@@ -77,7 +72,7 @@ void ext2_xip_verify_sb(struct super_block *sb)
 int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
 				void **kmem, unsigned long *pfn)
 {
-	int rc;
+	long rc;
 	sector_t block;
 
 	/* first, retrieve the sector number */
@@ -86,6 +81,6 @@ int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
 		return rc;
 
 	/* retrieve address of the target data */
-	rc = __inode_direct_access(mapping->host, block, kmem, pfn);
-	return rc;
+	rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE);
+	return (rc < 0) ? rc : 0;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 92f4b4b288dd..e9086be6d9a0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1601,8 +1601,8 @@ struct block_device_operations {
 	int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-	int (*direct_access) (struct block_device *, sector_t,
-						void **, unsigned long *);
+	long (*direct_access)(struct block_device *, sector_t,
+					void **, unsigned long *pfn, long size);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1620,6 +1620,8 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
+extern long bdev_direct_access(struct block_device *, sector_t, void **addr,
+						unsigned long *pfn, long size);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.2.3


From a7d19057e7160a566bad9b2ba070a391fb78df96 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Sat, 12 Jul 2014 12:10:04 +0200
Subject: clk: sunxi: Remove custom phase function

Now that we don't have any user left for our custom phase function, we can
safely remove this hack from the code.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Reviewed-by: Chen-Yu Tsai <wens@csie.org>
Tested-by: Chen-Yu Tsai <wens@csie.org>
---
 drivers/clk/sunxi/clk-sunxi.c | 37 -------------------------------------
 include/linux/clk/sunxi.h     | 22 ----------------------
 2 files changed, 59 deletions(-)
 delete mode 100644 include/linux/clk/sunxi.h

(limited to 'include/linux')

diff --git a/drivers/clk/sunxi/clk-sunxi.c b/drivers/clk/sunxi/clk-sunxi.c
index 04e0b334c678..d43c7949a07c 100644
--- a/drivers/clk/sunxi/clk-sunxi.c
+++ b/drivers/clk/sunxi/clk-sunxi.c
@@ -562,43 +562,6 @@ static void sun7i_a20_get_out_factors(u32 *freq, u32 parent_rate,
 	*p = calcp;
 }
 
-/**
- * clk_sunxi_mmc_phase_control() - configures MMC clock phase control
- */
-
-void clk_sunxi_mmc_phase_control(struct clk *clk, u8 sample, u8 output)
-{
-	#define to_clk_composite(_hw) container_of(_hw, struct clk_composite, hw)
-	#define to_clk_factors(_hw) container_of(_hw, struct clk_factors, hw)
-
-	struct clk_hw *hw = __clk_get_hw(clk);
-	struct clk_composite *composite = to_clk_composite(hw);
-	struct clk_hw *rate_hw = composite->rate_hw;
-	struct clk_factors *factors = to_clk_factors(rate_hw);
-	unsigned long flags = 0;
-	u32 reg;
-
-	if (factors->lock)
-		spin_lock_irqsave(factors->lock, flags);
-
-	reg = readl(factors->reg);
-
-	/* set sample clock phase control */
-	reg &= ~(0x7 << 20);
-	reg |= ((sample & 0x7) << 20);
-
-	/* set output clock phase control */
-	reg &= ~(0x7 << 8);
-	reg |= ((output & 0x7) << 8);
-
-	writel(reg, factors->reg);
-
-	if (factors->lock)
-		spin_unlock_irqrestore(factors->lock, flags);
-}
-EXPORT_SYMBOL(clk_sunxi_mmc_phase_control);
-
-
 /**
  * sunxi_factors_clk_setup() - Setup function for factor clocks
  */
diff --git a/include/linux/clk/sunxi.h b/include/linux/clk/sunxi.h
deleted file mode 100644
index aed28c4451d9..000000000000
--- a/include/linux/clk/sunxi.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright 2013 - Hans de Goede <hdegoede@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef __LINUX_CLK_SUNXI_H_
-#define __LINUX_CLK_SUNXI_H_
-
-#include <linux/clk.h>
-
-void clk_sunxi_mmc_phase_control(struct clk *clk, u8 sample, u8 output);
-
-#endif
-- 
cgit v1.2.3


From f684e4ac9f4bae4e6ecff92eef9645a44764fc04 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 12 Jan 2015 00:45:55 +0100
Subject: pinctrl: pinconf-generic: loose DT dependence

New pin controllers such as ACPI-based may also have custom properties
to parse, and should be able to use generic pin config. Let's make the
code compile on !OF systems and rename members a bit to underscore it
is custom parameters and not necessarily DT parameters.

This fixes a build regression for x86_64 on the zeroday kernel builds.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reviewed-and-tested-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c        | 39 ++++++++++++++++++--------------
 drivers/pinctrl/pinctrl-zynq.c           |  8 +++----
 drivers/pinctrl/qcom/pinctrl-spmi-gpio.c |  8 +++----
 include/linux/pinctrl/pinconf-generic.h  |  2 +-
 include/linux/pinctrl/pinctrl.h          | 17 ++++++++------
 5 files changed, 41 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index e0886665b70a..4db92f64b4de 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -113,10 +113,11 @@ void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev, struct seq_file *s,
 	pinconf_generic_dump_one(pctldev, s, gname, pin, conf_items,
 				 ARRAY_SIZE(conf_items));
 	/* driver-specific parameters */
-	if (pctldev->desc->num_dt_params && pctldev->desc->conf_items)
+	if (pctldev->desc->num_custom_params &&
+	    pctldev->desc->custom_conf_items)
 		pinconf_generic_dump_one(pctldev, s, gname, pin,
-					 pctldev->desc->conf_items,
-					 pctldev->desc->num_dt_params);
+					 pctldev->desc->custom_conf_items,
+					 pctldev->desc->num_custom_params);
 }
 
 void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
@@ -131,21 +132,24 @@ void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 			   pinconf_to_config_argument(config));
 	}
 
-	if (!pctldev->desc->num_dt_params || !pctldev->desc->conf_items)
+	if (!pctldev->desc->num_custom_params ||
+	    !pctldev->desc->custom_conf_items)
 		return;
 
-	for (i = 0; i < pctldev->desc->num_dt_params; i++) {
-		if (pinconf_to_config_param(config) != pctldev->desc->conf_items[i].param)
+	for (i = 0; i < pctldev->desc->num_custom_params; i++) {
+		if (pinconf_to_config_param(config) !=
+		    pctldev->desc->custom_conf_items[i].param)
 			continue;
-		seq_printf(s, "%s: 0x%x", pctldev->desc->conf_items[i].display,
-			   pinconf_to_config_argument(config));
+		seq_printf(s, "%s: 0x%x",
+				pctldev->desc->custom_conf_items[i].display,
+				pinconf_to_config_argument(config));
 	}
 }
 EXPORT_SYMBOL_GPL(pinconf_generic_dump_config);
 #endif
 
 #ifdef CONFIG_OF
-static const struct pinconf_generic_dt_params dt_params[] = {
+static const struct pinconf_generic_params dt_params[] = {
 	{ "bias-disable", PIN_CONFIG_BIAS_DISABLE, 0 },
 	{ "bias-high-impedance", PIN_CONFIG_BIAS_HIGH_IMPEDANCE, 0 },
 	{ "bias-bus-hold", PIN_CONFIG_BIAS_BUS_HOLD, 0 },
@@ -170,9 +174,9 @@ static const struct pinconf_generic_dt_params dt_params[] = {
 };
 
 /**
- * parse_dt_cfg - Parse DT pinconf parameters
+ * parse_dt_cfg() - Parse DT pinconf parameters
  * @np:	DT node
- * @params:	Array of describing DT parameters
+ * @params:	Array of describing generic parameters
  * @count:	Number of entries in @params
  * @cfg:	Array of parsed config options
  * @ncfg:	Number of entries in @cfg
@@ -183,7 +187,7 @@ static const struct pinconf_generic_dt_params dt_params[] = {
  * needs to have enough memory allocated to hold all possible entries.
  */
 static void parse_dt_cfg(struct device_node *np,
-			 const struct pinconf_generic_dt_params *params,
+			 const struct pinconf_generic_params *params,
 			 unsigned int count, unsigned long *cfg,
 			 unsigned int *ncfg)
 {
@@ -192,7 +196,7 @@ static void parse_dt_cfg(struct device_node *np,
 	for (i = 0; i < count; i++) {
 		u32 val;
 		int ret;
-		const struct pinconf_generic_dt_params *par = &params[i];
+		const struct pinconf_generic_params *par = &params[i];
 
 		ret = of_property_read_u32(np, par->property, &val);
 
@@ -232,15 +236,16 @@ int pinconf_generic_parse_dt_config(struct device_node *np,
 	/* allocate a temporary array big enough to hold one of each option */
 	max_cfg = ARRAY_SIZE(dt_params);
 	if (pctldev)
-		max_cfg += pctldev->desc->num_dt_params;
+		max_cfg += pctldev->desc->num_custom_params;
 	cfg = kcalloc(max_cfg, sizeof(*cfg), GFP_KERNEL);
 	if (!cfg)
 		return -ENOMEM;
 
 	parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg);
-	if (pctldev && pctldev->desc->num_dt_params && pctldev->desc->params)
-		parse_dt_cfg(np, pctldev->desc->params,
-			      pctldev->desc->num_dt_params, cfg, &ncfg);
+	if (pctldev && pctldev->desc->num_custom_params &&
+		pctldev->desc->custom_params)
+		parse_dt_cfg(np, pctldev->desc->custom_params,
+			     pctldev->desc->num_custom_params, cfg, &ncfg);
 
 	ret = 0;
 
diff --git a/drivers/pinctrl/pinctrl-zynq.c b/drivers/pinctrl/pinctrl-zynq.c
index 62534234da78..8aa05e2eb705 100644
--- a/drivers/pinctrl/pinctrl-zynq.c
+++ b/drivers/pinctrl/pinctrl-zynq.c
@@ -920,7 +920,7 @@ enum zynq_pin_config_param {
 	PIN_CONFIG_IOSTANDARD = PIN_CONFIG_END + 1,
 };
 
-static const struct pinconf_generic_dt_params zynq_dt_params[] = {
+static const struct pinconf_generic_params zynq_dt_params[] = {
 	{"io-standard", PIN_CONFIG_IOSTANDARD, zynq_iostd_lvcmos18},
 };
 
@@ -1099,9 +1099,9 @@ static struct pinctrl_desc zynq_desc = {
 	.pctlops = &zynq_pctrl_ops,
 	.pmxops = &zynq_pinmux_ops,
 	.confops = &zynq_pinconf_ops,
-	.num_dt_params = ARRAY_SIZE(zynq_dt_params),
-	.params = zynq_dt_params,
-	.conf_items = zynq_conf_items,
+	.num_custom_params = ARRAY_SIZE(zynq_dt_params),
+	.custom_params = zynq_dt_params,
+	.custom_conf_items = zynq_conf_items,
 	.owner = THIS_MODULE,
 };
 
diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
index 17f811c9c2c0..bbf99a715b63 100644
--- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
+++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
@@ -131,7 +131,7 @@ struct pmic_gpio_state {
 	struct gpio_chip chip;
 };
 
-static const struct pinconf_generic_dt_params pmic_gpio_bindings[] = {
+static const struct pinconf_generic_params pmic_gpio_bindings[] = {
 	{"qcom,pull-up-strength",	PMIC_GPIO_CONF_PULL_UP,		0},
 	{"qcom,drive-strength",		PMIC_GPIO_CONF_STRENGTH,	0},
 };
@@ -742,9 +742,9 @@ static int pmic_gpio_probe(struct platform_device *pdev)
 	pctrldesc->name = dev_name(dev);
 	pctrldesc->pins = pindesc;
 	pctrldesc->npins = npins;
-	pctrldesc->num_dt_params = ARRAY_SIZE(pmic_gpio_bindings);
-	pctrldesc->params = pmic_gpio_bindings;
-	pctrldesc->conf_items = pmic_conf_items;
+	pctrldesc->num_custom_params = ARRAY_SIZE(pmic_gpio_bindings);
+	pctrldesc->custom_params = pmic_gpio_bindings;
+	pctrldesc->custom_conf_items = pmic_conf_items;
 
 	for (i = 0; i < npins; i++, pindesc++) {
 		pad = &pads[i];
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 342409f7f3ec..fe65962b264f 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -162,7 +162,7 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param
 struct pinctrl_dev;
 struct pinctrl_map;
 
-struct pinconf_generic_dt_params {
+struct pinconf_generic_params {
 	const char * const property;
 	enum pin_config_param param;
 	u32 default_value;
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index c58b3e11ba8e..66e4697516de 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -118,9 +118,12 @@ struct pinctrl_ops {
  * @confops: pin config operations vtable, if you support pin configuration in
  *	your driver
  * @owner: module providing the pin controller, used for refcounting
- * @num_dt_params: Number of driver-specific DT parameters
- * @params: List of DT parameters
- * @conf_items: Information how to print @params in debugfs
+ * @num_custom_params: Number of driver-specific custom parameters to be parsed
+ *	from the hardware description
+ * @custom_params: List of driver_specific custom parameters to be parsed from
+ *	the hardware description
+ * @custom_conf_items: Information how to print @params in debugfs, must be
+ *	the same size as the @custom_params, i.e. @num_custom_params
  */
 struct pinctrl_desc {
 	const char *name;
@@ -130,10 +133,10 @@ struct pinctrl_desc {
 	const struct pinmux_ops *pmxops;
 	const struct pinconf_ops *confops;
 	struct module *owner;
-#if defined(CONFIG_GENERIC_PINCONF) && defined(CONFIG_OF)
-	unsigned int num_dt_params;
-	const struct pinconf_generic_dt_params *params;
-	const struct pin_config_item *conf_items;
+#ifdef CONFIG_GENERIC_PINCONF
+	unsigned int num_custom_params;
+	const struct pinconf_generic_params *custom_params;
+	const struct pin_config_item *custom_conf_items;
 #endif
 };
 
-- 
cgit v1.2.3


From 5a7d2efdd93f6c4bb6cd3d5df3d2f5611c9b87ac Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Tue, 16 Dec 2014 10:59:17 +0100
Subject: pinctrl: consumer: use correct retval for placeholder functions

These functions are supposed to return an error pointer, not NULL.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 18eccefea06e..72c0415d6c21 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -82,7 +82,7 @@ static inline int pinctrl_gpio_direction_output(unsigned gpio)
 
 static inline struct pinctrl * __must_check pinctrl_get(struct device *dev)
 {
-	return NULL;
+	return ERR_PTR(-ENOSYS);
 }
 
 static inline void pinctrl_put(struct pinctrl *p)
@@ -93,7 +93,7 @@ static inline struct pinctrl_state * __must_check pinctrl_lookup_state(
 							struct pinctrl *p,
 							const char *name)
 {
-	return NULL;
+	return ERR_PTR(-ENOSYS);
 }
 
 static inline int pinctrl_select_state(struct pinctrl *p,
@@ -104,7 +104,7 @@ static inline int pinctrl_select_state(struct pinctrl *p,
 
 static inline struct pinctrl * __must_check devm_pinctrl_get(struct device *dev)
 {
-	return NULL;
+	return ERR_PTR(-ENOSYS);
 }
 
 static inline void devm_pinctrl_put(struct pinctrl *p)
-- 
cgit v1.2.3


From d84b6728c54dcf73bcef3e3f7cf6767e2d224e39 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 6 Jan 2015 11:45:07 -0800
Subject: locking/mcs: Better differentiate between MCS variants

We have two flavors of the MCS spinlock: standard and cancelable (OSQ).
While each one is independent of the other, we currently mix and match
them. This patch:

  - Moves the OSQ code out of mcs_spinlock.h (which only deals with the traditional
    version) into include/linux/osq_lock.h. No unnecessary code is added to the
    more global header file, anything locks that make use of OSQ must include
    it anyway.

  - Renames mcs_spinlock.c to osq_lock.c. This file only contains osq code.

  - Introduces a CONFIG_LOCK_SPIN_ON_OWNER in order to only build osq_lock
    if there is support for it.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Waiman Long <Waiman.Long@hp.com>
Link: http://lkml.kernel.org/r/1420573509-24774-5-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/osq_lock.h      |  12 ++-
 kernel/Kconfig.locks          |   4 +
 kernel/locking/Makefile       |   3 +-
 kernel/locking/mcs_spinlock.c | 208 ------------------------------------------
 kernel/locking/mcs_spinlock.h |  16 ----
 kernel/locking/osq_lock.c     | 203 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 219 insertions(+), 227 deletions(-)
 delete mode 100644 kernel/locking/mcs_spinlock.c
 create mode 100644 kernel/locking/osq_lock.c

(limited to 'include/linux')

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index 90230d5811c5..3a6490e81b28 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -5,8 +5,11 @@
  * An MCS like lock especially tailored for optimistic spinning for sleeping
  * lock implementations (mutex, rwsem, etc).
  */
-
-#define OSQ_UNLOCKED_VAL (0)
+struct optimistic_spin_node {
+	struct optimistic_spin_node *next, *prev;
+	int locked; /* 1 if lock acquired */
+	int cpu; /* encoded CPU # + 1 value */
+};
 
 struct optimistic_spin_queue {
 	/*
@@ -16,6 +19,8 @@ struct optimistic_spin_queue {
 	atomic_t tail;
 };
 
+#define OSQ_UNLOCKED_VAL (0)
+
 /* Init macro and function. */
 #define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
 
@@ -24,4 +29,7 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock)
 	atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
 }
 
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
+
 #endif
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
        def_bool y
        depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
 
+config LOCK_SPIN_ON_OWNER
+       def_bool y
+       depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
+
 config ARCH_USE_QUEUE_RWLOCK
 	bool
 
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..4ca8eb151975 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
 
-obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
deleted file mode 100644
index 9887a905a762..000000000000
--- a/kernel/locking/mcs_spinlock.c
+++ /dev/null
@@ -1,208 +0,0 @@
-#include <linux/percpu.h>
-#include <linux/sched.h>
-#include "mcs_spinlock.h"
-
-#ifdef CONFIG_SMP
-
-/*
- * An MCS like lock especially tailored for optimistic spinning for sleeping
- * lock implementations (mutex, rwsem, etc).
- *
- * Using a single mcs node per CPU is safe because sleeping locks should not be
- * called from interrupt context and we have preemption disabled while
- * spinning.
- */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
-
-/*
- * We use the value 0 to represent "no CPU", thus the encoded value
- * will be the CPU number incremented by 1.
- */
-static inline int encode_cpu(int cpu_nr)
-{
-	return cpu_nr + 1;
-}
-
-static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
-{
-	int cpu_nr = encoded_cpu_val - 1;
-
-	return per_cpu_ptr(&osq_node, cpu_nr);
-}
-
-/*
- * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
- * Can return NULL in case we were the last queued and we updated @lock instead.
- */
-static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_queue *lock,
-	      struct optimistic_spin_node *node,
-	      struct optimistic_spin_node *prev)
-{
-	struct optimistic_spin_node *next = NULL;
-	int curr = encode_cpu(smp_processor_id());
-	int old;
-
-	/*
-	 * If there is a prev node in queue, then the 'old' value will be
-	 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
-	 * we're currently last in queue, then the queue will then become empty.
-	 */
-	old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
-
-	for (;;) {
-		if (atomic_read(&lock->tail) == curr &&
-		    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
-			/*
-			 * We were the last queued, we moved @lock back. @prev
-			 * will now observe @lock and will complete its
-			 * unlock()/unqueue().
-			 */
-			break;
-		}
-
-		/*
-		 * We must xchg() the @node->next value, because if we were to
-		 * leave it in, a concurrent unlock()/unqueue() from
-		 * @node->next might complete Step-A and think its @prev is
-		 * still valid.
-		 *
-		 * If the concurrent unlock()/unqueue() wins the race, we'll
-		 * wait for either @lock to point to us, through its Step-B, or
-		 * wait for a new @node->next from its Step-C.
-		 */
-		if (node->next) {
-			next = xchg(&node->next, NULL);
-			if (next)
-				break;
-		}
-
-		cpu_relax_lowlatency();
-	}
-
-	return next;
-}
-
-bool osq_lock(struct optimistic_spin_queue *lock)
-{
-	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
-	struct optimistic_spin_node *prev, *next;
-	int curr = encode_cpu(smp_processor_id());
-	int old;
-
-	node->locked = 0;
-	node->next = NULL;
-	node->cpu = curr;
-
-	old = atomic_xchg(&lock->tail, curr);
-	if (old == OSQ_UNLOCKED_VAL)
-		return true;
-
-	prev = decode_cpu(old);
-	node->prev = prev;
-	ACCESS_ONCE(prev->next) = node;
-
-	/*
-	 * Normally @prev is untouchable after the above store; because at that
-	 * moment unlock can proceed and wipe the node element from stack.
-	 *
-	 * However, since our nodes are static per-cpu storage, we're
-	 * guaranteed their existence -- this allows us to apply
-	 * cmpxchg in an attempt to undo our queueing.
-	 */
-
-	while (!smp_load_acquire(&node->locked)) {
-		/*
-		 * If we need to reschedule bail... so we can block.
-		 */
-		if (need_resched())
-			goto unqueue;
-
-		cpu_relax_lowlatency();
-	}
-	return true;
-
-unqueue:
-	/*
-	 * Step - A  -- stabilize @prev
-	 *
-	 * Undo our @prev->next assignment; this will make @prev's
-	 * unlock()/unqueue() wait for a next pointer since @lock points to us
-	 * (or later).
-	 */
-
-	for (;;) {
-		if (prev->next == node &&
-		    cmpxchg(&prev->next, node, NULL) == node)
-			break;
-
-		/*
-		 * We can only fail the cmpxchg() racing against an unlock(),
-		 * in which case we should observe @node->locked becomming
-		 * true.
-		 */
-		if (smp_load_acquire(&node->locked))
-			return true;
-
-		cpu_relax_lowlatency();
-
-		/*
-		 * Or we race against a concurrent unqueue()'s step-B, in which
-		 * case its step-C will write us a new @node->prev pointer.
-		 */
-		prev = ACCESS_ONCE(node->prev);
-	}
-
-	/*
-	 * Step - B -- stabilize @next
-	 *
-	 * Similar to unlock(), wait for @node->next or move @lock from @node
-	 * back to @prev.
-	 */
-
-	next = osq_wait_next(lock, node, prev);
-	if (!next)
-		return false;
-
-	/*
-	 * Step - C -- unlink
-	 *
-	 * @prev is stable because its still waiting for a new @prev->next
-	 * pointer, @next is stable because our @node->next pointer is NULL and
-	 * it will wait in Step-A.
-	 */
-
-	ACCESS_ONCE(next->prev) = prev;
-	ACCESS_ONCE(prev->next) = next;
-
-	return false;
-}
-
-void osq_unlock(struct optimistic_spin_queue *lock)
-{
-	struct optimistic_spin_node *node, *next;
-	int curr = encode_cpu(smp_processor_id());
-
-	/*
-	 * Fast path for the uncontended case.
-	 */
-	if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
-		return;
-
-	/*
-	 * Second most likely case.
-	 */
-	node = this_cpu_ptr(&osq_node);
-	next = xchg(&node->next, NULL);
-	if (next) {
-		ACCESS_ONCE(next->locked) = 1;
-		return;
-	}
-
-	next = osq_wait_next(lock, node, NULL);
-	if (next)
-		ACCESS_ONCE(next->locked) = 1;
-}
-
-#endif
-
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 	arch_mcs_spin_unlock_contended(&next->locked);
 }
 
-/*
- * Cancellable version of the MCS lock above.
- *
- * Intended for adaptive spinning of sleeping locks:
- * mutex_lock()/rwsem_down_{read,write}() etc.
- */
-
-struct optimistic_spin_node {
-	struct optimistic_spin_node *next, *prev;
-	int locked; /* 1 if lock acquired */
-	int cpu; /* encoded CPU # value */
-};
-
-extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue *lock);
-
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
new file mode 100644
index 000000000000..ec83d4db8ec6
--- /dev/null
+++ b/kernel/locking/osq_lock.c
@@ -0,0 +1,203 @@
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/osq_lock.h>
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+	return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+	int cpu_nr = encoded_cpu_val - 1;
+
+	return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_queue *lock,
+	      struct optimistic_spin_node *node,
+	      struct optimistic_spin_node *prev)
+{
+	struct optimistic_spin_node *next = NULL;
+	int curr = encode_cpu(smp_processor_id());
+	int old;
+
+	/*
+	 * If there is a prev node in queue, then the 'old' value will be
+	 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+	 * we're currently last in queue, then the queue will then become empty.
+	 */
+	old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
+
+	for (;;) {
+		if (atomic_read(&lock->tail) == curr &&
+		    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+			/*
+			 * We were the last queued, we moved @lock back. @prev
+			 * will now observe @lock and will complete its
+			 * unlock()/unqueue().
+			 */
+			break;
+		}
+
+		/*
+		 * We must xchg() the @node->next value, because if we were to
+		 * leave it in, a concurrent unlock()/unqueue() from
+		 * @node->next might complete Step-A and think its @prev is
+		 * still valid.
+		 *
+		 * If the concurrent unlock()/unqueue() wins the race, we'll
+		 * wait for either @lock to point to us, through its Step-B, or
+		 * wait for a new @node->next from its Step-C.
+		 */
+		if (node->next) {
+			next = xchg(&node->next, NULL);
+			if (next)
+				break;
+		}
+
+		cpu_relax_lowlatency();
+	}
+
+	return next;
+}
+
+bool osq_lock(struct optimistic_spin_queue *lock)
+{
+	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+	struct optimistic_spin_node *prev, *next;
+	int curr = encode_cpu(smp_processor_id());
+	int old;
+
+	node->locked = 0;
+	node->next = NULL;
+	node->cpu = curr;
+
+	old = atomic_xchg(&lock->tail, curr);
+	if (old == OSQ_UNLOCKED_VAL)
+		return true;
+
+	prev = decode_cpu(old);
+	node->prev = prev;
+	ACCESS_ONCE(prev->next) = node;
+
+	/*
+	 * Normally @prev is untouchable after the above store; because at that
+	 * moment unlock can proceed and wipe the node element from stack.
+	 *
+	 * However, since our nodes are static per-cpu storage, we're
+	 * guaranteed their existence -- this allows us to apply
+	 * cmpxchg in an attempt to undo our queueing.
+	 */
+
+	while (!smp_load_acquire(&node->locked)) {
+		/*
+		 * If we need to reschedule bail... so we can block.
+		 */
+		if (need_resched())
+			goto unqueue;
+
+		cpu_relax_lowlatency();
+	}
+	return true;
+
+unqueue:
+	/*
+	 * Step - A  -- stabilize @prev
+	 *
+	 * Undo our @prev->next assignment; this will make @prev's
+	 * unlock()/unqueue() wait for a next pointer since @lock points to us
+	 * (or later).
+	 */
+
+	for (;;) {
+		if (prev->next == node &&
+		    cmpxchg(&prev->next, node, NULL) == node)
+			break;
+
+		/*
+		 * We can only fail the cmpxchg() racing against an unlock(),
+		 * in which case we should observe @node->locked becomming
+		 * true.
+		 */
+		if (smp_load_acquire(&node->locked))
+			return true;
+
+		cpu_relax_lowlatency();
+
+		/*
+		 * Or we race against a concurrent unqueue()'s step-B, in which
+		 * case its step-C will write us a new @node->prev pointer.
+		 */
+		prev = ACCESS_ONCE(node->prev);
+	}
+
+	/*
+	 * Step - B -- stabilize @next
+	 *
+	 * Similar to unlock(), wait for @node->next or move @lock from @node
+	 * back to @prev.
+	 */
+
+	next = osq_wait_next(lock, node, prev);
+	if (!next)
+		return false;
+
+	/*
+	 * Step - C -- unlink
+	 *
+	 * @prev is stable because its still waiting for a new @prev->next
+	 * pointer, @next is stable because our @node->next pointer is NULL and
+	 * it will wait in Step-A.
+	 */
+
+	ACCESS_ONCE(next->prev) = prev;
+	ACCESS_ONCE(prev->next) = next;
+
+	return false;
+}
+
+void osq_unlock(struct optimistic_spin_queue *lock)
+{
+	struct optimistic_spin_node *node, *next;
+	int curr = encode_cpu(smp_processor_id());
+
+	/*
+	 * Fast path for the uncontended case.
+	 */
+	if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+		return;
+
+	/*
+	 * Second most likely case.
+	 */
+	node = this_cpu_ptr(&osq_node);
+	next = xchg(&node->next, NULL);
+	if (next) {
+		ACCESS_ONCE(next->locked) = 1;
+		return;
+	}
+
+	next = osq_wait_next(lock, node, NULL);
+	if (next)
+		ACCESS_ONCE(next->locked) = 1;
+}
-- 
cgit v1.2.3


From 86038c5ea81b519a8a1fcfcd5e4599aab0cdd119 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Tue, 16 Dec 2014 12:47:34 +0100
Subject: perf: Avoid horrible stack usage

Both Linus (most recent) and Steve (a while ago) reported that perf
related callbacks have massive stack bloat.

The problem is that software events need a pt_regs in order to
properly report the event location and unwind stack. And because we
could not assume one was present we allocated one on stack and filled
it with minimal bits required for operation.

Now, pt_regs is quite large, so this is undesirable. Furthermore it
turns out that most sites actually have a pt_regs pointer available,
making this even more onerous, as the stack space is pointless waste.

This patch addresses the problem by observing that software events
have well defined nesting semantics, therefore we can use static
per-cpu storage instead of on-stack.

Linus made the further observation that all but the scheduler callers
of perf_sw_event() have a pt_regs available, so we change the regular
perf_sw_event() to require a valid pt_regs (where it used to be
optional) and add perf_sw_event_sched() for the scheduler.

We have a scheduler specific call instead of a more generic _noregs()
like construct because we can assume non-recursion from the scheduler
and thereby simplify the code further (_noregs would have to put the
recursion context call inline in order to assertain which __perf_regs
element to use).

One last note on the implementation of perf_trace_buf_prepare(); we
allow .regs = NULL for those cases where we already have a pt_regs
pointer available and do not need another.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Javi Merino <javi.merino@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Vaibhav Nagarnaik <vnagarnaik@google.com>
Link: http://lkml.kernel.org/r/20141216115041.GW3337@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ftrace_event.h    |  2 +-
 include/linux/perf_event.h      | 28 +++++++++++++++++++++-------
 include/trace/ftrace.h          |  7 ++++---
 kernel/events/core.c            | 23 +++++++++++++++++------
 kernel/sched/core.c             |  2 +-
 kernel/trace/trace_event_perf.c |  4 +++-
 kernel/trace/trace_kprobe.c     |  4 ++--
 kernel/trace/trace_syscalls.c   |  4 ++--
 kernel/trace/trace_uprobe.c     |  2 +-
 9 files changed, 52 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..d36f68b08acc 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -595,7 +595,7 @@ extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
 extern void *perf_trace_buf_prepare(int size, unsigned short type,
-				    struct pt_regs *regs, int *rctxp);
+				    struct pt_regs **regs, int *rctxp);
 
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4f7a61ca4b39..3a7bd80b4db8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -665,6 +665,7 @@ static inline int is_software_event(struct perf_event *event)
 
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
+extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
 extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);
 
 #ifndef perf_arch_fetch_caller_regs
@@ -689,14 +690,25 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 static __always_inline void
 perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
-	struct pt_regs hot_regs;
+	if (static_key_false(&perf_swevent_enabled[event_id]))
+		__perf_sw_event(event_id, nr, regs, addr);
+}
+
+DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
 
+/*
+ * 'Special' version for the scheduler, it hard assumes no recursion,
+ * which is guaranteed by us not actually scheduling inside other swevents
+ * because those disable preemption.
+ */
+static __always_inline void
+perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
+{
 	if (static_key_false(&perf_swevent_enabled[event_id])) {
-		if (!regs) {
-			perf_fetch_caller_regs(&hot_regs);
-			regs = &hot_regs;
-		}
-		__perf_sw_event(event_id, nr, regs, addr);
+		struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+
+		perf_fetch_caller_regs(regs);
+		___perf_sw_event(event_id, nr, regs, addr);
 	}
 }
 
@@ -712,7 +724,7 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
 static inline void perf_event_task_sched_out(struct task_struct *prev,
 					     struct task_struct *next)
 {
-	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
+	perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
 
 	if (static_key_false(&perf_sched_events.key))
 		__perf_event_task_sched_out(prev, next);
@@ -823,6 +835,8 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh)
 static inline void
 perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)	{ }
 static inline void
+perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)			{ }
+static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
 static inline int perf_register_guest_info_callbacks
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..27609dfcce25 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -763,7 +763,7 @@ perf_trace_##call(void *__data, proto)					\
 	struct ftrace_event_call *event_call = __data;			\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_raw_##call *entry;				\
-	struct pt_regs __regs;						\
+	struct pt_regs *__regs;						\
 	u64 __addr = 0, __count = 1;					\
 	struct task_struct *__task = NULL;				\
 	struct hlist_head *head;					\
@@ -782,18 +782,19 @@ perf_trace_##call(void *__data, proto)					\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	perf_fetch_caller_regs(&__regs);				\
 	entry = perf_trace_buf_prepare(__entry_size,			\
 			event_call->event.type, &__regs, &rctx);	\
 	if (!entry)							\
 		return;							\
 									\
+	perf_fetch_caller_regs(__regs);					\
+									\
 	tstruct								\
 									\
 	{ assign; }							\
 									\
 	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
-		__count, &__regs, head, __task);			\
+		__count, __regs, head, __task);				\
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..c10124b772c4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5889,6 +5889,8 @@ end:
 	rcu_read_unlock();
 }
 
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
 int perf_swevent_get_recursion_context(void)
 {
 	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5904,21 +5906,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
 	put_recursion_context(swhash->recursion, rctx);
 }
 
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data;
-	int rctx;
 
-	preempt_disable_notrace();
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
+	if (WARN_ON_ONCE(!regs))
 		return;
 
 	perf_sample_data_init(&data, addr, 0);
-
 	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+	int rctx;
+
+	preempt_disable_notrace();
+	rctx = perf_swevent_get_recursion_context();
+	if (unlikely(rctx < 0))
+		goto fail;
+
+	___perf_sw_event(event_id, nr, regs, addr);
 
 	perf_swevent_put_recursion_context(rctx);
+fail:
 	preempt_enable_notrace();
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc00566e..d22fb16a7153 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1082,7 +1082,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
-		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
 	}
 
 	__set_task_cpu(p, new_cpu);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
 }
 
 void *perf_trace_buf_prepare(int size, unsigned short type,
-			     struct pt_regs *regs, int *rctxp)
+			     struct pt_regs **regs, int *rctxp)
 {
 	struct trace_entry *entry;
 	unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
 	if (*rctxp < 0)
 		return NULL;
 
+	if (regs)
+		*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
 	raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
 
 	/* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..296079ae6583 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
 	if (!entry)
 		return;
 
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
 	if (!entry)
 		return;
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	size -= sizeof(u32);
 
 	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-				sys_data->enter_event->event.type, regs, &rctx);
+				sys_data->enter_event->event.type, NULL, &rctx);
 	if (!rec)
 		return;
 
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	size -= sizeof(u32);
 
 	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-				sys_data->exit_event->event.type, regs, &rctx);
+				sys_data->exit_event->event.type, NULL, &rctx);
 	if (!rec)
 		return;
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..b11441321e7a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	if (hlist_empty(head))
 		goto out;
 
-	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
 	if (!entry)
 		goto out;
 
-- 
cgit v1.2.3


From a2b12f3c7ac1ea43ae646db74faf0b56c2bba563 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 12 Jan 2015 17:00:37 -0800
Subject: udp: pass udp_offload struct to UDP gro callbacks

This patch introduces udp_offload_callbacks which has the same
GRO functions (but not a GSO function) as offload_callbacks,
except there is an argument to a udp_offload struct passed to
gro_receive and gro_complete functions. This additional argument
can be used to retrieve the per port structure of the encapsulation
for use in gro processing (mostly by doing container_of on the
structure).

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       |  7 +++++--
 include/linux/netdevice.h | 15 +++++++++++++--
 net/ipv4/fou.c            | 12 ++++++++----
 net/ipv4/geneve.c         |  6 ++++--
 net/ipv4/udp_offload.c    |  7 +++++--
 5 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 985359dd6033..5c56a3ff25aa 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -539,7 +539,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 	return 1;
 }
 
-static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
+					  struct sk_buff *skb,
+					  struct udp_offload *uoff)
 {
 	struct sk_buff *p, **pp = NULL;
 	struct vxlanhdr *vh, *vh2;
@@ -578,7 +580,8 @@ out:
 	return pp;
 }
 
-static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
+static int vxlan_gro_complete(struct sk_buff *skb, int nhoff,
+			      struct udp_offload *uoff)
 {
 	udp_tunnel_gro_complete(skb, nhoff);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 679e6e90aa4c..47921c291dd6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1969,7 +1969,7 @@ struct offload_callbacks {
 	struct sk_buff		*(*gso_segment)(struct sk_buff *skb,
 						netdev_features_t features);
 	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
-					       struct sk_buff *skb);
+						 struct sk_buff *skb);
 	int			(*gro_complete)(struct sk_buff *skb, int nhoff);
 };
 
@@ -1979,10 +1979,21 @@ struct packet_offload {
 	struct list_head	 list;
 };
 
+struct udp_offload;
+
+struct udp_offload_callbacks {
+	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
+						 struct sk_buff *skb,
+						 struct udp_offload *uoff);
+	int			(*gro_complete)(struct sk_buff *skb,
+						int nhoff,
+						struct udp_offload *uoff);
+};
+
 struct udp_offload {
 	__be16			 port;
 	u8			 ipproto;
-	struct offload_callbacks callbacks;
+	struct udp_offload_callbacks callbacks;
 };
 
 /* often modified stats are per cpu, other are shared (netdev->stats) */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 2197c36f722f..3bc0cf07661c 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -174,7 +174,8 @@ drop:
 }
 
 static struct sk_buff **fou_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb)
+					struct sk_buff *skb,
+					struct udp_offload *uoff)
 {
 	const struct net_offload *ops;
 	struct sk_buff **pp = NULL;
@@ -195,7 +196,8 @@ out_unlock:
 	return pp;
 }
 
-static int fou_gro_complete(struct sk_buff *skb, int nhoff)
+static int fou_gro_complete(struct sk_buff *skb, int nhoff,
+			    struct udp_offload *uoff)
 {
 	const struct net_offload *ops;
 	u8 proto = NAPI_GRO_CB(skb)->proto;
@@ -254,7 +256,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 }
 
 static struct sk_buff **gue_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb)
+					struct sk_buff *skb,
+					struct udp_offload *uoff)
 {
 	const struct net_offload **offloads;
 	const struct net_offload *ops;
@@ -360,7 +363,8 @@ out:
 	return pp;
 }
 
-static int gue_gro_complete(struct sk_buff *skb, int nhoff)
+static int gue_gro_complete(struct sk_buff *skb, int nhoff,
+			    struct udp_offload *uoff)
 {
 	const struct net_offload **offloads;
 	struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 23744c7a9718..9568594ca2f1 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -147,7 +147,8 @@ static int geneve_hlen(struct genevehdr *gh)
 }
 
 static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
-					   struct sk_buff *skb)
+					   struct sk_buff *skb,
+					   struct udp_offload *uoff)
 {
 	struct sk_buff *p, **pp = NULL;
 	struct genevehdr *gh, *gh2;
@@ -211,7 +212,8 @@ out:
 	return pp;
 }
 
-static int geneve_gro_complete(struct sk_buff *skb, int nhoff)
+static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
+			       struct udp_offload *uoff)
 {
 	struct genevehdr *gh;
 	struct packet_offload *ptype;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index d3e537ef6b7f..d10f6f4ead27 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -339,7 +339,8 @@ unflush:
 	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
 	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
 	NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-	pp = uo_priv->offload->callbacks.gro_receive(head, skb);
+	pp = uo_priv->offload->callbacks.gro_receive(head, skb,
+						     uo_priv->offload);
 
 out_unlock:
 	rcu_read_unlock();
@@ -395,7 +396,9 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
 
 	if (uo_priv != NULL) {
 		NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-		err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr));
+		err = uo_priv->offload->callbacks.gro_complete(skb,
+				nhoff + sizeof(struct udphdr),
+				uo_priv->offload);
 	}
 
 	rcu_read_unlock();
-- 
cgit v1.2.3


From c38fda3fe8163898f538a45d3c1419e6870625ed Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 14 Jan 2015 22:59:13 -0800
Subject: jbd: drop jbd_ENOSYS debug

A quick search shows that there are no users, drop the
macro for both jbd and jbd2.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/jbd.h  | 9 ---------
 include/linux/jbd2.h | 9 ---------
 2 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 31229e0be90b..d32615280be9 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -956,15 +956,6 @@ void __log_wait_for_space(journal_t *journal);
 extern void	__journal_drop_transaction(journal_t *, transaction_t *);
 extern int	cleanup_journal_tail(journal_t *);
 
-/* Debugging code only: */
-
-#define jbd_ENOSYS() \
-do {								           \
-	printk (KERN_ERR "JBD unimplemented function %s\n", __func__); \
-	current->state = TASK_UNINTERRUPTIBLE;			           \
-	schedule();						           \
-} while (1)
-
 /*
  * is_journal_abort
  *
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 704b9a599b26..20e7f78041c8 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1251,15 +1251,6 @@ void __jbd2_log_wait_for_space(journal_t *journal);
 extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
 extern int jbd2_cleanup_journal_tail(journal_t *);
 
-/* Debugging code only: */
-
-#define jbd_ENOSYS() \
-do {								           \
-	printk (KERN_ERR "JBD unimplemented function %s\n", __func__); \
-	current->state = TASK_UNINTERRUPTIBLE;			           \
-	schedule();						           \
-} while (1)
-
 /*
  * is_journal_abort
  *
-- 
cgit v1.2.3


From 37715f556a0776356300391f8ac41ace91bea447 Mon Sep 17 00:00:00 2001
From: David Ung <davidu@nvidia.com>
Date: Tue, 13 Jan 2015 19:04:25 -0800
Subject: video: fbdev: Add additional vesa modes

Add high resolution modes to vesa_modes struct.

Signed-off-by: David Ung <davidu@nvidia.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 drivers/video/fbdev/core/modedb.c | 27 +++++++++++++++++++++++++++
 include/linux/fb.h                |  2 +-
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/modedb.c b/drivers/video/fbdev/core/modedb.c
index 388f7971494b..0b57c1df73e3 100644
--- a/drivers/video/fbdev/core/modedb.c
+++ b/drivers/video/fbdev/core/modedb.c
@@ -468,6 +468,33 @@ const struct fb_videomode vesa_modes[] = {
 	/* 33 1920x1440-75 VESA */
 	{ NULL, 75, 1920, 1440, 3367, 352, 144, 56, 1, 224, 3,
 	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 34 1920x1200-60 RB VESA */
+	{ NULL, 60, 1920, 1200, 6493, 80, 48, 26, 3, 32, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 35 1920x1200-60 VESA */
+	{ NULL, 60, 1920, 1200, 5174, 336, 136, 36, 3, 200, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 36 1920x1200-75 VESA */
+	{ NULL, 75, 1920, 1200, 4077, 344, 136, 46, 3, 208, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 37 1920x1200-85 VESA */
+	{ NULL, 85, 1920, 1200, 3555, 352, 144, 53, 3, 208, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 38 2560x1600-60 RB VESA */
+	{ NULL, 60, 2560, 1600, 3724, 80, 48, 37, 3, 32, 6,
+	  FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 39 2560x1600-60 VESA */
+	{ NULL, 60, 2560, 1600, 2869, 472, 192, 49, 3, 280, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 40 2560x1600-75 VESA */
+	{ NULL, 75, 2560, 1600, 2256, 488, 208, 63, 3, 280, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 41 2560x1600-85 VESA */
+	{ NULL, 85, 2560, 1600, 1979, 488, 208, 73, 3, 280, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 42 2560x1600-120 RB VESA */
+	{ NULL, 120, 2560, 1600, 1809, 80, 48, 85, 3, 32, 6,
+	  FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
 };
 EXPORT_SYMBOL(vesa_modes);
 #endif /* CONFIG_FB_MODE_HELPERS */
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 09bb7a18d287..882dbd1a87b7 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -726,7 +726,7 @@ extern int fb_videomode_from_videomode(const struct videomode *vm,
 				       struct fb_videomode *fbmode);
 
 /* drivers/video/modedb.c */
-#define VESA_MODEDB_SIZE 34
+#define VESA_MODEDB_SIZE 43
 extern void fb_var_to_videomode(struct fb_videomode *mode,
 				const struct fb_var_screeninfo *var);
 extern void fb_videomode_to_var(struct fb_var_screeninfo *var,
-- 
cgit v1.2.3


From 8f5ee77bb8d162abe28ff8cd56f36e825d143207 Mon Sep 17 00:00:00 2001
From: David Ung <davidu@nvidia.com>
Date: Tue, 13 Jan 2015 19:04:26 -0800
Subject: video: fbdev: Check Standard Timing against DMT

Add the VESA Display Monitor Timing (DMT) table.
During parsing of Standard Timings, it compare the 2 byte STD code
with DMT to see what the VESA mode should be.  If there is no entry
in the vesa_modes table or no match found, it fallsback to the
GTF timings.

Signed-off-by: David Ung <davidu@nvidia.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 drivers/video/fbdev/core/fbmon.c  | 76 +++++++++++++++++++----------------
 drivers/video/fbdev/core/modedb.c | 84 +++++++++++++++++++++++++++++++++++++++
 include/linux/fb.h                | 10 +++++
 3 files changed, 136 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c
index 5b0e313849bd..0f234c15ece3 100644
--- a/drivers/video/fbdev/core/fbmon.c
+++ b/drivers/video/fbdev/core/fbmon.c
@@ -498,44 +498,52 @@ static int get_est_timing(unsigned char *block, struct fb_videomode *mode)
 static int get_std_timing(unsigned char *block, struct fb_videomode *mode,
 		int ver, int rev)
 {
-	int xres, yres = 0, refresh, ratio, i;
-
-	xres = (block[0] + 31) * 8;
-	if (xres <= 256)
-		return 0;
+	int i;
 
-	ratio = (block[1] & 0xc0) >> 6;
-	switch (ratio) {
-	case 0:
-		/* in EDID 1.3 the meaning of 0 changed to 16:10 (prior 1:1) */
-		if (ver < 1 || (ver == 1 && rev < 3))
-			yres = xres;
-		else
-			yres = (xres * 10)/16;
-		break;
-	case 1:
-		yres = (xres * 3)/4;
-		break;
-	case 2:
-		yres = (xres * 4)/5;
-		break;
-	case 3:
-		yres = (xres * 9)/16;
-		break;
+	for (i = 0; i < DMT_SIZE; i++) {
+		u32 std_2byte_code = block[0] << 8 | block[1];
+		if (std_2byte_code == dmt_modes[i].std_2byte_code)
+			break;
 	}
-	refresh = (block[1] & 0x3f) + 60;
-
-	DPRINTK("      %dx%d@%dHz\n", xres, yres, refresh);
-	for (i = 0; i < VESA_MODEDB_SIZE; i++) {
-		if (vesa_modes[i].xres == xres &&
-		    vesa_modes[i].yres == yres &&
-		    vesa_modes[i].refresh == refresh) {
-			*mode = vesa_modes[i];
-			mode->flag |= FB_MODE_IS_STANDARD;
-			return 1;
+
+	if (i < DMT_SIZE && dmt_modes[i].mode) {
+		/* DMT mode found */
+		*mode = *dmt_modes[i].mode;
+		mode->flag |= FB_MODE_IS_STANDARD;
+		DPRINTK("        DMT id=%d\n", dmt_modes[i].dmt_id);
+
+	} else {
+		int xres, yres = 0, refresh, ratio;
+
+		xres = (block[0] + 31) * 8;
+		if (xres <= 256)
+			return 0;
+
+		ratio = (block[1] & 0xc0) >> 6;
+		switch (ratio) {
+		case 0:
+			/* in EDID 1.3 the meaning of 0 changed to 16:10 (prior 1:1) */
+			if (ver < 1 || (ver == 1 && rev < 3))
+				yres = xres;
+			else
+				yres = (xres * 10)/16;
+			break;
+		case 1:
+			yres = (xres * 3)/4;
+			break;
+		case 2:
+			yres = (xres * 4)/5;
+			break;
+		case 3:
+			yres = (xres * 9)/16;
+			break;
 		}
+		refresh = (block[1] & 0x3f) + 60;
+		DPRINTK("      %dx%d@%dHz\n", xres, yres, refresh);
+
+		calc_mode_timings(xres, yres, refresh, mode);
 	}
-	calc_mode_timings(xres, yres, refresh, mode);
+
 	return 1;
 }
 
diff --git a/drivers/video/fbdev/core/modedb.c b/drivers/video/fbdev/core/modedb.c
index 0b57c1df73e3..858a97e03200 100644
--- a/drivers/video/fbdev/core/modedb.c
+++ b/drivers/video/fbdev/core/modedb.c
@@ -497,6 +497,90 @@ const struct fb_videomode vesa_modes[] = {
 	  FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
 };
 EXPORT_SYMBOL(vesa_modes);
+
+const struct dmt_videomode dmt_modes[DMT_SIZE] = {
+	{ 0x01, 0x0000, 0x000000, &vesa_modes[0] },
+	{ 0x02, 0x3119, 0x000000, &vesa_modes[1] },
+	{ 0x03, 0x0000, 0x000000, &vesa_modes[2] },
+	{ 0x04, 0x3140, 0x000000, &vesa_modes[3] },
+	{ 0x05, 0x314c, 0x000000, &vesa_modes[4] },
+	{ 0x06, 0x314f, 0x000000, &vesa_modes[5] },
+	{ 0x07, 0x3159, 0x000000, &vesa_modes[6] },
+	{ 0x08, 0x0000, 0x000000, &vesa_modes[7] },
+	{ 0x09, 0x4540, 0x000000, &vesa_modes[8] },
+	{ 0x0a, 0x454c, 0x000000, &vesa_modes[9] },
+	{ 0x0b, 0x454f, 0x000000, &vesa_modes[10] },
+	{ 0x0c, 0x4559, 0x000000, &vesa_modes[11] },
+	{ 0x0d, 0x0000, 0x000000, 0 },
+	{ 0x0e, 0x0000, 0x000000, 0 },
+	{ 0x0f, 0x0000, 0x000000, &vesa_modes[12] },
+	{ 0x10, 0x6140, 0x000000, &vesa_modes[13] },
+	{ 0x11, 0x614a, 0x000000, &vesa_modes[14] },
+	{ 0x12, 0x614f, 0x000000, &vesa_modes[15] },
+	{ 0x13, 0x6159, 0x000000, &vesa_modes[16] },
+	{ 0x14, 0x0000, 0x000000, 0 },
+	{ 0x15, 0x714f, 0x000000, &vesa_modes[17] },
+	{ 0x16, 0x0000, 0x7f1c21, 0 },
+	{ 0x17, 0x0000, 0x7f1c28, 0 },
+	{ 0x18, 0x0000, 0x7f1c44, 0 },
+	{ 0x19, 0x0000, 0x7f1c62, 0 },
+	{ 0x1a, 0x0000, 0x000000, 0 },
+	{ 0x1b, 0x0000, 0x8f1821, 0 },
+	{ 0x1c, 0x8100, 0x8f1828, 0 },
+	{ 0x1d, 0x810f, 0x8f1844, 0 },
+	{ 0x1e, 0x8119, 0x8f1862, 0 },
+	{ 0x1f, 0x0000, 0x000000, 0 },
+	{ 0x20, 0x8140, 0x000000, &vesa_modes[18] },
+	{ 0x21, 0x8159, 0x000000, &vesa_modes[19] },
+	{ 0x22, 0x0000, 0x000000, 0 },
+	{ 0x23, 0x8180, 0x000000, &vesa_modes[20] },
+	{ 0x24, 0x818f, 0x000000, &vesa_modes[21] },
+	{ 0x25, 0x8199, 0x000000, &vesa_modes[22] },
+	{ 0x26, 0x0000, 0x000000, 0 },
+	{ 0x27, 0x0000, 0x000000, 0 },
+	{ 0x28, 0x0000, 0x000000, 0 },
+	{ 0x29, 0x0000, 0x0c2021, 0 },
+	{ 0x2a, 0x9040, 0x0c2028, 0 },
+	{ 0x2b, 0x904f, 0x0c2044, 0 },
+	{ 0x2c, 0x9059, 0x0c2062, 0 },
+	{ 0x2d, 0x0000, 0x000000, 0 },
+	{ 0x2e, 0x9500, 0xc11821, 0 },
+	{ 0x2f, 0x9500, 0xc11828, 0 },
+	{ 0x30, 0x950f, 0xc11844, 0 },
+	{ 0x31, 0x9519, 0xc11868, 0 },
+	{ 0x32, 0x0000, 0x000000, 0 },
+	{ 0x33, 0xa940, 0x000000, &vesa_modes[23] },
+	{ 0x34, 0xa945, 0x000000, &vesa_modes[24] },
+	{ 0x35, 0xa94a, 0x000000, &vesa_modes[25] },
+	{ 0x36, 0xa94f, 0x000000, &vesa_modes[26] },
+	{ 0x37, 0xa959, 0x000000, &vesa_modes[27] },
+	{ 0x38, 0x0000, 0x000000, 0 },
+	{ 0x39, 0x0000, 0x0c2821, 0 },
+	{ 0x3a, 0xb300, 0x0c2828, 0 },
+	{ 0x3b, 0xb30f, 0x0c2844, 0 },
+	{ 0x3c, 0xb319, 0x0c2868, 0 },
+	{ 0x3d, 0x0000, 0x000000, 0 },
+	{ 0x3e, 0xc140, 0x000000, &vesa_modes[28] },
+	{ 0x3f, 0xc14f, 0x000000, &vesa_modes[29] },
+	{ 0x40, 0x0000, 0x000000, 0 },
+	{ 0x41, 0xc940, 0x000000, &vesa_modes[30] },
+	{ 0x42, 0xc94f, 0x000000, &vesa_modes[31] },
+	{ 0x43, 0x0000, 0x000000, 0 },
+	{ 0x44, 0x0000, 0x572821, &vesa_modes[34] },
+	{ 0x45, 0xd100, 0x572828, &vesa_modes[35] },
+	{ 0x46, 0xd10f, 0x572844, &vesa_modes[36] },
+	{ 0x47, 0xd119, 0x572862, &vesa_modes[37] },
+	{ 0x48, 0x0000, 0x000000, 0 },
+	{ 0x49, 0xd140, 0x000000, &vesa_modes[32] },
+	{ 0x4a, 0xd14f, 0x000000, &vesa_modes[33] },
+	{ 0x4b, 0x0000, 0x000000, 0 },
+	{ 0x4c, 0x0000, 0x1f3821, &vesa_modes[38] },
+	{ 0x4d, 0x0000, 0x1f3828, &vesa_modes[39] },
+	{ 0x4e, 0x0000, 0x1f3844, &vesa_modes[40] },
+	{ 0x4f, 0x0000, 0x1f3862, &vesa_modes[41] },
+	{ 0x50, 0x0000, 0x000000, &vesa_modes[42] },
+};
+EXPORT_SYMBOL(dmt_modes);
 #endif /* CONFIG_FB_MODE_HELPERS */
 
 /**
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 882dbd1a87b7..043f3283b71c 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -727,6 +727,8 @@ extern int fb_videomode_from_videomode(const struct videomode *vm,
 
 /* drivers/video/modedb.c */
 #define VESA_MODEDB_SIZE 43
+#define DMT_SIZE 0x50
+
 extern void fb_var_to_videomode(struct fb_videomode *mode,
 				const struct fb_var_screeninfo *var);
 extern void fb_videomode_to_var(struct fb_var_screeninfo *var,
@@ -777,9 +779,17 @@ struct fb_videomode {
 	u32 flag;
 };
 
+struct dmt_videomode {
+	u32 dmt_id;
+	u32 std_2byte_code;
+	u32 cvt_3byte_code;
+	const struct fb_videomode *mode;
+};
+
 extern const char *fb_mode_option;
 extern const struct fb_videomode vesa_modes[];
 extern const struct fb_videomode cea_modes[64];
+extern const struct dmt_videomode dmt_modes[];
 
 struct fb_modelist {
 	struct list_head list;
-- 
cgit v1.2.3


From 92a8b064fd4412d4dd32ef598b6f9e55feef9e51 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 14 Jan 2015 17:21:53 +0100
Subject: mfd: syscon: Add atmel-matrix registers definition

AT91 SoCs have a memory range reserved for internal bus configuration.
Expose those registers so that drivers can make use of the matrix syscon
declared in at91 DTs.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 include/linux/mfd/syscon/atmel-matrix.h | 117 ++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 include/linux/mfd/syscon/atmel-matrix.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon/atmel-matrix.h b/include/linux/mfd/syscon/atmel-matrix.h
new file mode 100644
index 000000000000..8293c3e2a82a
--- /dev/null
+++ b/include/linux/mfd/syscon/atmel-matrix.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (C) 2014 Atmel Corporation.
+ *
+ * Memory Controllers (MATRIX, EBI) - System peripherals registers.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _LINUX_MFD_SYSCON_ATMEL_MATRIX_H
+#define _LINUX_MFD_SYSCON_ATMEL_MATRIX_H
+
+#define AT91SAM9260_MATRIX_MCFG			0x00
+#define AT91SAM9260_MATRIX_SCFG			0x40
+#define AT91SAM9260_MATRIX_PRS			0x80
+#define AT91SAM9260_MATRIX_MRCR			0x100
+#define AT91SAM9260_MATRIX_EBICSA		0x11c
+
+#define AT91SAM9261_MATRIX_MRCR			0x0
+#define AT91SAM9261_MATRIX_SCFG			0x4
+#define AT91SAM9261_MATRIX_TCR			0x24
+#define AT91SAM9261_MATRIX_EBICSA		0x30
+#define AT91SAM9261_MATRIX_USBPUCR		0x34
+
+#define AT91SAM9263_MATRIX_MCFG			0x00
+#define AT91SAM9263_MATRIX_SCFG			0x40
+#define AT91SAM9263_MATRIX_PRS			0x80
+#define AT91SAM9263_MATRIX_MRCR			0x100
+#define AT91SAM9263_MATRIX_TCR			0x114
+#define AT91SAM9263_MATRIX_EBI0CSA		0x120
+#define AT91SAM9263_MATRIX_EBI1CSA		0x124
+
+#define AT91SAM9RL_MATRIX_MCFG			0x00
+#define AT91SAM9RL_MATRIX_SCFG			0x40
+#define AT91SAM9RL_MATRIX_PRS			0x80
+#define AT91SAM9RL_MATRIX_MRCR			0x100
+#define AT91SAM9RL_MATRIX_TCR			0x114
+#define AT91SAM9RL_MATRIX_EBICSA		0x120
+
+#define AT91SAM9G45_MATRIX_MCFG			0x00
+#define AT91SAM9G45_MATRIX_SCFG			0x40
+#define AT91SAM9G45_MATRIX_PRS			0x80
+#define AT91SAM9G45_MATRIX_MRCR			0x100
+#define AT91SAM9G45_MATRIX_TCR			0x110
+#define AT91SAM9G45_MATRIX_DDRMPR		0x118
+#define AT91SAM9G45_MATRIX_EBICSA		0x128
+
+#define AT91SAM9N12_MATRIX_MCFG			0x00
+#define AT91SAM9N12_MATRIX_SCFG			0x40
+#define AT91SAM9N12_MATRIX_PRS			0x80
+#define AT91SAM9N12_MATRIX_MRCR			0x100
+#define AT91SAM9N12_MATRIX_EBICSA		0x118
+
+#define AT91SAM9X5_MATRIX_MCFG			0x00
+#define AT91SAM9X5_MATRIX_SCFG			0x40
+#define AT91SAM9X5_MATRIX_PRS			0x80
+#define AT91SAM9X5_MATRIX_MRCR			0x100
+#define AT91SAM9X5_MATRIX_EBICSA		0x120
+
+#define SAMA5D3_MATRIX_MCFG			0x00
+#define SAMA5D3_MATRIX_SCFG			0x40
+#define SAMA5D3_MATRIX_PRS			0x80
+#define SAMA5D3_MATRIX_MRCR			0x100
+
+#define AT91_MATRIX_MCFG(o, x)			((o) + ((x) * 0x4))
+#define AT91_MATRIX_ULBT			GENMASK(2, 0)
+#define AT91_MATRIX_ULBT_INFINITE		(0 << 0)
+#define AT91_MATRIX_ULBT_SINGLE			(1 << 0)
+#define AT91_MATRIX_ULBT_FOUR			(2 << 0)
+#define AT91_MATRIX_ULBT_EIGHT			(3 << 0)
+#define AT91_MATRIX_ULBT_SIXTEEN		(4 << 0)
+
+#define AT91_MATRIX_SCFG(o, x)			((o) + ((x) * 0x4))
+#define AT91_MATRIX_SLOT_CYCLE			GENMASK(7,  0)
+#define AT91_MATRIX_DEFMSTR_TYPE		GENMASK(17, 16)
+#define AT91_MATRIX_DEFMSTR_TYPE_NONE		(0 << 16)
+#define AT91_MATRIX_DEFMSTR_TYPE_LAST		(1 << 16)
+#define AT91_MATRIX_DEFMSTR_TYPE_FIXED		(2 << 16)
+#define AT91_MATRIX_FIXED_DEFMSTR		GENMASK(20, 18)
+#define AT91_MATRIX_ARBT			GENMASK(25, 24)
+#define AT91_MATRIX_ARBT_ROUND_ROBIN		(0 << 24)
+#define AT91_MATRIX_ARBT_FIXED_PRIORITY		(1 << 24)
+
+#define AT91_MATRIX_ITCM_SIZE			GENMASK(3, 0)
+#define AT91_MATRIX_ITCM_0			(0 << 0)
+#define AT91_MATRIX_ITCM_16			(5 << 0)
+#define AT91_MATRIX_ITCM_32			(6 << 0)
+#define AT91_MATRIX_ITCM_64			(7 << 0)
+#define	AT91_MATRIX_DTCM_SIZE			GENMASK(7, 4)
+#define	AT91_MATRIX_DTCM_0			(0 << 4)
+#define	AT91_MATRIX_DTCM_16			(5 << 4)
+#define AT91_MATRIX_DTCM_32			(6 << 4)
+#define AT91_MATRIX_DTCM_64			(7 << 4)
+
+#define AT91_MATRIX_PRAS(o, x)			((o) + ((x) * 0x8))
+#define AT91_MATRIX_PRBS(o, x)			((o) + ((x) * 0x8) + 0x4)
+#define AT91_MATRIX_MPR(x)			GENMASK(((x) * 0x4) + 1, ((x) * 0x4))
+
+#define AT91_MATRIX_RCB(x)			BIT(x)
+
+#define AT91_MATRIX_CSA(cs, val)		(val << (cs))
+#define AT91_MATRIX_DBPUC			BIT(8)
+#define AT91_MATRIX_DBPDC			BIT(9)
+#define AT91_MATRIX_VDDIOMSEL			BIT(16)
+#define AT91_MATRIX_VDDIOMSEL_1_8V		(0 << 16)
+#define AT91_MATRIX_VDDIOMSEL_3_3V		(1 << 16)
+#define AT91_MATRIX_EBI_IOSR			BIT(17)
+#define AT91_MATRIX_DDR_IOSR			BIT(18)
+#define AT91_MATRIX_NFD0_SELECT			BIT(24)
+#define AT91_MATRIX_DDR_MP_EN			BIT(25)
+#define AT91_MATRIX_EBI_NUM_CS			8
+
+#define AT91_MATRIX_USBPUCR_PUON		BIT(30)
+
+#endif /* _LINUX_MFD_SYSCON_ATMEL_MATRIX_H */
-- 
cgit v1.2.3


From 57d8acbacaace9a4d95ebde1c13b94d293ff21f3 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 14 Jan 2015 17:21:55 +0100
Subject: mfd: syscon: Add atmel-smc registers definition

Atmel AT91 SoCs have a memory range reserved for SMC (Static Memory
Controller) configuration.
Expose those registers so that drivers can make use of the smc syscon
declared in at91 DTs.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 include/linux/mfd/syscon/atmel-smc.h | 173 +++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 include/linux/mfd/syscon/atmel-smc.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon/atmel-smc.h b/include/linux/mfd/syscon/atmel-smc.h
new file mode 100644
index 000000000000..be6ebe64eebe
--- /dev/null
+++ b/include/linux/mfd/syscon/atmel-smc.h
@@ -0,0 +1,173 @@
+/*
+ * Atmel SMC (Static Memory Controller) register offsets and bit definitions.
+ *
+ * Copyright (C) 2014 Atmel
+ * Copyright (C) 2014 Free Electrons
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_MFD_SYSCON_ATMEL_SMC_H_
+#define _LINUX_MFD_SYSCON_ATMEL_SMC_H_
+
+#include <linux/kernel.h>
+#include <linux/regmap.h>
+
+#define AT91SAM9_SMC_GENERIC		0x00
+#define AT91SAM9_SMC_GENERIC_BLK_SZ	0x10
+
+#define SAMA5_SMC_GENERIC		0x600
+#define SAMA5_SMC_GENERIC_BLK_SZ	0x14
+
+#define AT91SAM9_SMC_SETUP(o)		((o) + 0x00)
+#define AT91SAM9_SMC_NWESETUP(x)	(x)
+#define AT91SAM9_SMC_NCS_WRSETUP(x)	((x) << 8)
+#define AT91SAM9_SMC_NRDSETUP(x)	((x) << 16)
+#define AT91SAM9_SMC_NCS_NRDSETUP(x)	((x) << 24)
+
+#define AT91SAM9_SMC_PULSE(o)		((o) + 0x04)
+#define AT91SAM9_SMC_NWEPULSE(x)	(x)
+#define AT91SAM9_SMC_NCS_WRPULSE(x)	((x) << 8)
+#define AT91SAM9_SMC_NRDPULSE(x)	((x) << 16)
+#define AT91SAM9_SMC_NCS_NRDPULSE(x)	((x) << 24)
+
+#define AT91SAM9_SMC_CYCLE(o)		((o) + 0x08)
+#define AT91SAM9_SMC_NWECYCLE(x)	(x)
+#define AT91SAM9_SMC_NRDCYCLE(x)	((x) << 16)
+
+#define AT91SAM9_SMC_MODE(o)		((o) + 0x0c)
+#define SAMA5_SMC_MODE(o)		((o) + 0x10)
+#define AT91_SMC_READMODE		BIT(0)
+#define AT91_SMC_READMODE_NCS		(0 << 0)
+#define AT91_SMC_READMODE_NRD		(1 << 0)
+#define AT91_SMC_WRITEMODE		BIT(1)
+#define AT91_SMC_WRITEMODE_NCS		(0 << 1)
+#define AT91_SMC_WRITEMODE_NWE		(1 << 1)
+#define AT91_SMC_EXNWMODE		GENMASK(5, 4)
+#define AT91_SMC_EXNWMODE_DISABLE	(0 << 4)
+#define AT91_SMC_EXNWMODE_FROZEN	(2 << 4)
+#define AT91_SMC_EXNWMODE_READY		(3 << 4)
+#define AT91_SMC_BAT			BIT(8)
+#define AT91_SMC_BAT_SELECT		(0 << 8)
+#define AT91_SMC_BAT_WRITE		(1 << 8)
+#define AT91_SMC_DBW			GENMASK(13, 12)
+#define AT91_SMC_DBW_8			(0 << 12)
+#define AT91_SMC_DBW_16			(1 << 12)
+#define AT91_SMC_DBW_32			(2 << 12)
+#define AT91_SMC_TDF			GENMASK(19, 16)
+#define AT91_SMC_TDF_(x)		((((x) - 1) << 16) & AT91_SMC_TDF)
+#define AT91_SMC_TDF_MAX		16
+#define AT91_SMC_TDFMODE_OPTIMIZED	BIT(20)
+#define AT91_SMC_PMEN			BIT(24)
+#define AT91_SMC_PS			GENMASK(29, 28)
+#define AT91_SMC_PS_4			(0 << 28)
+#define AT91_SMC_PS_8			(1 << 28)
+#define AT91_SMC_PS_16			(2 << 28)
+#define AT91_SMC_PS_32			(3 << 28)
+
+
+/*
+ * This function converts a setup timing expressed in nanoseconds into an
+ * encoded value that can be written in the SMC_SETUP register.
+ *
+ * The following formula is described in atmel datasheets (section
+ * "SMC Setup Register"):
+ *
+ * setup length = (128* SETUP[5] + SETUP[4:0])
+ *
+ * where setup length is the timing expressed in cycles.
+ */
+static inline u32 at91sam9_smc_setup_ns_to_cycles(unsigned int clk_rate,
+						  u32 timing_ns)
+{
+	u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate);
+	u32 coded_cycles = 0;
+	u32 cycles;
+
+	cycles = DIV_ROUND_UP(timing_ns, clk_period);
+	if (cycles / 32) {
+		coded_cycles |= 1 << 5;
+		if (cycles < 128)
+			cycles = 0;
+	}
+
+	coded_cycles |= cycles % 32;
+
+	return coded_cycles;
+}
+
+/*
+ * This function converts a pulse timing expressed in nanoseconds into an
+ * encoded value that can be written in the SMC_PULSE register.
+ *
+ * The following formula is described in atmel datasheets (section
+ * "SMC Pulse Register"):
+ *
+ * pulse length = (256* PULSE[6] + PULSE[5:0])
+ *
+ * where pulse length is the timing expressed in cycles.
+ */
+static inline u32 at91sam9_smc_pulse_ns_to_cycles(unsigned int clk_rate,
+						  u32 timing_ns)
+{
+	u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate);
+	u32 coded_cycles = 0;
+	u32 cycles;
+
+	cycles = DIV_ROUND_UP(timing_ns, clk_period);
+	if (cycles / 64) {
+		coded_cycles |= 1 << 6;
+		if (cycles < 256)
+			cycles = 0;
+	}
+
+	coded_cycles |= cycles % 64;
+
+	return coded_cycles;
+}
+
+/*
+ * This function converts a cycle timing expressed in nanoseconds into an
+ * encoded value that can be written in the SMC_CYCLE register.
+ *
+ * The following formula is described in atmel datasheets (section
+ * "SMC Cycle Register"):
+ *
+ * cycle length = (CYCLE[8:7]*256 + CYCLE[6:0])
+ *
+ * where cycle length is the timing expressed in cycles.
+ */
+static inline u32 at91sam9_smc_cycle_ns_to_cycles(unsigned int clk_rate,
+						  u32 timing_ns)
+{
+	u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate);
+	u32 coded_cycles = 0;
+	u32 cycles;
+
+	cycles = DIV_ROUND_UP(timing_ns, clk_period);
+	if (cycles / 128) {
+		coded_cycles = cycles / 256;
+		cycles %= 256;
+		if (cycles >= 128) {
+			coded_cycles++;
+			cycles = 0;
+		}
+
+		if (coded_cycles > 0x3) {
+			coded_cycles = 0x3;
+			cycles = 0x7f;
+		}
+
+		coded_cycles <<= 7;
+	}
+
+	coded_cycles |= cycles % 128;
+
+	return coded_cycles;
+}
+
+#endif /* _LINUX_MFD_SYSCON_ATMEL_SMC_H_ */
-- 
cgit v1.2.3


From d621e8bae5ac9c67de4de90c5cded12adc8ee1e1 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Date: Wed, 17 Dec 2014 16:51:13 +0100
Subject: gpio/gpiolib-of: Create of_mm_gpiochip_remove

Create counterpart of of_mm_gpiochip_add(). This way the modules that
can be removable do not duplicate the cleanup code.

Suggested-by: Alexandre Courbot <gnurou@gmail.com>
Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-of.c | 17 +++++++++++++++++
 include/linux/of_gpio.h   |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index 604dbe60bdee..3e2c6afeab11 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -204,6 +204,23 @@ err0:
 }
 EXPORT_SYMBOL(of_mm_gpiochip_add);
 
+/**
+ * of_mm_gpiochip_remove - Remove memory mapped GPIO chip (bank)
+ * @mm_gc:	pointer to the of_mm_gpio_chip allocated structure
+ */
+void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc)
+{
+	struct gpio_chip *gc = &mm_gc->gc;
+
+	if (!mm_gc)
+		return;
+
+	gpiochip_remove(gc);
+	iounmap(mm_gc->regs);
+	kfree(gc->label);
+}
+EXPORT_SYMBOL(of_mm_gpiochip_remove);
+
 #ifdef CONFIG_PINCTRL
 static void of_gpiochip_add_pin_range(struct gpio_chip *chip)
 {
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 38fc05036015..69dbe312b11b 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -52,6 +52,7 @@ extern int of_get_named_gpio_flags(struct device_node *np,
 
 extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
+extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc);
 
 extern void of_gpiochip_add(struct gpio_chip *gc);
 extern void of_gpiochip_remove(struct gpio_chip *gc);
-- 
cgit v1.2.3


From 53e41f554a0cbad139ee5072bbb49b4951f680c2 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 15 Dec 2014 10:39:47 +0100
Subject: gpio: tc3589x: get rid of platform data

This device is only used from the device tree, and the startup()
and remove() callbacks are not used anywhere in the kernel, so
retire them and the pdata altogether.

Cc: Samuel Ortiz <sameo@linux.intel.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-tc3589x.c | 15 ++-------------
 include/linux/mfd/tc3589x.h | 12 ------------
 2 files changed, 2 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-tc3589x.c b/drivers/gpio/gpio-tc3589x.c
index abdcf58935f5..11aed2671065 100644
--- a/drivers/gpio/gpio-tc3589x.c
+++ b/drivers/gpio/gpio-tc3589x.c
@@ -232,16 +232,13 @@ static irqreturn_t tc3589x_gpio_irq(int irq, void *dev)
 static int tc3589x_gpio_probe(struct platform_device *pdev)
 {
 	struct tc3589x *tc3589x = dev_get_drvdata(pdev->dev.parent);
-	struct tc3589x_gpio_platform_data *pdata;
 	struct device_node *np = pdev->dev.of_node;
 	struct tc3589x_gpio *tc3589x_gpio;
 	int ret;
 	int irq;
 
-	pdata = tc3589x->pdata->gpio;
-
-	if (!(pdata || np)) {
-		dev_err(&pdev->dev, "No platform data or Device Tree found\n");
+	if (!np) {
+		dev_err(&pdev->dev, "No Device Tree node found\n");
 		return -EINVAL;
 	}
 
@@ -305,9 +302,6 @@ static int tc3589x_gpio_probe(struct platform_device *pdev)
 				     irq,
 				     NULL);
 
-	if (pdata && pdata->setup)
-		pdata->setup(tc3589x, tc3589x_gpio->chip.base);
-
 	platform_set_drvdata(pdev, tc3589x_gpio);
 
 	return 0;
@@ -316,11 +310,6 @@ static int tc3589x_gpio_probe(struct platform_device *pdev)
 static int tc3589x_gpio_remove(struct platform_device *pdev)
 {
 	struct tc3589x_gpio *tc3589x_gpio = platform_get_drvdata(pdev);
-	struct tc3589x *tc3589x = tc3589x_gpio->tc3589x;
-	struct tc3589x_gpio_platform_data *pdata = tc3589x->pdata->gpio;
-
-	if (pdata && pdata->remove)
-		pdata->remove(tc3589x, tc3589x_gpio->chip.base);
 
 	gpiochip_remove(&tc3589x_gpio->chip);
 
diff --git a/include/linux/mfd/tc3589x.h b/include/linux/mfd/tc3589x.h
index e1c12d84c26a..c203c9c56776 100644
--- a/include/linux/mfd/tc3589x.h
+++ b/include/linux/mfd/tc3589x.h
@@ -162,25 +162,13 @@ struct tc3589x_keypad_platform_data {
 	bool                    no_autorepeat;
 };
 
-/**
- * struct tc3589x_gpio_platform_data - TC3589x GPIO platform data
- * @setup: callback for board-specific initialization
- * @remove: callback for board-specific teardown
- */
-struct tc3589x_gpio_platform_data {
-	void (*setup)(struct tc3589x *tc3589x, unsigned gpio_base);
-	void (*remove)(struct tc3589x *tc3589x, unsigned gpio_base);
-};
-
 /**
  * struct tc3589x_platform_data - TC3589x platform data
  * @block: bitmask of blocks to enable (use TC3589x_BLOCK_*)
- * @gpio: GPIO-specific platform data
  * @keypad: keypad-specific platform data
  */
 struct tc3589x_platform_data {
 	unsigned int block;
-	struct tc3589x_gpio_platform_data *gpio;
 	const struct tc3589x_keypad_platform_data *keypad;
 };
 
-- 
cgit v1.2.3


From d34541bc48eb7d7cb2ead5ff0284acf65af96f17 Mon Sep 17 00:00:00 2001
From: Olliver Schinagl <oliver@schinagl.nl>
Date: Wed, 7 Jan 2015 09:44:57 +0100
Subject: gpio: Make the vararg hacks not pass magic values

Right now, in consumer.h, there's some vararg hacks that pass 0 as the
flags. What actually is passed however is GPIOD_ASIS, which naturally is
also 0. Using the define/enum rather then the magic 0 makes it the
define more readable to a passer by.

Signed-off-by: Olliver Schinagl <oliver@schinagl.nl>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/consumer.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index fd85cb120ee0..45afc2dee560 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -340,31 +340,32 @@ static inline int desc_to_gpio(const struct gpio_desc *desc)
  * etc.
  */
 #define __gpiod_get(dev, con_id, flags, ...) __gpiod_get(dev, con_id, flags)
-#define gpiod_get(varargs...) __gpiod_get(varargs, 0)
+#define gpiod_get(varargs...) __gpiod_get(varargs, GPIOD_ASIS)
 #define __gpiod_get_index(dev, con_id, index, flags, ...)		\
 	__gpiod_get_index(dev, con_id, index, flags)
-#define gpiod_get_index(varargs...) __gpiod_get_index(varargs, 0)
+#define gpiod_get_index(varargs...) __gpiod_get_index(varargs, GPIOD_ASIS)
 #define __gpiod_get_optional(dev, con_id, flags, ...)			\
 	__gpiod_get_optional(dev, con_id, flags)
-#define gpiod_get_optional(varargs...) __gpiod_get_optional(varargs, 0)
+#define gpiod_get_optional(varargs...) __gpiod_get_optional(varargs, GPIOD_ASIS)
 #define __gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
 	__gpiod_get_index_optional(dev, con_id, index, flags)
 #define gpiod_get_index_optional(varargs...)				\
-	__gpiod_get_index_optional(varargs, 0)
+	__gpiod_get_index_optional(varargs, GPIOD_ASIS)
 #define __devm_gpiod_get(dev, con_id, flags, ...)			\
 	__devm_gpiod_get(dev, con_id, flags)
-#define devm_gpiod_get(varargs...) __devm_gpiod_get(varargs, 0)
+#define devm_gpiod_get(varargs...) __devm_gpiod_get(varargs, GPIOD_ASIS)
 #define __devm_gpiod_get_index(dev, con_id, index, flags, ...)		\
 	__devm_gpiod_get_index(dev, con_id, index, flags)
-#define devm_gpiod_get_index(varargs...) __devm_gpiod_get_index(varargs, 0)
+#define devm_gpiod_get_index(varargs...)				\
+	__devm_gpiod_get_index(varargs, GPIOD_ASIS)
 #define __devm_gpiod_get_optional(dev, con_id, flags, ...)		\
 	__devm_gpiod_get_optional(dev, con_id, flags)
 #define devm_gpiod_get_optional(varargs...)				\
-	__devm_gpiod_get_optional(varargs, 0)
+	__devm_gpiod_get_optional(varargs, GPIOD_ASIS)
 #define __devm_gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
 	__devm_gpiod_get_index_optional(dev, con_id, index, flags)
 #define devm_gpiod_get_index_optional(varargs...)			\
-	__devm_gpiod_get_index_optional(varargs, 0)
+	__devm_gpiod_get_index_optional(varargs, GPIOD_ASIS)
 
 #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_GPIO_SYSFS)
 
-- 
cgit v1.2.3


From 2397aa8b515f7bd77c8d5698170b6a98fdd6721c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 13 Jan 2015 11:02:54 -0500
Subject: svcrdma: Clean up read chunk counting

The byte_count argument is not used, and the function is called
only from one place.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  2 --
 net/sunrpc/xprtrdma/svc_rdma_marshal.c  | 16 ----------------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 15 ++++++++++++---
 3 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 975da754c778..2280325e4c88 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -178,8 +178,6 @@ struct svcxprt_rdma {
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
 /* svc_rdma_marshal.c */
-extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
-				      int *, int *);
 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
 extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 65b146297f5a..b681855cf970 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -70,22 +70,6 @@ static u32 *decode_read_list(u32 *va, u32 *vaend)
 	return (u32 *)&ch->rc_position;
 }
 
-/*
- * Determine number of chunks and total bytes in chunk list. The chunk
- * list has already been verified to fit within the RPCRDMA header.
- */
-void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
-			       int *ch_count, int *byte_count)
-{
-	/* compute the number of bytes represented by read chunks */
-	*byte_count = 0;
-	*ch_count = 0;
-	for (; ch->rc_discrim != 0; ch++) {
-		*byte_count = *byte_count + ntohl(ch->rc_target.rs_length);
-		*ch_count = *ch_count + 1;
-	}
-}
-
 /*
  * Decodes a write chunk list. The expected format is as follows:
  *    descrim  : xdr_one
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2c67de032009..b3b7bb85844d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -365,12 +365,22 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
 	return ret;
 }
 
+static unsigned int
+rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch)
+{
+	unsigned int count;
+
+	for (count = 0; ch->rc_discrim != xdr_zero; ch++)
+		count++;
+	return count;
+}
+
 static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 			    struct rpcrdma_msg *rmsgp,
 			    struct svc_rqst *rqstp,
 			    struct svc_rdma_op_ctxt *head)
 {
-	int page_no, ch_count, ret;
+	int page_no, ret;
 	struct rpcrdma_read_chunk *ch;
 	u32 page_offset, byte_count;
 	u64 rs_offset;
@@ -381,8 +391,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	if (!ch)
 		return 0;
 
-	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
-	if (ch_count > RPCSVC_MAXPAGES)
+	if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES)
 		return -EINVAL;
 
 	/* The request is completed when the RDMA_READs complete. The
-- 
cgit v1.2.3


From e54524111f51eac1900cf91aca3d38a92a6b11c0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 13 Jan 2015 11:03:20 -0500
Subject: svcrdma: Plant reader function in struct svcxprt_rdma

The RDMA reader function doesn't change once an svcxprt_rdma is
instantiated. Instead of checking sc_devcap during every incoming
RPC, set the reader function once when the connection is accepted.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 10 +++++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 71 ++++++++++++--------------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  2 +
 3 files changed, 39 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 2280325e4c88..f161e309f25e 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -150,6 +150,10 @@ struct svcxprt_rdma {
 	struct ib_cq         *sc_rq_cq;
 	struct ib_cq         *sc_sq_cq;
 	struct ib_mr         *sc_phys_mr;	/* MR for server memory */
+	int		     (*sc_reader)(struct svcxprt_rdma *,
+					  struct svc_rqst *,
+					  struct svc_rdma_op_ctxt *,
+					  int *, u32 *, u32, u32, u64, bool);
 	u32		     sc_dev_caps;	/* distilled device caps */
 	u32		     sc_dma_lkey;	/* local dma key */
 	unsigned int	     sc_frmr_pg_list_len;
@@ -195,6 +199,12 @@ extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
+extern int rdma_read_chunk_lcl(struct svcxprt_rdma *, struct svc_rqst *,
+			       struct svc_rdma_op_ctxt *, int *, u32 *,
+			       u32, u32, u64, bool);
+extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
+				struct svc_rdma_op_ctxt *, int *, u32 *,
+				u32, u32, u64, bool);
 
 /* svc_rdma_sendto.c */
 extern int svc_rdma_sendto(struct svc_rqst *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 577f8659ca30..c3aebc1bf0a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -117,26 +117,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
 		return min_t(int, sge_count, xprt->sc_max_sge);
 }
 
-typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
-			      struct svc_rqst *rqstp,
-			      struct svc_rdma_op_ctxt *head,
-			      int *page_no,
-			      u32 *page_offset,
-			      u32 rs_handle,
-			      u32 rs_length,
-			      u64 rs_offset,
-			      int last);
-
 /* Issue an RDMA_READ using the local lkey to map the data sink */
-static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
-			       struct svc_rqst *rqstp,
-			       struct svc_rdma_op_ctxt *head,
-			       int *page_no,
-			       u32 *page_offset,
-			       u32 rs_handle,
-			       u32 rs_length,
-			       u64 rs_offset,
-			       int last)
+int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
+			struct svc_rqst *rqstp,
+			struct svc_rdma_op_ctxt *head,
+			int *page_no,
+			u32 *page_offset,
+			u32 rs_handle,
+			u32 rs_length,
+			u64 rs_offset,
+			bool last)
 {
 	struct ib_send_wr read_wr;
 	int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
@@ -221,15 +211,15 @@ static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 }
 
 /* Issue an RDMA_READ using an FRMR to map the data sink */
-static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
-				struct svc_rqst *rqstp,
-				struct svc_rdma_op_ctxt *head,
-				int *page_no,
-				u32 *page_offset,
-				u32 rs_handle,
-				u32 rs_length,
-				u64 rs_offset,
-				int last)
+int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+			 struct svc_rqst *rqstp,
+			 struct svc_rdma_op_ctxt *head,
+			 int *page_no,
+			 u32 *page_offset,
+			 u32 rs_handle,
+			 u32 rs_length,
+			 u64 rs_offset,
+			 bool last)
 {
 	struct ib_send_wr read_wr;
 	struct ib_send_wr inv_wr;
@@ -374,9 +364,9 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 {
 	int page_no, ret;
 	struct rpcrdma_read_chunk *ch;
-	u32 page_offset, byte_count;
+	u32 handle, page_offset, byte_count;
 	u64 rs_offset;
-	rdma_reader_fn reader;
+	bool last;
 
 	/* If no read list is present, return 0 */
 	ch = svc_rdma_get_read_chunk(rmsgp);
@@ -399,27 +389,20 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	head->arg.len = rqstp->rq_arg.len;
 	head->arg.buflen = rqstp->rq_arg.buflen;
 
-	/* Use FRMR if supported */
-	if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
-		reader = rdma_read_chunk_frmr;
-	else
-		reader = rdma_read_chunk_lcl;
-
 	page_no = 0; page_offset = 0;
 	for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
 	     ch->rc_discrim != 0; ch++) {
-
+		handle = be32_to_cpu(ch->rc_target.rs_handle);
+		byte_count = be32_to_cpu(ch->rc_target.rs_length);
 		xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
 				 &rs_offset);
-		byte_count = ntohl(ch->rc_target.rs_length);
 
 		while (byte_count > 0) {
-			ret = reader(xprt, rqstp, head,
-				     &page_no, &page_offset,
-				     ntohl(ch->rc_target.rs_handle),
-				     byte_count, rs_offset,
-				     ((ch+1)->rc_discrim == 0) /* last */
-				     );
+			last = (ch + 1)->rc_discrim == xdr_zero;
+			ret = xprt->sc_reader(xprt, rqstp, head,
+					      &page_no, &page_offset,
+					      handle, byte_count,
+					      rs_offset, last);
 			if (ret < 0)
 				goto err;
 			byte_count -= ret;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f2e059bbab42..f609c1c2d38d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -974,10 +974,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	 * NB:	iWARP requires remote write access for the data sink
 	 *	of an RDMA_READ. IB does not.
 	 */
+	newxprt->sc_reader = rdma_read_chunk_lcl;
 	if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
 		newxprt->sc_frmr_pg_list_len =
 			devattr.max_fast_reg_page_list_len;
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+		newxprt->sc_reader = rdma_read_chunk_frmr;
 	}
 
 	/*
-- 
cgit v1.2.3


From 0b056c224bea63060ce8a981e84193c93fac6f5d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 13 Jan 2015 11:03:37 -0500
Subject: svcrdma: Support RDMA_NOMSG requests

Currently the Linux server can not decode RDMA_NOMSG type requests.
Operations whose length exceeds the fixed size of RDMA SEND buffers,
like large NFSv4 CREATE(NF4LNK) operations, must be conveyed via
RDMA_NOMSG.

For an RDMA_MSG type request, the client sends the RPC/RDMA, RPC
headers, and some or all of the NFS arguments via RDMA SEND.

For an RDMA_NOMSG type request, the client sends just the RPC/RDMA
header via RDMA SEND. The request's read list contains elements for
the entire RPC message, including the RPC header.

NFSD expects the RPC/RMDA header and RPC header to be contiguous in
page zero of the XDR buffer. Add logic in the RDMA READ path to make
the read list contents land where the server prefers, when the
incoming message is a type RDMA_NOMSG message.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 39 ++++++++++++++++++++++++++++++---
 2 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f161e309f25e..c343a94bc791 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -79,6 +79,7 @@ struct svc_rdma_op_ctxt {
 	enum ib_wr_opcode wr_op;
 	enum ib_wc_status wc_status;
 	u32 byte_len;
+	u32 position;
 	struct svcxprt_rdma *xprt;
 	unsigned long flags;
 	enum dma_data_direction direction;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index a67dd1a081dd..36cf51a3eab7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -60,6 +60,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 			       struct svc_rdma_op_ctxt *ctxt,
 			       u32 byte_count)
 {
+	struct rpcrdma_msg *rmsgp;
 	struct page *page;
 	u32 bc;
 	int sge_no;
@@ -82,7 +83,14 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	/* If data remains, store it in the pagelist */
 	rqstp->rq_arg.page_len = bc;
 	rqstp->rq_arg.page_base = 0;
-	rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
+	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+	if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
+		rqstp->rq_arg.pages = &rqstp->rq_pages[0];
+	else
+		rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
 	sge_no = 1;
 	while (bc && sge_no < ctxt->count) {
 		page = ctxt->pages[sge_no];
@@ -383,7 +391,6 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	 */
 	head->arg.head[0] = rqstp->rq_arg.head[0];
 	head->arg.tail[0] = rqstp->rq_arg.tail[0];
-	head->arg.pages = &head->pages[head->count];
 	head->hdr_count = head->count;
 	head->arg.page_base = 0;
 	head->arg.page_len = 0;
@@ -393,9 +400,17 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
 	position = be32_to_cpu(ch->rc_position);
 
+	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+	if (position == 0) {
+		head->arg.pages = &head->pages[0];
+		page_offset = head->byte_len;
+	} else {
+		head->arg.pages = &head->pages[head->count];
+		page_offset = 0;
+	}
+
 	ret = 0;
 	page_no = 0;
-	page_offset = 0;
 	for (; ch->rc_discrim != xdr_zero; ch++) {
 		if (be32_to_cpu(ch->rc_position) != position)
 			goto err;
@@ -418,7 +433,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 			head->arg.buflen += ret;
 		}
 	}
+
 	ret = 1;
+	head->position = position;
+
  err:
 	/* Detach arg pages. svc_recv will replenish them */
 	for (page_no = 0;
@@ -465,6 +483,21 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 		put_page(rqstp->rq_pages[page_no]);
 		rqstp->rq_pages[page_no] = head->pages[page_no];
 	}
+
+	/* Adjustments made for RDMA_NOMSG type requests */
+	if (head->position == 0) {
+		if (head->arg.len <= head->sge[0].length) {
+			head->arg.head[0].iov_len = head->arg.len -
+							head->byte_len;
+			head->arg.page_len = 0;
+		} else {
+			head->arg.head[0].iov_len = head->sge[0].length -
+								head->byte_len;
+			head->arg.page_len = head->arg.len -
+						head->sge[0].length;
+		}
+	}
+
 	/* Point rq_arg.pages past header */
 	rdma_fix_xdr_pad(&head->arg);
 	rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
-- 
cgit v1.2.3


From 57699a40b4f2694d3ee63fd5e6465ec8f600b620 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Fri, 16 Jan 2015 11:13:09 +0800
Subject: rhashtable: Fix race in rhashtable_destroy() and use regular
 work_struct

When we put our declared work task in the global workqueue with
schedule_delayed_work(), its delay parameter is always zero.
Therefore, we should define a regular work in rhashtable structure
instead of a delayed work.

By the way, we add a condition to check whether resizing functions
are NULL before cancelling the work, avoiding to cancel an
uninitialized work.

Lastly, while we wait for all work items we submitted before to run
to completion with cancel_delayed_work(), ht->mutex has been taken in
rhashtable_destroy(). Moreover, cancel_delayed_work() doesn't return
until all work items are accomplished, and when work items are
scheduled, the work's function - rht_deferred_worker() will be called.
However, as rht_deferred_worker() also needs to acquire the lock,
deadlock might happen at the moment as the lock is already held before.
So if the cancel work function is moved out of the lock covered scope,
this will avoid the deadlock.

Fixes: 97defe1 ("rhashtable: Per bucket locks & deferred expansion/shrinking")
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  2 +-
 lib/rhashtable.c           | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 9570832ab07c..a2562ed53ea3 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -119,7 +119,7 @@ struct rhashtable {
 	atomic_t			nelems;
 	atomic_t			shift;
 	struct rhashtable_params	p;
-	struct delayed_work             run_work;
+	struct work_struct		run_work;
 	struct mutex                    mutex;
 	bool                            being_destroyed;
 };
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index aca699813ba9..84a78e396a56 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -485,7 +485,7 @@ static void rht_deferred_worker(struct work_struct *work)
 	struct rhashtable *ht;
 	struct bucket_table *tbl;
 
-	ht = container_of(work, struct rhashtable, run_work.work);
+	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
 	tbl = rht_dereference(ht->tbl, ht);
 
@@ -507,7 +507,7 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 	if (tbl == new_tbl &&
 	    ((ht->p.grow_decision && ht->p.grow_decision(ht, size)) ||
 	     (ht->p.shrink_decision && ht->p.shrink_decision(ht, size))))
-		schedule_delayed_work(&ht->run_work, 0);
+		schedule_work(&ht->run_work);
 }
 
 static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
@@ -903,7 +903,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
 
 	if (ht->p.grow_decision || ht->p.shrink_decision)
-		INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker);
+		INIT_WORK(&ht->run_work, rht_deferred_worker);
 
 	return 0;
 }
@@ -921,11 +921,11 @@ void rhashtable_destroy(struct rhashtable *ht)
 {
 	ht->being_destroyed = true;
 
-	mutex_lock(&ht->mutex);
+	if (ht->p.grow_decision || ht->p.shrink_decision)
+		cancel_work_sync(&ht->run_work);
 
-	cancel_delayed_work(&ht->run_work);
+	mutex_lock(&ht->mutex);
 	bucket_table_free(rht_dereference(ht->tbl, ht));
-
 	mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
-- 
cgit v1.2.3


From 5cd37193ce8539be1e6ef76be226f4bcc984e0f5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 13 Dec 2014 20:32:04 -0800
Subject: rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors

Although cond_resched_rcu_qs() only applies to TASKS_RCU, it is used
in places where it would be useful for it to apply to the normal RCU
flavors, rcu_preempt, rcu_sched, and rcu_bh.  This is especially the
case for workloads that aggressively overload the system, particularly
those that generate large numbers of RCU updates on systems running
NO_HZ_FULL CPUs.  This commit therefore communicates quiescent states
from cond_resched_rcu_qs() to the normal RCU flavors.

Note that it is unfortunately necessary to leave the old ->passed_quiesce
mechanism in place to allow quiescent states that apply to only one
flavor to be recorded.  (Yes, we could decrement ->rcu_qs_ctr_snap in
that case, but that is not so good for debugging of RCU internals.)
In addition, if one of the RCU flavor's grace period has stalled, this
will invoke rcu_momentary_dyntick_idle(), resulting in a heavy-weight
quiescent state visible from other CPUs.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Merge commit from Sasha Levin fixing a bug where __this_cpu()
  was used in preemptible code. ]
---
 Documentation/RCU/trace.txt | 32 ++++++++++++++++----------------
 include/linux/rcupdate.h    |  3 ++-
 include/linux/rcutiny.h     |  5 ++++-
 include/linux/rcutree.h     |  2 ++
 kernel/rcu/tree.c           | 38 +++++++++++++++++++++++++++++++++-----
 kernel/rcu/tree.h           |  2 ++
 kernel/rcu/tree_trace.c     |  8 ++++++--
 7 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index b63b9bb3bc0c..08651da15448 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -56,14 +56,14 @@ rcuboost:
 
 The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
 
-  0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
-  1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
-  2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
-  3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
-  4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
-  5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
-  6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
-  7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
+  0!c=30455 g=30456 pq=1/0 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
+  1!c=30719 g=30720 pq=1/0 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
+  2!c=30150 g=30151 pq=1/1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
+  3 c=31249 g=31250 pq=1/1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
+  4!c=29502 g=29503 pq=1/0 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
+  5 c=31201 g=31202 pq=1/0 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
+  6!c=30253 g=30254 pq=1/0 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
+  7 c=31178 g=31178 pq=1/0 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
 
 This file has one line per CPU, or eight for this 8-CPU system.
 The fields are as follows:
@@ -188,14 +188,14 @@ o	"ca" is the number of RCU callbacks that have been adopted by this
 Kernels compiled with CONFIG_RCU_BOOST=y display the following from
 /debug/rcu/rcu_preempt/rcudata:
 
-  0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
-  1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
-  2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
-  3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
-  4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
-  5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
-  6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
-  7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
+  0!c=12865 g=12866 pq=1/0 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
+  1 c=14407 g=14408 pq=1/0 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
+  2 c=14407 g=14408 pq=1/0 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
+  3 c=14407 g=14408 pq=1/0 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
+  4 c=14405 g=14406 pq=1/0 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
+  5!c=14168 g=14169 pq=1/0 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
+  6 c=14404 g=14405 pq=1/0 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
+  7 c=14407 g=14408 pq=1/0 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
 
 This is similar to the output discussed above, but contains the following
 additional fields:
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ed4f5939a452..468228750299 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -331,12 +331,13 @@ static inline void rcu_init_nohz(void)
 extern struct srcu_struct tasks_rcu_exit_srcu;
 #define rcu_note_voluntary_context_switch(t) \
 	do { \
+		rcu_all_qs(); \
 		if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
 			ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
 	} while (0)
 #else /* #ifdef CONFIG_TASKS_RCU */
 #define TASKS_RCU(x) do { } while (0)
-#define rcu_note_voluntary_context_switch(t)	do { } while (0)
+#define rcu_note_voluntary_context_switch(t)	rcu_all_qs()
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 0e5366200154..fabd3fad8516 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -154,7 +154,10 @@ static inline bool rcu_is_watching(void)
 	return true;
 }
 
-
 #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 
+static inline void rcu_all_qs(void)
+{
+}
+
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 52953790dcca..ddba927f7316 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -97,4 +97,6 @@ extern int rcu_scheduler_active __read_mostly;
 
 bool rcu_is_watching(void);
 
+void rcu_all_qs(void);
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 75ce12316b4c..cb00e038c2f2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -219,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+
 /*
  * Let the RCU core know that this CPU has gone through the scheduler,
  * which is a quiescent state.  This is called when the need for a
@@ -288,6 +291,22 @@ void rcu_note_context_switch(void)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
+/*
+ * Register a quiesecent state for all RCU flavors.  If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desparate need of a quiescent state, which will normally
+ * be none of them).  Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+		rcu_momentary_dyntick_idle();
+	this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
+
 static long blimit = 10;	/* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;	/* If this many pending, ignore blimit. */
 static long qlowmark = 100;	/* Once only this many pending, use blimit. */
@@ -1609,6 +1628,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		rdp->gpnum = rnp->gpnum;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
 		rdp->passed_quiesce = 0;
+		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
 		ACCESS_ONCE(rdp->gpwrap) = false;
@@ -2075,8 +2095,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
-	    rnp->completed == rnp->gpnum || rdp->gpwrap) {
+	if ((rdp->passed_quiesce == 0 &&
+	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+	    rdp->gpwrap) {
 
 		/*
 		 * The grace period in which this quiescent state was
@@ -2085,6 +2107,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 * within the current grace period.
 		 */
 		rdp->passed_quiesce = 0;	/* need qs for new gp. */
+		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
@@ -2129,7 +2152,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (!rdp->passed_quiesce)
+	if (!rdp->passed_quiesce &&
+	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
 		return;
 
 	/*
@@ -3174,9 +3198,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
-	    rdp->qs_pending && !rdp->passed_quiesce) {
+	    rdp->qs_pending && !rdp->passed_quiesce &&
+	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
 		rdp->n_rp_qs_pending++;
-	} else if (rdp->qs_pending && rdp->passed_quiesce) {
+	} else if (rdp->qs_pending &&
+		   (rdp->passed_quiesce ||
+		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
 		rdp->n_rp_report_qs++;
 		return 1;
 	}
@@ -3510,6 +3537,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 			rdp->gpnum = rnp->completed;
 			rdp->completed = rnp->completed;
 			rdp->passed_quiesce = 0;
+			rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 			rdp->qs_pending = 0;
 			trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 		}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7472ff388d55..1e7f8b05714e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -257,6 +257,8 @@ struct rcu_data {
 					/*  in order to detect GP end. */
 	unsigned long	gpnum;		/* Highest gp number that this CPU */
 					/*  is aware of having started. */
+	unsigned long	rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+					/*  for rcu_all_qs() invocations. */
 	bool		passed_quiesce;	/* User-mode/idle loop etc. */
 	bool		qs_pending;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
 #define RCU_TREE_NONCORE
 #include "tree.h"
 
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+
 static int r_open(struct inode *inode, struct file *file,
 					const struct seq_operations *op)
 {
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-		   rdp->passed_quiesce, rdp->qs_pending);
+		   rdp->passed_quiesce,
+		   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+		   rdp->qs_pending);
 	seq_printf(m, " dt=%d/%llx/%d df=%lu",
 		   atomic_read(&rdp->dynticks->dynticks),
 		   rdp->dynticks->dynticks_nesting,
-- 
cgit v1.2.3


From 076c3b8e03e2737659a89660bb8e54e13587d974 Mon Sep 17 00:00:00 2001
From: James Ban <james.ban.opensource@diasemi.com>
Date: Fri, 16 Jan 2015 12:13:27 +0900
Subject: regulator: da9211: fix unmatched of_node

This is a patch for fixing unmatched of_node.

Signed-off-by: James Ban <james.ban.opensource@diasemi.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/da9211-regulator.c | 4 ++--
 include/linux/regulator/da9211.h     | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/da9211-regulator.c b/drivers/regulator/da9211-regulator.c
index c78d2106d6cb..8e6957c63a69 100644
--- a/drivers/regulator/da9211-regulator.c
+++ b/drivers/regulator/da9211-regulator.c
@@ -276,7 +276,7 @@ static struct da9211_pdata *da9211_parse_regulators_dt(
 			continue;
 
 		pdata->init_data[n] = da9211_matches[i].init_data;
-
+		pdata->reg_node[n] = da9211_matches[i].of_node;
 		n++;
 	}
 
@@ -364,7 +364,7 @@ static int da9211_regulator_init(struct da9211 *chip)
 		config.dev = chip->dev;
 		config.driver_data = chip;
 		config.regmap = chip->regmap;
-		config.of_node = chip->dev->of_node;
+		config.of_node = chip->pdata->reg_node[i];
 
 		chip->rdev[i] = devm_regulator_register(chip->dev,
 			&da9211_regulators[i], &config);
diff --git a/include/linux/regulator/da9211.h b/include/linux/regulator/da9211.h
index 5479394fefce..d1d9d3849bdb 100644
--- a/include/linux/regulator/da9211.h
+++ b/include/linux/regulator/da9211.h
@@ -32,6 +32,7 @@ struct da9211_pdata {
 	 * 2 : 2 phase 2 buck
 	 */
 	int num_buck;
+	struct device_node *reg_node[DA9211_MAX_REGULATORS];
 	struct regulator_init_data *init_data[DA9211_MAX_REGULATORS];
 };
 #endif
-- 
cgit v1.2.3


From ba0513b5b8ffbcb0cc89e2f172c0bcb70497ba2e Mon Sep 17 00:00:00 2001
From: Mario Smarduch <m.smarduch@samsung.com>
Date: Thu, 15 Jan 2015 15:58:53 -0800
Subject: KVM: Add generic support for dirty page logging

kvm_get_dirty_log() provides generic handling of dirty bitmap, currently reused
by several architectures. Building on that we intrdoduce
kvm_get_dirty_log_protect() adding write protection to mark these pages dirty
for future write access, before next KVM_GET_DIRTY_LOG ioctl call from user
space.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 include/linux/kvm_host.h |  9 ++++++
 virt/kvm/Kconfig         |  6 ++++
 virt/kvm/kvm_main.c      | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 26f106022c88..3b934cc94cc8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -611,6 +611,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
 
 int kvm_get_dirty_log(struct kvm *kvm,
 			struct kvm_dirty_log *log, int *is_dirty);
+
+int kvm_get_dirty_log_protect(struct kvm *kvm,
+			struct kvm_dirty_log *log, bool *is_dirty);
+
+void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+					struct kvm_memory_slot *slot,
+					gfn_t gfn_offset,
+					unsigned long mask);
+
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				struct kvm_dirty_log *log);
 
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 3796a2132a06..314950c51d9f 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -40,3 +40,9 @@ config KVM_VFIO
 
 config HAVE_KVM_ARCH_TLB_FLUSH_ALL
        bool
+
+config HAVE_KVM_ARCH_DIRTY_LOG_PROTECT
+       bool
+
+config KVM_GENERIC_DIRTYLOG_READ_PROTECT
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d03bd2255801..246cf291c6fd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -995,6 +995,86 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
 
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+/**
+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
+ *	are dirty write protect them for next write.
+ * @kvm:	pointer to kvm instance
+ * @log:	slot id and address to which we copy the log
+ * @is_dirty:	flag set if any page is dirty
+ *
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently. So, to avoid losing track of dirty pages we keep the
+ * following order:
+ *
+ *    1. Take a snapshot of the bit and clear it if needed.
+ *    2. Write protect the corresponding page.
+ *    3. Copy the snapshot to the userspace.
+ *    4. Upon return caller flushes TLB's if needed.
+ *
+ * Between 2 and 4, the guest may write to the page using the remaining TLB
+ * entry.  This is not a problem because the page is reported dirty using
+ * the snapshot taken before and step 4 ensures that writes done after
+ * exiting to userspace will be logged for the next call.
+ *
+ */
+int kvm_get_dirty_log_protect(struct kvm *kvm,
+			struct kvm_dirty_log *log, bool *is_dirty)
+{
+	struct kvm_memory_slot *memslot;
+	int r, i;
+	unsigned long n;
+	unsigned long *dirty_bitmap;
+	unsigned long *dirty_bitmap_buffer;
+
+	r = -EINVAL;
+	if (log->slot >= KVM_USER_MEM_SLOTS)
+		goto out;
+
+	memslot = id_to_memslot(kvm->memslots, log->slot);
+
+	dirty_bitmap = memslot->dirty_bitmap;
+	r = -ENOENT;
+	if (!dirty_bitmap)
+		goto out;
+
+	n = kvm_dirty_bitmap_bytes(memslot);
+
+	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+	memset(dirty_bitmap_buffer, 0, n);
+
+	spin_lock(&kvm->mmu_lock);
+	*is_dirty = false;
+	for (i = 0; i < n / sizeof(long); i++) {
+		unsigned long mask;
+		gfn_t offset;
+
+		if (!dirty_bitmap[i])
+			continue;
+
+		*is_dirty = true;
+
+		mask = xchg(&dirty_bitmap[i], 0);
+		dirty_bitmap_buffer[i] = mask;
+
+		offset = i * BITS_PER_LONG;
+		kvm_arch_mmu_write_protect_pt_masked(kvm, memslot, offset,
+								mask);
+	}
+
+	spin_unlock(&kvm->mmu_lock);
+
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+		goto out;
+
+	r = 0;
+out:
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+#endif
+
 bool kvm_largepages_enabled(void)
 {
 	return largepages_enabled;
-- 
cgit v1.2.3


From 67c2b9cb30f561325b010e046b7bbe2a327e69a0 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 18 Nov 2014 12:18:18 +0100
Subject: ARM: 8207/1: amba: Use inlines instead of macros for
 amba_pclk_enable/disable

Replace the amba_pclk_enable and amba_pclk_disable macros with static
inline functions and remove checks for IS_ERR. The amba bus clock won't
be ERR because probe would fail before the use of these functions.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/amba/bus.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 2afc618b15ce..0ab5f8e0dea2 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -92,11 +92,15 @@ struct amba_device *amba_find_device(const char *, struct device *, unsigned int
 int amba_request_regions(struct amba_device *, const char *);
 void amba_release_regions(struct amba_device *);
 
-#define amba_pclk_enable(d)	\
-	(IS_ERR((d)->pclk) ? 0 : clk_enable((d)->pclk))
+static inline int amba_pclk_enable(struct amba_device *dev)
+{
+	return clk_enable(dev->pclk);
+}
 
-#define amba_pclk_disable(d)	\
-	do { if (!IS_ERR((d)->pclk)) clk_disable((d)->pclk); } while (0)
+static inline void amba_pclk_disable(struct amba_device *dev)
+{
+	clk_disable(dev->pclk);
+}
 
 static inline int amba_pclk_prepare(struct amba_device *dev)
 {
-- 
cgit v1.2.3


From 78e1f974dd351ac82d978e3aac2d27422013f914 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Sun, 14 Dec 2014 02:37:09 +0200
Subject: iommu/ipmmu-vmsa: Remove platform data support

No board file instantiates the IPMMU using platform data. Now that we
have DT support, get rid of platform data.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
---
 drivers/iommu/ipmmu-vmsa.c               | 24 ------------------------
 include/linux/platform_data/ipmmu-vmsa.h | 24 ------------------------
 2 files changed, 48 deletions(-)
 delete mode 100644 include/linux/platform_data/ipmmu-vmsa.h

(limited to 'include/linux')

diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 5d080cf11ba5..791c3daec7c0 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -17,7 +17,6 @@
 #include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/of.h>
-#include <linux/platform_data/ipmmu-vmsa.h>
 #include <linux/platform_device.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
@@ -30,7 +29,6 @@ struct ipmmu_vmsa_device {
 	void __iomem *base;
 	struct list_head list;
 
-	const struct ipmmu_vmsa_platform_data *pdata;
 	unsigned int num_utlbs;
 
 	struct dma_iommu_mapping *mapping;
@@ -1015,27 +1013,6 @@ static int ipmmu_find_utlbs(struct ipmmu_vmsa_device *mmu, struct device *dev,
 	unsigned int i;
 	int count;
 
-	if (mmu->pdata) {
-		const struct ipmmu_vmsa_master *master = mmu->pdata->masters;
-		const char *devname = dev_name(dev);
-		unsigned int i;
-
-		for (i = 0; i < mmu->pdata->num_masters; ++i, ++master) {
-			if (strcmp(master->name, devname) == 0) {
-				utlbs = kmalloc(sizeof(*utlbs), GFP_KERNEL);
-				if (!utlbs)
-					return -ENOMEM;
-
-				utlbs[0] = master->utlb;
-
-				*_utlbs = utlbs;
-				return 1;
-			}
-		}
-
-		return -EINVAL;
-	}
-
 	count = of_count_phandle_with_args(dev->of_node, "iommus",
 					   "#iommu-cells");
 	if (count < 0)
@@ -1246,7 +1223,6 @@ static int ipmmu_probe(struct platform_device *pdev)
 	}
 
 	mmu->dev = &pdev->dev;
-	mmu->pdata = pdev->dev.platform_data;
 	mmu->num_utlbs = 32;
 
 	/* Map I/O memory and request IRQ. */
diff --git a/include/linux/platform_data/ipmmu-vmsa.h b/include/linux/platform_data/ipmmu-vmsa.h
deleted file mode 100644
index 5275b3ac6d37..000000000000
--- a/include/linux/platform_data/ipmmu-vmsa.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * IPMMU VMSA Platform Data
- *
- * Copyright (C) 2014 Renesas Electronics Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- */
-
-#ifndef __IPMMU_VMSA_H__
-#define __IPMMU_VMSA_H__
-
-struct ipmmu_vmsa_master {
-	const char *name;
-	unsigned int utlb;
-};
-
-struct ipmmu_vmsa_platform_data {
-	const struct ipmmu_vmsa_master *masters;
-	unsigned int num_masters;
-};
-
-#endif /* __IPMMU_VMSA_H__ */
-- 
cgit v1.2.3


From 51e537387990dc1f00752103f314fd135cb94bc6 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 21 Nov 2014 11:24:08 -0700
Subject: PCI: Add flag for devices that don't reset on D3hot->D0 transition

Per the PCI Power Management spec r1.2, sec 3.2.4, a device that advertises
No_Soft_Reset == 0 in the PMCSR register (reported by lspci as "NoSoftRst-")
should perform an internal reset when transitioning from D3hot to D0 via
software control.  Configuration context is lost and the device requires a
full reinitialization sequence.

Unfortunately the definition of "internal reset", beyond the application of
the configuration context, is largely left to the interpretation of the
specific device.  Some devices don't seem to perform an "internal reset"
even if they report No_Soft_Reset == 0.

We still need to honor the PCI specification and restore PCI config context
in the event that we do a PM reset, so we don't cache and modify the
PCI_PM_CTRL_NO_SOFT_RESET bit for the device, but for interfaces where the
intention is to reset the device, like pci_reset_function(), we need a
mechanism to flag that PM reset (a D3hot->D0 transition) doesn't perform
any significant "internal reset" of the device.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 2 +-
 include/linux/pci.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e9d4fd861ba1..422bc0179e90 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3197,7 +3197,7 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 {
 	u16 csr;
 
-	if (!dev->pm_cap)
+	if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET)
 		return -ENOTTY;
 
 	pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &csr);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 44627f1df4ca..7bed32b3fd54 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -177,6 +177,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS = (__force pci_dev_flags_t) (1 << 5),
 	/* Do not use bus resets for device */
 	PCI_DEV_FLAGS_NO_BUS_RESET = (__force pci_dev_flags_t) (1 << 6),
+	/* Do not use PM reset even if device advertises NoSoftRst- */
+	PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From 6dee60f69d48fcef021b4b53b3431797ec440764 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:54 -0500
Subject: locks: add new struct list_head to struct file_lock

...that we can use to queue file_locks to per-ctx list_heads. Go ahead
and convert locks_delete_lock and locks_dispose_list to use it instead
of the fl_block list.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c         | 8 +++++---
 include/linux/fs.h | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 59e2f905e4ff..bfe5f17401de 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -207,6 +207,7 @@ static struct kmem_cache *filelock_cache __read_mostly;
 static void locks_init_lock_heads(struct file_lock *fl)
 {
 	INIT_HLIST_NODE(&fl->fl_link);
+	INIT_LIST_HEAD(&fl->fl_list);
 	INIT_LIST_HEAD(&fl->fl_block);
 	init_waitqueue_head(&fl->fl_wait);
 }
@@ -243,6 +244,7 @@ EXPORT_SYMBOL_GPL(locks_release_private);
 void locks_free_lock(struct file_lock *fl)
 {
 	BUG_ON(waitqueue_active(&fl->fl_wait));
+	BUG_ON(!list_empty(&fl->fl_list));
 	BUG_ON(!list_empty(&fl->fl_block));
 	BUG_ON(!hlist_unhashed(&fl->fl_link));
 
@@ -257,8 +259,8 @@ locks_dispose_list(struct list_head *dispose)
 	struct file_lock *fl;
 
 	while (!list_empty(dispose)) {
-		fl = list_first_entry(dispose, struct file_lock, fl_block);
-		list_del_init(&fl->fl_block);
+		fl = list_first_entry(dispose, struct file_lock, fl_list);
+		list_del_init(&fl->fl_list);
 		locks_free_lock(fl);
 	}
 }
@@ -691,7 +693,7 @@ static void locks_delete_lock(struct file_lock **thisfl_p,
 
 	locks_unlink_lock(thisfl_p);
 	if (dispose)
-		list_add(&fl->fl_block, dispose);
+		list_add(&fl->fl_list, dispose);
 	else
 		locks_free_lock(fl);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..cd6818115162 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -934,6 +934,7 @@ int locks_in_grace(struct net *);
  */
 struct file_lock {
 	struct file_lock *fl_next;	/* singly linked list for this inode  */
+	struct list_head fl_list;	/* link into file_lock_context */
 	struct hlist_node fl_link;	/* node in global lists */
 	struct list_head fl_block;	/* circular list of blocked processes */
 	fl_owner_t fl_owner;
-- 
cgit v1.2.3


From 4a075e39c86490cc0f0c10ac6abe3592d1689463 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:54 -0500
Subject: locks: add a new struct file_locking_context pointer to struct inode

The current scheme of using the i_flock list is really difficult to
manage. There is also a legitimate desire for a per-inode spinlock to
manage these lists that isn't the i_lock.

Start conversion to a new scheme to eventually replace the old i_flock
list with a new "file_lock_context" object.

We start by adding a new i_flctx to struct inode. For now, it lives in
parallel with i_flock list, but will eventually replace it. The idea is
to allocate a structure to sit in that pointer and act as a locus for
all things file locking.

We allocate a file_lock_context for an inode when the first lock is
added to it, and it's only freed when the inode is freed. We use the
i_lock to protect the assignment, but afterward it should mostly be
accessed locklessly.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/inode.c         |  3 ++-
 fs/locks.c         | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h | 11 +++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..f30872ade6d7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -194,7 +194,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 #ifdef CONFIG_FSNOTIFY
 	inode->i_fsnotify_mask = 0;
 #endif
-
+	inode->i_flctx = NULL;
 	this_cpu_inc(nr_inodes);
 
 	return 0;
@@ -237,6 +237,7 @@ void __destroy_inode(struct inode *inode)
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
+	locks_free_lock_context(inode->i_flctx);
 	if (!inode->i_nlink) {
 		WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
 		atomic_long_dec(&inode->i_sb->s_remove_count);
diff --git a/fs/locks.c b/fs/locks.c
index ae1e7cf721d6..526d5fca67c8 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -202,8 +202,49 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
  */
 static DEFINE_SPINLOCK(blocked_lock_lock);
 
+static struct kmem_cache *flctx_cache __read_mostly;
 static struct kmem_cache *filelock_cache __read_mostly;
 
+static struct file_lock_context *
+locks_get_lock_context(struct inode *inode)
+{
+	struct file_lock_context *new;
+
+	if (likely(inode->i_flctx))
+		goto out;
+
+	new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+	if (!new)
+		goto out;
+
+	INIT_LIST_HEAD(&new->flc_flock);
+
+	/*
+	 * Assign the pointer if it's not already assigned. If it is, then
+	 * free the context we just allocated.
+	 */
+	spin_lock(&inode->i_lock);
+	if (likely(!inode->i_flctx)) {
+		inode->i_flctx = new;
+		new = NULL;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (new)
+		kmem_cache_free(flctx_cache, new);
+out:
+	return inode->i_flctx;
+}
+
+void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+	if (ctx) {
+		WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
+		kmem_cache_free(flctx_cache, ctx);
+	}
+}
+
 static void locks_init_lock_heads(struct file_lock *fl)
 {
 	INIT_HLIST_NODE(&fl->fl_link);
@@ -2636,6 +2677,9 @@ static int __init filelock_init(void)
 {
 	int i;
 
+	flctx_cache = kmem_cache_create("file_lock_ctx",
+			sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
+
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cd6818115162..dec0d38b05de 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -626,6 +626,7 @@ struct inode {
 #endif
 	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct file_lock	*i_flock;
+	struct file_lock_context	*i_flctx;
 	struct address_space	i_data;
 	struct list_head	i_devices;
 	union {
@@ -965,6 +966,10 @@ struct file_lock {
 	} fl_u;
 };
 
+struct file_lock_context {
+	struct list_head	flc_flock;
+};
+
 /* The following constant reflects the upper bound of the file/locking space */
 #ifndef OFFSET_MAX
 #define INT_LIMIT(x)	(~((x)1 << (sizeof(x)*8 - 1)))
@@ -991,6 +996,7 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
 extern int fcntl_getlease(struct file *filp);
 
 /* fs/locks.c */
+void locks_free_lock_context(struct file_lock_context *ctx);
 void locks_free_lock(struct file_lock *fl);
 extern void locks_init_lock(struct file_lock *);
 extern struct file_lock * locks_alloc_lock(void);
@@ -1048,6 +1054,11 @@ static inline int fcntl_getlease(struct file *filp)
 	return F_UNLCK;
 }
 
+static inline void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+}
+
 static inline void locks_init_lock(struct file_lock *fl)
 {
 	return;
-- 
cgit v1.2.3


From bd61e0a9c852de2d705b6f1bb2cc54c5774db570 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:55 -0500
Subject: locks: convert posix locks to file_lock_context

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/ceph/locks.c     |  58 +++++++++++++---------------
 fs/cifs/file.c      |  26 +++++--------
 fs/lockd/svcsubs.c  |  20 ++++++----
 fs/locks.c          | 108 +++++++++++++++++++++++++++-------------------------
 fs/nfs/delegation.c |  28 +++++---------
 fs/nfs/nfs4state.c  |  52 +++++--------------------
 fs/nfs/pagelist.c   |   8 ++--
 fs/nfs/write.c      |  30 +++++++--------
 fs/nfsd/nfs4state.c |  18 +++++----
 fs/read_write.c     |   2 +-
 include/linux/fs.h  |   3 +-
 11 files changed, 155 insertions(+), 198 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 917656ea8dcf..19beeed83233 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -253,18 +253,15 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 	*fcntl_count = 0;
 	*flock_count = 0;
 
-	spin_lock(&inode->i_lock);
-	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
-		if (lock->fl_flags & FL_POSIX)
-			++(*fcntl_count);
-	}
-
 	ctx = inode->i_flctx;
 	if (ctx) {
+		spin_lock(&inode->i_lock);
+		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
+			++(*fcntl_count);
 		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
 			++(*flock_count);
+		spin_unlock(&inode->i_lock);
 	}
-	spin_unlock(&inode->i_lock);
 	dout("counted %d flock locks and %d fcntl locks",
 	     *flock_count, *fcntl_count);
 }
@@ -279,7 +276,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 				int num_fcntl_locks, int num_flock_locks)
 {
 	struct file_lock *lock;
-	struct file_lock_context *ctx;
+	struct file_lock_context *ctx = inode->i_flctx;
 	int err = 0;
 	int seen_fcntl = 0;
 	int seen_flock = 0;
@@ -288,34 +285,31 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
 	     num_fcntl_locks);
 
+	if (!ctx)
+		return 0;
+
 	spin_lock(&inode->i_lock);
-	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
-		if (lock->fl_flags & FL_POSIX) {
-			++seen_fcntl;
-			if (seen_fcntl > num_fcntl_locks) {
-				err = -ENOSPC;
-				goto fail;
-			}
-			err = lock_to_ceph_filelock(lock, &flocks[l]);
-			if (err)
-				goto fail;
-			++l;
+	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+		++seen_fcntl;
+		if (seen_fcntl > num_fcntl_locks) {
+			err = -ENOSPC;
+			goto fail;
 		}
+		err = lock_to_ceph_filelock(lock, &flocks[l]);
+		if (err)
+			goto fail;
+		++l;
 	}
-
-	ctx = inode->i_flctx;
-	if (ctx) {
-		list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
-			++seen_flock;
-			if (seen_flock > num_flock_locks) {
-				err = -ENOSPC;
-				goto fail;
-			}
-			err = lock_to_ceph_filelock(lock, &flocks[l]);
-			if (err)
-				goto fail;
-			++l;
+	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+		++seen_flock;
+		if (seen_flock > num_flock_locks) {
+			err = -ENOSPC;
+			goto fail;
 		}
+		err = lock_to_ceph_filelock(lock, &flocks[l]);
+		if (err)
+			goto fail;
+		++l;
 	}
 fail:
 	spin_unlock(&inode->i_lock);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 96b7e9b7706d..ea78f6f81ce2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1109,11 +1109,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 	return rc;
 }
 
-/* copied from fs/locks.c with a name change */
-#define cifs_for_each_lock(inode, lockp) \
-	for (lockp = &inode->i_flock; *lockp != NULL; \
-	     lockp = &(*lockp)->fl_next)
-
 struct lock_to_push {
 	struct list_head llist;
 	__u64 offset;
@@ -1128,8 +1123,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
 	struct inode *inode = cfile->dentry->d_inode;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
-	struct file_lock *flock, **before;
-	unsigned int count = 0, i = 0;
+	struct file_lock *flock;
+	struct file_lock_context *flctx = inode->i_flctx;
+	unsigned int count = 0, i;
 	int rc = 0, xid, type;
 	struct list_head locks_to_send, *el;
 	struct lock_to_push *lck, *tmp;
@@ -1137,10 +1133,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 
 	xid = get_xid();
 
+	if (!flctx)
+		goto out;
+
 	spin_lock(&inode->i_lock);
-	cifs_for_each_lock(inode, before) {
-		if ((*before)->fl_flags & FL_POSIX)
-			count++;
+	list_for_each(el, &flctx->flc_posix) {
+		count++;
 	}
 	spin_unlock(&inode->i_lock);
 
@@ -1151,7 +1149,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	 * added to the list while we are holding cinode->lock_sem that
 	 * protects locking operations of this inode.
 	 */
-	for (; i < count; i++) {
+	for (i = 0; i < count; i++) {
 		lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
 		if (!lck) {
 			rc = -ENOMEM;
@@ -1162,10 +1160,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 
 	el = locks_to_send.next;
 	spin_lock(&inode->i_lock);
-	cifs_for_each_lock(inode, before) {
-		flock = *before;
-		if ((flock->fl_flags & FL_POSIX) == 0)
-			continue;
+	list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
 		if (el == &locks_to_send) {
 			/*
 			 * The list ended. We don't have enough allocated
@@ -1185,7 +1180,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 		lck->length = length;
 		lck->type = type;
 		lck->offset = flock->fl_start;
-		el = el->next;
 	}
 	spin_unlock(&inode->i_lock);
 
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d12ff4e2dbe7..5300bb53835f 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -164,12 +164,15 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 {
 	struct inode	 *inode = nlmsvc_file_inode(file);
 	struct file_lock *fl;
+	struct file_lock_context *flctx = inode->i_flctx;
 	struct nlm_host	 *lockhost;
 
+	if (!flctx || list_empty_careful(&flctx->flc_posix))
+		return 0;
 again:
 	file->f_locks = 0;
 	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+	list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
 		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
 
@@ -223,18 +226,21 @@ nlm_file_inuse(struct nlm_file *file)
 {
 	struct inode	 *inode = nlmsvc_file_inode(file);
 	struct file_lock *fl;
+	struct file_lock_context *flctx = inode->i_flctx;
 
 	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
 		return 1;
 
-	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
-		if (fl->fl_lmops == &nlmsvc_lock_operations) {
-			spin_unlock(&inode->i_lock);
-			return 1;
+	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+		spin_lock(&inode->i_lock);
+		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+			if (fl->fl_lmops == &nlmsvc_lock_operations) {
+				spin_unlock(&inode->i_lock);
+				return 1;
+			}
 		}
+		spin_unlock(&inode->i_lock);
 	}
-	spin_unlock(&inode->i_lock);
 	file->f_locks = 0;
 	return 0;
 }
diff --git a/fs/locks.c b/fs/locks.c
index 055df53f19de..e50bb4d9e757 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -157,9 +157,6 @@ static int target_leasetype(struct file_lock *fl)
 int leases_enable = 1;
 int lease_break_time = 45;
 
-#define for_each_lock(inode, lockp) \
-	for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
-
 /*
  * The global file_lock_list is only used for displaying /proc/locks, so we
  * keep a list on each CPU, with each list protected by its own spinlock via
@@ -218,6 +215,7 @@ locks_get_lock_context(struct inode *inode)
 		goto out;
 
 	INIT_LIST_HEAD(&new->flc_flock);
+	INIT_LIST_HEAD(&new->flc_posix);
 
 	/*
 	 * Assign the pointer if it's not already assigned. If it is, then
@@ -241,6 +239,7 @@ locks_free_lock_context(struct file_lock_context *ctx)
 {
 	if (ctx) {
 		WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
+		WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
 		kmem_cache_free(flctx_cache, ctx);
 	}
 }
@@ -809,21 +808,26 @@ void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
+	struct file_lock_context *ctx;
 	struct inode *inode = file_inode(filp);
 
+	ctx = inode->i_flctx;
+	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
+		fl->fl_type = F_UNLCK;
+		return;
+	}
+
 	spin_lock(&inode->i_lock);
-	for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
-		if (!IS_POSIX(cfl))
-			continue;
-		if (posix_locks_conflict(fl, cfl))
-			break;
+	list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+		if (posix_locks_conflict(fl, cfl)) {
+			locks_copy_conflock(fl, cfl);
+			if (cfl->fl_nspid)
+				fl->fl_pid = pid_vnr(cfl->fl_nspid);
+			goto out;
+		}
 	}
-	if (cfl) {
-		locks_copy_conflock(fl, cfl);
-		if (cfl->fl_nspid)
-			fl->fl_pid = pid_vnr(cfl->fl_nspid);
-	} else
-		fl->fl_type = F_UNLCK;
+	fl->fl_type = F_UNLCK;
+out:
 	spin_unlock(&inode->i_lock);
 	return;
 }
@@ -983,16 +987,20 @@ out:
 
 static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
-	struct file_lock *fl;
+	struct file_lock *fl, *tmp;
 	struct file_lock *new_fl = NULL;
 	struct file_lock *new_fl2 = NULL;
 	struct file_lock *left = NULL;
 	struct file_lock *right = NULL;
-	struct file_lock **before;
+	struct file_lock_context *ctx;
 	int error;
 	bool added = false;
 	LIST_HEAD(dispose);
 
+	ctx = locks_get_lock_context(inode);
+	if (!ctx)
+		return -ENOMEM;
+
 	/*
 	 * We may need two file_lock structures for this operation,
 	 * so we get them in advance to avoid races.
@@ -1013,8 +1021,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	 * blocker's list of waiters and the global blocked_hash.
 	 */
 	if (request->fl_type != F_UNLCK) {
-		for_each_lock(inode, before) {
-			fl = *before;
+		list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
 			if (!IS_POSIX(fl))
 				continue;
 			if (!posix_locks_conflict(request, fl))
@@ -1044,29 +1051,25 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 
-	/*
-	 * Find the first old lock with the same owner as the new lock.
-	 */
-	
-	before = &inode->i_flock;
-
-	/* First skip locks owned by other processes.  */
-	while ((fl = *before) && (!IS_POSIX(fl) ||
-				  !posix_same_owner(request, fl))) {
-		before = &fl->fl_next;
+	/* Find the first old lock with the same owner as the new lock */
+	list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
+		if (posix_same_owner(request, fl))
+			break;
 	}
 
 	/* Process locks with this owner. */
-	while ((fl = *before) && posix_same_owner(request, fl)) {
-		/* Detect adjacent or overlapping regions (if same lock type)
-		 */
+	list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
+		if (!posix_same_owner(request, fl))
+			break;
+
+		/* Detect adjacent or overlapping regions (if same lock type) */
 		if (request->fl_type == fl->fl_type) {
 			/* In all comparisons of start vs end, use
 			 * "start - 1" rather than "end + 1". If end
 			 * is OFFSET_MAX, end + 1 will become negative.
 			 */
 			if (fl->fl_end < request->fl_start - 1)
-				goto next_lock;
+				continue;
 			/* If the next lock in the list has entirely bigger
 			 * addresses than the new one, insert the lock here.
 			 */
@@ -1087,18 +1090,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			else
 				request->fl_end = fl->fl_end;
 			if (added) {
-				locks_delete_lock(before, &dispose);
+				locks_delete_lock_ctx(fl, &dispose);
 				continue;
 			}
 			request = fl;
 			added = true;
-		}
-		else {
+		} else {
 			/* Processing for different lock types is a bit
 			 * more complex.
 			 */
 			if (fl->fl_end < request->fl_start)
-				goto next_lock;
+				continue;
 			if (fl->fl_start > request->fl_end)
 				break;
 			if (request->fl_type == F_UNLCK)
@@ -1117,7 +1119,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				 * one (This may happen several times).
 				 */
 				if (added) {
-					locks_delete_lock(before, &dispose);
+					locks_delete_lock_ctx(fl, &dispose);
 					continue;
 				}
 				/*
@@ -1133,15 +1135,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				locks_copy_lock(new_fl, request);
 				request = new_fl;
 				new_fl = NULL;
-				locks_delete_lock(before, &dispose);
-				locks_insert_lock(before, request);
+				locks_insert_lock_ctx(request, &fl->fl_list);
+				locks_delete_lock_ctx(fl, &dispose);
 				added = true;
 			}
 		}
-		/* Go on to next lock.
-		 */
-	next_lock:
-		before = &fl->fl_next;
 	}
 
 	/*
@@ -1166,7 +1164,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			goto out;
 		}
 		locks_copy_lock(new_fl, request);
-		locks_insert_lock(before, new_fl);
+		locks_insert_lock_ctx(new_fl, &fl->fl_list);
 		new_fl = NULL;
 	}
 	if (right) {
@@ -1177,7 +1175,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			left = new_fl2;
 			new_fl2 = NULL;
 			locks_copy_lock(left, right);
-			locks_insert_lock(before, left);
+			locks_insert_lock_ctx(left, &fl->fl_list);
 		}
 		right->fl_start = request->fl_end + 1;
 		locks_wake_up_blocks(right);
@@ -1257,22 +1255,29 @@ EXPORT_SYMBOL(posix_lock_file_wait);
  */
 int locks_mandatory_locked(struct file *file)
 {
+	int ret;
 	struct inode *inode = file_inode(file);
+	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
+	ctx = inode->i_flctx;
+	if (!ctx || list_empty_careful(&ctx->flc_posix))
+		return 0;
+
 	/*
 	 * Search the lock list for this inode for any POSIX locks.
 	 */
 	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
-		if (!IS_POSIX(fl))
-			continue;
+	ret = 0;
+	list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
 		if (fl->fl_owner != current->files &&
-		    fl->fl_owner != file)
+		    fl->fl_owner != file) {
+			ret = -EAGAIN;
 			break;
+		}
 	}
 	spin_unlock(&inode->i_lock);
-	return fl ? -EAGAIN : 0;
+	return ret;
 }
 
 /**
@@ -2389,13 +2394,14 @@ out:
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
 	struct file_lock lock;
+	struct file_lock_context *ctx = file_inode(filp)->i_flctx;
 
 	/*
 	 * If there are no locks held on this file, we don't need to call
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
-	if (!file_inode(filp)->i_flock)
+	if (!ctx || list_empty(&ctx->flc_posix))
 		return;
 
 	lock.fl_type = F_UNLCK;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 9f9f67b17e2b..3fb1caa3874d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -85,17 +85,17 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 {
 	struct inode *inode = state->inode;
 	struct file_lock *fl;
-	struct file_lock_context *flctx;
+	struct file_lock_context *flctx = inode->i_flctx;
+	struct list_head *list;
 	int status = 0;
 
-	if (inode->i_flock == NULL && inode->i_flctx == NULL)
+	if (flctx == NULL)
 		goto out;
 
-	/* Protect inode->i_flock using the i_lock */
+	list = &flctx->flc_posix;
 	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
-		if (!(fl->fl_flags & (FL_POSIX)))
-			continue;
+restart:
+	list_for_each_entry(fl, list, fl_list) {
 		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
 		spin_unlock(&inode->i_lock);
@@ -104,19 +104,9 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 			goto out;
 		spin_lock(&inode->i_lock);
 	}
-
-	flctx = inode->i_flctx;
-	if (flctx) {
-		list_for_each_entry(fl, &flctx->flc_flock, fl_list) {
-			if (nfs_file_open_context(fl->fl_file) != ctx)
-				continue;
-			spin_unlock(&inode->i_lock);
-			status = nfs4_lock_delegation_recall(fl, state,
-								stateid);
-			if (status < 0)
-				goto out;
-			spin_lock(&inode->i_lock);
-		}
+	if (list == &flctx->flc_posix) {
+		list = &flctx->flc_flock;
+		goto restart;
 	}
 	spin_unlock(&inode->i_lock);
 out:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 65c404bf61ae..6084c267f3a0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1367,53 +1367,18 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 	struct file_lock *fl;
 	int status = 0;
 	struct file_lock_context *flctx = inode->i_flctx;
+	struct list_head *list;
 
-	if (inode->i_flock == NULL && flctx == NULL)
+	if (flctx == NULL)
 		return 0;
 
+	list = &flctx->flc_posix;
+
 	/* Guard against delegation returns and new lock/unlock calls */
 	down_write(&nfsi->rwsem);
-	/* Protect inode->i_flock using the BKL */
 	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
-		if (!(fl->fl_flags & FL_POSIX))
-			continue;
-		if (nfs_file_open_context(fl->fl_file)->state != state)
-			continue;
-		spin_unlock(&inode->i_lock);
-		status = ops->recover_lock(state, fl);
-		switch (status) {
-			case 0:
-				break;
-			case -ESTALE:
-			case -NFS4ERR_ADMIN_REVOKED:
-			case -NFS4ERR_STALE_STATEID:
-			case -NFS4ERR_BAD_STATEID:
-			case -NFS4ERR_EXPIRED:
-			case -NFS4ERR_NO_GRACE:
-			case -NFS4ERR_STALE_CLIENTID:
-			case -NFS4ERR_BADSESSION:
-			case -NFS4ERR_BADSLOT:
-			case -NFS4ERR_BAD_HIGH_SLOT:
-			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-				goto out;
-			default:
-				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
-					 __func__, status);
-			case -ENOMEM:
-			case -NFS4ERR_DENIED:
-			case -NFS4ERR_RECLAIM_BAD:
-			case -NFS4ERR_RECLAIM_CONFLICT:
-				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
-				status = 0;
-		}
-		spin_lock(&inode->i_lock);
-	}
-
-	if (!flctx)
-		goto out_unlock;
-
-	list_for_each_entry(fl, &flctx->flc_flock, fl_list) {
+restart:
+	list_for_each_entry(fl, list, fl_list) {
 		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
 		spin_unlock(&inode->i_lock);
@@ -1445,7 +1410,10 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 		}
 		spin_lock(&inode->i_lock);
 	}
-out_unlock:
+	if (list == &flctx->flc_posix) {
+		list = &flctx->flc_flock;
+		goto restart;
+	}
 	spin_unlock(&inode->i_lock);
 out:
 	up_write(&nfsi->rwsem);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a3b62e15b444..29c7f33c9cf1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -831,12 +831,10 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 	if (prev) {
 		if (!nfs_match_open_context(req->wb_context, prev->wb_context))
 			return false;
-		if (req->wb_context->dentry->d_inode->i_flock != NULL &&
-		    !nfs_match_lock_context(req->wb_lock_context,
-					    prev->wb_lock_context))
-			return false;
 		flctx = req->wb_context->dentry->d_inode->i_flctx;
-		if (flctx != NULL && !list_empty_careful(&flctx->flc_flock) &&
+		if (flctx != NULL &&
+		    !(list_empty_careful(&flctx->flc_posix) &&
+		      list_empty_careful(&flctx->flc_flock)) &&
 		    !nfs_match_lock_context(req->wb_lock_context,
 					    prev->wb_lock_context))
 			return false;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e072aeb34195..784c13485b3f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1091,6 +1091,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct nfs_lock_context *l_ctx;
+	struct file_lock_context *flctx = file_inode(file)->i_flctx;
 	struct nfs_page	*req;
 	int do_flush, status;
 	/*
@@ -1109,12 +1110,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		do_flush = req->wb_page != page || req->wb_context != ctx;
 		/* for now, flush if more than 1 request in page_group */
 		do_flush |= req->wb_this_page != req;
-		if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
-			do_flush |= l_ctx->lockowner.l_owner != current->files
-				|| l_ctx->lockowner.l_pid != current->tgid;
-		}
-		if (l_ctx && ctx->dentry->d_inode->i_flctx &&
-		    !list_empty_careful(&ctx->dentry->d_inode->i_flctx->flc_flock)) {
+		if (l_ctx && flctx &&
+		    !(list_empty_careful(&flctx->flc_posix) &&
+		      list_empty_careful(&flctx->flc_flock))) {
 			do_flush |= l_ctx->lockowner.l_owner != current->files
 				|| l_ctx->lockowner.l_pid != current->tgid;
 		}
@@ -1202,26 +1200,24 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
 		return 0;
 	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
 		return 1;
-	if (!inode->i_flock && !flctx)
+	if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
+		       list_empty_careful(&flctx->flc_posix)))
 		return 0;
 
 	/* Check to see if there are whole file write locks */
-	spin_lock(&inode->i_lock);
 	ret = 0;
-
-	fl = inode->i_flock;
-	if (fl && is_whole_file_wrlock(fl)) {
-		ret = 1;
-		goto out;
-	}
-
-	if (!list_empty(&flctx->flc_flock)) {
+	spin_lock(&inode->i_lock);
+	if (!list_empty(&flctx->flc_posix)) {
+		fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+					fl_list);
+		if (is_whole_file_wrlock(fl))
+			ret = 1;
+	} else if (!list_empty(&flctx->flc_flock)) {
 		fl = list_first_entry(&flctx->flc_flock, struct file_lock,
 					fl_list);
 		if (fl->fl_type == F_WRLCK)
 			ret = 1;
 	}
-out:
 	spin_unlock(&inode->i_lock);
 	return ret;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c06a1ba80d73..fad821991369 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5556,10 +5556,11 @@ out_nfserr:
 static bool
 check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 {
-	struct file_lock **flpp;
+	struct file_lock *fl;
 	int status = false;
 	struct file *filp = find_any_file(fp);
 	struct inode *inode;
+	struct file_lock_context *flctx;
 
 	if (!filp) {
 		/* Any valid lock stateid should have some sort of access */
@@ -5568,15 +5569,18 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 	}
 
 	inode = file_inode(filp);
+	flctx = inode->i_flctx;
 
-	spin_lock(&inode->i_lock);
-	for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
-		if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
-			status = true;
-			break;
+	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+		spin_lock(&inode->i_lock);
+		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+			if (fl->fl_owner == (fl_owner_t)lowner) {
+				status = true;
+				break;
+			}
 		}
+		spin_unlock(&inode->i_lock);
 	}
-	spin_unlock(&inode->i_lock);
 	fput(filp);
 	return status;
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..4060691e78f7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -358,7 +358,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
 			return retval;
 	}
 
-	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
+	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 		retval = locks_mandatory_area(
 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 			inode, file, pos, count);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dec0d38b05de..571f113588e9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -968,6 +968,7 @@ struct file_lock {
 
 struct file_lock_context {
 	struct list_head	flc_flock;
+	struct list_head	flc_posix;
 };
 
 /* The following constant reflects the upper bound of the file/locking space */
@@ -1971,7 +1972,7 @@ static inline int locks_verify_truncate(struct inode *inode,
 				    struct file *filp,
 				    loff_t size)
 {
-	if (inode->i_flock && mandatory_lock(inode))
+	if (inode->i_flctx && mandatory_lock(inode))
 		return locks_mandatory_area(
 			FLOCK_VERIFY_WRITE, inode, filp,
 			size < inode->i_size ? size : inode->i_size,
-- 
cgit v1.2.3


From 8634b51f6ca298fb8b07aa4847340764903533ab Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:55 -0500
Subject: locks: convert lease handling to file_lock_context

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c         | 252 +++++++++++++++++++++--------------------------------
 include/linux/fs.h |   5 +-
 2 files changed, 102 insertions(+), 155 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index e50bb4d9e757..d46e70567b99 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -216,6 +216,7 @@ locks_get_lock_context(struct inode *inode)
 
 	INIT_LIST_HEAD(&new->flc_flock);
 	INIT_LIST_HEAD(&new->flc_posix);
+	INIT_LIST_HEAD(&new->flc_lease);
 
 	/*
 	 * Assign the pointer if it's not already assigned. If it is, then
@@ -240,6 +241,7 @@ locks_free_lock_context(struct file_lock_context *ctx)
 	if (ctx) {
 		WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
 		WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
+		WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
 		kmem_cache_free(flctx_cache, ctx);
 	}
 }
@@ -677,22 +679,6 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 	spin_unlock(&blocked_lock_lock);
 }
 
-/* Insert file lock fl into an inode's lock list at the position indicated
- * by pos. At the same time add the lock to the global file lock list.
- *
- * Must be called with the i_lock held!
- */
-static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
-{
-	fl->fl_nspid = get_pid(task_tgid(current));
-
-	/* insert into file's list */
-	fl->fl_next = *pos;
-	*pos = fl;
-
-	locks_insert_global_locks(fl);
-}
-
 static void
 locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
 {
@@ -701,63 +687,28 @@ locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
 	locks_insert_global_locks(fl);
 }
 
-/**
- * locks_delete_lock - Delete a lock and then free it.
- * @thisfl_p: pointer that points to the fl_next field of the previous
- * 	      inode->i_flock list entry
- *
- * Unlink a lock from all lists and free the namespace reference, but don't
- * free it yet. Wake up processes that are blocked waiting for this lock and
- * notify the FS that the lock has been cleared.
- *
- * Must be called with the i_lock held!
- */
-static void locks_unlink_lock(struct file_lock **thisfl_p)
+static void
+locks_unlink_lock_ctx(struct file_lock *fl)
 {
-	struct file_lock *fl = *thisfl_p;
-
 	locks_delete_global_locks(fl);
-
-	*thisfl_p = fl->fl_next;
-	fl->fl_next = NULL;
-
+	list_del_init(&fl->fl_list);
 	if (fl->fl_nspid) {
 		put_pid(fl->fl_nspid);
 		fl->fl_nspid = NULL;
 	}
-
 	locks_wake_up_blocks(fl);
 }
 
-/*
- * Unlink a lock from all lists and free it.
- *
- * Must be called with i_lock held!
- */
-static void locks_delete_lock(struct file_lock **thisfl_p,
-			      struct list_head *dispose)
+static void
+locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
 {
-	struct file_lock *fl = *thisfl_p;
-
-	locks_unlink_lock(thisfl_p);
+	locks_unlink_lock_ctx(fl);
 	if (dispose)
 		list_add(&fl->fl_list, dispose);
 	else
 		locks_free_lock(fl);
 }
 
-static void
-locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
-{
-	locks_delete_global_locks(fl);
-	if (fl->fl_nspid) {
-		put_pid(fl->fl_nspid);
-		fl->fl_nspid = NULL;
-	}
-	locks_wake_up_blocks(fl);
-	list_move(&fl->fl_list, dispose);
-}
-
 /* Determine if lock sys_fl blocks lock caller_fl. Common functionality
  * checks for shared/exclusive status of overlapping locks.
  */
@@ -1376,7 +1327,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
 			printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
 			fl->fl_fasync = NULL;
 		}
-		locks_delete_lock(before, dispose);
+		locks_delete_lock_ctx(fl, dispose);
 	}
 	return 0;
 }
@@ -1392,20 +1343,17 @@ static bool past_time(unsigned long then)
 
 static void time_out_leases(struct inode *inode, struct list_head *dispose)
 {
-	struct file_lock **before;
-	struct file_lock *fl;
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl, *tmp;
 
 	lockdep_assert_held(&inode->i_lock);
 
-	before = &inode->i_flock;
-	while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
 		trace_time_out_leases(inode, fl);
 		if (past_time(fl->fl_downgrade_time))
-			lease_modify(before, F_RDLCK, dispose);
+			lease_modify(&fl, F_RDLCK, dispose);
 		if (past_time(fl->fl_break_time))
-			lease_modify(before, F_UNLCK, dispose);
-		if (fl == *before)	/* lease_modify may have freed fl */
-			before = &fl->fl_next;
+			lease_modify(&fl, F_UNLCK, dispose);
 	}
 }
 
@@ -1419,11 +1367,12 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 static bool
 any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 {
+	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lock *fl;
 
 	lockdep_assert_held(&inode->i_lock);
 
-	for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) {
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (leases_conflict(fl, breaker))
 			return true;
 	}
@@ -1447,7 +1396,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	int error = 0;
 	struct file_lock *new_fl;
-	struct file_lock *fl, **before;
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl;
 	unsigned long break_time;
 	int want_write = (mode & O_ACCMODE) != O_RDONLY;
 	LIST_HEAD(dispose);
@@ -1457,6 +1407,12 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 		return PTR_ERR(new_fl);
 	new_fl->fl_flags = type;
 
+	/* typically we will check that ctx is non-NULL before calling */
+	if (!ctx) {
+		WARN_ON_ONCE(1);
+		return error;
+	}
+
 	spin_lock(&inode->i_lock);
 
 	time_out_leases(inode, &dispose);
@@ -1471,9 +1427,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 			break_time++;	/* so that 0 means no break time */
 	}
 
-	for (before = &inode->i_flock;
-			((fl = *before) != NULL) && IS_LEASE(fl);
-			before = &fl->fl_next) {
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (!leases_conflict(fl, new_fl))
 			continue;
 		if (want_write) {
@@ -1482,17 +1436,16 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 			fl->fl_flags |= FL_UNLOCK_PENDING;
 			fl->fl_break_time = break_time;
 		} else {
-			if (lease_breaking(inode->i_flock))
+			if (lease_breaking(fl))
 				continue;
 			fl->fl_flags |= FL_DOWNGRADE_PENDING;
 			fl->fl_downgrade_time = break_time;
 		}
 		if (fl->fl_lmops->lm_break(fl))
-			locks_delete_lock(before, &dispose);
+			locks_delete_lock_ctx(fl, &dispose);
 	}
 
-	fl = inode->i_flock;
-	if (!fl || !IS_LEASE(fl))
+	if (list_empty(&ctx->flc_lease))
 		goto out;
 
 	if (mode & O_NONBLOCK) {
@@ -1502,12 +1455,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	}
 
 restart:
-	break_time = inode->i_flock->fl_break_time;
+	fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+	break_time = fl->fl_break_time;
 	if (break_time != 0)
 		break_time -= jiffies;
 	if (break_time == 0)
 		break_time++;
-	locks_insert_block(inode->i_flock, new_fl);
+	locks_insert_block(fl, new_fl);
 	trace_break_lease_block(inode, new_fl);
 	spin_unlock(&inode->i_lock);
 	locks_dispose_list(&dispose);
@@ -1525,10 +1479,8 @@ restart:
 			time_out_leases(inode, &dispose);
 		if (any_leases_conflict(inode, new_fl))
 			goto restart;
-
 		error = 0;
 	}
-
 out:
 	spin_unlock(&inode->i_lock);
 	locks_dispose_list(&dispose);
@@ -1550,13 +1502,17 @@ EXPORT_SYMBOL(__break_lease);
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
 	bool has_lease = false;
-	struct file_lock *flock;
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl;
 
-	if (inode->i_flock) {
+	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&inode->i_lock);
-		flock = inode->i_flock;
-		if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
-			has_lease = true;
+		if (!list_empty(&ctx->flc_lease)) {
+			fl = list_first_entry(&ctx->flc_lease,
+						struct file_lock, fl_list);
+			if (fl->fl_type == F_WRLCK)
+				has_lease = true;
+		}
 		spin_unlock(&inode->i_lock);
 	}
 
@@ -1595,20 +1551,22 @@ int fcntl_getlease(struct file *filp)
 {
 	struct file_lock *fl;
 	struct inode *inode = file_inode(filp);
+	struct file_lock_context *ctx = inode->i_flctx;
 	int type = F_UNLCK;
 	LIST_HEAD(dispose);
 
-	spin_lock(&inode->i_lock);
-	time_out_leases(file_inode(filp), &dispose);
-	for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
-			fl = fl->fl_next) {
-		if (fl->fl_file == filp) {
+	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
+		spin_lock(&inode->i_lock);
+		time_out_leases(file_inode(filp), &dispose);
+		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+			if (fl->fl_file != filp)
+				continue;
 			type = target_leasetype(fl);
 			break;
 		}
+		spin_unlock(&inode->i_lock);
+		locks_dispose_list(&dispose);
 	}
-	spin_unlock(&inode->i_lock);
-	locks_dispose_list(&dispose);
 	return type;
 }
 
@@ -1641,9 +1599,10 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
 static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
-	struct file_lock *fl, **before, **my_before = NULL, *lease;
+	struct file_lock *fl, *my_fl = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
+	struct file_lock_context *ctx;
 	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
 	int error;
 	LIST_HEAD(dispose);
@@ -1651,6 +1610,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	lease = *flp;
 	trace_generic_add_lease(inode, lease);
 
+	ctx = locks_get_lock_context(inode);
+	if (!ctx)
+		return -ENOMEM;
+
 	/*
 	 * In the delegation case we need mutual exclusion with
 	 * a number of operations that take the i_mutex.  We trylock
@@ -1684,13 +1647,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * except for this filp.
 	 */
 	error = -EAGAIN;
-	for (before = &inode->i_flock;
-			((fl = *before) != NULL) && IS_LEASE(fl);
-			before = &fl->fl_next) {
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (fl->fl_file == filp) {
-			my_before = before;
+			my_fl = fl;
 			continue;
 		}
+
 		/*
 		 * No exclusive leases if someone else has a lease on
 		 * this file:
@@ -1705,9 +1667,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 			goto out;
 	}
 
-	if (my_before != NULL) {
-		lease = *my_before;
-		error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
+	if (my_fl != NULL) {
+		error = lease->fl_lmops->lm_change(&my_fl, arg, &dispose);
 		if (error)
 			goto out;
 		goto out_setup;
@@ -1717,7 +1678,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	if (!leases_enable)
 		goto out;
 
-	locks_insert_lock(before, lease);
+	locks_insert_lock_ctx(lease, &ctx->flc_lease);
 	/*
 	 * The check in break_lease() is lockless. It's possible for another
 	 * open to race in after we did the earlier check for a conflicting
@@ -1729,8 +1690,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 */
 	smp_mb();
 	error = check_conflicting_open(dentry, arg);
-	if (error)
-		goto out_unlink;
+	if (error) {
+		locks_unlink_lock_ctx(lease);
+		goto out;
+	}
 
 out_setup:
 	if (lease->fl_lmops->lm_setup)
@@ -1740,33 +1703,35 @@ out:
 	locks_dispose_list(&dispose);
 	if (is_deleg)
 		mutex_unlock(&inode->i_mutex);
-	if (!error && !my_before)
+	if (!error && !my_fl)
 		*flp = NULL;
 	return error;
-out_unlink:
-	locks_unlink_lock(before);
-	goto out;
 }
 
 static int generic_delete_lease(struct file *filp)
 {
 	int error = -EAGAIN;
-	struct file_lock *fl, **before;
+	struct file_lock *fl, *victim = NULL;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
+	struct file_lock_context *ctx = inode->i_flctx;
 	LIST_HEAD(dispose);
 
+	if (!ctx) {
+		trace_generic_delete_lease(inode, NULL);
+		return error;
+	}
+
 	spin_lock(&inode->i_lock);
-	time_out_leases(inode, &dispose);
-	for (before = &inode->i_flock;
-			((fl = *before) != NULL) && IS_LEASE(fl);
-			before = &fl->fl_next) {
-		if (fl->fl_file == filp)
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+		if (fl->fl_file == filp) {
+			victim = fl;
 			break;
+		}
 	}
 	trace_generic_delete_lease(inode, fl);
-	if (fl && IS_LEASE(fl))
-		error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose);
+	if (victim)
+		error = fl->fl_lmops->lm_change(&victim, F_UNLCK, &dispose);
 	spin_unlock(&inode->i_lock);
 	locks_dispose_list(&dispose);
 	return error;
@@ -2447,56 +2412,37 @@ locks_remove_flock(struct file *filp)
 		fl.fl_ops->fl_release_private(&fl);
 }
 
+static void
+locks_remove_lease(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl, *tmp;
+	LIST_HEAD(dispose);
+
+	if (!ctx || list_empty(&ctx->flc_lease))
+		return;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
+		lease_modify(&fl, F_UNLCK, &dispose);
+	spin_unlock(&inode->i_lock);
+	locks_dispose_list(&dispose);
+}
+
 /*
  * This function is called on the last close of an open file.
  */
 void locks_remove_file(struct file *filp)
 {
-	struct inode * inode = file_inode(filp);
-	struct file_lock *fl;
-	struct file_lock **before;
-	LIST_HEAD(dispose);
-
 	/* remove any OFD locks */
 	locks_remove_posix(filp, filp);
 
 	/* remove flock locks */
 	locks_remove_flock(filp);
 
-	if (!inode->i_flock)
-		return;
-
-	spin_lock(&inode->i_lock);
-	before = &inode->i_flock;
-
-	while ((fl = *before) != NULL) {
-		if (fl->fl_file == filp) {
-			if (IS_LEASE(fl)) {
-				lease_modify(before, F_UNLCK, &dispose);
-				continue;
-			}
-
-			/*
-			 * There's a leftover lock on the list of a type that
-			 * we didn't expect to see. Most likely a classic
-			 * POSIX lock that ended up not getting released
-			 * properly, or that raced onto the list somehow. Log
-			 * some info about it and then just remove it from
-			 * the list.
-			 */
-			WARN(1, "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
-				MAJOR(inode->i_sb->s_dev),
-				MINOR(inode->i_sb->s_dev), inode->i_ino,
-				fl->fl_type, fl->fl_flags,
-				fl->fl_start, fl->fl_end);
-
-			locks_delete_lock(before, &dispose);
-			continue;
- 		}
-		before = &fl->fl_next;
-	}
-	spin_unlock(&inode->i_lock);
-	locks_dispose_list(&dispose);
+	/* remove any leases */
+	locks_remove_lease(filp);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 571f113588e9..2ddec3cf81b9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -969,6 +969,7 @@ struct file_lock {
 struct file_lock_context {
 	struct list_head	flc_flock;
 	struct list_head	flc_posix;
+	struct list_head	flc_lease;
 };
 
 /* The following constant reflects the upper bound of the file/locking space */
@@ -1990,7 +1991,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 	 * end up racing with tasks trying to set a new lease on this file.
 	 */
 	smp_mb();
-	if (inode->i_flock)
+	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
 		return __break_lease(inode, mode, FL_LEASE);
 	return 0;
 }
@@ -2003,7 +2004,7 @@ static inline int break_deleg(struct inode *inode, unsigned int mode)
 	 * end up racing with tasks trying to set a new lease on this file.
 	 */
 	smp_mb();
-	if (inode->i_flock)
+	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
 		return __break_lease(inode, mode, FL_DELEG);
 	return 0;
 }
-- 
cgit v1.2.3


From a7231a97467d5a0c36f82f581c76c12c034e4b80 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:56 -0500
Subject: locks: remove i_flock field from struct inode

Nothing uses it anymore. Also add a forward declaration for struct
file_lock to silence some compiler warnings that the removal triggers.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ddec3cf81b9..ce0873af0b97 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -625,7 +625,6 @@ struct inode {
 	atomic_t		i_readcount; /* struct files open RO */
 #endif
 	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
-	struct file_lock	*i_flock;
 	struct file_lock_context	*i_flctx;
 	struct address_space	i_data;
 	struct list_head	i_devices;
@@ -886,6 +885,8 @@ static inline struct file *get_file(struct file *f)
 /* legacy typedef, should eventually be removed */
 typedef void *fl_owner_t;
 
+struct file_lock;
+
 struct file_lock_operations {
 	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
-- 
cgit v1.2.3


From 6109c85037e53443f29fd39c0de69f578a1cf285 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:57 -0500
Subject: locks: add a dedicated spinlock to protect i_flctx lists

We can now add a dedicated spinlock without expanding struct inode.
Change to using that to protect the various i_flctx lists.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/ceph/locks.c     |  8 ++---
 fs/cifs/file.c      |  8 ++---
 fs/lockd/svcsubs.c  | 12 ++++----
 fs/locks.c          | 87 +++++++++++++++++++++++++++--------------------------
 fs/nfs/delegation.c |  8 ++---
 fs/nfs/nfs4state.c  |  8 ++---
 fs/nfs/write.c      |  4 +--
 fs/nfsd/nfs4state.c |  4 +--
 include/linux/fs.h  |  1 +
 9 files changed, 71 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 19beeed83233..0303da8e3233 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -255,12 +255,12 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 
 	ctx = inode->i_flctx;
 	if (ctx) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&ctx->flc_lock);
 		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
 			++(*fcntl_count);
 		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
 			++(*flock_count);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ctx->flc_lock);
 	}
 	dout("counted %d flock locks and %d fcntl locks",
 	     *flock_count, *fcntl_count);
@@ -288,7 +288,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 	if (!ctx)
 		return 0;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
 		++seen_fcntl;
 		if (seen_fcntl > num_fcntl_locks) {
@@ -312,7 +312,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 		++l;
 	}
 fail:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	return err;
 }
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ea78f6f81ce2..b65166eb111e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1136,11 +1136,11 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	if (!flctx)
 		goto out;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 	list_for_each(el, &flctx->flc_posix) {
 		count++;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 
 	INIT_LIST_HEAD(&locks_to_send);
 
@@ -1159,7 +1159,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	}
 
 	el = locks_to_send.next;
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 	list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
 		if (el == &locks_to_send) {
 			/*
@@ -1181,7 +1181,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 		lck->type = type;
 		lck->offset = flock->fl_start;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 
 	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
 		int stored_rc;
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 5300bb53835f..665ef5a05183 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -171,7 +171,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 		return 0;
 again:
 	file->f_locks = 0;
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 	list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
 		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
@@ -183,7 +183,7 @@ again:
 		if (match(lockhost, host)) {
 			struct file_lock lock = *fl;
 
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&flctx->flc_lock);
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
@@ -195,7 +195,7 @@ again:
 			goto again;
 		}
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 
 	return 0;
 }
@@ -232,14 +232,14 @@ nlm_file_inuse(struct nlm_file *file)
 		return 1;
 
 	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&flctx->flc_lock);
 		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
 			if (fl->fl_lmops == &nlmsvc_lock_operations) {
-				spin_unlock(&inode->i_lock);
+				spin_unlock(&flctx->flc_lock);
 				return 1;
 			}
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&flctx->flc_lock);
 	}
 	file->f_locks = 0;
 	return 0;
diff --git a/fs/locks.c b/fs/locks.c
index d46e70567b99..a268d959ccd6 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -161,7 +161,7 @@ int lease_break_time = 45;
  * The global file_lock_list is only used for displaying /proc/locks, so we
  * keep a list on each CPU, with each list protected by its own spinlock via
  * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant i_lock is held.
+ * the relevant flc_lock is held.
  */
 DEFINE_STATIC_LGLOCK(file_lock_lglock);
 static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
@@ -189,13 +189,13 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
  * contrast to those that are acting as records of acquired locks).
  *
  * Note that when we acquire this lock in order to change the above fields,
- * we often hold the i_lock as well. In certain cases, when reading the fields
+ * we often hold the flc_lock as well. In certain cases, when reading the fields
  * protected by this lock, we can skip acquiring it iff we already hold the
- * i_lock.
+ * flc_lock.
  *
  * In particular, adding an entry to the fl_block list requires that you hold
- * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
- * an entry from the list however only requires the file_lock_lock.
+ * both the flc_lock and the blocked_lock_lock (acquired in that order).
+ * Deleting an entry from the list however only requires the file_lock_lock.
  */
 static DEFINE_SPINLOCK(blocked_lock_lock);
 
@@ -214,6 +214,7 @@ locks_get_lock_context(struct inode *inode)
 	if (!new)
 		goto out;
 
+	spin_lock_init(&new->flc_lock);
 	INIT_LIST_HEAD(&new->flc_flock);
 	INIT_LIST_HEAD(&new->flc_posix);
 	INIT_LIST_HEAD(&new->flc_lease);
@@ -557,7 +558,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 	return fl1->fl_owner == fl2->fl_owner;
 }
 
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_insert_global_locks(struct file_lock *fl)
 {
 	lg_local_lock(&file_lock_lglock);
@@ -566,12 +567,12 @@ static void locks_insert_global_locks(struct file_lock *fl)
 	lg_local_unlock(&file_lock_lglock);
 }
 
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_delete_global_locks(struct file_lock *fl)
 {
 	/*
 	 * Avoid taking lock if already unhashed. This is safe since this check
-	 * is done while holding the i_lock, and new insertions into the list
+	 * is done while holding the flc_lock, and new insertions into the list
 	 * also require that it be held.
 	 */
 	if (hlist_unhashed(&fl->fl_link))
@@ -623,10 +624,10 @@ static void locks_delete_block(struct file_lock *waiter)
  * the order they blocked. The documentation doesn't require this but
  * it seems like the reasonable thing to do.
  *
- * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
- * list itself is protected by the blocked_lock_lock, but by ensuring that the
- * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
- * in some cases when we see that the fl_block list is empty.
+ * Must be called with both the flc_lock and blocked_lock_lock held. The
+ * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
+ * that the flc_lock is also held on insertions we can avoid taking the
+ * blocked_lock_lock in some cases when we see that the fl_block list is empty.
  */
 static void __locks_insert_block(struct file_lock *blocker,
 					struct file_lock *waiter)
@@ -638,7 +639,7 @@ static void __locks_insert_block(struct file_lock *blocker,
 		locks_insert_global_blocked(waiter);
 }
 
-/* Must be called with i_lock held. */
+/* Must be called with flc_lock held. */
 static void locks_insert_block(struct file_lock *blocker,
 					struct file_lock *waiter)
 {
@@ -650,15 +651,15 @@ static void locks_insert_block(struct file_lock *blocker,
 /*
  * Wake up processes blocked waiting for blocker.
  *
- * Must be called with the inode->i_lock held!
+ * Must be called with the inode->flc_lock held!
  */
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
 	/*
 	 * Avoid taking global lock if list is empty. This is safe since new
-	 * blocked requests are only added to the list under the i_lock, and
-	 * the i_lock is always held here. Note that removal from the fl_block
-	 * list does not require the i_lock, so we must recheck list_empty()
+	 * blocked requests are only added to the list under the flc_lock, and
+	 * the flc_lock is always held here. Note that removal from the fl_block
+	 * list does not require the flc_lock, so we must recheck list_empty()
 	 * after acquiring the blocked_lock_lock.
 	 */
 	if (list_empty(&blocker->fl_block))
@@ -768,7 +769,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 		return;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
 		if (posix_locks_conflict(fl, cfl)) {
 			locks_copy_conflock(fl, cfl);
@@ -779,7 +780,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 	}
 	fl->fl_type = F_UNLCK;
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -880,7 +881,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 			return -ENOMEM;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	if (request->fl_flags & FL_ACCESS)
 		goto find_conflict;
 
@@ -905,9 +906,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	 * give it the opportunity to lock the file.
 	 */
 	if (found) {
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ctx->flc_lock);
 		cond_resched();
-		spin_lock(&inode->i_lock);
+		spin_lock(&ctx->flc_lock);
 	}
 
 find_conflict:
@@ -929,7 +930,7 @@ find_conflict:
 	error = 0;
 
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	if (new_fl)
 		locks_free_lock(new_fl);
 	locks_dispose_list(&dispose);
@@ -965,7 +966,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		new_fl2 = locks_alloc_lock();
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	/*
 	 * New lock request. Walk all POSIX locks and look for conflicts. If
 	 * there are any, either return error or put the request on the
@@ -1136,7 +1137,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		locks_wake_up_blocks(left);
 	}
  out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	/*
 	 * Free any unused locks.
 	 */
@@ -1218,7 +1219,7 @@ int locks_mandatory_locked(struct file *file)
 	/*
 	 * Search the lock list for this inode for any POSIX locks.
 	 */
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	ret = 0;
 	list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
 		if (fl->fl_owner != current->files &&
@@ -1227,7 +1228,7 @@ int locks_mandatory_locked(struct file *file)
 			break;
 		}
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	return ret;
 }
 
@@ -1346,7 +1347,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lock *fl, *tmp;
 
-	lockdep_assert_held(&inode->i_lock);
+	lockdep_assert_held(&ctx->flc_lock);
 
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
 		trace_time_out_leases(inode, fl);
@@ -1370,7 +1371,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lock *fl;
 
-	lockdep_assert_held(&inode->i_lock);
+	lockdep_assert_held(&ctx->flc_lock);
 
 	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (leases_conflict(fl, breaker))
@@ -1413,7 +1414,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 		return error;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 
 	time_out_leases(inode, &dispose);
 
@@ -1463,11 +1464,11 @@ restart:
 		break_time++;
 	locks_insert_block(fl, new_fl);
 	trace_break_lease_block(inode, new_fl);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	error = wait_event_interruptible_timeout(new_fl->fl_wait,
 						!new_fl->fl_next, break_time);
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	trace_break_lease_unblock(inode, new_fl);
 	locks_delete_block(new_fl);
 	if (error >= 0) {
@@ -1482,7 +1483,7 @@ restart:
 		error = 0;
 	}
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	locks_free_lock(new_fl);
 	return error;
@@ -1506,14 +1507,14 @@ void lease_get_mtime(struct inode *inode, struct timespec *time)
 	struct file_lock *fl;
 
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&ctx->flc_lock);
 		if (!list_empty(&ctx->flc_lease)) {
 			fl = list_first_entry(&ctx->flc_lease,
 						struct file_lock, fl_list);
 			if (fl->fl_type == F_WRLCK)
 				has_lease = true;
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ctx->flc_lock);
 	}
 
 	if (has_lease)
@@ -1556,7 +1557,7 @@ int fcntl_getlease(struct file *filp)
 	LIST_HEAD(dispose);
 
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&ctx->flc_lock);
 		time_out_leases(file_inode(filp), &dispose);
 		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 			if (fl->fl_file != filp)
@@ -1564,7 +1565,7 @@ int fcntl_getlease(struct file *filp)
 			type = target_leasetype(fl);
 			break;
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ctx->flc_lock);
 		locks_dispose_list(&dispose);
 	}
 	return type;
@@ -1632,7 +1633,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 		return -EINVAL;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
 	error = check_conflicting_open(dentry, arg);
 	if (error)
@@ -1699,7 +1700,7 @@ out_setup:
 	if (lease->fl_lmops->lm_setup)
 		lease->fl_lmops->lm_setup(lease, priv);
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	if (is_deleg)
 		mutex_unlock(&inode->i_mutex);
@@ -1722,7 +1723,7 @@ static int generic_delete_lease(struct file *filp)
 		return error;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (fl->fl_file == filp) {
 			victim = fl;
@@ -1732,7 +1733,7 @@ static int generic_delete_lease(struct file *filp)
 	trace_generic_delete_lease(inode, fl);
 	if (victim)
 		error = fl->fl_lmops->lm_change(&victim, F_UNLCK, &dispose);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	return error;
 }
@@ -2423,10 +2424,10 @@ locks_remove_lease(struct file *filp)
 	if (!ctx || list_empty(&ctx->flc_lease))
 		return;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
 		lease_modify(&fl, F_UNLCK, &dispose);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 }
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 3fb1caa3874d..8cdb2b28a104 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -93,22 +93,22 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 		goto out;
 
 	list = &flctx->flc_posix;
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 restart:
 	list_for_each_entry(fl, list, fl_list) {
 		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&flctx->flc_lock);
 		status = nfs4_lock_delegation_recall(fl, state, stateid);
 		if (status < 0)
 			goto out;
-		spin_lock(&inode->i_lock);
+		spin_lock(&flctx->flc_lock);
 	}
 	if (list == &flctx->flc_posix) {
 		list = &flctx->flc_flock;
 		goto restart;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 out:
 	return status;
 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6084c267f3a0..a3bb22ab68c5 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1376,12 +1376,12 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 
 	/* Guard against delegation returns and new lock/unlock calls */
 	down_write(&nfsi->rwsem);
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 restart:
 	list_for_each_entry(fl, list, fl_list) {
 		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&flctx->flc_lock);
 		status = ops->recover_lock(state, fl);
 		switch (status) {
 		case 0:
@@ -1408,13 +1408,13 @@ restart:
 			/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 			status = 0;
 		}
-		spin_lock(&inode->i_lock);
+		spin_lock(&flctx->flc_lock);
 	}
 	if (list == &flctx->flc_posix) {
 		list = &flctx->flc_flock;
 		goto restart;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 out:
 	up_write(&nfsi->rwsem);
 	return status;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 784c13485b3f..4ae66f416eb9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1206,7 +1206,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
 
 	/* Check to see if there are whole file write locks */
 	ret = 0;
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 	if (!list_empty(&flctx->flc_posix)) {
 		fl = list_first_entry(&flctx->flc_posix, struct file_lock,
 					fl_list);
@@ -1218,7 +1218,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
 		if (fl->fl_type == F_WRLCK)
 			ret = 1;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 	return ret;
 }
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fad821991369..80242f5bd621 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5572,14 +5572,14 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 	flctx = inode->i_flctx;
 
 	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&flctx->flc_lock);
 		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
 			if (fl->fl_owner == (fl_owner_t)lowner) {
 				status = true;
 				break;
 			}
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&flctx->flc_lock);
 	}
 	fput(filp);
 	return status;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ce0873af0b97..32eafa9b5c9f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -968,6 +968,7 @@ struct file_lock {
 };
 
 struct file_lock_context {
+	spinlock_t		flc_lock;
 	struct list_head	flc_flock;
 	struct list_head	flc_posix;
 	struct list_head	flc_lease;
-- 
cgit v1.2.3


From 7448cc37b1a6b620d948aaee3bb30960c06d5d5d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:57 -0500
Subject: locks: clean up the lm_change prototype

Now that we use standard list_heads for tracking leases, we can have
lm_change take a pointer to the lease to be modified instead of a
double pointer.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c          | 13 ++++++-------
 fs/nfsd/nfs4state.c |  3 ++-
 include/linux/fs.h  |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index a268d959ccd6..864f2460a0ad 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1309,9 +1309,8 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
 }
 
 /* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
+int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 {
-	struct file_lock *fl = *before;
 	int error = assign_type(fl, arg);
 
 	if (error)
@@ -1352,9 +1351,9 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
 		trace_time_out_leases(inode, fl);
 		if (past_time(fl->fl_downgrade_time))
-			lease_modify(&fl, F_RDLCK, dispose);
+			lease_modify(fl, F_RDLCK, dispose);
 		if (past_time(fl->fl_break_time))
-			lease_modify(&fl, F_UNLCK, dispose);
+			lease_modify(fl, F_UNLCK, dispose);
 	}
 }
 
@@ -1669,7 +1668,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	}
 
 	if (my_fl != NULL) {
-		error = lease->fl_lmops->lm_change(&my_fl, arg, &dispose);
+		error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
 		if (error)
 			goto out;
 		goto out_setup;
@@ -1732,7 +1731,7 @@ static int generic_delete_lease(struct file *filp)
 	}
 	trace_generic_delete_lease(inode, fl);
 	if (victim)
-		error = fl->fl_lmops->lm_change(&victim, F_UNLCK, &dispose);
+		error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	return error;
@@ -2426,7 +2425,7 @@ locks_remove_lease(struct file *filp)
 
 	spin_lock(&ctx->flc_lock);
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
-		lease_modify(&fl, F_UNLCK, &dispose);
+		lease_modify(fl, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 80242f5bd621..532a60cca2fb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3477,7 +3477,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
 }
 
 static int
-nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose)
+nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+		     struct list_head *dispose)
 {
 	if (arg & F_UNLCK)
 		return lease_modify(onlist, arg, dispose);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 32eafa9b5c9f..94e706a0a408 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -900,7 +900,7 @@ struct lock_manager_operations {
 	void (*lm_notify)(struct file_lock *);	/* unblock callback */
 	int (*lm_grant)(struct file_lock *, int);
 	bool (*lm_break)(struct file_lock *);
-	int (*lm_change)(struct file_lock **, int, struct list_head *);
+	int (*lm_change)(struct file_lock *, int, struct list_head *);
 	void (*lm_setup)(struct file_lock *, void **);
 };
 
@@ -1021,7 +1021,7 @@ extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int t
 extern void lease_get_mtime(struct inode *, struct timespec *time);
 extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
 extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
-extern int lease_modify(struct file_lock **, int, struct list_head *);
+extern int lease_modify(struct file_lock *, int, struct list_head *);
 #else /* !CONFIG_FILE_LOCKING */
 static inline int fcntl_getlk(struct file *file, unsigned int cmd,
 			      struct flock __user *user)
@@ -1153,7 +1153,7 @@ static inline int vfs_setlease(struct file *filp, long arg,
 	return -EINVAL;
 }
 
-static inline int lease_modify(struct file_lock **before, int arg,
+static inline int lease_modify(struct file_lock *fl, int arg,
 			       struct list_head *dispose)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 9bd0f45b7037fcfa8b575c7e27d0431d6e6dc3bb Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:57 -0500
Subject: locks: keep a count of locks on the flctx lists

This makes things a bit more efficient in the cifs and ceph lock
pushing code.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/ceph/locks.c    | 11 ++---------
 fs/cifs/file.c     | 14 ++++----------
 fs/locks.c         | 45 +++++++++++++++++++++++++++++----------------
 include/linux/fs.h |  3 +++
 4 files changed, 38 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 0303da8e3233..06ea5cd05cd9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -242,12 +242,9 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 /*
  * Fills in the passed counter variables, so you can prepare pagelist metadata
  * before calling ceph_encode_locks.
- *
- * FIXME: add counters to struct file_lock_context so we don't need to do this?
  */
 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 {
-	struct file_lock *lock;
 	struct file_lock_context *ctx;
 
 	*fcntl_count = 0;
@@ -255,12 +252,8 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 
 	ctx = inode->i_flctx;
 	if (ctx) {
-		spin_lock(&ctx->flc_lock);
-		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
-			++(*fcntl_count);
-		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
-			++(*flock_count);
-		spin_unlock(&ctx->flc_lock);
+		*fcntl_count = ctx->flc_posix_cnt;
+		*flock_count = ctx->flc_flock_cnt;
 	}
 	dout("counted %d flock locks and %d fcntl locks",
 	     *flock_count, *fcntl_count);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b65166eb111e..8c2ca6f62bad 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1125,7 +1125,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	struct file_lock *flock;
 	struct file_lock_context *flctx = inode->i_flctx;
-	unsigned int count = 0, i;
+	unsigned int i;
 	int rc = 0, xid, type;
 	struct list_head locks_to_send, *el;
 	struct lock_to_push *lck, *tmp;
@@ -1136,20 +1136,14 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	if (!flctx)
 		goto out;
 
-	spin_lock(&flctx->flc_lock);
-	list_for_each(el, &flctx->flc_posix) {
-		count++;
-	}
-	spin_unlock(&flctx->flc_lock);
-
 	INIT_LIST_HEAD(&locks_to_send);
 
 	/*
-	 * Allocating count locks is enough because no FL_POSIX locks can be
-	 * added to the list while we are holding cinode->lock_sem that
+	 * Allocating flc_posix_cnt locks is enough because no FL_POSIX locks
+	 * can be added to the list while we are holding cinode->lock_sem that
 	 * protects locking operations of this inode.
 	 */
-	for (i = 0; i < count; i++) {
+	for (i = 0; i < flctx->flc_posix_cnt; i++) {
 		lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
 		if (!lck) {
 			rc = -ENOMEM;
diff --git a/fs/locks.c b/fs/locks.c
index 864f2460a0ad..bd578700342d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -681,18 +681,21 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 }
 
 static void
-locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
+locks_insert_lock_ctx(struct file_lock *fl, int *counter,
+		      struct list_head *before)
 {
 	fl->fl_nspid = get_pid(task_tgid(current));
 	list_add_tail(&fl->fl_list, before);
+	++*counter;
 	locks_insert_global_locks(fl);
 }
 
 static void
-locks_unlink_lock_ctx(struct file_lock *fl)
+locks_unlink_lock_ctx(struct file_lock *fl, int *counter)
 {
 	locks_delete_global_locks(fl);
 	list_del_init(&fl->fl_list);
+	--*counter;
 	if (fl->fl_nspid) {
 		put_pid(fl->fl_nspid);
 		fl->fl_nspid = NULL;
@@ -701,9 +704,10 @@ locks_unlink_lock_ctx(struct file_lock *fl)
 }
 
 static void
-locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
+locks_delete_lock_ctx(struct file_lock *fl, int *counter,
+		      struct list_head *dispose)
 {
-	locks_unlink_lock_ctx(fl);
+	locks_unlink_lock_ctx(fl, counter);
 	if (dispose)
 		list_add(&fl->fl_list, dispose);
 	else
@@ -891,7 +895,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 		if (request->fl_type == fl->fl_type)
 			goto out;
 		found = true;
-		locks_delete_lock_ctx(fl, &dispose);
+		locks_delete_lock_ctx(fl, &ctx->flc_flock_cnt, &dispose);
 		break;
 	}
 
@@ -925,7 +929,7 @@ find_conflict:
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 	locks_copy_lock(new_fl, request);
-	locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
+	locks_insert_lock_ctx(new_fl, &ctx->flc_flock_cnt, &ctx->flc_flock);
 	new_fl = NULL;
 	error = 0;
 
@@ -1042,7 +1046,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			else
 				request->fl_end = fl->fl_end;
 			if (added) {
-				locks_delete_lock_ctx(fl, &dispose);
+				locks_delete_lock_ctx(fl, &ctx->flc_posix_cnt,
+							&dispose);
 				continue;
 			}
 			request = fl;
@@ -1071,7 +1076,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				 * one (This may happen several times).
 				 */
 				if (added) {
-					locks_delete_lock_ctx(fl, &dispose);
+					locks_delete_lock_ctx(fl,
+						&ctx->flc_posix_cnt, &dispose);
 					continue;
 				}
 				/*
@@ -1087,8 +1093,10 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				locks_copy_lock(new_fl, request);
 				request = new_fl;
 				new_fl = NULL;
-				locks_insert_lock_ctx(request, &fl->fl_list);
-				locks_delete_lock_ctx(fl, &dispose);
+				locks_insert_lock_ctx(request,
+					&ctx->flc_posix_cnt, &fl->fl_list);
+				locks_delete_lock_ctx(fl,
+					&ctx->flc_posix_cnt, &dispose);
 				added = true;
 			}
 		}
@@ -1116,7 +1124,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			goto out;
 		}
 		locks_copy_lock(new_fl, request);
-		locks_insert_lock_ctx(new_fl, &fl->fl_list);
+		locks_insert_lock_ctx(new_fl, &ctx->flc_posix_cnt,
+					&fl->fl_list);
 		new_fl = NULL;
 	}
 	if (right) {
@@ -1127,7 +1136,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			left = new_fl2;
 			new_fl2 = NULL;
 			locks_copy_lock(left, right);
-			locks_insert_lock_ctx(left, &fl->fl_list);
+			locks_insert_lock_ctx(left, &ctx->flc_posix_cnt,
+						&fl->fl_list);
 		}
 		right->fl_start = request->fl_end + 1;
 		locks_wake_up_blocks(right);
@@ -1311,6 +1321,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
 /* We already had a lease on this file; just change its type */
 int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 {
+	struct file_lock_context *flctx;
 	int error = assign_type(fl, arg);
 
 	if (error)
@@ -1320,6 +1331,7 @@ int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 	if (arg == F_UNLCK) {
 		struct file *filp = fl->fl_file;
 
+		flctx = file_inode(filp)->i_flctx;
 		f_delown(filp);
 		filp->f_owner.signum = 0;
 		fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
@@ -1327,7 +1339,7 @@ int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 			printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
 			fl->fl_fasync = NULL;
 		}
-		locks_delete_lock_ctx(fl, dispose);
+		locks_delete_lock_ctx(fl, &flctx->flc_lease_cnt, dispose);
 	}
 	return 0;
 }
@@ -1442,7 +1454,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 			fl->fl_downgrade_time = break_time;
 		}
 		if (fl->fl_lmops->lm_break(fl))
-			locks_delete_lock_ctx(fl, &dispose);
+			locks_delete_lock_ctx(fl, &ctx->flc_lease_cnt,
+						&dispose);
 	}
 
 	if (list_empty(&ctx->flc_lease))
@@ -1678,7 +1691,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	if (!leases_enable)
 		goto out;
 
-	locks_insert_lock_ctx(lease, &ctx->flc_lease);
+	locks_insert_lock_ctx(lease, &ctx->flc_lease_cnt, &ctx->flc_lease);
 	/*
 	 * The check in break_lease() is lockless. It's possible for another
 	 * open to race in after we did the earlier check for a conflicting
@@ -1691,7 +1704,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	smp_mb();
 	error = check_conflicting_open(dentry, arg);
 	if (error) {
-		locks_unlink_lock_ctx(lease);
+		locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
 		goto out;
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 94e706a0a408..f87cb2f03103 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -972,6 +972,9 @@ struct file_lock_context {
 	struct list_head	flc_flock;
 	struct list_head	flc_posix;
 	struct list_head	flc_lease;
+	int			flc_flock_cnt;
+	int			flc_posix_cnt;
+	int			flc_lease_cnt;
 };
 
 /* The following constant reflects the upper bound of the file/locking space */
-- 
cgit v1.2.3


From 9d5438f462abd6398cdb7b3211bdcec271873a3b Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <salyzyn@android.com>
Date: Fri, 16 Jan 2015 16:01:10 -0800
Subject: pstore: Add pmsg - user-space accessible pstore object

A secured user-space accessible pstore object. Writes
to /dev/pmsg0 are appended to the buffer, on reboot
the persistent contents are available in
/sys/fs/pstore/pmsg-ramoops-[ID].

One possible use is syslogd, or other daemon, can
write messages, then on reboot provides a means to
triage user-space activities leading up to a panic
as a companion to the pstore dmesg or console logs.

Signed-off-by: Mark Salyzyn <salyzyn@android.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/Kconfig          |  10 ++++
 fs/pstore/Makefile         |   2 +
 fs/pstore/inode.c          |   3 ++
 fs/pstore/internal.h       |   6 +++
 fs/pstore/platform.c       |   1 +
 fs/pstore/pmsg.c           | 114 +++++++++++++++++++++++++++++++++++++++++++++
 fs/pstore/ram.c            |  34 +++++++++++++-
 include/linux/pstore.h     |   1 +
 include/linux/pstore_ram.h |   1 +
 9 files changed, 170 insertions(+), 2 deletions(-)
 create mode 100644 fs/pstore/pmsg.c

(limited to 'include/linux')

diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 983d9510becc..916b8e23d968 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -21,6 +21,16 @@ config PSTORE_CONSOLE
 	  When the option is enabled, pstore will log all kernel
 	  messages, even if no oops or panic happened.
 
+config PSTORE_PMSG
+	bool "Log user space messages"
+	depends on PSTORE
+	help
+	  When the option is enabled, pstore will export a character
+	  interface /dev/pmsg0 to log user space messages. On reboot
+	  data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
+
+	  If unsure, say N.
+
 config PSTORE_FTRACE
 	bool "Persistent function tracer"
 	depends on PSTORE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 4c9095c2781e..e647d8e81712 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -7,5 +7,7 @@ obj-y += pstore.o
 pstore-objs += inode.o platform.o
 obj-$(CONFIG_PSTORE_FTRACE)	+= ftrace.o
 
+obj-$(CONFIG_PSTORE_PMSG)	+= pmsg.o
+
 ramoops-objs += ram.o ram_core.o
 obj-$(CONFIG_PSTORE_RAM)	+= ramoops.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d69586f09ffd..b32ce53d24ee 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -361,6 +361,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 		scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
 			  psname, id);
 		break;
+	case PSTORE_TYPE_PMSG:
+		scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
+		break;
 	case PSTORE_TYPE_UNKNOWN:
 		scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
 		break;
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 3b3d305277c4..c36ba2cd0b5d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void);
 static inline void pstore_register_ftrace(void) {}
 #endif
 
+#ifdef CONFIG_PSTORE_PMSG
+extern void pstore_register_pmsg(void);
+#else
+static inline void pstore_register_pmsg(void) {}
+#endif
+
 extern struct pstore_info *psinfo;
 
 extern void	pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0a9b72cdfeca..15ee78c5020b 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -447,6 +447,7 @@ int pstore_register(struct pstore_info *psi)
 	if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
 		pstore_register_console();
 		pstore_register_ftrace();
+		pstore_register_pmsg();
 	}
 
 	if (pstore_update_ms >= 0) {
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
new file mode 100644
index 000000000000..feb5dd2948b4
--- /dev/null
+++ b/fs/pstore/pmsg.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2014  Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+
+static DEFINE_MUTEX(pmsg_lock);
+#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
+
+static ssize_t write_pmsg(struct file *file, const char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	size_t i, buffer_size;
+	char *buffer;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	buffer_size = count;
+	if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
+		buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
+	buffer = vmalloc(buffer_size);
+
+	mutex_lock(&pmsg_lock);
+	for (i = 0; i < count; ) {
+		size_t c = min(count - i, buffer_size);
+		u64 id;
+		long ret;
+
+		ret = __copy_from_user(buffer, buf + i, c);
+		if (unlikely(ret != 0)) {
+			mutex_unlock(&pmsg_lock);
+			vfree(buffer);
+			return -EFAULT;
+		}
+		psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
+				  psinfo);
+
+		i += c;
+	}
+
+	mutex_unlock(&pmsg_lock);
+	vfree(buffer);
+	return count;
+}
+
+static const struct file_operations pmsg_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+	.write		= write_pmsg,
+};
+
+static struct class *pmsg_class;
+static int pmsg_major;
+#define PMSG_NAME "pmsg"
+#undef pr_fmt
+#define pr_fmt(fmt) PMSG_NAME ": " fmt
+
+static char *pmsg_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0220;
+	return NULL;
+}
+
+void pstore_register_pmsg(void)
+{
+	struct device *pmsg_device;
+
+	pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
+	if (pmsg_major < 0) {
+		pr_err("register_chrdev failed\n");
+		goto err;
+	}
+
+	pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+	if (IS_ERR(pmsg_class)) {
+		pr_err("device class file already in use\n");
+		goto err_class;
+	}
+	pmsg_class->devnode = pmsg_devnode;
+
+	pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
+					NULL, "%s%d", PMSG_NAME, 0);
+	if (IS_ERR(pmsg_device)) {
+		pr_err("failed to create device\n");
+		goto err_device;
+	}
+	return;
+
+err_device:
+	class_destroy(pmsg_class);
+err_class:
+	unregister_chrdev(pmsg_major, PMSG_NAME);
+err:
+	return;
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 6150e54eed30..39d1373128e9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
 module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
 MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
 
+static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
+module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
+MODULE_PARM_DESC(pmsg_size, "size of user space message log");
+
 static ulong mem_address;
 module_param(mem_address, ulong, 0400);
 MODULE_PARM_DESC(mem_address,
@@ -82,12 +86,14 @@ struct ramoops_context {
 	struct persistent_ram_zone **przs;
 	struct persistent_ram_zone *cprz;
 	struct persistent_ram_zone *fprz;
+	struct persistent_ram_zone *mprz;
 	phys_addr_t phys_addr;
 	unsigned long size;
 	unsigned int memtype;
 	size_t record_size;
 	size_t console_size;
 	size_t ftrace_size;
+	size_t pmsg_size;
 	int dump_oops;
 	struct persistent_ram_ecc_info ecc_info;
 	unsigned int max_dump_cnt;
@@ -96,6 +102,7 @@ struct ramoops_context {
 	unsigned int dump_read_cnt;
 	unsigned int console_read_cnt;
 	unsigned int ftrace_read_cnt;
+	unsigned int pmsg_read_cnt;
 	struct pstore_info pstore;
 };
 
@@ -109,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
 	cxt->dump_read_cnt = 0;
 	cxt->console_read_cnt = 0;
 	cxt->ftrace_read_cnt = 0;
+	cxt->pmsg_read_cnt = 0;
 	return 0;
 }
 
@@ -190,6 +198,9 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
 					   1, id, type, PSTORE_TYPE_FTRACE, 0);
+	if (!prz_ok(prz))
+		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
+					   1, id, type, PSTORE_TYPE_PMSG, 0);
 	if (!prz_ok(prz))
 		return 0;
 
@@ -258,6 +269,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 			return -ENOMEM;
 		persistent_ram_write(cxt->fprz, buf, size);
 		return 0;
+	} else if (type == PSTORE_TYPE_PMSG) {
+		if (!cxt->mprz)
+			return -ENOMEM;
+		persistent_ram_write(cxt->mprz, buf, size);
+		return 0;
 	}
 
 	if (type != PSTORE_TYPE_DMESG)
@@ -315,6 +331,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
 	case PSTORE_TYPE_FTRACE:
 		prz = cxt->fprz;
 		break;
+	case PSTORE_TYPE_PMSG:
+		prz = cxt->mprz;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -441,7 +460,7 @@ static int ramoops_probe(struct platform_device *pdev)
 		goto fail_out;
 
 	if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
-			!pdata->ftrace_size)) {
+			!pdata->ftrace_size && !pdata->pmsg_size)) {
 		pr_err("The memory size and the record/console size must be "
 			"non-zero\n");
 		goto fail_out;
@@ -453,6 +472,8 @@ static int ramoops_probe(struct platform_device *pdev)
 		pdata->console_size = rounddown_pow_of_two(pdata->console_size);
 	if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
 		pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
+	if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
+		pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
 
 	cxt->size = pdata->mem_size;
 	cxt->phys_addr = pdata->mem_address;
@@ -460,12 +481,14 @@ static int ramoops_probe(struct platform_device *pdev)
 	cxt->record_size = pdata->record_size;
 	cxt->console_size = pdata->console_size;
 	cxt->ftrace_size = pdata->ftrace_size;
+	cxt->pmsg_size = pdata->pmsg_size;
 	cxt->dump_oops = pdata->dump_oops;
 	cxt->ecc_info = pdata->ecc_info;
 
 	paddr = cxt->phys_addr;
 
-	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size;
+	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
+			- cxt->pmsg_size;
 	err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
 	if (err)
 		goto fail_out;
@@ -480,6 +503,10 @@ static int ramoops_probe(struct platform_device *pdev)
 	if (err)
 		goto fail_init_fprz;
 
+	err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
+	if (err)
+		goto fail_init_mprz;
+
 	cxt->pstore.data = cxt;
 	/*
 	 * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we
@@ -523,6 +550,8 @@ fail_buf:
 	kfree(cxt->pstore.buf);
 fail_clear:
 	cxt->pstore.bufsize = 0;
+	kfree(cxt->mprz);
+fail_init_mprz:
 	kfree(cxt->fprz);
 fail_init_fprz:
 	kfree(cxt->cprz);
@@ -580,6 +609,7 @@ static void ramoops_register_dummy(void)
 	dummy_data->record_size = record_size;
 	dummy_data->console_size = ramoops_console_size;
 	dummy_data->ftrace_size = ramoops_ftrace_size;
+	dummy_data->pmsg_size = ramoops_pmsg_size;
 	dummy_data->dump_oops = dump_oops;
 	/*
 	 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ece0c6bbfcc5..8884f6e507f7 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -39,6 +39,7 @@ enum pstore_type_id {
 	PSTORE_TYPE_PPC_RTAS	= 4,
 	PSTORE_TYPE_PPC_OF	= 5,
 	PSTORE_TYPE_PPC_COMMON	= 6,
+	PSTORE_TYPE_PMSG	= 7,
 	PSTORE_TYPE_UNKNOWN	= 255
 };
 
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index 4af3fdc85b01..9c9d6c154c8e 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -81,6 +81,7 @@ struct ramoops_platform_data {
 	unsigned long	record_size;
 	unsigned long	console_size;
 	unsigned long	ftrace_size;
+	unsigned long	pmsg_size;
 	int		dump_oops;
 	struct persistent_ram_ecc_info ecc_info;
 };
-- 
cgit v1.2.3


From b9626f32876debc356fadb6e19aebcfe9d70c5ff Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Mon, 1 Dec 2014 19:32:51 +0100
Subject: tpm/tpm_i2c_stm_st33: Add new tpm_stm_dev structure and remove
 tpm_i2c_buffer[0], [1] buffer.

In order to clean big buffers in st33zp24_platform_data structure,
replace with tpm_stm_dev for driver internal usage.
As only one buffer is really necessary replace with buf field.

In the mean time move tpm_i2c_stm_st33.h to include/linux/platform_data.

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 drivers/char/tpm/tpm_i2c_stm_st33.c            | 204 +++++++++++--------------
 drivers/char/tpm/tpm_i2c_stm_st33.h            |  44 ------
 include/linux/platform_data/tpm_i2c_stm_st33.h |  40 +++++
 3 files changed, 132 insertions(+), 156 deletions(-)
 delete mode 100644 drivers/char/tpm/tpm_i2c_stm_st33.h
 create mode 100644 include/linux/platform_data/tpm_i2c_stm_st33.h

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c
index 44a02103ad49..4f725386385f 100644
--- a/drivers/char/tpm/tpm_i2c_stm_st33.c
+++ b/drivers/char/tpm/tpm_i2c_stm_st33.c
@@ -40,7 +40,6 @@
 #include <linux/wait.h>
 #include <linux/string.h>
 #include <linux/interrupt.h>
-#include <linux/spinlock.h>
 #include <linux/sysfs.h>
 #include <linux/gpio.h>
 #include <linux/sched.h>
@@ -48,6 +47,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 
+#include <linux/platform_data/tpm_i2c_stm_st33.h>
 #include "tpm.h"
 
 #define TPM_ACCESS			0x0
@@ -66,7 +66,7 @@
 #define TPM_BUFSIZE			2048
 
 #define LOCALITY0		0
-#include "tpm_i2c_stm_st33.h"
+
 
 enum stm33zp24_access {
 	TPM_ACCESS_VALID = 0x80,
@@ -98,6 +98,15 @@ enum tis_defaults {
 	TIS_LONG_TIMEOUT = 2000,
 };
 
+struct tpm_stm_dev {
+	struct i2c_client *client;
+	struct completion irq_detection;
+	struct tpm_chip *chip;
+	u8 buf[TPM_BUFSIZE + 1];
+	int io_serirq;
+	int io_lpcpd;
+};
+
 /*
  * write8_reg
  * Send byte to the TIS register according to the ST33ZP24 I2C protocol.
@@ -106,17 +115,12 @@ enum tis_defaults {
  * @param: tpm_size, The length of the data
  * @return: Returns negative errno, or else the number of bytes written.
  */
-static int write8_reg(struct i2c_client *client, u8 tpm_register,
+static int write8_reg(struct tpm_stm_dev *tpm_dev, u8 tpm_register,
 		      u8 *tpm_data, u16 tpm_size)
 {
-	struct st33zp24_platform_data *pin_infos;
-
-	pin_infos = client->dev.platform_data;
-
-	pin_infos->tpm_i2c_buffer[0][0] = tpm_register;
-	memcpy(&pin_infos->tpm_i2c_buffer[0][1], tpm_data, tpm_size);
-	return i2c_master_send(client, pin_infos->tpm_i2c_buffer[0],
-				tpm_size + 1);
+	tpm_dev->buf[0] = tpm_register;
+	memcpy(tpm_dev->buf + 1, tpm_data, tpm_size);
+	return i2c_master_send(tpm_dev->client, tpm_dev->buf, tpm_size + 1);
 } /* write8_reg() */
 
 /*
@@ -127,56 +131,56 @@ static int write8_reg(struct i2c_client *client, u8 tpm_register,
  * @param: tpm_size, tpm TPM response size to read.
  * @return: number of byte read successfully: should be one if success.
  */
-static int read8_reg(struct i2c_client *client, u8 tpm_register,
+static int read8_reg(struct tpm_stm_dev *tpm_dev, u8 tpm_register,
 		    u8 *tpm_data, int tpm_size)
 {
 	u8 status = 0;
 	u8 data;
 
 	data = TPM_DUMMY_BYTE;
-	status = write8_reg(client, tpm_register, &data, 1);
+	status = write8_reg(tpm_dev, tpm_register, &data, 1);
 	if (status == 2)
-		status = i2c_master_recv(client, tpm_data, tpm_size);
+		status = i2c_master_recv(tpm_dev->client, tpm_data, tpm_size);
 	return status;
 } /* read8_reg() */
 
 /*
  * I2C_WRITE_DATA
  * Send byte to the TIS register according to the ST33ZP24 I2C protocol.
- * @param: client, the chip description
+ * @param: tpm_dev, the chip description
  * @param: tpm_register, the tpm tis register where the data should be written
  * @param: tpm_data, the tpm_data to write inside the tpm_register
  * @param: tpm_size, The length of the data
  * @return: number of byte written successfully: should be one if success.
  */
-#define I2C_WRITE_DATA(client, tpm_register, tpm_data, tpm_size) \
-	(write8_reg(client, tpm_register | \
+#define I2C_WRITE_DATA(tpm_dev, tpm_register, tpm_data, tpm_size) \
+	(write8_reg(tpm_dev, tpm_register | \
 	TPM_WRITE_DIRECTION, tpm_data, tpm_size))
 
 /*
  * I2C_READ_DATA
  * Recv byte from the TIS register according to the ST33ZP24 I2C protocol.
- * @param: tpm, the chip description
+ * @param: tpm_dev, the chip description
  * @param: tpm_register, the tpm tis register where the data should be read
  * @param: tpm_data, the TPM response
  * @param: tpm_size, tpm TPM response size to read.
  * @return: number of byte read successfully: should be one if success.
  */
-#define I2C_READ_DATA(client, tpm_register, tpm_data, tpm_size) \
-	(read8_reg(client, tpm_register, tpm_data, tpm_size))
+#define I2C_READ_DATA(tpm_dev, tpm_register, tpm_data, tpm_size) \
+	(read8_reg(tpm_dev, tpm_register, tpm_data, tpm_size))
 
 /*
  * clear_interruption
  * clear the TPM interrupt register.
  * @param: tpm, the chip description
  */
-static void clear_interruption(struct i2c_client *client)
+static void clear_interruption(struct tpm_stm_dev *tpm_dev)
 {
 	u8 interrupt;
 
-	I2C_READ_DATA(client, TPM_INT_STATUS, &interrupt, 1);
-	I2C_WRITE_DATA(client, TPM_INT_STATUS, &interrupt, 1);
-	I2C_READ_DATA(client, TPM_INT_STATUS, &interrupt, 1);
+	I2C_READ_DATA(tpm_dev, TPM_INT_STATUS, &interrupt, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_INT_STATUS, &interrupt, 1);
+	I2C_READ_DATA(tpm_dev, TPM_INT_STATUS, &interrupt, 1);
 } /* clear_interruption() */
 
 /*
@@ -190,17 +194,16 @@ static long _wait_for_interrupt_serirq_timeout(struct tpm_chip *chip,
 {
 	long status;
 	struct i2c_client *client;
-	struct st33zp24_platform_data *pin_infos;
+	struct tpm_stm_dev *tpm_dev;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
-	pin_infos = client->dev.platform_data;
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
+	client = tpm_dev->client;
 
 	status = wait_for_completion_interruptible_timeout(
-					&pin_infos->irq_detection,
-						timeout);
+					&tpm_dev->irq_detection,
+					timeout);
 	if (status > 0)
-		enable_irq(gpio_to_irq(pin_infos->io_serirq));
-	gpio_direction_input(pin_infos->io_serirq);
+		enable_irq(client->irq);
 
 	return status;
 } /* wait_for_interrupt_serirq_timeout() */
@@ -209,15 +212,15 @@ static int wait_for_serirq_timeout(struct tpm_chip *chip, bool condition,
 				 unsigned long timeout)
 {
 	int status = 2;
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	status = _wait_for_interrupt_serirq_timeout(chip, timeout);
 	if (!status) {
 		status = -EBUSY;
 	} else {
-		clear_interruption(client);
+		clear_interruption(tpm_dev);
 		if (condition)
 			status = 1;
 	}
@@ -230,13 +233,13 @@ static int wait_for_serirq_timeout(struct tpm_chip *chip, bool condition,
  */
 static void tpm_stm_i2c_cancel(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	data = TPM_STS_COMMAND_READY;
-	I2C_WRITE_DATA(client, TPM_STS, &data, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_STS, &data, 1);
 	if (chip->vendor.irq)
 		wait_for_serirq_timeout(chip, 1, chip->vendor.timeout_a);
 }	/* tpm_stm_i2c_cancel() */
@@ -248,12 +251,12 @@ static void tpm_stm_i2c_cancel(struct tpm_chip *chip)
  */
 static u8 tpm_stm_i2c_status(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
-	I2C_READ_DATA(client, TPM_STS, &data, 1);
+	I2C_READ_DATA(tpm_dev, TPM_STS, &data, 1);
 	return data;
 }				/* tpm_stm_i2c_status() */
 
@@ -265,13 +268,13 @@ static u8 tpm_stm_i2c_status(struct tpm_chip *chip)
  */
 static int check_locality(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 	u8 status;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
-	status = I2C_READ_DATA(client, TPM_ACCESS, &data, 1);
+	status = I2C_READ_DATA(tpm_dev, TPM_ACCESS, &data, 1);
 	if (status && (data &
 		(TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID)) ==
 		(TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID))
@@ -290,16 +293,16 @@ static int request_locality(struct tpm_chip *chip)
 {
 	unsigned long stop;
 	long rc;
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	if (check_locality(chip) == chip->vendor.locality)
 		return chip->vendor.locality;
 
 	data = TPM_ACCESS_REQUEST_USE;
-	rc = I2C_WRITE_DATA(client, TPM_ACCESS, &data, 1);
+	rc = I2C_WRITE_DATA(tpm_dev, TPM_ACCESS, &data, 1);
 	if (rc < 0)
 		goto end;
 
@@ -328,13 +331,13 @@ end:
  */
 static void release_locality(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 	data = TPM_ACCESS_ACTIVE_LOCALITY;
 
-	I2C_WRITE_DATA(client, TPM_ACCESS, &data, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_ACCESS, &data, 1);
 }
 
 /*
@@ -347,19 +350,20 @@ static int get_burstcount(struct tpm_chip *chip)
 	unsigned long stop;
 	int burstcnt, status;
 	u8 tpm_reg, temp;
+	struct tpm_stm_dev *tpm_dev;
 
-	struct i2c_client *client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	stop = jiffies + chip->vendor.timeout_d;
 	do {
 		tpm_reg = TPM_STS + 1;
-		status = I2C_READ_DATA(client, tpm_reg, &temp, 1);
+		status = I2C_READ_DATA(tpm_dev, tpm_reg, &temp, 1);
 		if (status < 0)
 			goto end;
 
 		tpm_reg = tpm_reg + 1;
 		burstcnt = temp;
-		status = I2C_READ_DATA(client, tpm_reg, &temp, 1);
+		status = I2C_READ_DATA(tpm_dev, tpm_reg, &temp, 1);
 		if (status < 0)
 			goto end;
 
@@ -416,9 +420,9 @@ static int wait_for_stat(struct tpm_chip *chip, u8 mask, unsigned long timeout,
 static int recv_data(struct tpm_chip *chip, u8 *buf, size_t count)
 {
 	int size = 0, burstcnt, len;
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	while (size < count &&
 	       wait_for_stat(chip,
@@ -430,7 +434,7 @@ static int recv_data(struct tpm_chip *chip, u8 *buf, size_t count)
 		if (burstcnt < 0)
 			return burstcnt;
 		len = min_t(int, burstcnt, count - size);
-		I2C_READ_DATA(client, TPM_DATA_FIFO, buf + size, len);
+		I2C_READ_DATA(tpm_dev, TPM_DATA_FIFO, buf + size, len);
 		size += len;
 	}
 	return size;
@@ -446,14 +450,14 @@ static irqreturn_t tpm_ioserirq_handler(int irq, void *dev_id)
 {
 	struct tpm_chip *chip = dev_id;
 	struct i2c_client *client;
-	struct st33zp24_platform_data *pin_infos;
+	struct tpm_stm_dev *tpm_dev;
 
 	disable_irq_nosync(irq);
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
-	pin_infos = client->dev.platform_data;
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
+	client = tpm_dev->client;
 
-	complete(&pin_infos->irq_detection);
+	complete(&tpm_dev->irq_detection);
 	return IRQ_HANDLED;
 } /* tpm_ioserirq_handler() */
 
@@ -475,13 +479,15 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 	int ret;
 	u8 data;
 	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 
 	if (chip == NULL)
 		return -EBUSY;
 	if (len < TPM_HEADER_SIZE)
 		return -EBUSY;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
+	client = tpm_dev->client;
 
 	client->flags = 0;
 
@@ -505,7 +511,7 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 		if (burstcnt < 0)
 			return burstcnt;
 		size = min_t(int, len - i - 1, burstcnt);
-		ret = I2C_WRITE_DATA(client, TPM_DATA_FIFO, buf + i, size);
+		ret = I2C_WRITE_DATA(tpm_dev, TPM_DATA_FIFO, buf + i, size);
 		if (ret < 0)
 			goto out_err;
 
@@ -518,7 +524,7 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 		goto out_err;
 	}
 
-	ret = I2C_WRITE_DATA(client, TPM_DATA_FIFO, buf + len - 1, 1);
+	ret = I2C_WRITE_DATA(tpm_dev, TPM_DATA_FIFO, buf + len - 1, 1);
 	if (ret < 0)
 		goto out_err;
 
@@ -529,7 +535,7 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 	}
 
 	data = TPM_STS_GO;
-	I2C_WRITE_DATA(client, TPM_STS, &data, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_STS, &data, 1);
 
 	return len;
 out_err:
@@ -623,6 +629,7 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	u8 intmask;
 	struct tpm_chip *chip;
 	struct st33zp24_platform_data *platform_data;
+	struct tpm_stm_dev *tpm_dev;
 
 	if (client == NULL) {
 		pr_info("%s: i2c client is NULL. Device not accessible.\n",
@@ -637,11 +644,11 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		goto end;
 	}
 
-	chip = tpm_register_hardware(&client->dev, &st_i2c_tpm);
-	if (!chip) {
-		dev_info(&client->dev, "fail chip\n");
-		err = -ENODEV;
-		goto end;
+	tpm_dev = devm_kzalloc(&client->dev, sizeof(struct tpm_stm_dev),
+			       GFP_KERNEL);
+	if (!tpm_dev) {
+		err = -ENOMEM;
+		goto _tpm_clean_answer;
 	}
 
 	platform_data = client->dev.platform_data;
@@ -652,20 +659,14 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		goto _tpm_clean_answer;
 	}
 
-	platform_data->tpm_i2c_buffer[0] =
-	    kmalloc(TPM_BUFSIZE, GFP_KERNEL);
-	if (platform_data->tpm_i2c_buffer[0] == NULL) {
-		err = -ENOMEM;
-		goto _tpm_clean_answer;
-	}
-	platform_data->tpm_i2c_buffer[1] =
-	    kmalloc(TPM_BUFSIZE, GFP_KERNEL);
-	if (platform_data->tpm_i2c_buffer[1] == NULL) {
-		err = -ENOMEM;
-		goto _tpm_clean_response1;
+	chip = tpm_register_hardware(&client->dev, &st_i2c_tpm);
+	if (!chip) {
+		dev_info(&client->dev, "fail chip\n");
+		return -ENODEV;
 	}
 
-	TPM_VPRIV(chip) = client;
+	TPM_VPRIV(chip) = tpm_dev;
+	tpm_dev->client = client;
 
 	chip->vendor.timeout_a = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
 	chip->vendor.timeout_b = msecs_to_jiffies(TIS_LONG_TIMEOUT);
@@ -682,16 +683,16 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	}
 
 	if (interrupts) {
-		init_completion(&platform_data->irq_detection);
+		init_completion(&tpm_dev->irq_detection);
 		if (request_locality(chip) != LOCALITY0) {
 			err = -ENODEV;
-			goto _tpm_clean_response2;
+			goto _tpm_clean_answer;
 		}
 		err = gpio_request(platform_data->io_serirq, "TPM IO_SERIRQ");
 		if (err)
 			goto _gpio_init2;
 
-		clear_interruption(client);
+		clear_interruption(tpm_dev);
 		err = request_irq(gpio_to_irq(platform_data->io_serirq),
 				&tpm_ioserirq_handler,
 				IRQF_TRIGGER_HIGH,
@@ -702,7 +703,7 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 			goto _irq_set;
 		}
 
-		err = I2C_READ_DATA(client, TPM_INT_ENABLE, &intmask, 1);
+		err = I2C_READ_DATA(tpm_dev, TPM_INT_ENABLE, &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
@@ -713,16 +714,17 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 			|  TPM_INTF_STS_VALID_INT
 			|  TPM_INTF_DATA_AVAIL_INT;
 
-		err = I2C_WRITE_DATA(client, TPM_INT_ENABLE, &intmask, 1);
+		err = I2C_WRITE_DATA(tpm_dev, TPM_INT_ENABLE, &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
 		intmask = TPM_GLOBAL_INT_ENABLE;
-		err = I2C_WRITE_DATA(client, (TPM_INT_ENABLE + 3), &intmask, 1);
+		err = I2C_WRITE_DATA(tpm_dev, (TPM_INT_ENABLE + 3),
+				     &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
-		err = I2C_READ_DATA(client, TPM_INT_STATUS, &intmask, 1);
+		err = I2C_READ_DATA(tpm_dev, TPM_INT_STATUS, &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
@@ -744,12 +746,6 @@ _gpio_init2:
 _gpio_init1:
 	if (power_mgt)
 		gpio_free(platform_data->io_lpcpd);
-_tpm_clean_response2:
-	kzfree(platform_data->tpm_i2c_buffer[1]);
-	platform_data->tpm_i2c_buffer[1] = NULL;
-_tpm_clean_response1:
-	kzfree(platform_data->tpm_i2c_buffer[0]);
-	platform_data->tpm_i2c_buffer[0] = NULL;
 _tpm_clean_answer:
 	tpm_remove_hardware(chip->dev);
 end:
@@ -765,28 +761,12 @@ end:
  */
 static int tpm_st33_i2c_remove(struct i2c_client *client)
 {
-	struct tpm_chip *chip = (struct tpm_chip *)i2c_get_clientdata(client);
-	struct st33zp24_platform_data *pin_infos =
-		((struct i2c_client *)TPM_VPRIV(chip))->dev.platform_data;
-
-	if (pin_infos != NULL) {
-		free_irq(pin_infos->io_serirq, chip);
-
-		gpio_free(pin_infos->io_serirq);
-		gpio_free(pin_infos->io_lpcpd);
+	struct tpm_chip *chip =
+		(struct tpm_chip *) i2c_get_clientdata(client);
 
+	if (chip)
 		tpm_remove_hardware(chip->dev);
 
-		if (pin_infos->tpm_i2c_buffer[1] != NULL) {
-			kzfree(pin_infos->tpm_i2c_buffer[1]);
-			pin_infos->tpm_i2c_buffer[1] = NULL;
-		}
-		if (pin_infos->tpm_i2c_buffer[0] != NULL) {
-			kzfree(pin_infos->tpm_i2c_buffer[0]);
-			pin_infos->tpm_i2c_buffer[0] = NULL;
-		}
-	}
-
 	return 0;
 }
 
diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.h b/drivers/char/tpm/tpm_i2c_stm_st33.h
deleted file mode 100644
index 3041271342eb..000000000000
--- a/drivers/char/tpm/tpm_i2c_stm_st33.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
- * Copyright (C) 2009, 2010  STMicroelectronics
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * STMicroelectronics version 1.2.0, Copyright (C) 2010
- * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
- * This is free software, and you are welcome to redistribute it
- * under certain conditions.
- *
- * @Author: Christophe RICARD tpmsupport@st.com
- *
- * @File: stm_st33_tpm_i2c.h
- *
- * @Date: 09/15/2010
- */
-#ifndef __STM_ST33_TPM_I2C_MAIN_H__
-#define __STM_ST33_TPM_I2C_MAIN_H__
-
-#define TPM_ST33_I2C			"st33zp24_i2c"
-
-struct st33zp24_platform_data {
-	int io_serirq;
-	int io_lpcpd;
-	struct i2c_client *client;
-	u8 *tpm_i2c_buffer[2]; /* 0 Request 1 Response */
-	struct completion irq_detection;
-	struct mutex lock;
-};
-
-#endif /* __STM_ST33_TPM_I2C_MAIN_H__ */
diff --git a/include/linux/platform_data/tpm_i2c_stm_st33.h b/include/linux/platform_data/tpm_i2c_stm_st33.h
new file mode 100644
index 000000000000..88f9cb18113a
--- /dev/null
+++ b/include/linux/platform_data/tpm_i2c_stm_st33.h
@@ -0,0 +1,40 @@
+/*
+ * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
+ * Copyright (C) 2009, 2010  STMicroelectronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * STMicroelectronics version 1.2.0, Copyright (C) 2010
+ * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
+ * This is free software, and you are welcome to redistribute it
+ * under certain conditions.
+ *
+ * @Author: Christophe RICARD tpmsupport@st.com
+ *
+ * @File: stm_st33_tpm_i2c.h
+ *
+ * @Date: 09/15/2010
+ */
+#ifndef __STM_ST33_TPM_I2C_MAIN_H__
+#define __STM_ST33_TPM_I2C_MAIN_H__
+
+
+#define TPM_ST33_I2C			"st33zp24_i2c"
+
+struct st33zp24_platform_data {
+	int io_serirq;
+	int io_lpcpd;
+};
+
+#endif /* __STM_ST33_TPM_I2C_MAIN_H__ */
-- 
cgit v1.2.3


From 76182b6b008b694d095aec1e2eb6c07fae787128 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Mon, 1 Dec 2014 19:32:52 +0100
Subject: tpm/tpm_i2c_stm_st33: Remove reference to io_serirq

The serirq gpio pin is used only as interrupt. After driver initialization,
the serirq signal is always used through interrupt and never with gpio
kernel API.

The irq can then be initialized during the platform_data definition within the client->irq pin.

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 drivers/char/tpm/tpm_i2c_stm_st33.c            | 13 +++----------
 include/linux/platform_data/tpm_i2c_stm_st33.h |  1 -
 2 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c
index 4f725386385f..728611638d1c 100644
--- a/drivers/char/tpm/tpm_i2c_stm_st33.c
+++ b/drivers/char/tpm/tpm_i2c_stm_st33.c
@@ -103,7 +103,6 @@ struct tpm_stm_dev {
 	struct completion irq_detection;
 	struct tpm_chip *chip;
 	u8 buf[TPM_BUFSIZE + 1];
-	int io_serirq;
 	int io_lpcpd;
 };
 
@@ -688,18 +687,15 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 			err = -ENODEV;
 			goto _tpm_clean_answer;
 		}
-		err = gpio_request(platform_data->io_serirq, "TPM IO_SERIRQ");
-		if (err)
-			goto _gpio_init2;
 
 		clear_interruption(tpm_dev);
-		err = request_irq(gpio_to_irq(platform_data->io_serirq),
+		err = request_irq(client->irq,
 				&tpm_ioserirq_handler,
 				IRQF_TRIGGER_HIGH,
 				"TPM SERIRQ management", chip);
 		if (err < 0) {
 			dev_err(chip->dev , "TPM SERIRQ signals %d not available\n",
-				gpio_to_irq(platform_data->io_serirq));
+				client->irq);
 			goto _irq_set;
 		}
 
@@ -739,10 +735,7 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	dev_info(chip->dev, "TPM I2C Initialized\n");
 	return 0;
 _irq_set:
-	free_irq(gpio_to_irq(platform_data->io_serirq), (void *)chip);
-_gpio_init2:
-	if (interrupts)
-		gpio_free(platform_data->io_serirq);
+	free_irq(client->irq, (void *)chip);
 _gpio_init1:
 	if (power_mgt)
 		gpio_free(platform_data->io_lpcpd);
diff --git a/include/linux/platform_data/tpm_i2c_stm_st33.h b/include/linux/platform_data/tpm_i2c_stm_st33.h
index 88f9cb18113a..85775cf5f9a5 100644
--- a/include/linux/platform_data/tpm_i2c_stm_st33.h
+++ b/include/linux/platform_data/tpm_i2c_stm_st33.h
@@ -33,7 +33,6 @@
 #define TPM_ST33_I2C			"st33zp24_i2c"
 
 struct st33zp24_platform_data {
-	int io_serirq;
 	int io_lpcpd;
 };
 
-- 
cgit v1.2.3


From 3eda7d0ea3a0365aa72a2007f9450f314d92f065 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Tue, 13 Jan 2015 23:13:13 +0100
Subject: tpm/tpm_i2c_stm_st33: Change tpm_i2c_stm_st33.h to tpm_stm_st33.h

include/linux/platform_data/tpm_i2c_stm_st33.h can be used by other st33
tpm device driver not using i2c protocol.

Reviewed-by: Jason Gunthorpe <jason.gunthorpe@obsidianresearch.com>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Reviewed-by: Peter Huewe <peterhuewe@gmx.de>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 drivers/char/tpm/tpm_i2c_stm_st33.c            |  2 +-
 include/linux/platform_data/tpm_i2c_stm_st33.h | 39 --------------------------
 include/linux/platform_data/tpm_stm_st33.h     | 39 ++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 40 deletions(-)
 delete mode 100644 include/linux/platform_data/tpm_i2c_stm_st33.h
 create mode 100644 include/linux/platform_data/tpm_stm_st33.h

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c
index 86a24ced66c0..dbab8d0d875e 100644
--- a/drivers/char/tpm/tpm_i2c_stm_st33.c
+++ b/drivers/char/tpm/tpm_i2c_stm_st33.c
@@ -50,7 +50,7 @@
 #include <linux/of_irq.h>
 #include <linux/of_gpio.h>
 
-#include <linux/platform_data/tpm_i2c_stm_st33.h>
+#include <linux/platform_data/tpm_stm_st33.h>
 #include "tpm.h"
 
 #define TPM_ACCESS			0x0
diff --git a/include/linux/platform_data/tpm_i2c_stm_st33.h b/include/linux/platform_data/tpm_i2c_stm_st33.h
deleted file mode 100644
index 85775cf5f9a5..000000000000
--- a/include/linux/platform_data/tpm_i2c_stm_st33.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
- * Copyright (C) 2009, 2010  STMicroelectronics
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * STMicroelectronics version 1.2.0, Copyright (C) 2010
- * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
- * This is free software, and you are welcome to redistribute it
- * under certain conditions.
- *
- * @Author: Christophe RICARD tpmsupport@st.com
- *
- * @File: stm_st33_tpm_i2c.h
- *
- * @Date: 09/15/2010
- */
-#ifndef __STM_ST33_TPM_I2C_MAIN_H__
-#define __STM_ST33_TPM_I2C_MAIN_H__
-
-
-#define TPM_ST33_I2C			"st33zp24_i2c"
-
-struct st33zp24_platform_data {
-	int io_lpcpd;
-};
-
-#endif /* __STM_ST33_TPM_I2C_MAIN_H__ */
diff --git a/include/linux/platform_data/tpm_stm_st33.h b/include/linux/platform_data/tpm_stm_st33.h
new file mode 100644
index 000000000000..ff75310c0f47
--- /dev/null
+++ b/include/linux/platform_data/tpm_stm_st33.h
@@ -0,0 +1,39 @@
+/*
+ * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
+ * Copyright (C) 2009, 2010  STMicroelectronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * STMicroelectronics version 1.2.0, Copyright (C) 2010
+ * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
+ * This is free software, and you are welcome to redistribute it
+ * under certain conditions.
+ *
+ * @Author: Christophe RICARD tpmsupport@st.com
+ *
+ * @File: stm_st33_tpm.h
+ *
+ * @Date: 09/15/2010
+ */
+#ifndef __STM_ST33_TPM_H__
+#define __STM_ST33_TPM_H__
+
+#define TPM_ST33_I2C			"st33zp24-i2c"
+#define TPM_ST33_SPI			"st33zp24-spi"
+
+struct st33zp24_platform_data {
+	int io_lpcpd;
+};
+
+#endif /* __STM_ST33_TPM_H__ */
-- 
cgit v1.2.3


From 4e3c021fb995bcbb5d1f814d00584cb80eb904a8 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Mon, 5 Jan 2015 10:52:40 +0100
Subject: clk: Add clk_unregister_{divider, gate, mux} to close memory leak

The common clk_register_{divider,gate,mux} functions allocated memory
for internal data which wasn't freed anywhere. Drivers using these
helpers could only unregister clocks but the memory would still leak.

Add corresponding unregister functions which will release all resources.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 drivers/clk/clk-divider.c    | 16 ++++++++++++++++
 drivers/clk/clk-gate.c       | 16 ++++++++++++++++
 drivers/clk/clk-mux.c        | 16 ++++++++++++++++
 include/linux/clk-provider.h |  4 ++++
 4 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index c0a842b335c5..c2bb9f679ec6 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -463,3 +463,19 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 			width, clk_divider_flags, table, lock);
 }
 EXPORT_SYMBOL_GPL(clk_register_divider_table);
+
+void clk_unregister_divider(struct clk *clk)
+{
+	struct clk_divider *div;
+	struct clk_hw *hw;
+
+	hw = __clk_get_hw(clk);
+	if (!hw)
+		return;
+
+	div = to_clk_divider(hw);
+
+	clk_unregister(clk);
+	kfree(div);
+}
+EXPORT_SYMBOL_GPL(clk_unregister_divider);
diff --git a/drivers/clk/clk-gate.c b/drivers/clk/clk-gate.c
index 51fd87fb7ba6..186b96efeebf 100644
--- a/drivers/clk/clk-gate.c
+++ b/drivers/clk/clk-gate.c
@@ -162,3 +162,19 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 	return clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_gate);
+
+void clk_unregister_gate(struct clk *clk)
+{
+	struct clk_gate *gate;
+	struct clk_hw *hw;
+
+	hw = __clk_get_hw(clk);
+	if (!hw)
+		return;
+
+	gate = to_clk_gate(hw);
+
+	clk_unregister(clk);
+	kfree(gate);
+}
+EXPORT_SYMBOL_GPL(clk_unregister_gate);
diff --git a/drivers/clk/clk-mux.c b/drivers/clk/clk-mux.c
index 6e1ecf94bf58..69a094c3783d 100644
--- a/drivers/clk/clk-mux.c
+++ b/drivers/clk/clk-mux.c
@@ -177,3 +177,19 @@ struct clk *clk_register_mux(struct device *dev, const char *name,
 				      NULL, lock);
 }
 EXPORT_SYMBOL_GPL(clk_register_mux);
+
+void clk_unregister_mux(struct clk *clk)
+{
+	struct clk_mux *mux;
+	struct clk_hw *hw;
+
+	hw = __clk_get_hw(clk);
+	if (!hw)
+		return;
+
+	mux = to_clk_mux(hw);
+
+	clk_unregister(clk);
+	kfree(mux);
+}
+EXPORT_SYMBOL_GPL(clk_unregister_mux);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index d936409520f8..ebb7055a6d84 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -294,6 +294,7 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 bit_idx,
 		u8 clk_gate_flags, spinlock_t *lock);
+void clk_unregister_gate(struct clk *clk);
 
 struct clk_div_table {
 	unsigned int	val;
@@ -361,6 +362,7 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_divider_flags, const struct clk_div_table *table,
 		spinlock_t *lock);
+void clk_unregister_divider(struct clk *clk);
 
 /**
  * struct clk_mux - multiplexer clock
@@ -411,6 +413,8 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 		void __iomem *reg, u8 shift, u32 mask,
 		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
 
+void clk_unregister_mux(struct clk *clk);
+
 void of_fixed_factor_clk_setup(struct device_node *node);
 
 /**
-- 
cgit v1.2.3


From 3aeb66176ffa8fefd7a9f7d37bda1d8adcf469a1 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Thu, 15 Jan 2015 23:49:37 +0100
Subject: net: replace br_fdb_external_learn_* calls with switchdev notifier
 events

This patch benefits from newly introduced switchdev notifier and uses it
to propagate fdb learn events from rocker driver to bridge. That avoids
direct function calls and possible use by other listeners (ovs).

Suggested-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 10 +++++--
 include/linux/if_bridge.h            | 18 -------------
 include/net/switchdev.h              | 11 ++++++++
 net/bridge/br.c                      | 52 +++++++++++++++++++++++++++++++++++-
 net/bridge/br_fdb.c                  | 38 +++-----------------------
 net/bridge/br_private.h              |  4 +++
 6 files changed, 78 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index cad8cf962cdf..964d719b150f 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3026,11 +3026,17 @@ static void rocker_port_fdb_learn_work(struct work_struct *work)
 		container_of(work, struct rocker_fdb_learn_work, work);
 	bool removing = (lw->flags & ROCKER_OP_FLAG_REMOVE);
 	bool learned = (lw->flags & ROCKER_OP_FLAG_LEARNED);
+	struct netdev_switch_notifier_fdb_info info;
+
+	info.addr = lw->addr;
+	info.vid = lw->vid;
 
 	if (learned && removing)
-		br_fdb_external_learn_del(lw->dev, lw->addr, lw->vid);
+		call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_DEL,
+					     lw->dev, &info.info);
 	else if (learned && !removing)
-		br_fdb_external_learn_add(lw->dev, lw->addr, lw->vid);
+		call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_ADD,
+					     lw->dev, &info.info);
 
 	kfree(work);
 }
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 0a8ce762a47f..a57bca2ea97e 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -50,24 +50,6 @@ extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __use
 typedef int br_should_route_hook_t(struct sk_buff *skb);
 extern br_should_route_hook_t __rcu *br_should_route_hook;
 
-#if IS_ENABLED(CONFIG_BRIDGE)
-int br_fdb_external_learn_add(struct net_device *dev,
-			      const unsigned char *addr, u16 vid);
-int br_fdb_external_learn_del(struct net_device *dev,
-			      const unsigned char *addr, u16 vid);
-#else
-static inline int br_fdb_external_learn_add(struct net_device *dev,
-					    const unsigned char *addr, u16 vid)
-{
-	return 0;
-}
-static inline int br_fdb_external_learn_del(struct net_device *dev,
-					    const unsigned char *addr, u16 vid)
-{
-	return 0;
-}
-#endif
-
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)
 int br_multicast_list_adjacent(struct net_device *dev,
 			       struct list_head *br_ip_list);
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 7f8d74372d87..201120e18e4d 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -13,10 +13,21 @@
 #include <linux/netdevice.h>
 #include <linux/notifier.h>
 
+enum netdev_switch_notifier_type {
+	NETDEV_SWITCH_FDB_ADD = 1,
+	NETDEV_SWITCH_FDB_DEL,
+};
+
 struct netdev_switch_notifier_info {
 	struct net_device *dev;
 };
 
+struct netdev_switch_notifier_fdb_info {
+	struct netdev_switch_notifier_info info; /* must be first */
+	const unsigned char *addr;
+	u16 vid;
+};
+
 static inline struct net_device *
 netdev_switch_notifier_info_to_dev(const struct netdev_switch_notifier_info *info)
 {
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 44425aff7cba..fb57ab6b24f9 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -19,6 +19,7 @@
 #include <linux/llc.h>
 #include <net/llc.h>
 #include <net/stp.h>
+#include <net/switchdev.h>
 
 #include "br_private.h"
 
@@ -120,6 +121,48 @@ static struct notifier_block br_device_notifier = {
 	.notifier_call = br_device_event
 };
 
+static int br_netdev_switch_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr);
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	struct netdev_switch_notifier_fdb_info *fdb_info;
+	int err = NOTIFY_DONE;
+
+	rtnl_lock();
+	p = br_port_get_rtnl(dev);
+	if (!p)
+		goto out;
+
+	br = p->br;
+
+	switch (event) {
+	case NETDEV_SWITCH_FDB_ADD:
+		fdb_info = ptr;
+		err = br_fdb_external_learn_add(br, p, fdb_info->addr,
+						fdb_info->vid);
+		if (err)
+			err = notifier_from_errno(err);
+		break;
+	case NETDEV_SWITCH_FDB_DEL:
+		fdb_info = ptr;
+		err = br_fdb_external_learn_del(br, p, fdb_info->addr,
+						fdb_info->vid);
+		if (err)
+			err = notifier_from_errno(err);
+		break;
+	}
+
+out:
+	rtnl_unlock();
+	return err;
+}
+
+static struct notifier_block br_netdev_switch_notifier = {
+	.notifier_call = br_netdev_switch_event,
+};
+
 static void __net_exit br_net_exit(struct net *net)
 {
 	struct net_device *dev;
@@ -169,10 +212,14 @@ static int __init br_init(void)
 	if (err)
 		goto err_out3;
 
-	err = br_netlink_init();
+	err = register_netdev_switch_notifier(&br_netdev_switch_notifier);
 	if (err)
 		goto err_out4;
 
+	err = br_netlink_init();
+	if (err)
+		goto err_out5;
+
 	brioctl_set(br_ioctl_deviceless_stub);
 
 #if IS_ENABLED(CONFIG_ATM_LANE)
@@ -185,6 +232,8 @@ static int __init br_init(void)
 
 	return 0;
 
+err_out5:
+	unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
 err_out4:
 	unregister_netdevice_notifier(&br_device_notifier);
 err_out3:
@@ -202,6 +251,7 @@ static void __exit br_deinit(void)
 {
 	stp_proto_unregister(&br_stp_proto);
 	br_netlink_fini();
+	unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
 	unregister_netdevice_notifier(&br_device_notifier);
 	brioctl_set(NULL);
 	unregister_pernet_subsys(&br_net_ops);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e6e0372bc3cd..03667e65cc29 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -990,26 +990,14 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
 	}
 }
 
-int br_fdb_external_learn_add(struct net_device *dev,
+int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
-	struct net_bridge_port *p;
-	struct net_bridge *br;
 	struct hlist_head *head;
 	struct net_bridge_fdb_entry *fdb;
 	int err = 0;
 
-	rtnl_lock();
-
-	p = br_port_get_rtnl(dev);
-	if (!p) {
-		pr_info("bridge: %s not a bridge port\n", dev->name);
-		err = -EINVAL;
-		goto err_rtnl_unlock;
-	}
-
-	br = p->br;
-
+	ASSERT_RTNL();
 	spin_lock_bh(&br->hash_lock);
 
 	head = &br->hash[br_mac_hash(addr, vid)];
@@ -1034,33 +1022,18 @@ int br_fdb_external_learn_add(struct net_device *dev,
 
 err_unlock:
 	spin_unlock_bh(&br->hash_lock);
-err_rtnl_unlock:
-	rtnl_unlock();
 
 	return err;
 }
-EXPORT_SYMBOL(br_fdb_external_learn_add);
 
-int br_fdb_external_learn_del(struct net_device *dev,
+int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
-	struct net_bridge_port *p;
-	struct net_bridge *br;
 	struct hlist_head *head;
 	struct net_bridge_fdb_entry *fdb;
 	int err = 0;
 
-	rtnl_lock();
-
-	p = br_port_get_rtnl(dev);
-	if (!p) {
-		pr_info("bridge: %s not a bridge port\n", dev->name);
-		err = -EINVAL;
-		goto err_rtnl_unlock;
-	}
-
-	br = p->br;
-
+	ASSERT_RTNL();
 	spin_lock_bh(&br->hash_lock);
 
 	head = &br->hash[br_mac_hash(addr, vid)];
@@ -1071,9 +1044,6 @@ int br_fdb_external_learn_del(struct net_device *dev,
 		err = -ENOENT;
 
 	spin_unlock_bh(&br->hash_lock);
-err_rtnl_unlock:
-	rtnl_unlock();
 
 	return err;
 }
-EXPORT_SYMBOL(br_fdb_external_learn_del);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d808d766334d..e8e3f3681680 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -402,6 +402,10 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		struct net_device *dev, struct net_device *fdev, int idx);
 int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
 void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
+int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
+			      const unsigned char *addr, u16 vid);
+int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
+			      const unsigned char *addr, u16 vid);
 
 /* br_forward.c */
 void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb);
-- 
cgit v1.2.3


From 0d5484b1c3db8a3870c6100deeb4678594433b2c Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 29 Oct 2014 00:30:58 +0200
Subject: dmaengine: Move dma_get_slave_caps() implementation to dmaengine.c

The function is too big to be a static inline.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dmaengine.c   | 33 +++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h | 32 +-------------------------------
 2 files changed, 34 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 30211f9791b7..f15712f2fec6 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -479,6 +479,39 @@ static void dma_channel_rebalance(void)
 		}
 }
 
+int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
+{
+	struct dma_device *device;
+
+	if (!chan || !caps)
+		return -EINVAL;
+
+	device = chan->device;
+
+	/* check if the channel supports slave transactions */
+	if (!test_bit(DMA_SLAVE, device->cap_mask.bits))
+		return -ENXIO;
+
+	/*
+	 * Check whether it reports it uses the generic slave
+	 * capabilities, if not, that means it doesn't support any
+	 * kind of slave capabilities reporting.
+	 */
+	if (!device->directions)
+		return -ENXIO;
+
+	caps->src_addr_widths = device->src_addr_widths;
+	caps->dst_addr_widths = device->dst_addr_widths;
+	caps->directions = device->directions;
+	caps->residue_granularity = device->residue_granularity;
+
+	caps->cmd_pause = !!device->device_pause;
+	caps->cmd_terminate = !!device->device_terminate_all;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dma_get_slave_caps);
+
 static struct dma_chan *private_candidate(const dma_cap_mask_t *mask,
 					  struct dma_device *dev,
 					  dma_filter_fn fn, void *fn_param)
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 6d34ce91036c..1b4842bb3890 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -758,37 +758,7 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
 			src_sg, src_nents, flags);
 }
 
-static inline int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
-{
-	struct dma_device *device;
-
-	if (!chan || !caps)
-		return -EINVAL;
-
-	device = chan->device;
-
-	/* check if the channel supports slave transactions */
-	if (!test_bit(DMA_SLAVE, device->cap_mask.bits))
-		return -ENXIO;
-
-	/*
-	 * Check whether it reports it uses the generic slave
-	 * capabilities, if not, that means it doesn't support any
-	 * kind of slave capabilities reporting.
-	 */
-	if (!device->directions)
-		return -ENXIO;
-
-	caps->src_addr_widths = device->src_addr_widths;
-	caps->dst_addr_widths = device->dst_addr_widths;
-	caps->directions = device->directions;
-	caps->residue_granularity = device->residue_granularity;
-
-	caps->cmd_pause = !!device->device_pause;
-	caps->cmd_terminate = !!device->device_terminate_all;
-
-	return 0;
-}
+int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps);
 
 static inline int dmaengine_terminate_all(struct dma_chan *chan)
 {
-- 
cgit v1.2.3


From f8c58c1136349fdfa9b605c501f2f911622d3a9a Mon Sep 17 00:00:00 2001
From: Doug Anderson <dianders@chromium.org>
Date: Tue, 2 Dec 2014 15:42:47 -0800
Subject: mmc: dw_mmc: Protect read-modify-write of INTMASK with a lock

We're running into cases where our enabling of the SDIO interrupt in
dw_mmc doesn't actually take effect.  Specifically, adding patch like
this:

 +++ b/drivers/mmc/host/dw_mmc.c
 @@ -1076,6 +1076,9 @@ static void dw_mci_enable_sdio_irq(struct mmc_host *mmc, int enb)

      mci_writel(host, INTMASK,
           (int_mask | SDMMC_INT_SDIO(slot->id)));
 +    int_mask = mci_readl(host, INTMASK);
 +    if (!(int_mask & SDMMC_INT_SDIO(slot->id)))
 +      dev_err(&mmc->class_dev, "failed to enable sdio irq\n");
    } else {

...actually triggers the error message.  That's because the
dw_mci_enable_sdio_irq() unsafely does a read-modify-write of the
INTMASK register.

We can't just use the standard host->lock since that lock is not irq
safe and mmc_signal_sdio_irq() (called from interrupt context) calls
dw_mci_enable_sdio_irq().  Add a new irq-safe lock to protect INTMASK.

An alternate solution to this is to punt mmc_signal_sdio_irq() to the
tasklet and then protect INTMASK modifications by the standard host
lock.  This seemed like a bit more of a high-latency change.

Reported-by: Bing Zhao <bzhao@marvell.com>
Signed-off-by: Doug Anderson <dianders@chromium.org>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/dw_mmc.c  | 13 +++++++++++++
 include/linux/mmc/dw_mmc.h |  6 ++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index ae10a021765c..64ea04274913 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -759,6 +759,7 @@ disable:
 
 static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
 {
+	unsigned long irqflags;
 	int sg_len;
 	u32 temp;
 
@@ -795,9 +796,11 @@ static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
 	mci_writel(host, CTRL, temp);
 
 	/* Disable RX/TX IRQs, let DMA handle it */
+	spin_lock_irqsave(&host->irq_lock, irqflags);
 	temp = mci_readl(host, INTMASK);
 	temp  &= ~(SDMMC_INT_RXDR | SDMMC_INT_TXDR);
 	mci_writel(host, INTMASK, temp);
+	spin_unlock_irqrestore(&host->irq_lock, irqflags);
 
 	host->dma_ops->start(host, sg_len);
 
@@ -806,6 +809,7 @@ static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
 
 static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
 {
+	unsigned long irqflags;
 	u32 temp;
 
 	data->error = -EINPROGRESS;
@@ -834,9 +838,12 @@ static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
 		host->part_buf_count = 0;
 
 		mci_writel(host, RINTSTS, SDMMC_INT_TXDR | SDMMC_INT_RXDR);
+
+		spin_lock_irqsave(&host->irq_lock, irqflags);
 		temp = mci_readl(host, INTMASK);
 		temp |= SDMMC_INT_TXDR | SDMMC_INT_RXDR;
 		mci_writel(host, INTMASK, temp);
+		spin_unlock_irqrestore(&host->irq_lock, irqflags);
 
 		temp = mci_readl(host, CTRL);
 		temp &= ~SDMMC_CTRL_DMA_ENABLE;
@@ -1284,8 +1291,11 @@ static void dw_mci_enable_sdio_irq(struct mmc_host *mmc, int enb)
 {
 	struct dw_mci_slot *slot = mmc_priv(mmc);
 	struct dw_mci *host = slot->host;
+	unsigned long irqflags;
 	u32 int_mask;
 
+	spin_lock_irqsave(&host->irq_lock, irqflags);
+
 	/* Enable/disable Slot Specific SDIO interrupt */
 	int_mask = mci_readl(host, INTMASK);
 	if (enb)
@@ -1293,6 +1303,8 @@ static void dw_mci_enable_sdio_irq(struct mmc_host *mmc, int enb)
 	else
 		int_mask &= ~SDMMC_INT_SDIO(slot->sdio_id);
 	mci_writel(host, INTMASK, int_mask);
+
+	spin_unlock_irqrestore(&host->irq_lock, irqflags);
 }
 
 static int dw_mci_execute_tuning(struct mmc_host *mmc, u32 opcode)
@@ -2661,6 +2673,7 @@ int dw_mci_probe(struct dw_mci *host)
 	host->quirks = host->pdata->quirks;
 
 	spin_lock_init(&host->lock);
+	spin_lock_init(&host->irq_lock);
 	INIT_LIST_HEAD(&host->queue);
 
 	/*
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index 42b724e8d503..471fb3116dbe 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -106,6 +106,11 @@ struct mmc_data;
  * @cur_slot, @mrq and @state. These must always be updated
  * at the same time while holding @lock.
  *
+ * @irq_lock is an irq-safe spinlock protecting the INTMASK register
+ * to allow the interrupt handler to modify it directly.  Held for only long
+ * enough to read-modify-write INTMASK and no other locks are grabbed when
+ * holding this one.
+ *
  * The @mrq field of struct dw_mci_slot is also protected by @lock,
  * and must always be written at the same time as the slot is added to
  * @queue.
@@ -125,6 +130,7 @@ struct mmc_data;
  */
 struct dw_mci {
 	spinlock_t		lock;
+	spinlock_t		irq_lock;
 	void __iomem		*regs;
 
 	struct scatterlist	*sg;
-- 
cgit v1.2.3


From 61bd8a04b379e4fefbc0b0d06941d3f91071bb8c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 12 Dec 2014 08:43:08 +1100
Subject: mmc: omap_hsmmc: remove prepare/complete system suspend support.

The only function of these 'prepare' and 'complete' is to
disable the 'card detect' irq during suspend.

The commit which added this,
commit a48ce884d5819d5df2cf1139ab3c43f8e9e419b3
    mmc: omap_hsmmc: Introduce omap_hsmmc_prepare/complete

justified it by the need to avoid the registration of new devices
during suspend.
However mmc_pm_notify will set ->rescan_disable in the 'prepare'
stage and clear it in the 'complete' stage, so no card detection
will actually happen.
Also the interrupt will be disabled before final suspend as part
of common suspend processing.

So this disabling of the interrupt is unnecessary, and interferes
with a transition to using common code for card-detect management.

Cc: Felipe Balbi <balbi@ti.com>
Cc: Venkatraman S <svenkatr@ti.com>
Cc: Chris Ball <cjb@laptop.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/omap_hsmmc.c          | 50 ----------------------------------
 include/linux/platform_data/mmc-omap.h |  4 ---
 2 files changed, 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c
index 7c71dcdcba8b..537cba8f1de1 100644
--- a/drivers/mmc/host/omap_hsmmc.c
+++ b/drivers/mmc/host/omap_hsmmc.c
@@ -275,31 +275,6 @@ static int omap_hsmmc_get_cover_state(struct device *dev)
 	return !gpio_get_value_cansleep(mmc->switch_pin);
 }
 
-#ifdef CONFIG_PM
-
-static int omap_hsmmc_suspend_cdirq(struct device *dev)
-{
-	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
-
-	disable_irq(host->card_detect_irq);
-	return 0;
-}
-
-static int omap_hsmmc_resume_cdirq(struct device *dev)
-{
-	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
-
-	enable_irq(host->card_detect_irq);
-	return 0;
-}
-
-#else
-
-#define omap_hsmmc_suspend_cdirq	NULL
-#define omap_hsmmc_resume_cdirq		NULL
-
-#endif
-
 #ifdef CONFIG_REGULATOR
 
 static int omap_hsmmc_set_power(struct device *dev, int power_on, int vdd)
@@ -2234,8 +2209,6 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 				"Unable to grab MMC CD IRQ\n");
 			goto err_irq_cd;
 		}
-		host->suspend = omap_hsmmc_suspend_cdirq;
-		host->resume = omap_hsmmc_resume_cdirq;
 	}
 
 	omap_hsmmc_disable_irq(host);
@@ -2322,25 +2295,6 @@ static int omap_hsmmc_remove(struct platform_device *pdev)
 }
 
 #ifdef CONFIG_PM
-static int omap_hsmmc_prepare(struct device *dev)
-{
-	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
-
-	if (host->suspend)
-		return host->suspend(dev);
-
-	return 0;
-}
-
-static void omap_hsmmc_complete(struct device *dev)
-{
-	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
-
-	if (host->resume)
-		host->resume(dev);
-
-}
-
 static int omap_hsmmc_suspend(struct device *dev)
 {
 	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
@@ -2398,8 +2352,6 @@ static int omap_hsmmc_resume(struct device *dev)
 }
 
 #else
-#define omap_hsmmc_prepare	NULL
-#define omap_hsmmc_complete	NULL
 #define omap_hsmmc_suspend	NULL
 #define omap_hsmmc_resume	NULL
 #endif
@@ -2484,8 +2436,6 @@ static int omap_hsmmc_runtime_resume(struct device *dev)
 static struct dev_pm_ops omap_hsmmc_dev_pm_ops = {
 	.suspend	= omap_hsmmc_suspend,
 	.resume		= omap_hsmmc_resume,
-	.prepare	= omap_hsmmc_prepare,
-	.complete	= omap_hsmmc_complete,
 	.runtime_suspend = omap_hsmmc_runtime_suspend,
 	.runtime_resume = omap_hsmmc_runtime_resume,
 };
diff --git a/include/linux/platform_data/mmc-omap.h b/include/linux/platform_data/mmc-omap.h
index 5c188f4e9bec..929469291406 100644
--- a/include/linux/platform_data/mmc-omap.h
+++ b/include/linux/platform_data/mmc-omap.h
@@ -31,10 +31,6 @@ struct omap_mmc_platform_data {
 	void (*cleanup)(struct device *dev);
 	void (*shutdown)(struct device *dev);
 
-	/* To handle board related suspend/resume functionality for MMC */
-	int (*suspend)(struct device *dev, int slot);
-	int (*resume)(struct device *dev, int slot);
-
 	/* Return context loss count due to PM states changing */
 	int (*get_context_loss_count)(struct device *dev);
 
-- 
cgit v1.2.3


From eddbc3abc5bf11bdfc92ef84fd97ec4d379b7278 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 18 Dec 2014 15:44:32 +0100
Subject: mmc: slot-gpio: Remove option to explicitly free requested CD/WP
 GPIOs

The slot-gpio uses the devm*_ managed functions. Still it provide APIs
to explicitly free requested CD/WP GPIOs, but these API isn't being
used.

Therefore let's simplify slot-gpio by removing these unused APIs. If it
later turns out we need some of them, we can always consider to restore
the code.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/slot-gpio.c  | 92 ++-----------------------------------------
 include/linux/mmc/slot-gpio.h |  3 --
 2 files changed, 4 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c
index 69bbf2adb329..a428f10747d2 100644
--- a/drivers/mmc/core/slot-gpio.c
+++ b/drivers/mmc/core/slot-gpio.c
@@ -103,11 +103,7 @@ EXPORT_SYMBOL(mmc_gpio_get_cd);
  * @gpio: gpio number requested
  *
  * As devm_* managed functions are used in mmc_gpio_request_ro(), client
- * drivers do not need to explicitly call mmc_gpio_free_ro() for freeing up,
- * if the requesting and freeing are only needed at probing and unbinding time
- * for once.  However, if client drivers do something special like runtime
- * switching for write-protection, they are responsible for calling
- * mmc_gpio_request_ro() and mmc_gpio_free_ro() as a pair on their own.
+ * drivers do not need to worry about freeing up memory.
  *
  * Returns zero on success, else an error.
  */
@@ -178,11 +174,7 @@ EXPORT_SYMBOL(mmc_gpiod_request_cd_irq);
  * @debounce: debounce time in microseconds
  *
  * As devm_* managed functions are used in mmc_gpio_request_cd(), client
- * drivers do not need to explicitly call mmc_gpio_free_cd() for freeing up,
- * if the requesting and freeing are only needed at probing and unbinding time
- * for once.  However, if client drivers do something special like runtime
- * switching for card-detection, they are responsible for calling
- * mmc_gpio_request_cd() and mmc_gpio_free_cd() as a pair on their own.
+ * drivers do not need to worry about freeing up memory.
  *
  * If GPIO debouncing is desired, set the debounce parameter to a non-zero
  * value. The caller is responsible for ensuring that the GPIO driver associated
@@ -225,55 +217,6 @@ int mmc_gpio_request_cd(struct mmc_host *host, unsigned int gpio,
 }
 EXPORT_SYMBOL(mmc_gpio_request_cd);
 
-/**
- * mmc_gpio_free_ro - free the write-protection gpio
- * @host: mmc host
- *
- * It's provided only for cases that client drivers need to manually free
- * up the write-protection gpio requested by mmc_gpio_request_ro().
- */
-void mmc_gpio_free_ro(struct mmc_host *host)
-{
-	struct mmc_gpio *ctx = host->slot.handler_priv;
-	int gpio;
-
-	if (!ctx || !ctx->ro_gpio)
-		return;
-
-	gpio = desc_to_gpio(ctx->ro_gpio);
-	ctx->ro_gpio = NULL;
-
-	devm_gpio_free(&host->class_dev, gpio);
-}
-EXPORT_SYMBOL(mmc_gpio_free_ro);
-
-/**
- * mmc_gpio_free_cd - free the card-detection gpio
- * @host: mmc host
- *
- * It's provided only for cases that client drivers need to manually free
- * up the card-detection gpio requested by mmc_gpio_request_cd().
- */
-void mmc_gpio_free_cd(struct mmc_host *host)
-{
-	struct mmc_gpio *ctx = host->slot.handler_priv;
-	int gpio;
-
-	if (!ctx || !ctx->cd_gpio)
-		return;
-
-	if (host->slot.cd_irq >= 0) {
-		devm_free_irq(&host->class_dev, host->slot.cd_irq, host);
-		host->slot.cd_irq = -EINVAL;
-	}
-
-	gpio = desc_to_gpio(ctx->cd_gpio);
-	ctx->cd_gpio = NULL;
-
-	devm_gpio_free(&host->class_dev, gpio);
-}
-EXPORT_SYMBOL(mmc_gpio_free_cd);
-
 /**
  * mmc_gpiod_request_cd - request a gpio descriptor for card-detection
  * @host: mmc host
@@ -285,8 +228,7 @@ EXPORT_SYMBOL(mmc_gpio_free_cd);
  * to NULL to ignore
  *
  * Use this function in place of mmc_gpio_request_cd() to use the GPIO
- * descriptor API.  Note that it is paired with mmc_gpiod_free_cd() not
- * mmc_gpio_free_cd().  Note also that it must be called prior to mmc_add_host()
+ * descriptor API.  Note that it must be called prior to mmc_add_host()
  * otherwise the caller must also call mmc_gpiod_request_cd_irq().
  *
  * Returns zero on success, else an error.
@@ -339,8 +281,7 @@ EXPORT_SYMBOL(mmc_gpiod_request_cd);
  * set to NULL to ignore
  *
  * Use this function in place of mmc_gpio_request_ro() to use the GPIO
- * descriptor API.  Note that it is paired with mmc_gpiod_free_ro() not
- * mmc_gpio_free_ro().
+ * descriptor API.
  *
  * Returns zero on success, else an error.
  */
@@ -380,28 +321,3 @@ int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 	return 0;
 }
 EXPORT_SYMBOL(mmc_gpiod_request_ro);
-
-/**
- * mmc_gpiod_free_cd - free the card-detection gpio descriptor
- * @host: mmc host
- *
- * It's provided only for cases that client drivers need to manually free
- * up the card-detection gpio requested by mmc_gpiod_request_cd().
- */
-void mmc_gpiod_free_cd(struct mmc_host *host)
-{
-	struct mmc_gpio *ctx = host->slot.handler_priv;
-
-	if (!ctx || !ctx->cd_gpio)
-		return;
-
-	if (host->slot.cd_irq >= 0) {
-		devm_free_irq(&host->class_dev, host->slot.cd_irq, host);
-		host->slot.cd_irq = -EINVAL;
-	}
-
-	devm_gpiod_put(host->parent, ctx->cd_gpio);
-
-	ctx->cd_gpio = NULL;
-}
-EXPORT_SYMBOL(mmc_gpiod_free_cd);
diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h
index e56fa24c9322..4a36d6954631 100644
--- a/include/linux/mmc/slot-gpio.h
+++ b/include/linux/mmc/slot-gpio.h
@@ -15,12 +15,10 @@ struct mmc_host;
 
 int mmc_gpio_get_ro(struct mmc_host *host);
 int mmc_gpio_request_ro(struct mmc_host *host, unsigned int gpio);
-void mmc_gpio_free_ro(struct mmc_host *host);
 
 int mmc_gpio_get_cd(struct mmc_host *host);
 int mmc_gpio_request_cd(struct mmc_host *host, unsigned int gpio,
 			unsigned int debounce);
-void mmc_gpio_free_cd(struct mmc_host *host);
 
 int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
@@ -28,7 +26,6 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
 			 unsigned int debounce, bool *gpio_invert);
-void mmc_gpiod_free_cd(struct mmc_host *host);
 void mmc_gpiod_request_cd_irq(struct mmc_host *host);
 
 #endif
-- 
cgit v1.2.3


From df8aca162e5ff2b20c7a4de3e64e5b96ff838ab0 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 18 Dec 2014 15:44:36 +0100
Subject: mmc: slot-gpio: Rework how to handle allocation of slot-gpio data

By moving the allocation of the slot-gpio data into mmc_alloc_host(),
we can remove the slot-gpio internal calls to mmc_gpio_alloc().

This means mmc_gpio_alloc() has now only one caller left, which
consequence allow us to simplify and remove some of the slot-gpio code.

Additionally, this makes the slot-gpio mutex redundant, so let's remove
it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/host.c      | 20 +++++++-------
 drivers/mmc/core/slot-gpio.c | 62 ++++++++++----------------------------------
 include/linux/mmc/host.h     |  2 --
 3 files changed, 23 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index fcb7f06373cf..07636449b4de 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -29,6 +29,7 @@
 
 #include "core.h"
 #include "host.h"
+#include "slot-gpio.h"
 
 #define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
 
@@ -38,7 +39,6 @@ static DEFINE_SPINLOCK(mmc_host_lock);
 static void mmc_host_classdev_release(struct device *dev)
 {
 	struct mmc_host *host = cls_dev_to_mmc_host(dev);
-	mutex_destroy(&host->slot.lock);
 	spin_lock(&mmc_host_lock);
 	idr_remove(&mmc_host_idr, host->index);
 	spin_unlock(&mmc_host_lock);
@@ -478,8 +478,10 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 		host->index = err;
 	spin_unlock(&mmc_host_lock);
 	idr_preload_end();
-	if (err < 0)
-		goto free;
+	if (err < 0) {
+		kfree(host);
+		return NULL;
+	}
 
 	dev_set_name(&host->class_dev, "mmc%d", host->index);
 
@@ -488,10 +490,12 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	host->class_dev.class = &mmc_host_class;
 	device_initialize(&host->class_dev);
 
-	mmc_host_clk_init(host);
+	if (mmc_gpio_alloc(host)) {
+		put_device(&host->class_dev);
+		return NULL;
+	}
 
-	mutex_init(&host->slot.lock);
-	host->slot.cd_irq = -EINVAL;
+	mmc_host_clk_init(host);
 
 	spin_lock_init(&host->lock);
 	init_waitqueue_head(&host->wq);
@@ -512,10 +516,6 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	host->max_blk_count = PAGE_CACHE_SIZE / 512;
 
 	return host;
-
-free:
-	kfree(host);
-	return NULL;
 }
 
 EXPORT_SYMBOL(mmc_alloc_host);
diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c
index ec918c27e77f..1a3edbd47719 100644
--- a/drivers/mmc/core/slot-gpio.c
+++ b/drivers/mmc/core/slot-gpio.c
@@ -43,29 +43,17 @@ static irqreturn_t mmc_gpio_cd_irqt(int irq, void *dev_id)
 int mmc_gpio_alloc(struct mmc_host *host)
 {
 	size_t len = strlen(dev_name(host->parent)) + 4;
-	struct mmc_gpio *ctx;
-
-	mutex_lock(&host->slot.lock);
-
-	ctx = host->slot.handler_priv;
-	if (!ctx) {
-		/*
-		 * devm_kzalloc() can be called after device_initialize(), even
-		 * before device_add(), i.e., between mmc_alloc_host() and
-		 * mmc_add_host()
-		 */
-		ctx = devm_kzalloc(host->parent, sizeof(*ctx) + 2 * len,
-				   GFP_KERNEL);
-		if (ctx) {
-			ctx->ro_label = ctx->cd_label + len;
-			snprintf(ctx->cd_label, len, "%s cd", dev_name(host->parent));
-			snprintf(ctx->ro_label, len, "%s ro", dev_name(host->parent));
-			host->slot.handler_priv = ctx;
-		}
+	struct mmc_gpio *ctx = devm_kzalloc(host->parent,
+				sizeof(*ctx) + 2 * len,	GFP_KERNEL);
+
+	if (ctx) {
+		ctx->ro_label = ctx->cd_label + len;
+		snprintf(ctx->cd_label, len, "%s cd", dev_name(host->parent));
+		snprintf(ctx->ro_label, len, "%s ro", dev_name(host->parent));
+		host->slot.handler_priv = ctx;
+		host->slot.cd_irq = -EINVAL;
 	}
 
-	mutex_unlock(&host->slot.lock);
-
 	return ctx ? 0 : -ENOMEM;
 }
 
@@ -111,18 +99,12 @@ EXPORT_SYMBOL(mmc_gpio_get_cd);
  */
 int mmc_gpio_request_ro(struct mmc_host *host, unsigned int gpio)
 {
-	struct mmc_gpio *ctx;
+	struct mmc_gpio *ctx = host->slot.handler_priv;
 	int ret;
 
 	if (!gpio_is_valid(gpio))
 		return -EINVAL;
 
-	ret = mmc_gpio_alloc(host);
-	if (ret < 0)
-		return ret;
-
-	ctx = host->slot.handler_priv;
-
 	ret = devm_gpio_request_one(host->parent, gpio, GPIOF_DIR_IN,
 				    ctx->ro_label);
 	if (ret < 0)
@@ -187,15 +169,9 @@ EXPORT_SYMBOL(mmc_gpiod_request_cd_irq);
 int mmc_gpio_request_cd(struct mmc_host *host, unsigned int gpio,
 			unsigned int debounce)
 {
-	struct mmc_gpio *ctx;
+	struct mmc_gpio *ctx = host->slot.handler_priv;
 	int ret;
 
-	ret = mmc_gpio_alloc(host);
-	if (ret < 0)
-		return ret;
-
-	ctx = host->slot.handler_priv;
-
 	ret = devm_gpio_request_one(host->parent, gpio, GPIOF_DIR_IN,
 				    ctx->cd_label);
 	if (ret < 0)
@@ -239,16 +215,10 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
 			 unsigned int debounce, bool *gpio_invert)
 {
-	struct mmc_gpio *ctx;
+	struct mmc_gpio *ctx = host->slot.handler_priv;
 	struct gpio_desc *desc;
 	int ret;
 
-	ret = mmc_gpio_alloc(host);
-	if (ret < 0)
-		return ret;
-
-	ctx = host->slot.handler_priv;
-
 	if (!con_id)
 		con_id = ctx->cd_label;
 
@@ -291,16 +261,10 @@ int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
 			 unsigned int debounce, bool *gpio_invert)
 {
-	struct mmc_gpio *ctx;
+	struct mmc_gpio *ctx = host->slot.handler_priv;
 	struct gpio_desc *desc;
 	int ret;
 
-	ret = mmc_gpio_alloc(host);
-	if (ret < 0)
-		return ret;
-
-	ctx = host->slot.handler_priv;
-
 	if (!con_id)
 		con_id = ctx->ro_label;
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 9f322706f7cb..b6bf718c3498 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -166,7 +166,6 @@ struct mmc_async_req {
  * struct mmc_slot - MMC slot functions
  *
  * @cd_irq:		MMC/SD-card slot hotplug detection IRQ or -EINVAL
- * @lock:		protect the @handler_priv pointer
  * @handler_priv:	MMC/SD-card slot context
  *
  * Some MMC/SD host controllers implement slot-functions like card and
@@ -176,7 +175,6 @@ struct mmc_async_req {
  */
 struct mmc_slot {
 	int cd_irq;
-	struct mutex lock;
 	void *handler_priv;
 };
 
-- 
cgit v1.2.3


From 04cdbbfa73ebac57a30ec2ebebfd7e9342bbdc44 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 1 Dec 2014 16:53:34 +0100
Subject: mmc: core: Make tuning block patterns static

Since previous patches removed the need for the tuning block patterns
to be exported, let's move them close to the mmc_send_tuning() API.

Those are now intended to be used only by the mmc core.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Jaehoon Chung <jh80.chung@samsung.com>
---
 drivers/mmc/core/mmc.c     | 32 --------------------------------
 drivers/mmc/core/mmc_ops.c | 30 ++++++++++++++++++++++++++++++
 include/linux/mmc/mmc.h    |  5 -----
 3 files changed, 30 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index d854bff037a2..0b8ec87fc517 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1155,38 +1155,6 @@ bus_speed:
 	return err;
 }
 
-const u8 tuning_blk_pattern_4bit[MMC_TUNING_BLK_PATTERN_4BIT_SIZE] = {
-	0xff, 0x0f, 0xff, 0x00, 0xff, 0xcc, 0xc3, 0xcc,
-	0xc3, 0x3c, 0xcc, 0xff, 0xfe, 0xff, 0xfe, 0xef,
-	0xff, 0xdf, 0xff, 0xdd, 0xff, 0xfb, 0xff, 0xfb,
-	0xbf, 0xff, 0x7f, 0xff, 0x77, 0xf7, 0xbd, 0xef,
-	0xff, 0xf0, 0xff, 0xf0, 0x0f, 0xfc, 0xcc, 0x3c,
-	0xcc, 0x33, 0xcc, 0xcf, 0xff, 0xef, 0xff, 0xee,
-	0xff, 0xfd, 0xff, 0xfd, 0xdf, 0xff, 0xbf, 0xff,
-	0xbb, 0xff, 0xf7, 0xff, 0xf7, 0x7f, 0x7b, 0xde,
-};
-EXPORT_SYMBOL(tuning_blk_pattern_4bit);
-
-const u8 tuning_blk_pattern_8bit[MMC_TUNING_BLK_PATTERN_8BIT_SIZE] = {
-	0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, 0x00,
-	0xff, 0xff, 0xcc, 0xcc, 0xcc, 0x33, 0xcc, 0xcc,
-	0xcc, 0x33, 0x33, 0xcc, 0xcc, 0xcc, 0xff, 0xff,
-	0xff, 0xee, 0xff, 0xff, 0xff, 0xee, 0xee, 0xff,
-	0xff, 0xff, 0xdd, 0xff, 0xff, 0xff, 0xdd, 0xdd,
-	0xff, 0xff, 0xff, 0xbb, 0xff, 0xff, 0xff, 0xbb,
-	0xbb, 0xff, 0xff, 0xff, 0x77, 0xff, 0xff, 0xff,
-	0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee, 0xff,
-	0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00,
-	0x00, 0xff, 0xff, 0xcc, 0xcc, 0xcc, 0x33, 0xcc,
-	0xcc, 0xcc, 0x33, 0x33, 0xcc, 0xcc, 0xcc, 0xff,
-	0xff, 0xff, 0xee, 0xff, 0xff, 0xff, 0xee, 0xee,
-	0xff, 0xff, 0xff, 0xdd, 0xff, 0xff, 0xff, 0xdd,
-	0xdd, 0xff, 0xff, 0xff, 0xbb, 0xff, 0xff, 0xff,
-	0xbb, 0xbb, 0xff, 0xff, 0xff, 0x77, 0xff, 0xff,
-	0xff, 0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee,
-};
-EXPORT_SYMBOL(tuning_blk_pattern_8bit);
-
 /*
  * Execute tuning sequence to seek the proper bus operating
  * conditions for HS200 and HS400, which sends CMD21 to the device.
diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index 3b044c5b029c..0ea042dc7443 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -23,6 +23,36 @@
 
 #define MMC_OPS_TIMEOUT_MS	(10 * 60 * 1000) /* 10 minute timeout */
 
+static const u8 tuning_blk_pattern_4bit[] = {
+	0xff, 0x0f, 0xff, 0x00, 0xff, 0xcc, 0xc3, 0xcc,
+	0xc3, 0x3c, 0xcc, 0xff, 0xfe, 0xff, 0xfe, 0xef,
+	0xff, 0xdf, 0xff, 0xdd, 0xff, 0xfb, 0xff, 0xfb,
+	0xbf, 0xff, 0x7f, 0xff, 0x77, 0xf7, 0xbd, 0xef,
+	0xff, 0xf0, 0xff, 0xf0, 0x0f, 0xfc, 0xcc, 0x3c,
+	0xcc, 0x33, 0xcc, 0xcf, 0xff, 0xef, 0xff, 0xee,
+	0xff, 0xfd, 0xff, 0xfd, 0xdf, 0xff, 0xbf, 0xff,
+	0xbb, 0xff, 0xf7, 0xff, 0xf7, 0x7f, 0x7b, 0xde,
+};
+
+static const u8 tuning_blk_pattern_8bit[] = {
+	0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, 0x00,
+	0xff, 0xff, 0xcc, 0xcc, 0xcc, 0x33, 0xcc, 0xcc,
+	0xcc, 0x33, 0x33, 0xcc, 0xcc, 0xcc, 0xff, 0xff,
+	0xff, 0xee, 0xff, 0xff, 0xff, 0xee, 0xee, 0xff,
+	0xff, 0xff, 0xdd, 0xff, 0xff, 0xff, 0xdd, 0xdd,
+	0xff, 0xff, 0xff, 0xbb, 0xff, 0xff, 0xff, 0xbb,
+	0xbb, 0xff, 0xff, 0xff, 0x77, 0xff, 0xff, 0xff,
+	0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee, 0xff,
+	0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00,
+	0x00, 0xff, 0xff, 0xcc, 0xcc, 0xcc, 0x33, 0xcc,
+	0xcc, 0xcc, 0x33, 0x33, 0xcc, 0xcc, 0xcc, 0xff,
+	0xff, 0xff, 0xee, 0xff, 0xff, 0xff, 0xee, 0xee,
+	0xff, 0xff, 0xff, 0xdd, 0xff, 0xff, 0xff, 0xdd,
+	0xdd, 0xff, 0xff, 0xff, 0xbb, 0xff, 0xff, 0xff,
+	0xbb, 0xbb, 0xff, 0xff, 0xff, 0x77, 0xff, 0xff,
+	0xff, 0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee,
+};
+
 static inline int __mmc_send_status(struct mmc_card *card, u32 *status,
 				    bool ignore_crc)
 {
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 49ad7a943638..fb97b5cc91cd 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -53,11 +53,6 @@
 #define MMC_SEND_TUNING_BLOCK    19   /* adtc                    R1  */
 #define MMC_SEND_TUNING_BLOCK_HS200	21	/* adtc R1  */
 
-#define MMC_TUNING_BLK_PATTERN_4BIT_SIZE	 64
-#define MMC_TUNING_BLK_PATTERN_8BIT_SIZE	128
-extern const u8 tuning_blk_pattern_4bit[MMC_TUNING_BLK_PATTERN_4BIT_SIZE];
-extern const u8 tuning_blk_pattern_8bit[MMC_TUNING_BLK_PATTERN_8BIT_SIZE];
-
   /* class 3 */
 #define MMC_WRITE_DAT_UNTIL_STOP 20   /* adtc [31:0] data addr   R1  */
 
-- 
cgit v1.2.3


From 348487cb28e66b032bae1b38424d81bf5b444408 Mon Sep 17 00:00:00 2001
From: Haibo Chen <haibo.chen@freescale.com>
Date: Tue, 9 Dec 2014 17:04:05 +0800
Subject: mmc: sdhci: use pipeline mmc requests to improve performance

This patch is based on the patches by Per Forlin, Tony Lin and Ryan QIAN.

This patch complete the API 'post_req' and 'pre_req' in sdhci host side,

Test Env:
1. i.MX6Q-SABREAUTO board, CPU @ 996MHz, use ADMA in uSDHC controller.
2. Test command:
		$ echo 1 > /proc/sys/vm/drop_caches
	write to sd card:
		$ dd if=/dev/zero of=/dev/mmcblk0 bs=1M count=2000 conv=fsync
	read the sd card:
		$ dd if=/dev/mmcblk0 of=/dev/null bs=1M count=2000

3. TOSHIBA 16GB SD3.0 card, running at 4 bit, SDR104 @ 198MHZ
	Performance with and without this patch:
      -------------------------------------------------
	  |                    | read speed | write speed |
	  |------------------------------------------------
	  | with this patch    | ~76.7 MB/s |  ~23.3 MB/s |
	  |------------------------------------------------
	  |without this patch  | ~60.5 MB/s |  ~22.5 MB/s |
	  -------------------------------------------------

4. SanDisk 8GB SD3.0 card, running at 4 bit, DDR50 @ 50MHZ
	Performance with and without this patch:
      -------------------------------------------------
	  |                    | read speed | write speed |
	  |------------------------------------------------
	  | with this patch    | ~40.5 MB/s |  ~15.6 MB/s |
	  |------------------------------------------------
	  |without this patch  | ~36.1 MB/s |  ~14.1 MB/s |
	  -------------------------------------------------

5. Kingston 8GB SD2.0 card, running at 4 bit, High-speed @ 50MHZ
	Performance with and without this patch:
      -------------------------------------------------
	  |                    | read speed | write speed |
	  |------------------------------------------------
	  | with this patch    | ~22.7 MB/s |  ~8.2 MB/s  |
	  |------------------------------------------------
	  |without this patch  | ~21.3 MB/s |  ~8.0 MB/s  |
	  -------------------------------------------------

6. About eMMC, Sandisk 8GB eMMC on i.MX6DL-sabresd board, CPU @ 792MHZ,
   eMMC running at 8 bit, DDR52 @ 52MHZ.
	Performance with and without this patch:
      -------------------------------------------------
	  |                    | read speed | write speed |
	  |------------------------------------------------
	  | with this patch    | ~37.3 MB/s |  ~10.5 MB/s |
	  |------------------------------------------------
	  |without this patch  | ~33.4 MB/s |  ~10.5 MB/s |
	  -------------------------------------------------

Signed-off-by: Haibo Chen <haibo.chen@freescale.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci.c  | 99 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/mmc/sdhci.h |  6 +++
 2 files changed, 93 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 2558d705c92e..1aab8a120d02 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -53,6 +53,9 @@ static void sdhci_finish_command(struct sdhci_host *);
 static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode);
 static void sdhci_tuning_timer(unsigned long data);
 static void sdhci_enable_preset_value(struct sdhci_host *host, bool enable);
+static int sdhci_pre_dma_transfer(struct sdhci_host *host,
+					struct mmc_data *data,
+					struct sdhci_host_next *next);
 
 #ifdef CONFIG_PM
 static int sdhci_runtime_pm_get(struct sdhci_host *host);
@@ -505,9 +508,8 @@ static int sdhci_adma_table_pre(struct sdhci_host *host,
 		goto fail;
 	BUG_ON(host->align_addr & host->align_mask);
 
-	host->sg_count = dma_map_sg(mmc_dev(host->mmc),
-		data->sg, data->sg_len, direction);
-	if (host->sg_count == 0)
+	host->sg_count = sdhci_pre_dma_transfer(host, data, NULL);
+	if (host->sg_count < 0)
 		goto unmap_align;
 
 	desc = host->adma_table;
@@ -645,8 +647,9 @@ static void sdhci_adma_table_post(struct sdhci_host *host,
 		}
 	}
 
-	dma_unmap_sg(mmc_dev(host->mmc), data->sg,
-		data->sg_len, direction);
+	if (!data->host_cookie)
+		dma_unmap_sg(mmc_dev(host->mmc), data->sg,
+			data->sg_len, direction);
 }
 
 static u8 sdhci_calc_timeout(struct sdhci_host *host, struct mmc_command *cmd)
@@ -842,11 +845,7 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_command *cmd)
 		} else {
 			int sg_cnt;
 
-			sg_cnt = dma_map_sg(mmc_dev(host->mmc),
-					data->sg, data->sg_len,
-					(data->flags & MMC_DATA_READ) ?
-						DMA_FROM_DEVICE :
-						DMA_TO_DEVICE);
+			sg_cnt = sdhci_pre_dma_transfer(host, data, NULL);
 			if (sg_cnt == 0) {
 				/*
 				 * This only happens when someone fed
@@ -959,8 +958,10 @@ static void sdhci_finish_data(struct sdhci_host *host)
 		if (host->flags & SDHCI_USE_ADMA)
 			sdhci_adma_table_post(host, data);
 		else {
-			dma_unmap_sg(mmc_dev(host->mmc), data->sg,
-				data->sg_len, (data->flags & MMC_DATA_READ) ?
+			if (!data->host_cookie)
+				dma_unmap_sg(mmc_dev(host->mmc),
+					data->sg, data->sg_len,
+					(data->flags & MMC_DATA_READ) ?
 					DMA_FROM_DEVICE : DMA_TO_DEVICE);
 		}
 	}
@@ -2125,6 +2126,77 @@ static void sdhci_enable_preset_value(struct sdhci_host *host, bool enable)
 	}
 }
 
+static void sdhci_post_req(struct mmc_host *mmc, struct mmc_request *mrq,
+				int err)
+{
+	struct sdhci_host *host = mmc_priv(mmc);
+	struct mmc_data *data = mrq->data;
+
+	if (host->flags & SDHCI_REQ_USE_DMA) {
+		if (data->host_cookie)
+			dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+					 data->flags & MMC_DATA_WRITE ?
+					 DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		mrq->data->host_cookie = 0;
+	}
+}
+
+static int sdhci_pre_dma_transfer(struct sdhci_host *host,
+				       struct mmc_data *data,
+				       struct sdhci_host_next *next)
+{
+	int sg_count;
+
+	if (!next && data->host_cookie &&
+	    data->host_cookie != host->next_data.cookie) {
+		pr_debug(DRIVER_NAME "[%s] invalid cookie: %d, next-cookie %d\n",
+			__func__, data->host_cookie, host->next_data.cookie);
+		data->host_cookie = 0;
+	}
+
+	/* Check if next job is already prepared */
+	if (next ||
+	    (!next && data->host_cookie != host->next_data.cookie)) {
+		sg_count = dma_map_sg(mmc_dev(host->mmc), data->sg,
+				     data->sg_len,
+				     data->flags & MMC_DATA_WRITE ?
+				     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	} else {
+		sg_count = host->next_data.sg_count;
+		host->next_data.sg_count = 0;
+	}
+
+
+	if (sg_count == 0)
+		return -EINVAL;
+
+	if (next) {
+		next->sg_count = sg_count;
+		data->host_cookie = ++next->cookie < 0 ? 1 : next->cookie;
+	} else
+		host->sg_count = sg_count;
+
+	return sg_count;
+}
+
+static void sdhci_pre_req(struct mmc_host *mmc, struct mmc_request *mrq,
+			       bool is_first_req)
+{
+	struct sdhci_host *host = mmc_priv(mmc);
+
+	if (mrq->data->host_cookie) {
+		mrq->data->host_cookie = 0;
+		return;
+	}
+
+	if (host->flags & SDHCI_REQ_USE_DMA)
+		if (sdhci_pre_dma_transfer(host,
+					mrq->data,
+					&host->next_data) < 0)
+			mrq->data->host_cookie = 0;
+}
+
 static void sdhci_card_event(struct mmc_host *mmc)
 {
 	struct sdhci_host *host = mmc_priv(mmc);
@@ -2158,6 +2230,8 @@ static void sdhci_card_event(struct mmc_host *mmc)
 
 static const struct mmc_host_ops sdhci_ops = {
 	.request	= sdhci_request,
+	.post_req	= sdhci_post_req,
+	.pre_req	= sdhci_pre_req,
 	.set_ios	= sdhci_set_ios,
 	.get_cd		= sdhci_get_cd,
 	.get_ro		= sdhci_get_ro,
@@ -3015,6 +3089,7 @@ int sdhci_add_host(struct sdhci_host *host)
 		host->max_clk = host->ops->get_max_clock(host);
 	}
 
+	host->next_data.cookie = 1;
 	/*
 	 * In case of Host Controller v3.00, find out whether clock
 	 * multiplier is supported.
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index f767a0de611f..cb8b94ff6a26 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -17,6 +17,11 @@
 #include <linux/io.h>
 #include <linux/mmc/host.h>
 
+struct sdhci_host_next {
+	unsigned int	sg_count;
+	s32		cookie;
+};
+
 struct sdhci_host {
 	/* Data set by hardware interface driver */
 	const char *hw_name;	/* Hardware bus name */
@@ -203,6 +208,7 @@ struct sdhci_host {
 #define SDHCI_TUNING_MODE_1	0
 	struct timer_list	tuning_timer;	/* Timer for tuning */
 
+	struct sdhci_host_next	next_data;
 	unsigned long private[0] ____cacheline_aligned;
 };
 #endif /* LINUX_MMC_SDHCI_H */
-- 
cgit v1.2.3


From 83533ab28380f6957af39a7b322e639e42dbdaf1 Mon Sep 17 00:00:00 2001
From: Johan Rudholm <johan.rudholm@axis.com>
Date: Mon, 12 Jan 2015 15:38:04 +0100
Subject: mmc: core: always check status after reset

Always check if the card is alive after a successful reset. This allows
us to remove mmc_hw_reset_check(), leaving mmc_hw_reset() as the only
card reset interface.

Signed-off-by: Johan Rudholm <johanru@axis.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/card/mmc_test.c | 18 +++++++-----------
 drivers/mmc/core/core.c     | 24 +++++-------------------
 include/linux/mmc/core.h    |  1 -
 3 files changed, 12 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/card/mmc_test.c b/drivers/mmc/card/mmc_test.c
index 0a7430f94d29..7dac4695163b 100644
--- a/drivers/mmc/card/mmc_test.c
+++ b/drivers/mmc/card/mmc_test.c
@@ -2342,20 +2342,16 @@ static int mmc_test_hw_reset(struct mmc_test_card *test)
 	struct mmc_host *host = card->host;
 	int err;
 
-	err = mmc_hw_reset_check(host);
+	if (!mmc_card_mmc(card) || !mmc_can_reset(card))
+		return RESULT_UNSUP_CARD;
+
+	err = mmc_hw_reset(host);
 	if (!err)
 		return RESULT_OK;
+	else if (err == -EOPNOTSUPP)
+		return RESULT_UNSUP_HOST;
 
-	if (err == -ENOSYS)
-		return RESULT_FAIL;
-
-	if (err != -EOPNOTSUPP)
-		return err;
-
-	if (!mmc_can_reset(card))
-		return RESULT_UNSUP_CARD;
-
-	return RESULT_UNSUP_HOST;
+	return RESULT_FAIL;
 }
 
 static const struct mmc_test_case mmc_test_cases[] = {
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index d3bfbdfab052..72070f188cc5 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2286,9 +2286,10 @@ int mmc_can_reset(struct mmc_card *card)
 }
 EXPORT_SYMBOL(mmc_can_reset);
 
-static int mmc_do_hw_reset(struct mmc_host *host, int check)
+int mmc_hw_reset(struct mmc_host *host)
 {
 	struct mmc_card *card = host->card;
+	u32 status;
 
 	if (!(host->caps & MMC_CAP_HW_RESET) || !host->ops->hw_reset)
 		return -EOPNOTSUPP;
@@ -2305,13 +2306,9 @@ static int mmc_do_hw_reset(struct mmc_host *host, int check)
 	host->ops->hw_reset(host);
 
 	/* If the reset has happened, then a status command will fail */
-	if (check) {
-		u32 status;
-
-		if (!mmc_send_status(card, &status)) {
-			mmc_host_clk_release(host);
-			return -ENOSYS;
-		}
+	if (!mmc_send_status(card, &status)) {
+		mmc_host_clk_release(host);
+		return -ENOSYS;
 	}
 
 	/* Set initial state and call mmc_set_ios */
@@ -2321,19 +2318,8 @@ static int mmc_do_hw_reset(struct mmc_host *host, int check)
 
 	return host->bus_ops->power_restore(host);
 }
-
-int mmc_hw_reset(struct mmc_host *host)
-{
-	return mmc_do_hw_reset(host, 0);
-}
 EXPORT_SYMBOL(mmc_hw_reset);
 
-int mmc_hw_reset_check(struct mmc_host *host)
-{
-	return mmc_do_hw_reset(host, 1);
-}
-EXPORT_SYMBOL(mmc_hw_reset_check);
-
 static int mmc_rescan_try_freq(struct mmc_host *host, unsigned freq)
 {
 	host->f_init = freq;
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index cb2b0400d284..160448f920ac 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -182,7 +182,6 @@ extern int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen);
 extern int mmc_set_blockcount(struct mmc_card *card, unsigned int blockcount,
 			      bool is_rel_write);
 extern int mmc_hw_reset(struct mmc_host *host);
-extern int mmc_hw_reset_check(struct mmc_host *host);
 extern int mmc_can_reset(struct mmc_card *card);
 
 extern void mmc_set_data_timeout(struct mmc_data *, const struct mmc_card *);
-- 
cgit v1.2.3


From c7ea834d81904b71505093f7ec50d036132cf628 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 13 Jan 2015 08:23:18 +1300
Subject: mmc: slot-gpio: Allow host driver to provide isr for card-detect
 interrupts

One of the reasons omap_hsmmc doesn't use the slot-gpio library
is that it has some non-standard functionality in the card-detect
interrupt service routine.

To make it possible for omap_hsmmc (and maybe others) to be converted
to use slot-gpio, add 'mmc_gpio_request_cd_isr' which provide an
alternate isr to be register by the slot-gpio code.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/slot-gpio.c  | 18 +++++++++++++++++-
 include/linux/mmc/slot-gpio.h |  2 ++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c
index 1a3edbd47719..27117ba47073 100644
--- a/drivers/mmc/core/slot-gpio.c
+++ b/drivers/mmc/core/slot-gpio.c
@@ -25,6 +25,7 @@ struct mmc_gpio {
 	struct gpio_desc *cd_gpio;
 	bool override_ro_active_level;
 	bool override_cd_active_level;
+	irqreturn_t (*cd_gpio_isr)(int irq, void *dev_id);
 	char *ro_label;
 	char cd_label[0];
 };
@@ -136,8 +137,10 @@ void mmc_gpiod_request_cd_irq(struct mmc_host *host)
 		irq = -EINVAL;
 
 	if (irq >= 0) {
+		if (!ctx->cd_gpio_isr)
+			ctx->cd_gpio_isr = mmc_gpio_cd_irqt;
 		ret = devm_request_threaded_irq(host->parent, irq,
-			NULL, mmc_gpio_cd_irqt,
+			NULL, ctx->cd_gpio_isr,
 			IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
 			ctx->cd_label, host);
 		if (ret < 0)
@@ -151,6 +154,19 @@ void mmc_gpiod_request_cd_irq(struct mmc_host *host)
 }
 EXPORT_SYMBOL(mmc_gpiod_request_cd_irq);
 
+/* Register an alternate interrupt service routine for
+ * the card-detect GPIO.
+ */
+void mmc_gpio_set_cd_isr(struct mmc_host *host,
+			 irqreturn_t (*isr)(int irq, void *dev_id))
+{
+	struct mmc_gpio *ctx = host->slot.handler_priv;
+
+	WARN_ON(ctx->cd_gpio_isr);
+	ctx->cd_gpio_isr = isr;
+}
+EXPORT_SYMBOL(mmc_gpio_set_cd_isr);
+
 /**
  * mmc_gpio_request_cd - request a gpio for card-detection
  * @host: mmc host
diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h
index 4a36d6954631..3945a8c9d3cb 100644
--- a/include/linux/mmc/slot-gpio.h
+++ b/include/linux/mmc/slot-gpio.h
@@ -26,6 +26,8 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
 			 unsigned int debounce, bool *gpio_invert);
+void mmc_gpio_set_cd_isr(struct mmc_host *host,
+			 irqreturn_t (*isr)(int irq, void *dev_id));
 void mmc_gpiod_request_cd_irq(struct mmc_host *host);
 
 #endif
-- 
cgit v1.2.3


From fdb8df9933632e177621daf60da74fc693a8c7d1 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Mon, 19 Jan 2015 13:54:27 +0200
Subject: dmaengine: Add dma_get_slave_caps() inline stub when
 !CONFIG_DMA_ENGINE

Commit 0d5484b1c3db8a38 ("dmaengine: Move dma_get_slave_caps()
implementation to dmaengine.c") turned the inline dma_get_slave_caps()
function into an external function without adding an inline stub for the
cases where CONFIG_DMA_ENGINE isn't set. This breaks compilation of
drivers using the DMA engine API when CONFIG_DMA_ENGINE isn't set.

Add an inline stub to fix compilation.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Fixes: 0d5484b1c3db ("dmaengine: Move dma_get_slave_caps() implementation to dmaengine.c")
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 1b4842bb3890..50745e3a8a3f 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -758,8 +758,6 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
 			src_sg, src_nents, flags);
 }
 
-int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps);
-
 static inline int dmaengine_terminate_all(struct dma_chan *chan)
 {
 	if (chan->device->device_terminate_all)
@@ -1048,6 +1046,7 @@ struct dma_chan *dma_request_slave_channel_reason(struct device *dev,
 						  const char *name);
 struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
 void dma_release_channel(struct dma_chan *chan);
+int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps);
 #else
 static inline struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type)
 {
@@ -1082,6 +1081,11 @@ static inline struct dma_chan *dma_request_slave_channel(struct device *dev,
 static inline void dma_release_channel(struct dma_chan *chan)
 {
 }
+static inline int dma_get_slave_caps(struct dma_chan *chan,
+				     struct dma_slave_caps *caps)
+{
+	return -ENXIO;
+}
 #endif
 
 /* --- DMA device --- */
-- 
cgit v1.2.3


From 927609d622a3773995f84bc03b4564f873cf0e22 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 25 Nov 2014 10:16:39 +0100
Subject: kernel: tighten rules for ACCESS ONCE

Now that all non-scalar users of ACCESS_ONCE have been converted
to READ_ONCE or ASSIGN once, lets tighten ACCESS_ONCE to only
work on scalar types.
This variant was proposed by Alexei Starovoitov.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/compiler.h | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index a1c81f80978e..5e186bf6f6c1 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -447,12 +447,23 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int
  * to make the compiler aware of ordering is to put the two invocations of
  * ACCESS_ONCE() in different C statements.
  *
- * This macro does absolutely -nothing- to prevent the CPU from reordering,
- * merging, or refetching absolutely anything at any time.  Its main intended
- * use is to mediate communication between process-level code and irq/NMI
- * handlers, all running on the same CPU.
+ * ACCESS_ONCE will only work on scalar types. For union types, ACCESS_ONCE
+ * on a union member will work as long as the size of the member matches the
+ * size of the union and the size is smaller than word size.
+ *
+ * The major use cases of ACCESS_ONCE used to be (1) Mediating communication
+ * between process-level code and irq/NMI handlers, all running on the same CPU,
+ * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
+ * mutilate accesses that either do not require ordering or that interact
+ * with an explicit memory barrier or atomic instruction that provides the
+ * required ordering.
+ *
+ * If possible use READ_ONCE/ASSIGN_ONCE instead.
  */
-#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+#define __ACCESS_ONCE(x) ({ \
+	 __maybe_unused typeof(x) __var = 0; \
+	(volatile typeof(x) *)&(x); })
+#define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
 
 /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
 #ifdef CONFIG_KPROBES
-- 
cgit v1.2.3


From c5b19946eb76c67566aae6a84bf2b10ad59295ea Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 12 Jan 2015 12:13:39 +0100
Subject: kernel: Fix sparse warning for ACCESS_ONCE

Commit 927609d622a3 ("kernel: tighten rules for ACCESS ONCE") results in
sparse warnings like "Using plain integer as NULL pointer" - Let's add a
type cast to the dummy assignment.
To avoid warnings lik "sparse: warning: cast to restricted __hc32" we also
use __force on that cast.

Fixes: 927609d622a3 ("kernel: tighten rules for ACCESS ONCE")
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 5e186bf6f6c1..7bebf0563d18 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -461,7 +461,7 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int
  * If possible use READ_ONCE/ASSIGN_ONCE instead.
  */
 #define __ACCESS_ONCE(x) ({ \
-	 __maybe_unused typeof(x) __var = 0; \
+	 __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \
 	(volatile typeof(x) *)&(x); })
 #define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
 
-- 
cgit v1.2.3


From 85b4545629663486b7f71047ce3b54fa0ad3eb28 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2015 17:51:14 +0000
Subject: iommu: Consolidate IOVA allocator code

In order to share the IOVA allocator with other architectures, break
the unnecssary dependency on the Intel IOMMU driver and move the
remaining IOVA internals to iova.c

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c | 33 ++-------------------------------
 drivers/iommu/iova.c        | 35 +++++++++++++++++++++++++++++++++++
 include/linux/iova.h        |  3 +++
 3 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 40dfbc0444c0..e758d8ed8fb5 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -485,7 +485,6 @@ __setup("intel_iommu=", intel_iommu_setup);
 
 static struct kmem_cache *iommu_domain_cache;
 static struct kmem_cache *iommu_devinfo_cache;
-static struct kmem_cache *iommu_iova_cache;
 
 static inline void *alloc_pgtable_page(int node)
 {
@@ -523,16 +522,6 @@ static inline void free_devinfo_mem(void *vaddr)
 	kmem_cache_free(iommu_devinfo_cache, vaddr);
 }
 
-struct iova *alloc_iova_mem(void)
-{
-	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
-}
-
-void free_iova_mem(struct iova *iova)
-{
-	kmem_cache_free(iommu_iova_cache, iova);
-}
-
 static inline int domain_type_is_vm(struct dmar_domain *domain)
 {
 	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
@@ -3427,23 +3416,6 @@ static inline int iommu_devinfo_cache_init(void)
 	return ret;
 }
 
-static inline int iommu_iova_cache_init(void)
-{
-	int ret = 0;
-
-	iommu_iova_cache = kmem_cache_create("iommu_iova",
-					 sizeof(struct iova),
-					 0,
-					 SLAB_HWCACHE_ALIGN,
-					 NULL);
-	if (!iommu_iova_cache) {
-		printk(KERN_ERR "Couldn't create iova cache\n");
-		ret = -ENOMEM;
-	}
-
-	return ret;
-}
-
 static int __init iommu_init_mempool(void)
 {
 	int ret;
@@ -3461,7 +3433,7 @@ static int __init iommu_init_mempool(void)
 
 	kmem_cache_destroy(iommu_domain_cache);
 domain_error:
-	kmem_cache_destroy(iommu_iova_cache);
+	iommu_iova_cache_destroy();
 
 	return -ENOMEM;
 }
@@ -3470,8 +3442,7 @@ static void __init iommu_exit_mempool(void)
 {
 	kmem_cache_destroy(iommu_devinfo_cache);
 	kmem_cache_destroy(iommu_domain_cache);
-	kmem_cache_destroy(iommu_iova_cache);
-
+	iommu_iova_cache_destroy();
 }
 
 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index f6b17e6af2fb..520b8c8ae0c4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -18,6 +18,41 @@
  */
 
 #include <linux/iova.h>
+#include <linux/slab.h>
+
+static struct kmem_cache *iommu_iova_cache;
+
+int iommu_iova_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_iova_cache = kmem_cache_create("iommu_iova",
+					 sizeof(struct iova),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!iommu_iova_cache) {
+		pr_err("Couldn't create iova cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+void iommu_iova_cache_destroy(void)
+{
+	kmem_cache_destroy(iommu_iova_cache);
+}
+
+struct iova *alloc_iova_mem(void)
+{
+	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
+}
+
+void free_iova_mem(struct iova *iova)
+{
+	kmem_cache_free(iommu_iova_cache, iova);
+}
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 19e81d5ccb6d..ad0507c61cc7 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -39,6 +39,9 @@ static inline unsigned long iova_size(struct iova *iova)
 	return iova->pfn_hi - iova->pfn_lo + 1;
 }
 
+int iommu_iova_cache_init(void);
+void iommu_iova_cache_destroy(void);
+
 struct iova *alloc_iova_mem(void);
 void free_iova_mem(struct iova *iova);
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
-- 
cgit v1.2.3


From 1b72250076dde4276acecf3a7da722b185703e78 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2015 17:51:15 +0000
Subject: iommu: Make IOVA domain low limit flexible

To share the IOVA allocator with other architectures, it needs to
accommodate more general aperture restrictions; move the lower limit
from a compile-time constant to a runtime domain property to allow
IOVA domains with different requirements to co-exist.

Also reword the slightly unclear description of alloc_iova since we're
touching it anyway.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c |  9 ++++++---
 drivers/iommu/iova.c        | 10 ++++++----
 include/linux/iova.h        |  7 +++----
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e758d8ed8fb5..86f9e82b015b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -71,6 +71,9 @@
 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
 
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN		(1)
+
 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
 #define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
 #define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
@@ -1632,7 +1635,7 @@ static int dmar_init_reserved_ranges(void)
 	struct iova *iova;
 	int i;
 
-	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
+	init_iova_domain(&reserved_iova_list, IOVA_START_PFN, DMA_32BIT_PFN);
 
 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
 		&reserved_rbtree_key);
@@ -1690,7 +1693,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	int adjust_width, agaw;
 	unsigned long sagaw;
 
-	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
@@ -4313,7 +4316,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 {
 	int adjust_width;
 
-	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 520b8c8ae0c4..a3dbba8caa19 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -55,11 +55,13 @@ void free_iova_mem(struct iova *iova)
 }
 
 void
-init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
+init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
+	unsigned long pfn_32bit)
 {
 	spin_lock_init(&iovad->iova_rbtree_lock);
 	iovad->rbroot = RB_ROOT;
 	iovad->cached32_node = NULL;
+	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
 }
 
@@ -162,7 +164,7 @@ move_left:
 	if (!curr) {
 		if (size_aligned)
 			pad_size = iova_get_pad_size(size, limit_pfn);
-		if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
+		if ((iovad->start_pfn + size + pad_size) > limit_pfn) {
 			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 			return -ENOMEM;
 		}
@@ -237,8 +239,8 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova)
  * @size: - size of page frames to allocate
  * @limit_pfn: - max limit address
  * @size_aligned: - set if size_aligned address range is required
- * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
- * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
+ * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
+ * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
  * flag is set then the allocated address iova->pfn_lo will be naturally
  * aligned on roundup_power_of_two(size).
  */
diff --git a/include/linux/iova.h b/include/linux/iova.h
index ad0507c61cc7..591b19626b46 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -16,9 +16,6 @@
 #include <linux/rbtree.h>
 #include <linux/dma-mapping.h>
 
-/* IO virtual address start page frame number */
-#define IOVA_START_PFN		(1)
-
 /* iova structure */
 struct iova {
 	struct rb_node	node;
@@ -31,6 +28,7 @@ struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
 	struct rb_root	rbroot;		/* iova domain rbtree root */
 	struct rb_node	*cached32_node; /* Save last alloced node */
+	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
 };
 
@@ -52,7 +50,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
+void init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
+	unsigned long pfn_32bit);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
-- 
cgit v1.2.3


From 0fb5fe874c42942e16c450ae05da453e13a1c09e Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2015 17:51:16 +0000
Subject: iommu: Make IOVA domain page size explicit

Systems may contain heterogeneous IOMMUs supporting differing minimum
page sizes, which may also not be common with the CPU page size.
Thus it is practical to have an explicit notion of IOVA granularity
to simplify handling of mapping and allocation constraints.

As an initial step, move the IOVA page granularity from an implicit
compile-time constant to a per-domain property so we can make use
of it in IOVA domain context at runtime. To keep the abstraction tidy,
extend the little API of inline iova_* helpers to parallel some of the
equivalent PAGE_* macros.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c |  9 ++++++---
 drivers/iommu/iova.c        | 12 ++++++++++--
 include/linux/iova.h        | 35 +++++++++++++++++++++++++++++++++--
 3 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 86f9e82b015b..ae4c1a854e57 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1635,7 +1635,8 @@ static int dmar_init_reserved_ranges(void)
 	struct iova *iova;
 	int i;
 
-	init_iova_domain(&reserved_iova_list, IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
+			DMA_32BIT_PFN);
 
 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
 		&reserved_rbtree_key);
@@ -1693,7 +1694,8 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	int adjust_width, agaw;
 	unsigned long sagaw;
 
-	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
+			DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
@@ -4316,7 +4318,8 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 {
 	int adjust_width;
 
-	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
+			DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index a3dbba8caa19..9dd8208312c2 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -55,12 +55,20 @@ void free_iova_mem(struct iova *iova)
 }
 
 void
-init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
-	unsigned long pfn_32bit)
+init_iova_domain(struct iova_domain *iovad, unsigned long granule,
+	unsigned long start_pfn, unsigned long pfn_32bit)
 {
+	/*
+	 * IOVA granularity will normally be equal to the smallest
+	 * supported IOMMU page size; both *must* be capable of
+	 * representing individual CPU pages exactly.
+	 */
+	BUG_ON((granule > PAGE_SIZE) || !is_power_of_2(granule));
+
 	spin_lock_init(&iovad->iova_rbtree_lock);
 	iovad->rbroot = RB_ROOT;
 	iovad->cached32_node = NULL;
+	iovad->granule = granule;
 	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
 }
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 591b19626b46..3920a19d8194 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -28,6 +28,7 @@ struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
 	struct rb_root	rbroot;		/* iova domain rbtree root */
 	struct rb_node	*cached32_node; /* Save last alloced node */
+	unsigned long	granule;	/* pfn granularity for this domain */
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
 };
@@ -37,6 +38,36 @@ static inline unsigned long iova_size(struct iova *iova)
 	return iova->pfn_hi - iova->pfn_lo + 1;
 }
 
+static inline unsigned long iova_shift(struct iova_domain *iovad)
+{
+	return __ffs(iovad->granule);
+}
+
+static inline unsigned long iova_mask(struct iova_domain *iovad)
+{
+	return iovad->granule - 1;
+}
+
+static inline size_t iova_offset(struct iova_domain *iovad, dma_addr_t iova)
+{
+	return iova & iova_mask(iovad);
+}
+
+static inline size_t iova_align(struct iova_domain *iovad, size_t size)
+{
+	return ALIGN(size, iovad->granule);
+}
+
+static inline dma_addr_t iova_dma_addr(struct iova_domain *iovad, struct iova *iova)
+{
+	return (dma_addr_t)iova->pfn_lo << iova_shift(iovad);
+}
+
+static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
+{
+	return iova >> iova_shift(iovad);
+}
+
 int iommu_iova_cache_init(void);
 void iommu_iova_cache_destroy(void);
 
@@ -50,8 +81,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
-	unsigned long pfn_32bit);
+void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
+	unsigned long start_pfn, unsigned long pfn_32bit);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
-- 
cgit v1.2.3


From c7d7ddee7e24eedde6149eefbcfbfbc7125b9ff0 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Thu, 15 Jan 2015 15:09:37 +0100
Subject: ata: libahci: Allow using multiple regulators

The current implementation of the libahci allows using multiple PHYs
but not multiple regulators. This patch adds the support of multiple
regulators. Until now it was mandatory to have a PHY under a subnode,
now a port subnode can contain either a regulator or a PHY (or both).

In order to be able to asociate a port with a regulator the port are
now a platform device in the device tree case.

There was only one driver which used directly the regulator field of
the ahci_host_priv structure. To preserve the bisectability the change
in the ahci_imx driver was done in the same patch.

Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/ahci.h             |   2 +-
 drivers/ata/ahci_imx.c         |  14 +--
 drivers/ata/libahci_platform.c | 230 +++++++++++++++++++++++++++++------------
 include/linux/ahci_platform.h  |   2 +
 4 files changed, 173 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index 40f0e34f17af..275358ae0b3f 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -333,7 +333,7 @@ struct ahci_host_priv {
 	u32			em_msg_type;	/* EM message type */
 	bool			got_runtime_pm; /* Did we do pm_runtime_get? */
 	struct clk		*clks[AHCI_MAX_CLKS]; /* Optional */
-	struct regulator	*target_pwr;	/* Optional */
+	struct regulator	**target_pwrs;	/* Optional */
 	/*
 	 * If platform uses PHYs. There is a 1:1 relation between the port number and
 	 * the PHY position in this array.
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index 35d51c59a370..41632e57d46f 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -221,11 +221,9 @@ static int imx_sata_enable(struct ahci_host_priv *hpriv)
 	if (imxpriv->no_device)
 		return 0;
 
-	if (hpriv->target_pwr) {
-		ret = regulator_enable(hpriv->target_pwr);
-		if (ret)
-			return ret;
-	}
+	ret = ahci_platform_enable_regulators(hpriv);
+	if (ret)
+		return ret;
 
 	ret = clk_prepare_enable(imxpriv->sata_ref_clk);
 	if (ret < 0)
@@ -270,8 +268,7 @@ static int imx_sata_enable(struct ahci_host_priv *hpriv)
 disable_clk:
 	clk_disable_unprepare(imxpriv->sata_ref_clk);
 disable_regulator:
-	if (hpriv->target_pwr)
-		regulator_disable(hpriv->target_pwr);
+	ahci_platform_disable_regulators(hpriv);
 
 	return ret;
 }
@@ -291,8 +288,7 @@ static void imx_sata_disable(struct ahci_host_priv *hpriv)
 
 	clk_disable_unprepare(imxpriv->sata_ref_clk);
 
-	if (hpriv->target_pwr)
-		regulator_disable(hpriv->target_pwr);
+	ahci_platform_disable_regulators(hpriv);
 }
 
 static void ahci_imx_error_handler(struct ata_port *ap)
diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c
index a147aaadca85..73a086664ee7 100644
--- a/drivers/ata/libahci_platform.c
+++ b/drivers/ata/libahci_platform.c
@@ -24,6 +24,7 @@
 #include <linux/ahci_platform.h>
 #include <linux/phy/phy.h>
 #include <linux/pm_runtime.h>
+#include <linux/of_platform.h>
 #include "ahci.h"
 
 static void ahci_host_stop(struct ata_host *host);
@@ -137,6 +138,59 @@ void ahci_platform_disable_clks(struct ahci_host_priv *hpriv)
 }
 EXPORT_SYMBOL_GPL(ahci_platform_disable_clks);
 
+/**
+ * ahci_platform_enable_regulators - Enable regulators
+ * @hpriv: host private area to store config values
+ *
+ * This function enables all the regulators found in
+ * hpriv->target_pwrs, if any.  If a regulator fails to be enabled, it
+ * disables all the regulators already enabled in reverse order and
+ * returns an error.
+ *
+ * RETURNS:
+ * 0 on success otherwise a negative error code
+ */
+int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv)
+{
+	int rc, i;
+
+	for (i = 0; i < hpriv->nports; i++) {
+		if (!hpriv->target_pwrs[i])
+			continue;
+
+		rc = regulator_enable(hpriv->target_pwrs[i]);
+		if (rc)
+			goto disable_target_pwrs;
+	}
+
+	return 0;
+
+disable_target_pwrs:
+	while (--i >= 0)
+		if (hpriv->target_pwrs[i])
+			regulator_disable(hpriv->target_pwrs[i]);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ahci_platform_enable_regulators);
+
+/**
+ * ahci_platform_disable_regulators - Disable regulators
+ * @hpriv: host private area to store config values
+ *
+ * This function disables all regulators found in hpriv->target_pwrs.
+ */
+void ahci_platform_disable_regulators(struct ahci_host_priv *hpriv)
+{
+	int i;
+
+	for (i = 0; i < hpriv->nports; i++) {
+		if (!hpriv->target_pwrs[i])
+			continue;
+		regulator_disable(hpriv->target_pwrs[i]);
+	}
+}
+EXPORT_SYMBOL_GPL(ahci_platform_disable_regulators);
 /**
  * ahci_platform_enable_resources - Enable platform resources
  * @hpriv: host private area to store config values
@@ -157,11 +211,9 @@ int ahci_platform_enable_resources(struct ahci_host_priv *hpriv)
 {
 	int rc;
 
-	if (hpriv->target_pwr) {
-		rc = regulator_enable(hpriv->target_pwr);
-		if (rc)
-			return rc;
-	}
+	rc = ahci_platform_enable_regulators(hpriv);
+	if (rc)
+		return rc;
 
 	rc = ahci_platform_enable_clks(hpriv);
 	if (rc)
@@ -177,8 +229,8 @@ disable_clks:
 	ahci_platform_disable_clks(hpriv);
 
 disable_regulator:
-	if (hpriv->target_pwr)
-		regulator_disable(hpriv->target_pwr);
+	ahci_platform_disable_regulators(hpriv);
+
 	return rc;
 }
 EXPORT_SYMBOL_GPL(ahci_platform_enable_resources);
@@ -199,8 +251,7 @@ void ahci_platform_disable_resources(struct ahci_host_priv *hpriv)
 
 	ahci_platform_disable_clks(hpriv);
 
-	if (hpriv->target_pwr)
-		regulator_disable(hpriv->target_pwr);
+	ahci_platform_disable_regulators(hpriv);
 }
 EXPORT_SYMBOL_GPL(ahci_platform_disable_resources);
 
@@ -216,6 +267,68 @@ static void ahci_platform_put_resources(struct device *dev, void *res)
 
 	for (c = 0; c < AHCI_MAX_CLKS && hpriv->clks[c]; c++)
 		clk_put(hpriv->clks[c]);
+	/*
+	 * The regulators are tied to child node device and not to the
+	 * SATA device itself. So we can't use devm for automatically
+	 * releasing them. We have to do it manually here.
+	 */
+	for (c = 0; c < hpriv->nports; c++)
+		if (hpriv->target_pwrs && hpriv->target_pwrs[c])
+			regulator_put(hpriv->target_pwrs[c]);
+
+}
+
+static int ahci_platform_get_phy(struct ahci_host_priv *hpriv, u32 port,
+				struct device *dev, struct device_node *node)
+{
+	int rc;
+
+	hpriv->phys[port] = devm_of_phy_get(dev, node, NULL);
+
+	if (!IS_ERR(hpriv->phys[port]))
+		return 0;
+
+	rc = PTR_ERR(hpriv->phys[port]);
+	switch (rc) {
+	case -ENOSYS:
+		/* No PHY support. Check if PHY is required. */
+		if (of_find_property(node, "phys", NULL)) {
+			dev_err(dev,
+				"couldn't get PHY in node %s: ENOSYS\n",
+				node->name);
+			break;
+		}
+	case -ENODEV:
+		/* continue normally */
+		hpriv->phys[port] = NULL;
+		rc = 0;
+		break;
+
+	default:
+		dev_err(dev,
+			"couldn't get PHY in node %s: %d\n",
+			node->name, rc);
+
+		break;
+	}
+
+	return rc;
+}
+
+static int ahci_platform_get_regulator(struct ahci_host_priv *hpriv, u32 port,
+				struct device *dev)
+{
+	struct regulator *target_pwr;
+	int rc = 0;
+
+	target_pwr = regulator_get_optional(dev, "target");
+
+	if (!IS_ERR(target_pwr))
+		hpriv->target_pwrs[port] = target_pwr;
+	else
+		rc = PTR_ERR(target_pwr);
+
+	return rc;
 }
 
 /**
@@ -240,7 +353,7 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 	struct ahci_host_priv *hpriv;
 	struct clk *clk;
 	struct device_node *child;
-	int i, enabled_ports = 0, rc = -ENOMEM;
+	int i, sz, enabled_ports = 0, rc = -ENOMEM, child_nodes;
 	u32 mask_port_map = 0;
 
 	if (!devres_open_group(dev, NULL, GFP_KERNEL))
@@ -261,14 +374,6 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 		goto err_out;
 	}
 
-	hpriv->target_pwr = devm_regulator_get_optional(dev, "target");
-	if (IS_ERR(hpriv->target_pwr)) {
-		rc = PTR_ERR(hpriv->target_pwr);
-		if (rc == -EPROBE_DEFER)
-			goto err_out;
-		hpriv->target_pwr = NULL;
-	}
-
 	for (i = 0; i < AHCI_MAX_CLKS; i++) {
 		/*
 		 * For now we must use clk_get(dev, NULL) for the first clock,
@@ -290,19 +395,33 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 		hpriv->clks[i] = clk;
 	}
 
-	hpriv->nports = of_get_child_count(dev->of_node);
+	hpriv->nports = child_nodes = of_get_child_count(dev->of_node);
 
-	if (hpriv->nports) {
-		hpriv->phys = devm_kzalloc(dev,
-					   hpriv->nports * sizeof(*hpriv->phys),
-					   GFP_KERNEL);
-		if (!hpriv->phys) {
-			rc = -ENOMEM;
-			goto err_out;
-		}
+	/*
+	 * If no sub-node was found, we still need to set nports to
+	 * one in order to be able to use the
+	 * ahci_platform_[en|dis]able_[phys|regulators] functions.
+	 */
+	if (!child_nodes)
+		hpriv->nports = 1;
 
+	sz = hpriv->nports * sizeof(*hpriv->phys);
+	hpriv->phys = devm_kzalloc(dev, sz, GFP_KERNEL);
+	if (!hpriv->phys) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+	sz = hpriv->nports * sizeof(*hpriv->target_pwrs);
+	hpriv->target_pwrs = devm_kzalloc(dev, sz, GFP_KERNEL);
+	if (!hpriv->target_pwrs) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	if (child_nodes) {
 		for_each_child_of_node(dev->of_node, child) {
 			u32 port;
+			struct platform_device *port_dev;
 
 			if (!of_device_is_available(child))
 				continue;
@@ -316,18 +435,23 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 				dev_warn(dev, "invalid port number %d\n", port);
 				continue;
 			}
-
 			mask_port_map |= BIT(port);
 
-			hpriv->phys[port] = devm_of_phy_get(dev, child, NULL);
-			if (IS_ERR(hpriv->phys[port])) {
-				rc = PTR_ERR(hpriv->phys[port]);
-				dev_err(dev,
-					"couldn't get PHY in node %s: %d\n",
-					child->name, rc);
-				goto err_out;
+			of_platform_device_create(child, NULL, NULL);
+
+			port_dev = of_find_device_by_node(child);
+
+			if (port_dev) {
+				rc = ahci_platform_get_regulator(hpriv, port,
+								&port_dev->dev);
+				if (rc == -EPROBE_DEFER)
+					goto err_out;
 			}
 
+			rc = ahci_platform_get_phy(hpriv, port, dev, child);
+			if (rc)
+				goto err_out;
+
 			enabled_ports++;
 		}
 		if (!enabled_ports) {
@@ -343,38 +467,14 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 		 * If no sub-node was found, keep this for device tree
 		 * compatibility
 		 */
-		struct phy *phy = devm_phy_get(dev, "sata-phy");
-		if (!IS_ERR(phy)) {
-			hpriv->phys = devm_kzalloc(dev, sizeof(*hpriv->phys),
-						   GFP_KERNEL);
-			if (!hpriv->phys) {
-				rc = -ENOMEM;
-				goto err_out;
-			}
-
-			hpriv->phys[0] = phy;
-			hpriv->nports = 1;
-		} else {
-			rc = PTR_ERR(phy);
-			switch (rc) {
-				case -ENOSYS:
-					/* No PHY support. Check if PHY is required. */
-					if (of_find_property(dev->of_node, "phys", NULL)) {
-						dev_err(dev, "couldn't get sata-phy: ENOSYS\n");
-						goto err_out;
-					}
-				case -ENODEV:
-					/* continue normally */
-					hpriv->phys = NULL;
-					break;
-
-				default:
-					goto err_out;
+		rc = ahci_platform_get_phy(hpriv, 0, dev, dev->of_node);
+		if (rc)
+			goto err_out;
 
-			}
-		}
+		rc = ahci_platform_get_regulator(hpriv, 0, dev);
+		if (rc == -EPROBE_DEFER)
+			goto err_out;
 	}
-
 	pm_runtime_enable(dev);
 	pm_runtime_get_sync(dev);
 	hpriv->got_runtime_pm = true;
diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index 642d6ae4030c..f65b33809170 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -24,6 +24,8 @@ struct platform_device;
 
 int ahci_platform_enable_clks(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_clks(struct ahci_host_priv *hpriv);
+int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv);
+void ahci_platform_disable_regulators(struct ahci_host_priv *hpriv);
 int ahci_platform_enable_resources(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_resources(struct ahci_host_priv *hpriv);
 struct ahci_host_priv *ahci_platform_get_resources(
-- 
cgit v1.2.3


From 54c523127bcca986c6f9b04c7b56a949ea011899 Mon Sep 17 00:00:00 2001
From: Matt Wagantall <mattw@codeaurora.org>
Date: Mon, 15 Dec 2014 23:47:23 +0000
Subject: iopoll: Introduce memory-mapped IO polling macros

It is sometimes necessary to poll a memory-mapped register until its value
satisfies some condition. Introduce a family of convenience macros that do
this. Tight-looping, sleeping, and timing out can all be accomplished using
these macros.

Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Robert Elliott <elliott@hp.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Matt Wagantall <mattw@codeaurora.org>
Signed-off-by: Mitchel Humpherys <mitchelh@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/iopoll.h | 144 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 include/linux/iopoll.h

(limited to 'include/linux')

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
new file mode 100644
index 000000000000..1c30014ed176
--- /dev/null
+++ b/include/linux/iopoll.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2012-2014 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_IOPOLL_H
+#define _LINUX_IOPOLL_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/hrtimer.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+
+/**
+ * readx_poll_timeout - Periodically poll an address until a condition is met or a timeout occurs
+ * @op: accessor function (takes @addr as its only argument)
+ * @addr: Address to poll
+ * @val: Variable to read the value into
+ * @cond: Break condition (usually involving @val)
+ * @sleep_us: Maximum time to sleep between reads in us (0
+ *            tight-loops).  Should be less than ~20ms since usleep_range
+ *            is used (see Documentation/timers/timers-howto.txt).
+ * @timeout_us: Timeout in us, 0 means never timeout
+ *
+ * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @addr is stored in @val. Must not
+ * be called from atomic context if sleep_us or timeout_us are used.
+ *
+ * When available, you'll probably want to use one of the specialized
+ * macros defined below rather than this macro directly.
+ */
+#define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us)	\
+({ \
+	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	might_sleep_if(sleep_us); \
+	for (;;) { \
+		(val) = op(addr); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+			(val) = op(addr); \
+			break; \
+		} \
+		if (sleep_us) \
+			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})
+
+/**
+ * readx_poll_timeout_atomic - Periodically poll an address until a condition is met or a timeout occurs
+ * @op: accessor function (takes @addr as its only argument)
+ * @addr: Address to poll
+ * @val: Variable to read the value into
+ * @cond: Break condition (usually involving @val)
+ * @delay_us: Time to udelay between reads in us (0 tight-loops).  Should
+ *            be less than ~10us since udelay is used (see
+ *            Documentation/timers/timers-howto.txt).
+ * @timeout_us: Timeout in us, 0 means never timeout
+ *
+ * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @addr is stored in @val.
+ *
+ * When available, you'll probably want to use one of the specialized
+ * macros defined below rather than this macro directly.
+ */
+#define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \
+({ \
+	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	for (;;) { \
+		(val) = op(addr); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+			(val) = op(addr); \
+			break; \
+		} \
+		if (delay_us) \
+			udelay(delay_us);	\
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})
+
+
+#define readb_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readb, addr, val, cond, delay_us, timeout_us)
+
+#define readb_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readb, addr, val, cond, delay_us, timeout_us)
+
+#define readw_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readw, addr, val, cond, delay_us, timeout_us)
+
+#define readw_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readw, addr, val, cond, delay_us, timeout_us)
+
+#define readl_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readl, addr, val, cond, delay_us, timeout_us)
+
+#define readl_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readl, addr, val, cond, delay_us, timeout_us)
+
+#define readq_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readq, addr, val, cond, delay_us, timeout_us)
+
+#define readq_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readq, addr, val, cond, delay_us, timeout_us)
+
+#define readb_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readb_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readb_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readb_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readw_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readw_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readw_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readw_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readl_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readl_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readl_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readl_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readq_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readq_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readq_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readq_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#endif /* _LINUX_IOPOLL_H */
-- 
cgit v1.2.3


From 379dcfb406002d855fa56f9c3d290c4048f44e9c Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 19 Jan 2015 00:13:39 +0100
Subject: crypto: doc - remove colons in comments

As documented in Documentation/kernel-doc-nano-HOWTO.txt lines
terminated with a colon are treated as headings.

The current layout of the documentation when compiling the kernel
crypto API DocBook documentation is messed up by by treating some lines
as headings. The patch removes colons from comments that shall not be
treated as headings.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 90998348e564..fb5ef16d6a12 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -1147,7 +1147,7 @@ static inline void ablkcipher_request_free(struct ablkcipher_request *req)
  * cipher operation completes.
  *
  * The callback function is registered with the ablkcipher_request handle and
- * must comply with the following template:
+ * must comply with the following template
  *
  *	void callback_function(struct crypto_async_request *req, int error)
  */
@@ -1174,7 +1174,7 @@ static inline void ablkcipher_request_set_callback(
  *
  * For encryption, the source is treated as the plaintext and the
  * destination is the ciphertext. For a decryption operation, the use is
- * reversed: the source is the ciphertext and the destination is the plaintext.
+ * reversed - the source is the ciphertext and the destination is the plaintext.
  */
 static inline void ablkcipher_request_set_crypt(
 	struct ablkcipher_request *req,
@@ -1509,7 +1509,7 @@ static inline void aead_request_free(struct aead_request *req)
  * completes
  *
  * The callback function is registered with the aead_request handle and
- * must comply with the following template:
+ * must comply with the following template
  *
  *	void callback_function(struct crypto_async_request *req, int error)
  */
@@ -1536,7 +1536,7 @@ static inline void aead_request_set_callback(struct aead_request *req,
  *
  * For encryption, the source is treated as the plaintext and the
  * destination is the ciphertext. For a decryption operation, the use is
- * reversed: the source is the ciphertext and the destination is the plaintext.
+ * reversed - the source is the ciphertext and the destination is the plaintext.
  *
  * IMPORTANT NOTE AEAD requires an authentication tag (MAC). For decryption,
  *		  the caller must concatenate the ciphertext followed by the
-- 
cgit v1.2.3


From 67d0d04a762db4bd610fd628ad683b5d7dc905e7 Mon Sep 17 00:00:00 2001
From: Vincent Yang <vincent.yang.fujitsu@gmail.com>
Date: Tue, 20 Jan 2015 16:05:16 +0800
Subject: mmc: sdhci: add a quirk for tuning work around

This patch defines a quirk for tuning work
around for some sdhci host controller. It sets
both SDHCI_CTRL_EXEC_TUNING and SDHCI_CTRL_TUNED_CLK
for tuning.
It is a preparation and will be used by Fujitsu
SDHCI controller f_sdh30 driver.

Signed-off-by: Vincent Yang <Vincent.Yang@tw.fujitsu.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci.c  | 2 ++
 include/linux/mmc/sdhci.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 3e4fe967d7e5..a3ecd20e5510 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1961,6 +1961,8 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode)
 
 	ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2);
 	ctrl |= SDHCI_CTRL_EXEC_TUNING;
+	if (host->quirks2 & SDHCI_QUIRK2_TUNING_WORK_AROUND)
+		ctrl |= SDHCI_CTRL_TUNED_CLK;
 	sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
 
 	/*
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index cb8b94ff6a26..933b897ca095 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -111,6 +111,8 @@ struct sdhci_host {
 #define SDHCI_QUIRK2_CLEAR_TRANSFERMODE_REG_BEFORE_CMD	(1<<10)
 /* Capability register bit-63 indicates HS400 support */
 #define SDHCI_QUIRK2_CAPS_BIT63_FOR_HS400		(1<<11)
+/* forced tuned clock */
+#define SDHCI_QUIRK2_TUNING_WORK_AROUND			(1<<12)
 
 	int irq;		/* Device IRQ */
 	void __iomem *ioaddr;	/* Mapped address */
-- 
cgit v1.2.3


From d3fc5d71ac4dfd28a66689cfd1eea84c4dba8bde Mon Sep 17 00:00:00 2001
From: Vincent Yang <vincent.yang.fujitsu@gmail.com>
Date: Tue, 20 Jan 2015 16:05:17 +0800
Subject: mmc: sdhci: add a quirk for single block transactions

This patch defines a quirk to disable the block count
for single block transactions.
It is a preparation and will be used by Fujitsu
SDHCI controller f_sdh30 driver.

Signed-off-by: Vincent Yang <Vincent.Yang@tw.fujitsu.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci.c  | 8 +++++---
 include/linux/mmc/sdhci.h | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index a3ecd20e5510..c9881ca131d5 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -904,7 +904,7 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_command *cmd)
 static void sdhci_set_transfer_mode(struct sdhci_host *host,
 	struct mmc_command *cmd)
 {
-	u16 mode;
+	u16 mode = 0;
 	struct mmc_data *data = cmd->data;
 
 	if (data == NULL) {
@@ -922,9 +922,11 @@ static void sdhci_set_transfer_mode(struct sdhci_host *host,
 
 	WARN_ON(!host->data);
 
-	mode = SDHCI_TRNS_BLK_CNT_EN;
+	if (!(host->quirks2 & SDHCI_QUIRK2_SUPPORT_SINGLE))
+		mode = SDHCI_TRNS_BLK_CNT_EN;
+
 	if (mmc_op_multi(cmd->opcode) || data->blocks > 1) {
-		mode |= SDHCI_TRNS_MULTI;
+		mode = SDHCI_TRNS_BLK_CNT_EN | SDHCI_TRNS_MULTI;
 		/*
 		 * If we are sending CMD23, CMD12 never gets sent
 		 * on successful completion (so no Auto-CMD12).
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 933b897ca095..c3e3db196738 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -113,6 +113,8 @@ struct sdhci_host {
 #define SDHCI_QUIRK2_CAPS_BIT63_FOR_HS400		(1<<11)
 /* forced tuned clock */
 #define SDHCI_QUIRK2_TUNING_WORK_AROUND			(1<<12)
+/* disable the block count for single block transactions */
+#define SDHCI_QUIRK2_SUPPORT_SINGLE			(1<<13)
 
 	int irq;		/* Device IRQ */
 	void __iomem *ioaddr;	/* Mapped address */
-- 
cgit v1.2.3


From 94b110aff8679b14f46fd6653ea87b42fe1555be Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:57:22 +0000
Subject: mmc: tmio: add tmio_mmc_host_alloc/free()

Current tmio_mmc driver is using tmio_mmc_data for driver/platform
specific data/callback, and it is needed for tmio_mmc_host_probe()
function. Because of this style, include/linux/mfd/tmio.h header has
tmio driver/framework specific data which is not needed from platform.

This patch adds new tmio_mmc_host_alloc/free() as cleanup preparation.
tmio driver specific data/callback will be implemented in tmio_mmc_host,
and platform specific data/callback will be implemented in tmio_mmc_data
in this cleanup.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 14 +++++++++---
 drivers/mmc/host/tmio_mmc.c       | 10 +++++++--
 drivers/mmc/host/tmio_mmc.h       |  5 +++--
 drivers/mmc/host/tmio_mmc_pio.c   | 45 +++++++++++++++++++++++++--------------
 include/linux/mfd/tmio.h          |  1 -
 5 files changed, 51 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 00c8ebdf8ec7..cf062c4d87a9 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -113,7 +113,7 @@ static int sh_mobile_sdhi_wait_idle(struct tmio_mmc_host *host)
 		udelay(1);
 
 	if (!timeout) {
-		dev_warn(host->pdata->dev, "timeout waiting for SD bus idle\n");
+		dev_warn(&host->pdev->dev, "timeout waiting for SD bus idle\n");
 		return -EBUSY;
 	}
 
@@ -207,6 +207,12 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 		goto eclkget;
 	}
 
+	host = tmio_mmc_host_alloc(pdev);
+	if (!host) {
+		ret = -ENOMEM;
+		goto eprobe;
+	}
+
 	mmc_data->clk_enable = sh_mobile_sdhi_clk_enable;
 	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
@@ -274,9 +280,9 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	/* SD control register space size is 0x100, 0x200 for bus_shift=1 */
 	mmc_data->bus_shift = resource_size(res) >> 9;
 
-	ret = tmio_mmc_host_probe(&host, pdev, mmc_data);
+	ret = tmio_mmc_host_probe(host, mmc_data);
 	if (ret < 0)
-		goto eprobe;
+		goto efree;
 
 	/*
 	 * FIXME:
@@ -351,6 +357,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 
 eirq:
 	tmio_mmc_host_remove(host);
+efree:
+	tmio_mmc_host_free(host);
 eprobe:
 eclkget:
 	if (p && p->cleanup)
diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c
index 2ca0afaab792..a7c2e459187c 100644
--- a/drivers/mmc/host/tmio_mmc.c
+++ b/drivers/mmc/host/tmio_mmc.c
@@ -92,10 +92,14 @@ static int tmio_mmc_probe(struct platform_device *pdev)
 	pdata->bus_shift = resource_size(res) >> 10;
 	pdata->flags |= TMIO_MMC_HAVE_HIGH_REG;
 
-	ret = tmio_mmc_host_probe(&host, pdev, pdata);
-	if (ret)
+	host = tmio_mmc_host_alloc(pdev);
+	if (!host)
 		goto cell_disable;
 
+	ret = tmio_mmc_host_probe(host, pdata);
+	if (ret)
+		goto host_free;
+
 	ret = request_irq(irq, tmio_mmc_irq, IRQF_TRIGGER_FALLING,
 				dev_name(&pdev->dev), host);
 	if (ret)
@@ -108,6 +112,8 @@ static int tmio_mmc_probe(struct platform_device *pdev)
 
 host_remove:
 	tmio_mmc_host_remove(host);
+host_free:
+	tmio_mmc_host_free(host);
 cell_disable:
 	if (cell->disable)
 		cell->disable(pdev);
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index a34ecbe1c1ad..60d6747f0df5 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -85,8 +85,9 @@ struct tmio_mmc_host {
 	bool			sdio_irq_enabled;
 };
 
-int tmio_mmc_host_probe(struct tmio_mmc_host **host,
-			struct platform_device *pdev,
+struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
+void tmio_mmc_host_free(struct tmio_mmc_host *host);
+int tmio_mmc_host_probe(struct tmio_mmc_host *host,
 			struct tmio_mmc_data *pdata);
 void tmio_mmc_host_remove(struct tmio_mmc_host *host);
 void tmio_mmc_do_data_irq(struct tmio_mmc_host *host);
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 250bf8c9f998..396be0e95a98 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -1054,12 +1054,37 @@ static void tmio_mmc_of_parse(struct platform_device *pdev,
 		pdata->flags |= TMIO_MMC_WRPROTECT_DISABLE;
 }
 
-int tmio_mmc_host_probe(struct tmio_mmc_host **host,
-				  struct platform_device *pdev,
-				  struct tmio_mmc_data *pdata)
+struct tmio_mmc_host*
+tmio_mmc_host_alloc(struct platform_device *pdev)
 {
-	struct tmio_mmc_host *_host;
+	struct tmio_mmc_host *host;
 	struct mmc_host *mmc;
+
+	mmc = mmc_alloc_host(sizeof(struct tmio_mmc_host), &pdev->dev);
+	if (!mmc)
+		return NULL;
+
+	host = mmc_priv(mmc);
+	host->mmc = mmc;
+	host->pdev = pdev;
+
+	return host;
+}
+EXPORT_SYMBOL(tmio_mmc_host_alloc);
+
+void tmio_mmc_host_free(struct tmio_mmc_host *host)
+{
+	mmc_free_host(host->mmc);
+
+	host->mmc = NULL;
+}
+EXPORT_SYMBOL(tmio_mmc_host_free);
+
+int tmio_mmc_host_probe(struct tmio_mmc_host *_host,
+			struct tmio_mmc_data *pdata)
+{
+	struct platform_device *pdev = _host->pdev;
+	struct mmc_host *mmc = _host->mmc;
 	struct resource *res_ctl;
 	int ret;
 	u32 irq_mask = TMIO_MASK_CMD;
@@ -1073,19 +1098,11 @@ int tmio_mmc_host_probe(struct tmio_mmc_host **host,
 	if (!res_ctl)
 		return -EINVAL;
 
-	mmc = mmc_alloc_host(sizeof(struct tmio_mmc_host), &pdev->dev);
-	if (!mmc)
-		return -ENOMEM;
-
 	ret = mmc_of_parse(mmc);
 	if (ret < 0)
 		goto host_free;
 
-	pdata->dev = &pdev->dev;
-	_host = mmc_priv(mmc);
 	_host->pdata = pdata;
-	_host->mmc = mmc;
-	_host->pdev = pdev;
 	platform_set_drvdata(pdev, mmc);
 
 	_host->set_pwr = pdata->set_pwr;
@@ -1192,12 +1209,9 @@ int tmio_mmc_host_probe(struct tmio_mmc_host **host,
 		mmc_gpiod_request_cd_irq(mmc);
 	}
 
-	*host = _host;
-
 	return 0;
 
 host_free:
-	mmc_free_host(mmc);
 
 	return ret;
 }
@@ -1222,7 +1236,6 @@ void tmio_mmc_host_remove(struct tmio_mmc_host *host)
 	pm_runtime_disable(&pdev->dev);
 
 	iounmap(host->ctl);
-	mmc_free_host(mmc);
 }
 EXPORT_SYMBOL(tmio_mmc_host_remove);
 
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 57388171610d..c7d9af042d09 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -135,7 +135,6 @@ struct tmio_mmc_data {
 	unsigned long			bus_shift;
 	u32				ocr_mask;	/* available voltages */
 	struct tmio_mmc_dma		*dma;
-	struct device			*dev;
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-- 
cgit v1.2.3


From 7ecc09bab1e856e6730a4dd8a3bc1c28bb6ab3be Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:57:33 +0000
Subject: mmc: tmio: tmio_mmc_host has .dma

Current .dma is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c |  4 ++--
 drivers/mmc/host/tmio_mmc.h       | 11 +++++++++++
 drivers/mmc/host/tmio_mmc_dma.c   | 24 +++++++++++-------------
 include/linux/mfd/tmio.h          | 11 -----------
 4 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index cf062c4d87a9..288e78d2c7a1 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -213,6 +213,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 		goto eprobe;
 	}
 
+	host->dma		= dma_priv;
+
 	mmc_data->clk_enable = sh_mobile_sdhi_clk_enable;
 	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
@@ -241,8 +243,6 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	dma_priv->alignment_shift = 1; /* 2-byte alignment */
 	dma_priv->filter = shdma_chan_filter;
 
-	mmc_data->dma = dma_priv;
-
 	/*
 	 * All SDHI blocks support 2-byte and larger block sizes in 4-bit
 	 * bus width mode.
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 60d6747f0df5..49a2559ff489 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -40,6 +40,16 @@
 
 struct tmio_mmc_data;
 
+struct tmio_mmc_dma {
+	void *chan_priv_tx;
+	void *chan_priv_rx;
+	int slave_id_tx;
+	int slave_id_rx;
+	int alignment_shift;
+	dma_addr_t dma_rx_offset;
+	bool (*filter)(struct dma_chan *chan, void *arg);
+};
+
 struct tmio_mmc_host {
 	void __iomem *ctl;
 	struct mmc_command      *cmd;
@@ -59,6 +69,7 @@ struct tmio_mmc_host {
 
 	struct platform_device *pdev;
 	struct tmio_mmc_data *pdata;
+	struct tmio_mmc_dma	*dma;
 
 	/* DMA support */
 	bool			force_pio;
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index 7d077388b9eb..6c214d60bbb6 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -49,11 +49,10 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 	struct scatterlist *sg = host->sg_ptr, *sg_tmp;
 	struct dma_async_tx_descriptor *desc = NULL;
 	struct dma_chan *chan = host->chan_rx;
-	struct tmio_mmc_data *pdata = host->pdata;
 	dma_cookie_t cookie;
 	int ret, i;
 	bool aligned = true, multiple = true;
-	unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
+	unsigned int align = (1 << host->dma->alignment_shift) - 1;
 
 	for_each_sg(sg, sg_tmp, host->sg_len, i) {
 		if (sg_tmp->offset & align)
@@ -126,11 +125,10 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 	struct scatterlist *sg = host->sg_ptr, *sg_tmp;
 	struct dma_async_tx_descriptor *desc = NULL;
 	struct dma_chan *chan = host->chan_tx;
-	struct tmio_mmc_data *pdata = host->pdata;
 	dma_cookie_t cookie;
 	int ret, i;
 	bool aligned = true, multiple = true;
-	unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
+	unsigned int align = (1 << host->dma->alignment_shift) - 1;
 
 	for_each_sg(sg, sg_tmp, host->sg_len, i) {
 		if (sg_tmp->offset & align)
@@ -262,8 +260,8 @@ out:
 void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdata)
 {
 	/* We can only either use DMA for both Tx and Rx or not use it at all */
-	if (!pdata->dma || (!host->pdev->dev.of_node &&
-		(!pdata->dma->chan_priv_tx || !pdata->dma->chan_priv_rx)))
+	if (!host->dma || (!host->pdev->dev.of_node &&
+		(!host->dma->chan_priv_tx || !host->dma->chan_priv_rx)))
 		return;
 
 	if (!host->chan_tx && !host->chan_rx) {
@@ -280,7 +278,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		dma_cap_set(DMA_SLAVE, mask);
 
 		host->chan_tx = dma_request_slave_channel_compat(mask,
-					pdata->dma->filter, pdata->dma->chan_priv_tx,
+					host->dma->filter, host->dma->chan_priv_tx,
 					&host->pdev->dev, "tx");
 		dev_dbg(&host->pdev->dev, "%s: TX: got channel %p\n", __func__,
 			host->chan_tx);
@@ -288,8 +286,8 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		if (!host->chan_tx)
 			return;
 
-		if (pdata->dma->chan_priv_tx)
-			cfg.slave_id = pdata->dma->slave_id_tx;
+		if (host->dma->chan_priv_tx)
+			cfg.slave_id = host->dma->slave_id_tx;
 		cfg.direction = DMA_MEM_TO_DEV;
 		cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->pdata->bus_shift);
 		cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
@@ -299,7 +297,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 			goto ecfgtx;
 
 		host->chan_rx = dma_request_slave_channel_compat(mask,
-					pdata->dma->filter, pdata->dma->chan_priv_rx,
+					host->dma->filter, host->dma->chan_priv_rx,
 					&host->pdev->dev, "rx");
 		dev_dbg(&host->pdev->dev, "%s: RX: got channel %p\n", __func__,
 			host->chan_rx);
@@ -307,10 +305,10 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		if (!host->chan_rx)
 			goto ereqrx;
 
-		if (pdata->dma->chan_priv_rx)
-			cfg.slave_id = pdata->dma->slave_id_rx;
+		if (host->dma->chan_priv_rx)
+			cfg.slave_id = host->dma->slave_id_rx;
 		cfg.direction = DMA_DEV_TO_MEM;
-		cfg.src_addr = cfg.dst_addr + pdata->dma->dma_rx_offset;
+		cfg.src_addr = cfg.dst_addr + host->dma->dma_rx_offset;
 		cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
 		cfg.dst_addr = 0;
 		ret = dmaengine_slave_config(host->chan_rx, &cfg);
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index c7d9af042d09..8d708c7cf681 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -112,16 +112,6 @@ void tmio_core_mmc_clk_div(void __iomem *cnf, int shift, int state);
 
 struct dma_chan;
 
-struct tmio_mmc_dma {
-	void *chan_priv_tx;
-	void *chan_priv_rx;
-	int slave_id_tx;
-	int slave_id_rx;
-	int alignment_shift;
-	dma_addr_t dma_rx_offset;
-	bool (*filter)(struct dma_chan *chan, void *arg);
-};
-
 struct tmio_mmc_host;
 
 /*
@@ -134,7 +124,6 @@ struct tmio_mmc_data {
 	unsigned long			flags;
 	unsigned long			bus_shift;
 	u32				ocr_mask;	/* available voltages */
-	struct tmio_mmc_dma		*dma;
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-- 
cgit v1.2.3


From dfe9a229e0a66b6a00439cea2885ad3b5d3e0840 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:57:42 +0000
Subject: mmc: tmio: tmio_mmc_host has .write16_hook

Current .write16_hook is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 4 +++-
 drivers/mmc/host/tmio_mmc_pio.c   | 2 +-
 include/linux/mfd/tmio.h          | 3 ---
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 288e78d2c7a1..77ff0620912e 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -214,11 +214,11 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	}
 
 	host->dma		= dma_priv;
+	host->write16_hook	= sh_mobile_sdhi_write16_hook;
 
 	mmc_data->clk_enable = sh_mobile_sdhi_clk_enable;
 	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
-	mmc_data->write16_hook = sh_mobile_sdhi_write16_hook;
 	mmc_data->multi_io_quirk = sh_mobile_sdhi_multi_io_quirk;
 	if (p) {
 		mmc_data->flags = p->tmio_flags;
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 49a2559ff489..3fa96b11f76c 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -94,6 +94,8 @@ struct tmio_mmc_host {
 	struct mutex		ios_lock;	/* protect set_ios() context */
 	bool			native_hotplug;
 	bool			sdio_irq_enabled;
+
+	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
 };
 
 struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
@@ -183,7 +185,7 @@ static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val
 	/* If there is a hook and it returns non-zero then there
 	 * is an error and the write should be skipped
 	 */
-	if (host->pdata->write16_hook && host->pdata->write16_hook(host, addr))
+	if (host->write16_hook && host->write16_hook(host, addr))
 		return;
 	writew(val, host->ctl + (addr << host->pdata->bus_shift));
 }
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 396be0e95a98..58f4e47aac8c 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -1092,7 +1092,7 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host,
 	tmio_mmc_of_parse(pdev, pdata);
 
 	if (!(pdata->flags & TMIO_MMC_HAS_IDLE_WAIT))
-		pdata->write16_hook = NULL;
+		_host->write16_hook = NULL;
 
 	res_ctl = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res_ctl)
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 8d708c7cf681..0872ca1e5dc2 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -112,8 +112,6 @@ void tmio_core_mmc_clk_div(void __iomem *cnf, int shift, int state);
 
 struct dma_chan;
 
-struct tmio_mmc_host;
-
 /*
  * data for the MMC controller
  */
@@ -127,7 +125,6 @@ struct tmio_mmc_data {
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
 	/* clock management callbacks */
 	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
 	void (*clk_disable)(struct platform_device *pdev);
-- 
cgit v1.2.3


From 4fe2ec57a15f98c232536cf04e7c139d830955d4 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:57:52 +0000
Subject: mmc: tmio: tmio_mmc_host has .clk_enable

Current .clk_enable is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 1 +
 drivers/mmc/host/tmio_mmc_pio.c   | 5 ++---
 include/linux/mfd/tmio.h          | 1 -
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 77ff0620912e..87af77dd660b 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -215,8 +215,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 
 	host->dma		= dma_priv;
 	host->write16_hook	= sh_mobile_sdhi_write16_hook;
+	host->clk_enable	= sh_mobile_sdhi_clk_enable;
 
-	mmc_data->clk_enable = sh_mobile_sdhi_clk_enable;
 	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
 	mmc_data->multi_io_quirk = sh_mobile_sdhi_multi_io_quirk;
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 3fa96b11f76c..5cd298665b94 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -96,6 +96,7 @@ struct tmio_mmc_host {
 	bool			sdio_irq_enabled;
 
 	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
+	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
 };
 
 struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 58f4e47aac8c..38bcf0ca31c5 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -835,13 +835,12 @@ fail:
 static int tmio_mmc_clk_update(struct tmio_mmc_host *host)
 {
 	struct mmc_host *mmc = host->mmc;
-	struct tmio_mmc_data *pdata = host->pdata;
 	int ret;
 
-	if (!pdata->clk_enable)
+	if (!host->clk_enable)
 		return -ENOTSUPP;
 
-	ret = pdata->clk_enable(host->pdev, &mmc->f_max);
+	ret = host->clk_enable(host->pdev, &mmc->f_max);
 	if (!ret)
 		mmc->f_min = mmc->f_max / 512;
 
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 0872ca1e5dc2..472587a02a42 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -126,7 +126,6 @@ struct tmio_mmc_data {
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
 	/* clock management callbacks */
-	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
 	void (*clk_disable)(struct platform_device *pdev);
 	int (*multi_io_quirk)(struct mmc_card *card,
 			      unsigned int direction, int blk_size);
-- 
cgit v1.2.3


From 00452c11ea0e4e5822edf0ac46853933860d0f53 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:01 +0000
Subject: mmc: tmio: tmio_mmc_host has .clk_disable

Current .clk_disable is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 1 +
 drivers/mmc/host/tmio_mmc_pio.c   | 4 ++--
 include/linux/mfd/tmio.h          | 3 +--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 87af77dd660b..5e02a6a302e7 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -216,8 +216,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	host->dma		= dma_priv;
 	host->write16_hook	= sh_mobile_sdhi_write16_hook;
 	host->clk_enable	= sh_mobile_sdhi_clk_enable;
+	host->clk_disable	= sh_mobile_sdhi_clk_disable;
 
-	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
 	mmc_data->multi_io_quirk = sh_mobile_sdhi_multi_io_quirk;
 	if (p) {
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 5cd298665b94..28234ec5caa3 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -97,6 +97,7 @@ struct tmio_mmc_host {
 
 	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
 	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
+	void (*clk_disable)(struct platform_device *pdev);
 };
 
 struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 38bcf0ca31c5..e33eb8ee5eb5 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -1249,8 +1249,8 @@ int tmio_mmc_host_runtime_suspend(struct device *dev)
 	if (host->clk_cache)
 		tmio_mmc_clk_stop(host);
 
-	if (host->pdata->clk_disable)
-		host->pdata->clk_disable(host->pdev);
+	if (host->clk_disable)
+		host->clk_disable(host->pdev);
 
 	return 0;
 }
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 472587a02a42..a3f78da70af7 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -125,8 +125,7 @@ struct tmio_mmc_data {
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-	/* clock management callbacks */
-	void (*clk_disable)(struct platform_device *pdev);
+
 	int (*multi_io_quirk)(struct mmc_card *card,
 			      unsigned int direction, int blk_size);
 };
-- 
cgit v1.2.3


From 85c02ddd591e5252eb1cbe8743a839638d7415fd Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:10 +0000
Subject: mmc: tmio: tmio_mmc_host has .multi_io_quirk

Current .multi_io_quirk is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 2 ++
 drivers/mmc/host/tmio_mmc_pio.c   | 5 ++---
 include/linux/mfd/tmio.h          | 3 ---
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 5e02a6a302e7..c92efe59284d 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -217,9 +217,9 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	host->write16_hook	= sh_mobile_sdhi_write16_hook;
 	host->clk_enable	= sh_mobile_sdhi_clk_enable;
 	host->clk_disable	= sh_mobile_sdhi_clk_disable;
+	host->multi_io_quirk	= sh_mobile_sdhi_multi_io_quirk;
 
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
-	mmc_data->multi_io_quirk = sh_mobile_sdhi_multi_io_quirk;
 	if (p) {
 		mmc_data->flags = p->tmio_flags;
 		mmc_data->ocr_mask = p->tmio_ocr_mask;
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 28234ec5caa3..263256c897ed 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -98,6 +98,8 @@ struct tmio_mmc_host {
 	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
 	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
 	void (*clk_disable)(struct platform_device *pdev);
+	int (*multi_io_quirk)(struct mmc_card *card,
+			      unsigned int direction, int blk_size);
 };
 
 struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index e33eb8ee5eb5..a31c3573d386 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -1004,10 +1004,9 @@ static int tmio_multi_io_quirk(struct mmc_card *card,
 			       unsigned int direction, int blk_size)
 {
 	struct tmio_mmc_host *host = mmc_priv(card->host);
-	struct tmio_mmc_data *pdata = host->pdata;
 
-	if (pdata->multi_io_quirk)
-		return pdata->multi_io_quirk(card, direction, blk_size);
+	if (host->multi_io_quirk)
+		return host->multi_io_quirk(card, direction, blk_size);
 
 	return blk_size;
 }
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index a3f78da70af7..3edaa17d1ccd 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -125,9 +125,6 @@ struct tmio_mmc_data {
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-
-	int (*multi_io_quirk)(struct mmc_card *card,
-			      unsigned int direction, int blk_size);
 };
 
 /*
-- 
cgit v1.2.3


From 7445bf9e6f4e5d7755e22c7c9b06f4ae0d6160c6 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:20 +0000
Subject: mmc: tmio: tmio_mmc_host has .bus_shift

Current .bus_shift is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c |  5 ++---
 drivers/mmc/host/tmio_mmc.c       |  5 +++--
 drivers/mmc/host/tmio_mmc.h       | 17 +++++++++--------
 drivers/mmc/host/tmio_mmc_dma.c   |  2 +-
 include/linux/mfd/tmio.h          |  1 -
 5 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index c92efe59284d..0db2a0a0376b 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -218,6 +218,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	host->clk_enable	= sh_mobile_sdhi_clk_enable;
 	host->clk_disable	= sh_mobile_sdhi_clk_disable;
 	host->multi_io_quirk	= sh_mobile_sdhi_multi_io_quirk;
+	/* SD control register space size is 0x100, 0x200 for bus_shift=1 */
+	host->bus_shift		= resource_size(res) >> 9;
 
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
 	if (p) {
@@ -277,9 +279,6 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 		dma_priv->dma_rx_offset = of_data->dma_rx_offset;
 	}
 
-	/* SD control register space size is 0x100, 0x200 for bus_shift=1 */
-	mmc_data->bus_shift = resource_size(res) >> 9;
-
 	ret = tmio_mmc_host_probe(host, mmc_data);
 	if (ret < 0)
 		goto efree;
diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c
index a7c2e459187c..f746df493892 100644
--- a/drivers/mmc/host/tmio_mmc.c
+++ b/drivers/mmc/host/tmio_mmc.c
@@ -88,14 +88,15 @@ static int tmio_mmc_probe(struct platform_device *pdev)
 	if (!res)
 		return -EINVAL;
 
-	/* SD control register space size is 0x200, 0x400 for bus_shift=1 */
-	pdata->bus_shift = resource_size(res) >> 10;
 	pdata->flags |= TMIO_MMC_HAVE_HIGH_REG;
 
 	host = tmio_mmc_host_alloc(pdev);
 	if (!host)
 		goto cell_disable;
 
+	/* SD control register space size is 0x200, 0x400 for bus_shift=1 */
+	host->bus_shift = resource_size(res) >> 10;
+
 	ret = tmio_mmc_host_probe(host, pdata);
 	if (ret)
 		goto host_free;
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 263256c897ed..a5d30bfa7e64 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -66,6 +66,7 @@ struct tmio_mmc_host {
 	struct scatterlist      *sg_orig;
 	unsigned int            sg_len;
 	unsigned int            sg_off;
+	unsigned long		bus_shift;
 
 	struct platform_device *pdev;
 	struct tmio_mmc_data *pdata;
@@ -169,19 +170,19 @@ int tmio_mmc_host_runtime_resume(struct device *dev);
 
 static inline u16 sd_ctrl_read16(struct tmio_mmc_host *host, int addr)
 {
-	return readw(host->ctl + (addr << host->pdata->bus_shift));
+	return readw(host->ctl + (addr << host->bus_shift));
 }
 
 static inline void sd_ctrl_read16_rep(struct tmio_mmc_host *host, int addr,
 		u16 *buf, int count)
 {
-	readsw(host->ctl + (addr << host->pdata->bus_shift), buf, count);
+	readsw(host->ctl + (addr << host->bus_shift), buf, count);
 }
 
 static inline u32 sd_ctrl_read32(struct tmio_mmc_host *host, int addr)
 {
-	return readw(host->ctl + (addr << host->pdata->bus_shift)) |
-	       readw(host->ctl + ((addr + 2) << host->pdata->bus_shift)) << 16;
+	return readw(host->ctl + (addr << host->bus_shift)) |
+	       readw(host->ctl + ((addr + 2) << host->bus_shift)) << 16;
 }
 
 static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val)
@@ -191,19 +192,19 @@ static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val
 	 */
 	if (host->write16_hook && host->write16_hook(host, addr))
 		return;
-	writew(val, host->ctl + (addr << host->pdata->bus_shift));
+	writew(val, host->ctl + (addr << host->bus_shift));
 }
 
 static inline void sd_ctrl_write16_rep(struct tmio_mmc_host *host, int addr,
 		u16 *buf, int count)
 {
-	writesw(host->ctl + (addr << host->pdata->bus_shift), buf, count);
+	writesw(host->ctl + (addr << host->bus_shift), buf, count);
 }
 
 static inline void sd_ctrl_write32(struct tmio_mmc_host *host, int addr, u32 val)
 {
-	writew(val, host->ctl + (addr << host->pdata->bus_shift));
-	writew(val >> 16, host->ctl + ((addr + 2) << host->pdata->bus_shift));
+	writew(val, host->ctl + (addr << host->bus_shift));
+	writew(val >> 16, host->ctl + ((addr + 2) << host->bus_shift));
 }
 
 
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index 6c214d60bbb6..ee0131eb078d 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -289,7 +289,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		if (host->dma->chan_priv_tx)
 			cfg.slave_id = host->dma->slave_id_tx;
 		cfg.direction = DMA_MEM_TO_DEV;
-		cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->pdata->bus_shift);
+		cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->bus_shift);
 		cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
 		cfg.src_addr = 0;
 		ret = dmaengine_slave_config(host->chan_tx, &cfg);
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 3edaa17d1ccd..7a5c27948315 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -120,7 +120,6 @@ struct tmio_mmc_data {
 	unsigned long			capabilities;
 	unsigned long			capabilities2;
 	unsigned long			flags;
-	unsigned long			bus_shift;
 	u32				ocr_mask;	/* available voltages */
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
-- 
cgit v1.2.3


From e471df0bcaa137f1bbe7c5f75db6ce7566caa292 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:46 +0000
Subject: mmc: tmio: tmio_mmc_data has .alignment_shift

Current .alignment_shift is implemented under tmio_mmc_dma.
It goes to tmio_mmc_data by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 4 ++--
 drivers/mmc/host/tmio_mmc.h       | 1 -
 drivers/mmc/host/tmio_mmc_dma.c   | 4 ++--
 include/linux/mfd/tmio.h          | 1 +
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 0db2a0a0376b..782d8cf942a7 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -241,10 +241,10 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 			dma_priv->slave_id_rx = p->dma_slave_rx;
 		}
 	}
-
-	dma_priv->alignment_shift = 1; /* 2-byte alignment */
 	dma_priv->filter = shdma_chan_filter;
 
+	mmc_data->alignment_shift = 1; /* 2-byte alignment */
+
 	/*
 	 * All SDHI blocks support 2-byte and larger block sizes in 4-bit
 	 * bus width mode.
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index a5d30bfa7e64..9482abceeb9c 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -45,7 +45,6 @@ struct tmio_mmc_dma {
 	void *chan_priv_rx;
 	int slave_id_tx;
 	int slave_id_rx;
-	int alignment_shift;
 	dma_addr_t dma_rx_offset;
 	bool (*filter)(struct dma_chan *chan, void *arg);
 };
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index ee0131eb078d..d2b02de2f378 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -52,7 +52,7 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 	dma_cookie_t cookie;
 	int ret, i;
 	bool aligned = true, multiple = true;
-	unsigned int align = (1 << host->dma->alignment_shift) - 1;
+	unsigned int align = (1 << host->pdata->alignment_shift) - 1;
 
 	for_each_sg(sg, sg_tmp, host->sg_len, i) {
 		if (sg_tmp->offset & align)
@@ -128,7 +128,7 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 	dma_cookie_t cookie;
 	int ret, i;
 	bool aligned = true, multiple = true;
-	unsigned int align = (1 << host->dma->alignment_shift) - 1;
+	unsigned int align = (1 << host->pdata->alignment_shift) - 1;
 
 	for_each_sg(sg, sg_tmp, host->sg_len, i) {
 		if (sg_tmp->offset & align)
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 7a5c27948315..28a12d10e509 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -122,6 +122,7 @@ struct tmio_mmc_data {
 	unsigned long			flags;
 	u32				ocr_mask;	/* available voltages */
 	unsigned int			cd_gpio;
+	int				alignment_shift;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
 };
-- 
cgit v1.2.3


From 8b4c8f32da91681c0dcd321c9e3cd14f866c5517 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:56 +0000
Subject: mmc: tmio: tmio_mmc_data has .dma_rx_offset

Current .dma_rx_offset is implemented under tmio_mmc_dma.
It goes to tmio_mmc_data by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 1 -
 drivers/mmc/host/tmio_mmc_dma.c   | 2 +-
 include/linux/mfd/tmio.h          | 1 +
 4 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 782d8cf942a7..c06e93b77995 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -276,7 +276,7 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 		mmc_data->flags |= of_data->tmio_flags;
 		mmc_data->capabilities |= of_data->capabilities;
 		mmc_data->capabilities2 |= of_data->capabilities2;
-		dma_priv->dma_rx_offset = of_data->dma_rx_offset;
+		mmc_data->dma_rx_offset = of_data->dma_rx_offset;
 	}
 
 	ret = tmio_mmc_host_probe(host, mmc_data);
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 9482abceeb9c..c9a84695cff2 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -45,7 +45,6 @@ struct tmio_mmc_dma {
 	void *chan_priv_rx;
 	int slave_id_tx;
 	int slave_id_rx;
-	dma_addr_t dma_rx_offset;
 	bool (*filter)(struct dma_chan *chan, void *arg);
 };
 
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index d2b02de2f378..634b2700cb77 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -308,7 +308,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		if (host->dma->chan_priv_rx)
 			cfg.slave_id = host->dma->slave_id_rx;
 		cfg.direction = DMA_DEV_TO_MEM;
-		cfg.src_addr = cfg.dst_addr + host->dma->dma_rx_offset;
+		cfg.src_addr = cfg.dst_addr + host->pdata->dma_rx_offset;
 		cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
 		cfg.dst_addr = 0;
 		ret = dmaengine_slave_config(host->chan_rx, &cfg);
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 28a12d10e509..807ecfb162d7 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -123,6 +123,7 @@ struct tmio_mmc_data {
 	u32				ocr_mask;	/* available voltages */
 	unsigned int			cd_gpio;
 	int				alignment_shift;
+	dma_addr_t			dma_rx_offset;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
 };
-- 
cgit v1.2.3


From 010f4aa758f437647799b1fd677a5e2cf31714e9 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:59:24 +0000
Subject: mmc: sh_mobile_sdhi: remove .init/.cleanup

No one is using .init/.cleanup callback function.
Let's remove these.
sdhi_ops and .cd_wakeup are also removed

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c  | 26 +-------------------------
 include/linux/mmc/sh_mobile_sdhi.h | 15 ---------------
 2 files changed, 1 insertion(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index c06e93b77995..48a3be804cc0 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -156,15 +156,6 @@ static int sh_mobile_sdhi_multi_io_quirk(struct mmc_card *card,
 	return blk_size;
 }
 
-static void sh_mobile_sdhi_cd_wakeup(const struct platform_device *pdev)
-{
-	mmc_detect_change(platform_get_drvdata(pdev), msecs_to_jiffies(100));
-}
-
-static const struct sh_mobile_sdhi_ops sdhi_ops = {
-	.cd_wakeup = sh_mobile_sdhi_cd_wakeup,
-};
-
 static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 {
 	const struct of_device_id *of_id =
@@ -192,19 +183,11 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	mmc_data = &priv->mmc_data;
 	dma_priv = &priv->dma_priv;
 
-	if (p) {
-		if (p->init) {
-			ret = p->init(pdev, &sdhi_ops);
-			if (ret)
-				return ret;
-		}
-	}
-
 	priv->clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(priv->clk)) {
 		ret = PTR_ERR(priv->clk);
 		dev_err(&pdev->dev, "cannot get clock: %d\n", ret);
-		goto eclkget;
+		goto eprobe;
 	}
 
 	host = tmio_mmc_host_alloc(pdev);
@@ -359,9 +342,6 @@ eirq:
 efree:
 	tmio_mmc_host_free(host);
 eprobe:
-eclkget:
-	if (p && p->cleanup)
-		p->cleanup(pdev);
 	return ret;
 }
 
@@ -369,13 +349,9 @@ static int sh_mobile_sdhi_remove(struct platform_device *pdev)
 {
 	struct mmc_host *mmc = platform_get_drvdata(pdev);
 	struct tmio_mmc_host *host = mmc_priv(mmc);
-	struct sh_mobile_sdhi_info *p = pdev->dev.platform_data;
 
 	tmio_mmc_host_remove(host);
 
-	if (p && p->cleanup)
-		p->cleanup(pdev);
-
 	return 0;
 }
 
diff --git a/include/linux/mmc/sh_mobile_sdhi.h b/include/linux/mmc/sh_mobile_sdhi.h
index 68927ae50845..da77e5e2041d 100644
--- a/include/linux/mmc/sh_mobile_sdhi.h
+++ b/include/linux/mmc/sh_mobile_sdhi.h
@@ -3,20 +3,10 @@
 
 #include <linux/types.h>
 
-struct platform_device;
-
 #define SH_MOBILE_SDHI_IRQ_CARD_DETECT	"card_detect"
 #define SH_MOBILE_SDHI_IRQ_SDCARD	"sdcard"
 #define SH_MOBILE_SDHI_IRQ_SDIO		"sdio"
 
-/**
- * struct sh_mobile_sdhi_ops - SDHI driver callbacks
- * @cd_wakeup:		trigger a card-detection run
- */
-struct sh_mobile_sdhi_ops {
-	void (*cd_wakeup)(const struct platform_device *pdev);
-};
-
 struct sh_mobile_sdhi_info {
 	int dma_slave_tx;
 	int dma_slave_rx;
@@ -25,11 +15,6 @@ struct sh_mobile_sdhi_info {
 	unsigned long tmio_caps2;
 	u32 tmio_ocr_mask;	/* available MMC voltages */
 	unsigned int cd_gpio;
-
-	/* callbacks for board specific setup code */
-	int (*init)(struct platform_device *pdev,
-		    const struct sh_mobile_sdhi_ops *ops);
-	void (*cleanup)(struct platform_device *pdev);
 };
 
 #endif /* LINUX_MMC_SH_MOBILE_SDHI_H */
-- 
cgit v1.2.3


From de122cb1745313f331dc7c7923b484343d455e64 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 05:00:39 +0000
Subject: mmc: tmio: remove TMIO_MMC_HAVE_CTL_DMA_REG flag

tmio_mmc_host has .enable_dma callback now.
We don't need TMIO_MMC_HAVE_CTL_DMA_REG anymore.
Let's remove it

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 5 -----
 drivers/mmc/host/tmio_mmc_dma.c   | 3 ---
 include/linux/mfd/tmio.h          | 5 -----
 3 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 7dbcb577f596..6906a905cd54 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -291,11 +291,6 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	 */
 	mmc_data->flags |= TMIO_MMC_SDIO_STATUS_QUIRK;
 
-	/*
-	 * All SDHI have DMA control register
-	 */
-	mmc_data->flags |= TMIO_MMC_HAVE_CTL_DMA_REG;
-
 	if (of_id && of_id->data) {
 		const struct sh_mobile_sdhi_of_data *of_data = of_id->data;
 		mmc_data->flags |= of_data->tmio_flags;
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index aa5f4b6e790d..331bb618e398 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -28,9 +28,6 @@ void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable)
 	if (!host->chan_tx || !host->chan_rx)
 		return;
 
-	if (host->pdata->flags & TMIO_MMC_HAVE_CTL_DMA_REG)
-		sd_ctrl_write16(host, CTL_DMA_ENABLE, enable ? 2 : 0);
-
 	if (host->dma->enable)
 		host->dma->enable(host, enable);
 }
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 807ecfb162d7..605812820e48 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -95,11 +95,6 @@
  */
 #define TMIO_MMC_SDIO_STATUS_QUIRK	(1 << 8)
 
-/*
- * Some controllers have DMA enable/disable register
- */
-#define TMIO_MMC_HAVE_CTL_DMA_REG	(1 << 9)
-
 /*
  * Some controllers allows to set SDx actual clock
  */
-- 
cgit v1.2.3


From 1dfb4a0d7615811ec4a61b0a7631c8ddc0baf335 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 13 Jan 2015 08:00:29 +0100
Subject: gpio: stmpe: enforce device tree only mode

Require that device tree be used with STMPE (all platforms use this)
and enforce OF_GPIO, then delete the platform data.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/Kconfig      |  1 +
 drivers/gpio/gpio-stmpe.c | 23 +++--------------------
 include/linux/mfd/stmpe.h | 16 ----------------
 3 files changed, 4 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 4187fcbc88fd..22b46567b3bb 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -644,6 +644,7 @@ config GPIO_SX150X
 config GPIO_STMPE
 	bool "STMPE GPIOs"
 	depends on MFD_STMPE
+	depends on OF_GPIO
 	select GPIOLIB_IRQCHIP
 	help
 	  This enables support for the GPIOs found on the STMPE I/O
diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c
index 85c5b1974294..dabfb99dddef 100644
--- a/drivers/gpio/gpio-stmpe.c
+++ b/drivers/gpio/gpio-stmpe.c
@@ -30,7 +30,7 @@ struct stmpe_gpio {
 	struct stmpe *stmpe;
 	struct device *dev;
 	struct mutex irq_lock;
-	unsigned norequest_mask;
+	u32 norequest_mask;
 	/* Caches of interrupt control registers for bus_lock */
 	u8 regs[CACHE_NR_REGS][CACHE_NR_BANKS];
 	u8 oldregs[CACHE_NR_REGS][CACHE_NR_BANKS];
@@ -340,13 +340,10 @@ static int stmpe_gpio_probe(struct platform_device *pdev)
 {
 	struct stmpe *stmpe = dev_get_drvdata(pdev->dev.parent);
 	struct device_node *np = pdev->dev.of_node;
-	struct stmpe_gpio_platform_data *pdata;
 	struct stmpe_gpio *stmpe_gpio;
 	int ret;
 	int irq = 0;
 
-	pdata = stmpe->pdata->gpio;
-
 	irq = platform_get_irq(pdev, 0);
 
 	stmpe_gpio = kzalloc(sizeof(struct stmpe_gpio), GFP_KERNEL);
@@ -360,19 +357,14 @@ static int stmpe_gpio_probe(struct platform_device *pdev)
 	stmpe_gpio->chip = template_chip;
 	stmpe_gpio->chip.ngpio = stmpe->num_gpios;
 	stmpe_gpio->chip.dev = &pdev->dev;
-#ifdef CONFIG_OF
 	stmpe_gpio->chip.of_node = np;
-#endif
 	stmpe_gpio->chip.base = -1;
 
 	if (IS_ENABLED(CONFIG_DEBUG_FS))
                 stmpe_gpio->chip.dbg_show = stmpe_dbg_show;
 
-	if (pdata)
-		stmpe_gpio->norequest_mask = pdata->norequest_mask;
-	else if (np)
-		of_property_read_u32(np, "st,norequest-mask",
-				&stmpe_gpio->norequest_mask);
+	of_property_read_u32(np, "st,norequest-mask",
+			&stmpe_gpio->norequest_mask);
 
 	if (irq < 0)
 		dev_info(&pdev->dev,
@@ -414,9 +406,6 @@ static int stmpe_gpio_probe(struct platform_device *pdev)
 					     NULL);
 	}
 
-	if (pdata && pdata->setup)
-		pdata->setup(stmpe, stmpe_gpio->chip.base);
-
 	platform_set_drvdata(pdev, stmpe_gpio);
 
 	return 0;
@@ -433,15 +422,9 @@ static int stmpe_gpio_remove(struct platform_device *pdev)
 {
 	struct stmpe_gpio *stmpe_gpio = platform_get_drvdata(pdev);
 	struct stmpe *stmpe = stmpe_gpio->stmpe;
-	struct stmpe_gpio_platform_data *pdata = stmpe->pdata->gpio;
-
-	if (pdata && pdata->remove)
-		pdata->remove(stmpe, stmpe_gpio->chip.base);
 
 	gpiochip_remove(&stmpe_gpio->chip);
-
 	stmpe_disable(stmpe, STMPE_BLOCK_GPIO);
-
 	kfree(stmpe_gpio);
 
 	return 0;
diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
index f742b6717d52..c9d869027300 100644
--- a/include/linux/mfd/stmpe.h
+++ b/include/linux/mfd/stmpe.h
@@ -117,20 +117,6 @@ extern int stmpe_disable(struct stmpe *stmpe, unsigned int blocks);
 
 #define STMPE_GPIO_NOREQ_811_TOUCH	(0xf0)
 
-/**
- * struct stmpe_gpio_platform_data - STMPE GPIO platform data
- * @norequest_mask: bitmask specifying which GPIOs should _not_ be
- *		    requestable due to different usage (e.g. touch, keypad)
- *		    STMPE_GPIO_NOREQ_* macros can be used here.
- * @setup: board specific setup callback.
- * @remove: board specific remove callback
- */
-struct stmpe_gpio_platform_data {
-	unsigned norequest_mask;
-	void (*setup)(struct stmpe *stmpe, unsigned gpio_base);
-	void (*remove)(struct stmpe *stmpe, unsigned gpio_base);
-};
-
 /**
  * struct stmpe_ts_platform_data - stmpe811 touch screen controller platform
  * data
@@ -182,7 +168,6 @@ struct stmpe_ts_platform_data {
  * @irq_over_gpio: true if gpio is used to get irq
  * @irq_gpio: gpio number over which irq will be requested (significant only if
  *	      irq_over_gpio is true)
- * @gpio: GPIO-specific platform data
  * @ts: touchscreen-specific platform data
  */
 struct stmpe_platform_data {
@@ -194,7 +179,6 @@ struct stmpe_platform_data {
 	int irq_gpio;
 	int autosleep_timeout;
 
-	struct stmpe_gpio_platform_data *gpio;
 	struct stmpe_ts_platform_data *ts;
 };
 
-- 
cgit v1.2.3


From 4b6eade76ad19183464b739e9af1efacdb1bbda8 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 20 Jan 2015 11:00:53 +0100
Subject: mfd: max77693: Add defines for MAX77693 charger driver

Prepare for adding support for Maxim 77693 charger by adding necessary
new defines.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 include/linux/mfd/max77693-private.h | 108 +++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index 08dae01258b9..955dd990beaf 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -143,10 +143,118 @@ enum max77693_pmic_reg {
 #define FLASH_INT_FLED1_SHORT	BIT(3)
 #define FLASH_INT_OVER_CURRENT	BIT(4)
 
+/* Fast charge timer in in hours */
+#define DEFAULT_FAST_CHARGE_TIMER		4
+/* microamps */
+#define DEFAULT_TOP_OFF_THRESHOLD_CURRENT	150000
+/* minutes */
+#define DEFAULT_TOP_OFF_TIMER			30
+/* microvolts */
+#define DEFAULT_CONSTANT_VOLT			4200000
+/* microvolts */
+#define DEFAULT_MIN_SYSTEM_VOLT			3600000
+/* celsius */
+#define DEFAULT_THERMAL_REGULATION_TEMP		100
+/* microamps */
+#define DEFAULT_BATTERY_OVERCURRENT		3500000
+/* microvolts */
+#define DEFAULT_CHARGER_INPUT_THRESHOLD_VOLT	4300000
+
+/* MAX77693_CHG_REG_CHG_INT_OK register */
+#define CHG_INT_OK_BYP_SHIFT		0
+#define CHG_INT_OK_BAT_SHIFT		3
+#define CHG_INT_OK_CHG_SHIFT		4
+#define CHG_INT_OK_CHGIN_SHIFT		6
+#define CHG_INT_OK_DETBAT_SHIFT		7
+#define CHG_INT_OK_BYP_MASK		BIT(CHG_INT_OK_BYP_SHIFT)
+#define CHG_INT_OK_BAT_MASK		BIT(CHG_INT_OK_BAT_SHIFT)
+#define CHG_INT_OK_CHG_MASK		BIT(CHG_INT_OK_CHG_SHIFT)
+#define CHG_INT_OK_CHGIN_MASK		BIT(CHG_INT_OK_CHGIN_SHIFT)
+#define CHG_INT_OK_DETBAT_MASK		BIT(CHG_INT_OK_DETBAT_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_DETAILS_00 register */
+#define CHG_DETAILS_00_CHGIN_SHIFT	5
+#define CHG_DETAILS_00_CHGIN_MASK	(0x3 << CHG_DETAILS_00_CHGIN_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_DETAILS_01 register */
+#define CHG_DETAILS_01_CHG_SHIFT	0
+#define CHG_DETAILS_01_BAT_SHIFT	4
+#define CHG_DETAILS_01_TREG_SHIFT	7
+#define CHG_DETAILS_01_CHG_MASK		(0xf << CHG_DETAILS_01_CHG_SHIFT)
+#define CHG_DETAILS_01_BAT_MASK		(0x7 << CHG_DETAILS_01_BAT_SHIFT)
+#define CHG_DETAILS_01_TREG_MASK	BIT(7)
+
+/* MAX77693_CHG_REG_CHG_DETAILS_01/CHG field */
+enum max77693_charger_charging_state {
+	MAX77693_CHARGING_PREQUALIFICATION	= 0x0,
+	MAX77693_CHARGING_FAST_CONST_CURRENT,
+	MAX77693_CHARGING_FAST_CONST_VOLTAGE,
+	MAX77693_CHARGING_TOP_OFF,
+	MAX77693_CHARGING_DONE,
+	MAX77693_CHARGING_HIGH_TEMP,
+	MAX77693_CHARGING_TIMER_EXPIRED,
+	MAX77693_CHARGING_THERMISTOR_SUSPEND,
+	MAX77693_CHARGING_OFF,
+	MAX77693_CHARGING_RESERVED,
+	MAX77693_CHARGING_OVER_TEMP,
+	MAX77693_CHARGING_WATCHDOG_EXPIRED,
+};
+
+/* MAX77693_CHG_REG_CHG_DETAILS_01/BAT field */
+enum max77693_charger_battery_state {
+	MAX77693_BATTERY_NOBAT			= 0x0,
+	/* Dead-battery or low-battery prequalification */
+	MAX77693_BATTERY_PREQUALIFICATION,
+	MAX77693_BATTERY_TIMER_EXPIRED,
+	MAX77693_BATTERY_GOOD,
+	MAX77693_BATTERY_LOWVOLTAGE,
+	MAX77693_BATTERY_OVERVOLTAGE,
+	MAX77693_BATTERY_OVERCURRENT,
+	MAX77693_BATTERY_RESERVED,
+};
+
+/* MAX77693_CHG_REG_CHG_DETAILS_02 register */
+#define CHG_DETAILS_02_BYP_SHIFT	0
+#define CHG_DETAILS_02_BYP_MASK		(0xf << CHG_DETAILS_02_BYP_SHIFT)
+
 /* MAX77693 CHG_CNFG_00 register */
 #define CHG_CNFG_00_CHG_MASK		0x1
 #define CHG_CNFG_00_BUCK_MASK		0x4
 
+/* MAX77693_CHG_REG_CHG_CNFG_01 register */
+#define CHG_CNFG_01_FCHGTIME_SHIFT	0
+#define CHG_CNFG_01_CHGRSTRT_SHIFT	4
+#define CHG_CNFG_01_PQEN_SHIFT		7
+#define CHG_CNFG_01_FCHGTIME_MASK	(0x7 << CHG_CNFG_01_FCHGTIME_SHIFT)
+#define CHG_CNFG_01_CHGRSTRT_MASK	(0x3 << CHG_CNFG_01_CHGRSTRT_SHIFT)
+#define CHG_CNFG_01_PQEN_MAKS		BIT(CHG_CNFG_01_PQEN_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_03 register */
+#define CHG_CNFG_03_TOITH_SHIFT		0
+#define CHG_CNFG_03_TOTIME_SHIFT	3
+#define CHG_CNFG_03_TOITH_MASK		(0x7 << CHG_CNFG_03_TOITH_SHIFT)
+#define CHG_CNFG_03_TOTIME_MASK		(0x7 << CHG_CNFG_03_TOTIME_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_04 register */
+#define CHG_CNFG_04_CHGCVPRM_SHIFT	0
+#define CHG_CNFG_04_MINVSYS_SHIFT	5
+#define CHG_CNFG_04_CHGCVPRM_MASK	(0x1f << CHG_CNFG_04_CHGCVPRM_SHIFT)
+#define CHG_CNFG_04_MINVSYS_MASK	(0x7 << CHG_CNFG_04_MINVSYS_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_06 register */
+#define CHG_CNFG_06_CHGPROT_SHIFT	2
+#define CHG_CNFG_06_CHGPROT_MASK	(0x3 << CHG_CNFG_06_CHGPROT_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_07 register */
+#define CHG_CNFG_07_REGTEMP_SHIFT	5
+#define CHG_CNFG_07_REGTEMP_MASK	(0x3 << CHG_CNFG_07_REGTEMP_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_12 register */
+#define CHG_CNFG_12_B2SOVRC_SHIFT	0
+#define CHG_CNFG_12_VCHGINREG_SHIFT	3
+#define CHG_CNFG_12_B2SOVRC_MASK	(0x7 << CHG_CNFG_12_B2SOVRC_SHIFT)
+#define CHG_CNFG_12_VCHGINREG_MASK	(0x3 << CHG_CNFG_12_VCHGINREG_SHIFT)
+
 /* MAX77693 CHG_CNFG_09 Register */
 #define CHG_CNFG_09_CHGIN_ILIM_MASK	0x7F
 
-- 
cgit v1.2.3


From 2fded7f44b8fcf79e274c3f0cfbd0298f95308f3 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 23 Dec 2014 16:39:54 -0500
Subject: audit: remove vestiges of vers_ops

Should have been removed with commit 18900909 ("audit: remove the old
depricated kernel interface").

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/linux/audit.h | 1 -
 kernel/auditfilter.c  | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 93331929d643..b481779a8dee 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -46,7 +46,6 @@ struct audit_tree;
 struct sk_buff;
 
 struct audit_krule {
-	int			vers_ops;
 	u32			pflags;
 	u32			flags;
 	u32			listnr;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 103586e239a2..81c94d739e3f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		goto exit_nofree;
 
 	bufp = data->buf;
-	entry->rule.vers_ops = 2;
 	for (i = 0; i < data->field_count; i++) {
 		struct audit_field *f = &entry->rule.fields[i];
 
@@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
 		return ERR_PTR(-ENOMEM);
 
 	new = &entry->rule;
-	new->vers_ops = old->vers_ops;
 	new->flags = old->flags;
 	new->pflags = old->pflags;
 	new->listnr = old->listnr;
-- 
cgit v1.2.3


From ea2f83a7de9d0abbd145e37177905aab57fdb835 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Sun, 26 Oct 2014 23:17:00 +0000
Subject: arm/arm64: KVM: move kvm_register_device_ops() into vGIC probing

Currently we unconditionally register the GICv2 emulation device
during the host's KVM initialization. Since with GICv3 support we
may end up with only v2 or only v3 or both supported, we move the
registration into the GIC probing function, where we will later know
which combination is valid.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/arm/vgic-v2.c   | 2 ++
 virt/kvm/arm/vgic-v3.c   | 1 +
 virt/kvm/arm/vgic.c      | 5 ++---
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b934cc94cc8..25d7ce31a5d4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1051,6 +1051,7 @@ void kvm_unregister_device_ops(u32 type);
 
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
+extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index 2935405ad22f..e1cd3cb95903 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -229,6 +229,8 @@ int vgic_v2_probe(struct device_node *vgic_node,
 		goto out_unmap;
 	}
 
+	kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
+
 	vgic->vcpu_base = vcpu_res.start;
 
 	kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index 1c2c8eef0599..d14c75f4a33b 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -230,6 +230,7 @@ int vgic_v3_probe(struct device_node *vgic_node,
 		ret = -ENXIO;
 		goto out;
 	}
+	kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
 
 	vgic->vcpu_base = vcpu_res.start;
 	vgic->vctrl_base = NULL;
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 9b63141a599d..69f6e7aa573e 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -2564,7 +2564,7 @@ static int vgic_create(struct kvm_device *dev, u32 type)
 	return kvm_vgic_create(dev->kvm, type);
 }
 
-static struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+struct kvm_device_ops kvm_arm_vgic_v2_ops = {
 	.name = "kvm-arm-vgic",
 	.create = vgic_create,
 	.destroy = vgic_destroy,
@@ -2643,8 +2643,7 @@ int kvm_vgic_hyp_init(void)
 
 	on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
 
-	return kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-				       KVM_DEV_TYPE_ARM_VGIC_V2);
+	return 0;
 
 out_free_irq:
 	free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus());
-- 
cgit v1.2.3


From a0675c25d6392c2197b796a60c4a2a0138c86355 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Sat, 7 Jun 2014 00:54:51 +0200
Subject: arm/arm64: KVM: add virtual GICv3 distributor emulation

With everything separated and prepared, we implement a model of a
GICv3 distributor and redistributors by using the existing framework
to provide handler functions for each register group.

Currently we limit the emulation to a model enforcing a single
security state, with SRE==1 (forcing system register access) and
ARE==1 (allowing more than 8 VCPUs).

We share some of the functions provided for GICv2 emulation, but take
the different ways of addressing (v)CPUs into account.
Save and restore is currently not implemented.

Similar to the split-off of the GICv2 specific code, the new emulation
code goes into a new file (vgic-v3-emul.c).

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/kvm/Makefile            |   1 +
 include/kvm/arm_vgic.h             |   9 +-
 include/linux/irqchip/arm-gic-v3.h |  32 ++
 include/linux/kvm_host.h           |   1 +
 include/uapi/linux/kvm.h           |   2 +
 virt/kvm/arm/vgic-v3-emul.c        | 922 +++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic.c                |  11 +-
 virt/kvm/arm/vgic.h                |   3 +
 8 files changed, 978 insertions(+), 3 deletions(-)
 create mode 100644 virt/kvm/arm/vgic-v3-emul.c

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d9573533a873..4e6e09ee4033 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -24,5 +24,6 @@ kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o
+kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o
 kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ff04afd0d901..98c30168bce4 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -162,7 +162,11 @@ struct vgic_dist {
 
 	/* Distributor and vcpu interface mapping in the guest */
 	phys_addr_t		vgic_dist_base;
-	phys_addr_t		vgic_cpu_base;
+	/* GICv2 and GICv3 use different mapped register blocks */
+	union {
+		phys_addr_t		vgic_cpu_base;
+		phys_addr_t		vgic_redist_base;
+	};
 
 	/* Distributor enabled */
 	u32			enabled;
@@ -224,6 +228,9 @@ struct vgic_dist {
 	 */
 	struct vgic_bitmap	*irq_spi_target;
 
+	/* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
+	u32			*irq_spi_mpidr;
+
 	/* Bitmap indicating which CPU has something pending */
 	unsigned long		*irq_pending_on_cpu;
 
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 1e8b0cf30792..3fb4d8588a26 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -33,6 +33,7 @@
 #define GICD_SETSPI_SR			0x0050
 #define GICD_CLRSPI_SR			0x0058
 #define GICD_SEIR			0x0068
+#define GICD_IGROUPR			0x0080
 #define GICD_ISENABLER			0x0100
 #define GICD_ICENABLER			0x0180
 #define GICD_ISPENDR			0x0200
@@ -41,14 +42,37 @@
 #define GICD_ICACTIVER			0x0380
 #define GICD_IPRIORITYR			0x0400
 #define GICD_ICFGR			0x0C00
+#define GICD_IGRPMODR			0x0D00
+#define GICD_NSACR			0x0E00
 #define GICD_IROUTER			0x6000
+#define GICD_IDREGS			0xFFD0
 #define GICD_PIDR2			0xFFE8
 
+/*
+ * Those registers are actually from GICv2, but the spec demands that they
+ * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3).
+ */
+#define GICD_ITARGETSR			0x0800
+#define GICD_SGIR			0x0F00
+#define GICD_CPENDSGIR			0x0F10
+#define GICD_SPENDSGIR			0x0F20
+
 #define GICD_CTLR_RWP			(1U << 31)
+#define GICD_CTLR_DS			(1U << 6)
 #define GICD_CTLR_ARE_NS		(1U << 4)
 #define GICD_CTLR_ENABLE_G1A		(1U << 1)
 #define GICD_CTLR_ENABLE_G1		(1U << 0)
 
+/*
+ * In systems with a single security state (what we emulate in KVM)
+ * the meaning of the interrupt group enable bits is slightly different
+ */
+#define GICD_CTLR_ENABLE_SS_G1		(1U << 1)
+#define GICD_CTLR_ENABLE_SS_G0		(1U << 0)
+
+#define GICD_TYPER_LPIS			(1U << 17)
+#define GICD_TYPER_MBIS			(1U << 16)
+
 #define GICD_TYPER_ID_BITS(typer)	((((typer) >> 19) & 0x1f) + 1)
 #define GICD_TYPER_IRQS(typer)		((((typer) & 0x1f) + 1) * 32)
 #define GICD_TYPER_LPIS			(1U << 17)
@@ -60,6 +84,8 @@
 #define GIC_PIDR2_ARCH_GICv3		0x30
 #define GIC_PIDR2_ARCH_GICv4		0x40
 
+#define GIC_V3_DIST_SIZE		0x10000
+
 /*
  * Re-Distributor registers, offsets from RD_base
  */
@@ -78,6 +104,7 @@
 #define GICR_SYNCR			0x00C0
 #define GICR_MOVLPIR			0x0100
 #define GICR_MOVALLR			0x0110
+#define GICR_IDREGS			GICD_IDREGS
 #define GICR_PIDR2			GICD_PIDR2
 
 #define GICR_CTLR_ENABLE_LPIS		(1UL << 0)
@@ -104,6 +131,7 @@
 /*
  * Re-Distributor registers, offsets from SGI_base
  */
+#define GICR_IGROUPR0			GICD_IGROUPR
 #define GICR_ISENABLER0			GICD_ISENABLER
 #define GICR_ICENABLER0			GICD_ICENABLER
 #define GICR_ISPENDR0			GICD_ISPENDR
@@ -112,11 +140,15 @@
 #define GICR_ICACTIVER0			GICD_ICACTIVER
 #define GICR_IPRIORITYR0		GICD_IPRIORITYR
 #define GICR_ICFGR0			GICD_ICFGR
+#define GICR_IGRPMODR0			GICD_IGRPMODR
+#define GICR_NSACR			GICD_NSACR
 
 #define GICR_TYPER_PLPIS		(1U << 0)
 #define GICR_TYPER_VLPIS		(1U << 1)
 #define GICR_TYPER_LAST			(1U << 4)
 
+#define GIC_V3_REDIST_SIZE		0x20000
+
 #define LPI_PROP_GROUP1			(1 << 1)
 #define LPI_PROP_ENABLED		(1 << 0)
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 25d7ce31a5d4..0ef2daa199d8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1052,6 +1052,7 @@ void kvm_unregister_device_ops(u32 type);
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
+extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a37fd1224f36..b4e6f1e70f03 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -952,6 +952,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_ARM_VGIC_V2	KVM_DEV_TYPE_ARM_VGIC_V2
 	KVM_DEV_TYPE_FLIC,
 #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
+	KVM_DEV_TYPE_ARM_VGIC_V3,
+#define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
 	KVM_DEV_TYPE_MAX,
 };
 
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
new file mode 100644
index 000000000000..8db1db597223
--- /dev/null
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -0,0 +1,922 @@
+/*
+ * GICv3 distributor and redistributor emulation
+ *
+ * GICv3 emulation is currently only supported on a GICv3 host (because
+ * we rely on the hardware's CPU interface virtualization support), but
+ * supports both hardware with or without the optional GICv2 backwards
+ * compatibility features.
+ *
+ * Limitations of the emulation:
+ * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
+ * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
+ * - We do not support the message based interrupts (MBIs) triggered by
+ *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
+ * - We do not support the (optional) backwards compatibility feature.
+ *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
+ *   the compatiblity feature, you can use a GICv2 in the guest, though.
+ * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
+ * - Priorities are not emulated (same as the GICv2 emulation). Linux
+ *   as a guest is fine with this, because it does not use priorities.
+ * - We only support Group1 interrupts. Again Linux uses only those.
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+
+static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
+			       struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg = 0xffffffff;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
+			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg = 0;
+
+	/*
+	 * Force ARE and DS to 1, the guest cannot change this.
+	 * For the time being we only support Group1 interrupts.
+	 */
+	if (vcpu->kvm->arch.vgic.enabled)
+		reg = GICD_CTLR_ENABLE_SS_G1;
+	reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+	if (mmio->is_write) {
+		if (reg & GICD_CTLR_ENABLE_SS_G0)
+			kvm_info("guest tried to enable unsupported Group0 interrupts\n");
+		vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
+		vgic_update_state(vcpu->kvm);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * As this implementation does not provide compatibility
+ * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
+ * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
+ * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
+ */
+#define INTERRUPT_ID_BITS 10
+static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
+			      struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg;
+
+	reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
+
+	reg |= (INTERRUPT_ID_BITS - 1) << 19;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
+			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg;
+
+	reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
+					    struct kvm_exit_mmio *mmio,
+					    phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+					      vcpu->vcpu_id,
+					      ACCESS_WRITE_SETBIT);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
+					      struct kvm_exit_mmio *mmio,
+					      phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+					      vcpu->vcpu_id,
+					      ACCESS_WRITE_CLEARBIT);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
+					     struct kvm_exit_mmio *mmio,
+					     phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
+						   vcpu->vcpu_id);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
+					       struct kvm_exit_mmio *mmio,
+					       phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
+						     vcpu->vcpu_id);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
+					  struct kvm_exit_mmio *mmio,
+					  phys_addr_t offset)
+{
+	u32 *reg;
+
+	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
+		vgic_reg_access(mmio, NULL, offset,
+				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
+				   vcpu->vcpu_id, offset);
+	vgic_reg_access(mmio, reg, offset,
+		ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+	return false;
+}
+
+static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
+				     struct kvm_exit_mmio *mmio,
+				     phys_addr_t offset)
+{
+	u32 *reg;
+
+	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
+		vgic_reg_access(mmio, NULL, offset,
+				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+				  vcpu->vcpu_id, offset >> 1);
+
+	return vgic_handle_cfg_reg(reg, mmio, offset);
+}
+
+/*
+ * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
+ * when we store the target MPIDR written by the guest.
+ */
+static u32 compress_mpidr(unsigned long mpidr)
+{
+	u32 ret;
+
+	ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
+	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
+	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
+
+	return ret;
+}
+
+static unsigned long uncompress_mpidr(u32 value)
+{
+	unsigned long mpidr;
+
+	mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
+	mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
+	mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
+	mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
+
+	return mpidr;
+}
+
+/*
+ * Lookup the given MPIDR value to get the vcpu_id (if there is one)
+ * and store that in the irq_spi_cpu[] array.
+ * This limits the number of VCPUs to 255 for now, extending the data
+ * type (or storing kvm_vcpu pointers) should lift the limit.
+ * Store the original MPIDR value in an extra array to support read-as-written.
+ * Unallocated MPIDRs are translated to a special value and caught
+ * before any array accesses.
+ */
+static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
+				  struct kvm_exit_mmio *mmio,
+				  phys_addr_t offset)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	int spi;
+	u32 reg;
+	int vcpu_id;
+	unsigned long *bmap, mpidr;
+
+	/*
+	 * The upper 32 bits of each 64 bit register are zero,
+	 * as we don't support Aff3.
+	 */
+	if ((offset & 4)) {
+		vgic_reg_access(mmio, NULL, offset,
+				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	/* This region only covers SPIs, so no handling of private IRQs here. */
+	spi = offset / 8;
+
+	/* get the stored MPIDR for this IRQ */
+	mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
+	reg = mpidr;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+
+	if (!mmio->is_write)
+		return false;
+
+	/*
+	 * Now clear the currently assigned vCPU from the map, making room
+	 * for the new one to be written below
+	 */
+	vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
+	if (likely(vcpu)) {
+		vcpu_id = vcpu->vcpu_id;
+		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
+		__clear_bit(spi, bmap);
+	}
+
+	dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
+	vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
+
+	/*
+	 * The spec says that non-existent MPIDR values should not be
+	 * forwarded to any existent (v)CPU, but should be able to become
+	 * pending anyway. We simply keep the irq_spi_target[] array empty, so
+	 * the interrupt will never be injected.
+	 * irq_spi_cpu[irq] gets a magic value in this case.
+	 */
+	if (likely(vcpu)) {
+		vcpu_id = vcpu->vcpu_id;
+		dist->irq_spi_cpu[spi] = vcpu_id;
+		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
+		__set_bit(spi, bmap);
+	} else {
+		dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
+	}
+
+	vgic_update_state(kvm);
+
+	return true;
+}
+
+/*
+ * We should be careful about promising too much when a guest reads
+ * this register. Don't claim to be like any hardware implementation,
+ * but just report the GIC as version 3 - which is what a Linux guest
+ * would check.
+ */
+static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
+			       struct kvm_exit_mmio *mmio,
+			       phys_addr_t offset)
+{
+	u32 reg = 0;
+
+	switch (offset + GICD_IDREGS) {
+	case GICD_PIDR2:
+		reg = 0x3b;
+		break;
+	}
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static const struct kvm_mmio_range vgic_v3_dist_ranges[] = {
+	{
+		.base           = GICD_CTLR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_ctlr,
+	},
+	{
+		.base           = GICD_TYPER,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_typer,
+	},
+	{
+		.base           = GICD_IIDR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_iidr,
+	},
+	{
+		/* this register is optional, it is RAZ/WI if not implemented */
+		.base           = GICD_STATUSR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_raz_wi,
+	},
+	{
+		/* this write only register is WI when TYPER.MBIS=0 */
+		.base		= GICD_SETSPI_NSR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this write only register is WI when TYPER.MBIS=0 */
+		.base		= GICD_CLRSPI_NSR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_SETSPI_SR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_CLRSPI_SR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_IGROUPR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_rao_wi,
+	},
+	{
+		.base		= GICD_ISENABLER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_enable_reg_dist,
+	},
+	{
+		.base		= GICD_ICENABLER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_enable_reg_dist,
+	},
+	{
+		.base		= GICD_ISPENDR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_pending_reg_dist,
+	},
+	{
+		.base		= GICD_ICPENDR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_pending_reg_dist,
+	},
+	{
+		.base		= GICD_ISACTIVER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_ICACTIVER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_IPRIORITYR,
+		.len		= 0x400,
+		.bits_per_irq	= 8,
+		.handle_mmio	= handle_mmio_priority_reg_dist,
+	},
+	{
+		/* TARGETSRn is RES0 when ARE=1 */
+		.base		= GICD_ITARGETSR,
+		.len		= 0x400,
+		.bits_per_irq	= 8,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_ICFGR,
+		.len		= 0x100,
+		.bits_per_irq	= 2,
+		.handle_mmio	= handle_mmio_cfg_reg_dist,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_IGRPMODR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_NSACR,
+		.len		= 0x100,
+		.bits_per_irq	= 2,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when ARE=1 */
+		.base		= GICD_SGIR,
+		.len		= 0x04,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when ARE=1 */
+		.base		= GICD_CPENDSGIR,
+		.len		= 0x10,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when ARE=1 */
+		.base           = GICD_SPENDSGIR,
+		.len            = 0x10,
+		.handle_mmio    = handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_IROUTER + 0x100,
+		.len		= 0x1ee0,
+		.bits_per_irq	= 64,
+		.handle_mmio	= handle_mmio_route_reg,
+	},
+	{
+		.base           = GICD_IDREGS,
+		.len            = 0x30,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_idregs,
+	},
+	{},
+};
+
+static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
+					      struct kvm_exit_mmio *mmio,
+					      phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+				      redist_vcpu->vcpu_id,
+				      ACCESS_WRITE_SETBIT);
+}
+
+static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
+						struct kvm_exit_mmio *mmio,
+						phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+				      redist_vcpu->vcpu_id,
+				      ACCESS_WRITE_CLEARBIT);
+}
+
+static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
+					       struct kvm_exit_mmio *mmio,
+					       phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
+					   redist_vcpu->vcpu_id);
+}
+
+static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
+						 struct kvm_exit_mmio *mmio,
+						 phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
+					     redist_vcpu->vcpu_id);
+}
+
+static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
+					    struct kvm_exit_mmio *mmio,
+					    phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+	u32 *reg;
+
+	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
+				   redist_vcpu->vcpu_id, offset);
+	vgic_reg_access(mmio, reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+	return false;
+}
+
+static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
+				       struct kvm_exit_mmio *mmio,
+				       phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+				       redist_vcpu->vcpu_id, offset >> 1);
+
+	return vgic_handle_cfg_reg(reg, mmio, offset);
+}
+
+static const struct kvm_mmio_range vgic_redist_sgi_ranges[] = {
+	{
+		.base		= GICR_IGROUPR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_rao_wi,
+	},
+	{
+		.base		= GICR_ISENABLER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_enable_reg_redist,
+	},
+	{
+		.base		= GICR_ICENABLER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_enable_reg_redist,
+	},
+	{
+		.base		= GICR_ISPENDR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_pending_reg_redist,
+	},
+	{
+		.base		= GICR_ICPENDR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_pending_reg_redist,
+	},
+	{
+		.base		= GICR_ISACTIVER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_ICACTIVER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_IPRIORITYR0,
+		.len		= 0x20,
+		.bits_per_irq	= 8,
+		.handle_mmio	= handle_mmio_priority_reg_redist,
+	},
+	{
+		.base		= GICR_ICFGR0,
+		.len		= 0x08,
+		.bits_per_irq	= 2,
+		.handle_mmio	= handle_mmio_cfg_reg_redist,
+	},
+	{
+		.base		= GICR_IGRPMODR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_NSACR,
+		.len		= 0x04,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{},
+};
+
+static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
+				    struct kvm_exit_mmio *mmio,
+				    phys_addr_t offset)
+{
+	/* since we don't support LPIs, this register is zero for now */
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
+				     struct kvm_exit_mmio *mmio,
+				     phys_addr_t offset)
+{
+	u32 reg;
+	u64 mpidr;
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+	int target_vcpu_id = redist_vcpu->vcpu_id;
+
+	/* the upper 32 bits contain the affinity value */
+	if ((offset & ~3) == 4) {
+		mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
+		reg = compress_mpidr(mpidr);
+
+		vgic_reg_access(mmio, &reg, offset,
+				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	reg = redist_vcpu->vcpu_id << 8;
+	if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
+		reg |= GICR_TYPER_LAST;
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static const struct kvm_mmio_range vgic_redist_ranges[] = {
+	{
+		.base           = GICR_CTLR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_ctlr_redist,
+	},
+	{
+		.base           = GICR_TYPER,
+		.len            = 0x08,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_typer_redist,
+	},
+	{
+		.base           = GICR_IIDR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_iidr,
+	},
+	{
+		.base           = GICR_WAKER,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_raz_wi,
+	},
+	{
+		.base           = GICR_IDREGS,
+		.len            = 0x30,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_idregs,
+	},
+	{},
+};
+
+/*
+ * This function splits accesses between the distributor and the two
+ * redistributor parts (private/SPI). As each redistributor is accessible
+ * from any CPU, we have to determine the affected VCPU by taking the faulting
+ * address into account. We then pass this VCPU to the handler function via
+ * the private parameter.
+ */
+#define SGI_BASE_OFFSET SZ_64K
+static bool vgic_v3_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
+				struct kvm_exit_mmio *mmio)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	unsigned long dbase = dist->vgic_dist_base;
+	unsigned long rdbase = dist->vgic_redist_base;
+	int nrcpus = atomic_read(&vcpu->kvm->online_vcpus);
+	int vcpu_id;
+	const struct kvm_mmio_range *mmio_range;
+
+	if (is_in_range(mmio->phys_addr, mmio->len, dbase, GIC_V3_DIST_SIZE)) {
+		return vgic_handle_mmio_range(vcpu, run, mmio,
+					      vgic_v3_dist_ranges, dbase);
+	}
+
+	if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
+	    GIC_V3_REDIST_SIZE * nrcpus))
+		return false;
+
+	vcpu_id = (mmio->phys_addr - rdbase) / GIC_V3_REDIST_SIZE;
+	rdbase += (vcpu_id * GIC_V3_REDIST_SIZE);
+	mmio->private = kvm_get_vcpu(vcpu->kvm, vcpu_id);
+
+	if (mmio->phys_addr >= rdbase + SGI_BASE_OFFSET) {
+		rdbase += SGI_BASE_OFFSET;
+		mmio_range = vgic_redist_sgi_ranges;
+	} else {
+		mmio_range = vgic_redist_ranges;
+	}
+	return vgic_handle_mmio_range(vcpu, run, mmio, mmio_range, rdbase);
+}
+
+static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
+{
+	if (vgic_queue_irq(vcpu, 0, irq)) {
+		vgic_dist_irq_clear_pending(vcpu, irq);
+		vgic_cpu_irq_clear(vcpu, irq);
+		return true;
+	}
+
+	return false;
+}
+
+static int vgic_v3_map_resources(struct kvm *kvm,
+				 const struct vgic_params *params)
+{
+	int ret = 0;
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	if (!irqchip_in_kernel(kvm))
+		return 0;
+
+	mutex_lock(&kvm->lock);
+
+	if (vgic_ready(kvm))
+		goto out;
+
+	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+	    IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
+		kvm_err("Need to set vgic distributor addresses first\n");
+		ret = -ENXIO;
+		goto out;
+	}
+
+	/*
+	 * For a VGICv3 we require the userland to explicitly initialize
+	 * the VGIC before we need to use it.
+	 */
+	if (!vgic_initialized(kvm)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	kvm->arch.vgic.ready = true;
+out:
+	if (ret)
+		kvm_vgic_destroy(kvm);
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static int vgic_v3_init_model(struct kvm *kvm)
+{
+	int i;
+	u32 mpidr;
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
+
+	dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
+				      GFP_KERNEL);
+
+	if (!dist->irq_spi_mpidr)
+		return -ENOMEM;
+
+	/* Initialize the target VCPUs for each IRQ to VCPU 0 */
+	mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
+	for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
+		dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
+		dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
+		vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
+	}
+
+	return 0;
+}
+
+/* GICv3 does not keep track of SGI sources anymore. */
+static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
+{
+}
+
+void vgic_v3_init_emulation(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	dist->vm_ops.handle_mmio = vgic_v3_handle_mmio;
+	dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
+	dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
+	dist->vm_ops.init_model = vgic_v3_init_model;
+	dist->vm_ops.map_resources = vgic_v3_map_resources;
+
+	kvm->arch.max_vcpus = KVM_MAX_VCPUS;
+}
+
+static int vgic_v3_create(struct kvm_device *dev, u32 type)
+{
+	return kvm_vgic_create(dev->kvm, type);
+}
+
+static void vgic_v3_destroy(struct kvm_device *dev)
+{
+	kfree(dev);
+}
+
+static int vgic_v3_set_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_set_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return -ENXIO;
+	}
+
+	return -ENXIO;
+}
+
+static int vgic_v3_get_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_get_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return -ENXIO;
+	}
+
+	return -ENXIO;
+}
+
+static int vgic_v3_has_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		switch (attr->attr) {
+		case KVM_VGIC_V2_ADDR_TYPE_DIST:
+		case KVM_VGIC_V2_ADDR_TYPE_CPU:
+			return -ENXIO;
+		}
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return -ENXIO;
+	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+		return 0;
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			return 0;
+		}
+	}
+	return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v3_ops = {
+	.name = "kvm-arm-vgic-v3",
+	.create = vgic_v3_create,
+	.destroy = vgic_v3_destroy,
+	.set_attr = vgic_v3_set_attr,
+	.get_attr = vgic_v3_get_attr,
+	.has_attr = vgic_v3_has_attr,
+};
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index b6e17c886ce2..6d23e57c3561 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1249,7 +1249,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 	struct kvm_vcpu *vcpu;
 	int edge_triggered, level_triggered;
 	int enabled;
-	bool ret = true;
+	bool ret = true, can_inject = true;
 
 	spin_lock(&dist->lock);
 
@@ -1264,6 +1264,11 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 
 	if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
 		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
+		if (cpuid == VCPU_NOT_ALLOCATED) {
+			/* Pretend we use CPU0, and prevent injection */
+			cpuid = 0;
+			can_inject = false;
+		}
 		vcpu = kvm_get_vcpu(kvm, cpuid);
 	}
 
@@ -1286,7 +1291,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 
 	enabled = vgic_irq_is_enabled(vcpu, irq_num);
 
-	if (!enabled) {
+	if (!enabled || !can_inject) {
 		ret = false;
 		goto out;
 	}
@@ -1439,6 +1444,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
 	}
 	kfree(dist->irq_sgi_sources);
 	kfree(dist->irq_spi_cpu);
+	kfree(dist->irq_spi_mpidr);
 	kfree(dist->irq_spi_target);
 	kfree(dist->irq_pending_on_cpu);
 	dist->irq_sgi_sources = NULL;
@@ -1594,6 +1600,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
 	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
 	kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+	kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
 
 out_unlock:
 	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
index e363b9341873..1e83bdf5f499 100644
--- a/virt/kvm/arm/vgic.h
+++ b/virt/kvm/arm/vgic.h
@@ -35,6 +35,8 @@
 #define ACCESS_WRITE_VALUE	(3 << 1)
 #define ACCESS_WRITE_MASK(x)	((x) & (3 << 1))
 
+#define VCPU_NOT_ALLOCATED	((u8)-1)
+
 unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
 
 void vgic_update_state(struct kvm *kvm);
@@ -116,5 +118,6 @@ int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
 
 int vgic_init(struct kvm *kvm);
 void vgic_v2_init_emulation(struct kvm *kvm);
+void vgic_v3_init_emulation(struct kvm *kvm);
 
 #endif
-- 
cgit v1.2.3


From 7e5802781c3e109558ddfd8b02155ad24d872ee7 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Wed, 12 Nov 2014 13:46:06 +0000
Subject: arm64: GICv3: introduce symbolic names for GICv3 ICC_SGI1R_EL1 fields

The gic_send_sgi() function used hardcoded bit shift values to
generate the ICC_SGI1R_EL1 register value.
Replace this with symbolic names to allow reusing them later.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 drivers/irqchip/irq-gic-v3.c       | 14 +++++++++-----
 include/linux/irqchip/arm-gic-v3.h | 12 ++++++++++++
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 1a146ccee701..2ab290bec655 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -481,15 +481,19 @@ out:
 	return tlist;
 }
 
+#define MPIDR_TO_SGI_AFFINITY(cluster_id, level) \
+	(MPIDR_AFFINITY_LEVEL(cluster_id, level) \
+		<< ICC_SGI1R_AFFINITY_## level ##_SHIFT)
+
 static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq)
 {
 	u64 val;
 
-	val = (MPIDR_AFFINITY_LEVEL(cluster_id, 3) << 48	|
-	       MPIDR_AFFINITY_LEVEL(cluster_id, 2) << 32	|
-	       irq << 24			    		|
-	       MPIDR_AFFINITY_LEVEL(cluster_id, 1) << 16	|
-	       tlist);
+	val = (MPIDR_TO_SGI_AFFINITY(cluster_id, 3)	|
+	       MPIDR_TO_SGI_AFFINITY(cluster_id, 2)	|
+	       irq << ICC_SGI1R_SGI_ID_SHIFT		|
+	       MPIDR_TO_SGI_AFFINITY(cluster_id, 1)	|
+	       tlist << ICC_SGI1R_TARGET_LIST_SHIFT);
 
 	pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val);
 	gic_write_sgi1r(val);
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 3fb4d8588a26..800544bc7bfd 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -280,6 +280,18 @@
 #define ICC_SRE_EL2_SRE			(1 << 0)
 #define ICC_SRE_EL2_ENABLE		(1 << 3)
 
+#define ICC_SGI1R_TARGET_LIST_SHIFT	0
+#define ICC_SGI1R_TARGET_LIST_MASK	(0xffff << ICC_SGI1R_TARGET_LIST_SHIFT)
+#define ICC_SGI1R_AFFINITY_1_SHIFT	16
+#define ICC_SGI1R_AFFINITY_1_MASK	(0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_SGI_ID_SHIFT		24
+#define ICC_SGI1R_SGI_ID_MASK		(0xff << ICC_SGI1R_SGI_ID_SHIFT)
+#define ICC_SGI1R_AFFINITY_2_SHIFT	32
+#define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_IRQ_ROUTING_MODE_BIT	40
+#define ICC_SGI1R_AFFINITY_3_SHIFT	48
+#define ICC_SGI1R_AFFINITY_3_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+
 /*
  * System register definitions
  */
-- 
cgit v1.2.3


From 39e794bff718188cfb9ace2032cbe4fd86048dc6 Mon Sep 17 00:00:00 2001
From: Yaniv Gardi <ygardi@codeaurora.org>
Date: Thu, 15 Jan 2015 16:32:36 +0200
Subject: phy: qcom-ufs: add support for 20nm phy

This change adds a support for a 20nm qcom-ufs phy that is required in
platforms that use ufs-qcom controller.

Signed-off-by: Yaniv Gardi <ygardi@codeaurora.org>
Reviewed-by: Dov Levenglick <dovl@codeaurora.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/phy/Makefile                |   1 +
 drivers/phy/phy-qcom-ufs-i.h        |  43 +++++-
 drivers/phy/phy-qcom-ufs-qmp-20nm.c | 257 ++++++++++++++++++++++++++++++++++++
 drivers/phy/phy-qcom-ufs-qmp-20nm.h | 235 +++++++++++++++++++++++++++++++++
 include/linux/phy/phy-qcom-ufs.h    |  59 +++++++++
 5 files changed, 594 insertions(+), 1 deletion(-)
 create mode 100644 drivers/phy/phy-qcom-ufs-qmp-20nm.c
 create mode 100644 drivers/phy/phy-qcom-ufs-qmp-20nm.h
 create mode 100644 include/linux/phy/phy-qcom-ufs.h

(limited to 'include/linux')

diff --git a/drivers/phy/Makefile b/drivers/phy/Makefile
index 335965d3a9b7..781b2fa46d09 100644
--- a/drivers/phy/Makefile
+++ b/drivers/phy/Makefile
@@ -35,3 +35,4 @@ obj-$(CONFIG_PHY_XGENE)			+= phy-xgene.o
 obj-$(CONFIG_PHY_STIH407_USB)		+= phy-stih407-usb.o
 obj-$(CONFIG_PHY_STIH41X_USB)		+= phy-stih41x-usb.o
 obj-$(CONFIG_PHY_QCOM_UFS) 	+= phy-qcom-ufs.o
+obj-$(CONFIG_PHY_QCOM_UFS) 	+= phy-qcom-ufs-qmp-20nm.o
diff --git a/drivers/phy/phy-qcom-ufs-i.h b/drivers/phy/phy-qcom-ufs-i.h
index dac200f4d639..591a39175e8a 100644
--- a/drivers/phy/phy-qcom-ufs-i.h
+++ b/drivers/phy/phy-qcom-ufs-i.h
@@ -15,15 +15,56 @@
 #ifndef UFS_QCOM_PHY_I_H_
 #define UFS_QCOM_PHY_I_H_
 
+#include <linux/module.h>
 #include <linux/clk.h>
+#include <linux/regulator/consumer.h>
 #include <linux/slab.h>
-#include <linux/phy/phy.h>
+#include <linux/phy/phy-qcom-ufs.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/delay.h>
 
+#define readl_poll_timeout(addr, val, cond, sleep_us, timeout_us) \
+({ \
+	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	might_sleep_if(timeout_us); \
+	for (;;) { \
+		(val) = readl(addr); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+			(val) = readl(addr); \
+			break; \
+		} \
+		if (sleep_us) \
+			usleep_range(DIV_ROUND_UP(sleep_us, 4), sleep_us); \
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})
+
+#define UFS_QCOM_PHY_CAL_ENTRY(reg, val)	\
+	{				\
+		.reg_offset = reg,	\
+		.cfg_value = val,	\
+	}
+
 #define UFS_QCOM_PHY_NAME_LEN	30
 
+enum {
+	MASK_SERDES_START       = 0x1,
+	MASK_PCS_READY          = 0x1,
+};
+
+enum {
+	OFFSET_SERDES_START     = 0x0,
+};
+
+struct ufs_qcom_phy_stored_attributes {
+	u32 att;
+	u32 value;
+};
+
+
 struct ufs_qcom_phy_calibration {
 	u32 reg_offset;
 	u32 cfg_value;
diff --git a/drivers/phy/phy-qcom-ufs-qmp-20nm.c b/drivers/phy/phy-qcom-ufs-qmp-20nm.c
new file mode 100644
index 000000000000..8332f96b2c4a
--- /dev/null
+++ b/drivers/phy/phy-qcom-ufs-qmp-20nm.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2013-2015, Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "phy-qcom-ufs-qmp-20nm.h"
+
+#define UFS_PHY_NAME "ufs_phy_qmp_20nm"
+
+static
+int ufs_qcom_phy_qmp_20nm_phy_calibrate(struct ufs_qcom_phy *ufs_qcom_phy,
+					bool is_rate_B)
+{
+	struct ufs_qcom_phy_calibration *tbl_A, *tbl_B;
+	int tbl_size_A, tbl_size_B;
+	u8 major = ufs_qcom_phy->host_ctrl_rev_major;
+	u16 minor = ufs_qcom_phy->host_ctrl_rev_minor;
+	u16 step = ufs_qcom_phy->host_ctrl_rev_step;
+	int err;
+
+	if ((major == 0x1) && (minor == 0x002) && (step == 0x0000)) {
+		tbl_size_A = ARRAY_SIZE(phy_cal_table_rate_A_1_2_0);
+		tbl_A = phy_cal_table_rate_A_1_2_0;
+	} else if ((major == 0x1) && (minor == 0x003) && (step == 0x0000)) {
+		tbl_size_A = ARRAY_SIZE(phy_cal_table_rate_A_1_3_0);
+		tbl_A = phy_cal_table_rate_A_1_3_0;
+	} else {
+		dev_err(ufs_qcom_phy->dev, "%s: Unknown UFS-PHY version, no calibration values\n",
+			__func__);
+		err = -ENODEV;
+		goto out;
+	}
+
+	tbl_size_B = ARRAY_SIZE(phy_cal_table_rate_B);
+	tbl_B = phy_cal_table_rate_B;
+
+	err = ufs_qcom_phy_calibrate(ufs_qcom_phy, tbl_A, tbl_size_A,
+						tbl_B, tbl_size_B, is_rate_B);
+
+	if (err)
+		dev_err(ufs_qcom_phy->dev, "%s: ufs_qcom_phy_calibrate() failed %d\n",
+			__func__, err);
+
+out:
+	return err;
+}
+
+static
+void ufs_qcom_phy_qmp_20nm_advertise_quirks(struct ufs_qcom_phy *phy_common)
+{
+	phy_common->quirks =
+		UFS_QCOM_PHY_QUIRK_HIBERN8_EXIT_AFTER_PHY_PWR_COLLAPSE;
+}
+
+static int ufs_qcom_phy_qmp_20nm_init(struct phy *generic_phy)
+{
+	struct ufs_qcom_phy_qmp_20nm *phy = phy_get_drvdata(generic_phy);
+	struct ufs_qcom_phy *phy_common = &phy->common_cfg;
+	int err = 0;
+
+	err = ufs_qcom_phy_init_clks(generic_phy, phy_common);
+	if (err) {
+		dev_err(phy_common->dev, "%s: ufs_qcom_phy_init_clks() failed %d\n",
+			__func__, err);
+		goto out;
+	}
+
+	err = ufs_qcom_phy_init_vregulators(generic_phy, phy_common);
+	if (err) {
+		dev_err(phy_common->dev, "%s: ufs_qcom_phy_init_vregulators() failed %d\n",
+			__func__, err);
+		goto out;
+	}
+
+	ufs_qcom_phy_qmp_20nm_advertise_quirks(phy_common);
+
+out:
+	return err;
+}
+
+static
+void ufs_qcom_phy_qmp_20nm_power_control(struct ufs_qcom_phy *phy, bool val)
+{
+	bool hibern8_exit_after_pwr_collapse = phy->quirks &
+		UFS_QCOM_PHY_QUIRK_HIBERN8_EXIT_AFTER_PHY_PWR_COLLAPSE;
+
+	if (val) {
+		writel_relaxed(0x1, phy->mmio + UFS_PHY_POWER_DOWN_CONTROL);
+		/*
+		 * Before any transactions involving PHY, ensure PHY knows
+		 * that it's analog rail is powered ON.
+		 */
+		mb();
+
+		if (hibern8_exit_after_pwr_collapse) {
+			/*
+			 * Give atleast 1us delay after restoring PHY analog
+			 * power.
+			 */
+			usleep_range(1, 2);
+			writel_relaxed(0x0A, phy->mmio +
+				       QSERDES_COM_SYSCLK_EN_SEL_TXBAND);
+			writel_relaxed(0x08, phy->mmio +
+				       QSERDES_COM_SYSCLK_EN_SEL_TXBAND);
+			/*
+			 * Make sure workaround is deactivated before proceeding
+			 * with normal PHY operations.
+			 */
+			mb();
+		}
+	} else {
+		if (hibern8_exit_after_pwr_collapse) {
+			writel_relaxed(0x0A, phy->mmio +
+				       QSERDES_COM_SYSCLK_EN_SEL_TXBAND);
+			writel_relaxed(0x02, phy->mmio +
+				       QSERDES_COM_SYSCLK_EN_SEL_TXBAND);
+			/*
+			 * Make sure that above workaround is activated before
+			 * PHY analog power collapse.
+			 */
+			mb();
+		}
+
+		writel_relaxed(0x0, phy->mmio + UFS_PHY_POWER_DOWN_CONTROL);
+		/*
+		 * ensure that PHY knows its PHY analog rail is going
+		 * to be powered down
+		 */
+		mb();
+	}
+}
+
+static
+void ufs_qcom_phy_qmp_20nm_set_tx_lane_enable(struct ufs_qcom_phy *phy, u32 val)
+{
+	writel_relaxed(val & UFS_PHY_TX_LANE_ENABLE_MASK,
+			phy->mmio + UFS_PHY_TX_LANE_ENABLE);
+	mb();
+}
+
+static inline void ufs_qcom_phy_qmp_20nm_start_serdes(struct ufs_qcom_phy *phy)
+{
+	u32 tmp;
+
+	tmp = readl_relaxed(phy->mmio + UFS_PHY_PHY_START);
+	tmp &= ~MASK_SERDES_START;
+	tmp |= (1 << OFFSET_SERDES_START);
+	writel_relaxed(tmp, phy->mmio + UFS_PHY_PHY_START);
+	mb();
+}
+
+static int ufs_qcom_phy_qmp_20nm_is_pcs_ready(struct ufs_qcom_phy *phy_common)
+{
+	int err = 0;
+	u32 val;
+
+	err = readl_poll_timeout(phy_common->mmio + UFS_PHY_PCS_READY_STATUS,
+			val, (val & MASK_PCS_READY), 10, 1000000);
+	if (err)
+		dev_err(phy_common->dev, "%s: poll for pcs failed err = %d\n",
+			__func__, err);
+	return err;
+}
+
+static struct phy_ops ufs_qcom_phy_qmp_20nm_phy_ops = {
+	.init		= ufs_qcom_phy_qmp_20nm_init,
+	.exit		= ufs_qcom_phy_exit,
+	.power_on	= ufs_qcom_phy_power_on,
+	.power_off	= ufs_qcom_phy_power_off,
+	.owner		= THIS_MODULE,
+};
+
+static struct ufs_qcom_phy_specific_ops phy_20nm_ops = {
+	.calibrate_phy		= ufs_qcom_phy_qmp_20nm_phy_calibrate,
+	.start_serdes		= ufs_qcom_phy_qmp_20nm_start_serdes,
+	.is_physical_coding_sublayer_ready = ufs_qcom_phy_qmp_20nm_is_pcs_ready,
+	.set_tx_lane_enable	= ufs_qcom_phy_qmp_20nm_set_tx_lane_enable,
+	.power_control		= ufs_qcom_phy_qmp_20nm_power_control,
+};
+
+static int ufs_qcom_phy_qmp_20nm_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct phy *generic_phy;
+	struct ufs_qcom_phy_qmp_20nm *phy;
+	int err = 0;
+
+	phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL);
+	if (!phy) {
+		dev_err(dev, "%s: failed to allocate phy\n", __func__);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	generic_phy = ufs_qcom_phy_generic_probe(pdev, &phy->common_cfg,
+				&ufs_qcom_phy_qmp_20nm_phy_ops, &phy_20nm_ops);
+
+	if (!generic_phy) {
+		dev_err(dev, "%s: ufs_qcom_phy_generic_probe() failed\n",
+			__func__);
+		err = -EIO;
+		goto out;
+	}
+
+	phy_set_drvdata(generic_phy, phy);
+
+	strlcpy(phy->common_cfg.name, UFS_PHY_NAME,
+			sizeof(phy->common_cfg.name));
+
+out:
+	return err;
+}
+
+static int ufs_qcom_phy_qmp_20nm_remove(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct phy *generic_phy = to_phy(dev);
+	struct ufs_qcom_phy *ufs_qcom_phy = get_ufs_qcom_phy(generic_phy);
+	int err = 0;
+
+	err = ufs_qcom_phy_remove(generic_phy, ufs_qcom_phy);
+	if (err)
+		dev_err(dev, "%s: ufs_qcom_phy_remove failed = %d\n",
+			__func__, err);
+
+	return err;
+}
+
+static const struct of_device_id ufs_qcom_phy_qmp_20nm_of_match[] = {
+	{.compatible = "qcom,ufs-phy-qmp-20nm"},
+	{},
+};
+MODULE_DEVICE_TABLE(of, ufs_qcom_phy_qmp_20nm_of_match);
+
+static struct platform_driver ufs_qcom_phy_qmp_20nm_driver = {
+	.probe = ufs_qcom_phy_qmp_20nm_probe,
+	.remove = ufs_qcom_phy_qmp_20nm_remove,
+	.driver = {
+		.of_match_table = ufs_qcom_phy_qmp_20nm_of_match,
+		.name = "ufs_qcom_phy_qmp_20nm",
+		.owner = THIS_MODULE,
+	},
+};
+
+module_platform_driver(ufs_qcom_phy_qmp_20nm_driver);
+
+MODULE_DESCRIPTION("Universal Flash Storage (UFS) QCOM PHY QMP 20nm");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/phy/phy-qcom-ufs-qmp-20nm.h b/drivers/phy/phy-qcom-ufs-qmp-20nm.h
new file mode 100644
index 000000000000..4f3076bb3d71
--- /dev/null
+++ b/drivers/phy/phy-qcom-ufs-qmp-20nm.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2013-2015, Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef UFS_QCOM_PHY_QMP_20NM_H_
+#define UFS_QCOM_PHY_QMP_20NM_H_
+
+#include "phy-qcom-ufs-i.h"
+
+/* QCOM UFS PHY control registers */
+
+#define COM_OFF(x)     (0x000 + x)
+#define PHY_OFF(x)     (0xC00 + x)
+#define TX_OFF(n, x)   (0x400 + (0x400 * n) + x)
+#define RX_OFF(n, x)   (0x600 + (0x400 * n) + x)
+
+/* UFS PHY PLL block registers */
+#define QSERDES_COM_SYS_CLK_CTRL		COM_OFF(0x0)
+#define QSERDES_COM_PLL_VCOTAIL_EN		COM_OFF(0x04)
+#define QSERDES_COM_PLL_CNTRL			COM_OFF(0x14)
+#define QSERDES_COM_PLL_IP_SETI			COM_OFF(0x24)
+#define QSERDES_COM_CORE_CLK_IN_SYNC_SEL	COM_OFF(0x28)
+#define QSERDES_COM_BIAS_EN_CLKBUFLR_EN		COM_OFF(0x30)
+#define QSERDES_COM_PLL_CP_SETI			COM_OFF(0x34)
+#define QSERDES_COM_PLL_IP_SETP			COM_OFF(0x38)
+#define QSERDES_COM_PLL_CP_SETP			COM_OFF(0x3C)
+#define QSERDES_COM_SYSCLK_EN_SEL_TXBAND	COM_OFF(0x48)
+#define QSERDES_COM_RESETSM_CNTRL		COM_OFF(0x4C)
+#define QSERDES_COM_RESETSM_CNTRL2		COM_OFF(0x50)
+#define QSERDES_COM_PLLLOCK_CMP1		COM_OFF(0x90)
+#define QSERDES_COM_PLLLOCK_CMP2		COM_OFF(0x94)
+#define QSERDES_COM_PLLLOCK_CMP3		COM_OFF(0x98)
+#define QSERDES_COM_PLLLOCK_CMP_EN		COM_OFF(0x9C)
+#define QSERDES_COM_BGTC			COM_OFF(0xA0)
+#define QSERDES_COM_DEC_START1			COM_OFF(0xAC)
+#define QSERDES_COM_PLL_AMP_OS			COM_OFF(0xB0)
+#define QSERDES_COM_RES_CODE_UP_OFFSET		COM_OFF(0xD8)
+#define QSERDES_COM_RES_CODE_DN_OFFSET		COM_OFF(0xDC)
+#define QSERDES_COM_DIV_FRAC_START1		COM_OFF(0x100)
+#define QSERDES_COM_DIV_FRAC_START2		COM_OFF(0x104)
+#define QSERDES_COM_DIV_FRAC_START3		COM_OFF(0x108)
+#define QSERDES_COM_DEC_START2			COM_OFF(0x10C)
+#define QSERDES_COM_PLL_RXTXEPCLK_EN		COM_OFF(0x110)
+#define QSERDES_COM_PLL_CRCTRL			COM_OFF(0x114)
+#define QSERDES_COM_PLL_CLKEPDIV		COM_OFF(0x118)
+
+/* TX LANE n (0, 1) registers */
+#define QSERDES_TX_EMP_POST1_LVL(n)		TX_OFF(n, 0x08)
+#define QSERDES_TX_DRV_LVL(n)			TX_OFF(n, 0x0C)
+#define QSERDES_TX_LANE_MODE(n)			TX_OFF(n, 0x54)
+
+/* RX LANE n (0, 1) registers */
+#define QSERDES_RX_CDR_CONTROL1(n)		RX_OFF(n, 0x0)
+#define QSERDES_RX_CDR_CONTROL_HALF(n)		RX_OFF(n, 0x8)
+#define QSERDES_RX_RX_EQ_GAIN1_LSB(n)		RX_OFF(n, 0xA8)
+#define QSERDES_RX_RX_EQ_GAIN1_MSB(n)		RX_OFF(n, 0xAC)
+#define QSERDES_RX_RX_EQ_GAIN2_LSB(n)		RX_OFF(n, 0xB0)
+#define QSERDES_RX_RX_EQ_GAIN2_MSB(n)		RX_OFF(n, 0xB4)
+#define QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(n)	RX_OFF(n, 0xBC)
+#define QSERDES_RX_CDR_CONTROL_QUARTER(n)	RX_OFF(n, 0xC)
+#define QSERDES_RX_SIGDET_CNTRL(n)		RX_OFF(n, 0x100)
+
+/* UFS PHY registers */
+#define UFS_PHY_PHY_START			PHY_OFF(0x00)
+#define UFS_PHY_POWER_DOWN_CONTROL		PHY_OFF(0x4)
+#define UFS_PHY_TX_LANE_ENABLE			PHY_OFF(0x44)
+#define UFS_PHY_PWM_G1_CLK_DIVIDER		PHY_OFF(0x08)
+#define UFS_PHY_PWM_G2_CLK_DIVIDER		PHY_OFF(0x0C)
+#define UFS_PHY_PWM_G3_CLK_DIVIDER		PHY_OFF(0x10)
+#define UFS_PHY_PWM_G4_CLK_DIVIDER		PHY_OFF(0x14)
+#define UFS_PHY_CORECLK_PWM_G1_CLK_DIVIDER	PHY_OFF(0x34)
+#define UFS_PHY_CORECLK_PWM_G2_CLK_DIVIDER	PHY_OFF(0x38)
+#define UFS_PHY_CORECLK_PWM_G3_CLK_DIVIDER	PHY_OFF(0x3C)
+#define UFS_PHY_CORECLK_PWM_G4_CLK_DIVIDER	PHY_OFF(0x40)
+#define UFS_PHY_OMC_STATUS_RDVAL		PHY_OFF(0x68)
+#define UFS_PHY_LINE_RESET_TIME			PHY_OFF(0x28)
+#define UFS_PHY_LINE_RESET_GRANULARITY		PHY_OFF(0x2C)
+#define UFS_PHY_TSYNC_RSYNC_CNTL		PHY_OFF(0x48)
+#define UFS_PHY_PLL_CNTL			PHY_OFF(0x50)
+#define UFS_PHY_TX_LARGE_AMP_DRV_LVL		PHY_OFF(0x54)
+#define UFS_PHY_TX_SMALL_AMP_DRV_LVL		PHY_OFF(0x5C)
+#define UFS_PHY_TX_LARGE_AMP_POST_EMP_LVL	PHY_OFF(0x58)
+#define UFS_PHY_TX_SMALL_AMP_POST_EMP_LVL	PHY_OFF(0x60)
+#define UFS_PHY_CFG_CHANGE_CNT_VAL		PHY_OFF(0x64)
+#define UFS_PHY_RX_SYNC_WAIT_TIME		PHY_OFF(0x6C)
+#define UFS_PHY_TX_MIN_SLEEP_NOCONFIG_TIME_CAPABILITY	PHY_OFF(0xB4)
+#define UFS_PHY_RX_MIN_SLEEP_NOCONFIG_TIME_CAPABILITY	PHY_OFF(0xE0)
+#define UFS_PHY_TX_MIN_STALL_NOCONFIG_TIME_CAPABILITY	PHY_OFF(0xB8)
+#define UFS_PHY_RX_MIN_STALL_NOCONFIG_TIME_CAPABILITY	PHY_OFF(0xE4)
+#define UFS_PHY_TX_MIN_SAVE_CONFIG_TIME_CAPABILITY	PHY_OFF(0xBC)
+#define UFS_PHY_RX_MIN_SAVE_CONFIG_TIME_CAPABILITY	PHY_OFF(0xE8)
+#define UFS_PHY_RX_PWM_BURST_CLOSURE_LENGTH_CAPABILITY	PHY_OFF(0xFC)
+#define UFS_PHY_RX_MIN_ACTIVATETIME_CAPABILITY		PHY_OFF(0x100)
+#define UFS_PHY_RX_SIGDET_CTRL3				PHY_OFF(0x14c)
+#define UFS_PHY_RMMI_ATTR_CTRL			PHY_OFF(0x160)
+#define UFS_PHY_RMMI_RX_CFGUPDT_L1	(1 << 7)
+#define UFS_PHY_RMMI_TX_CFGUPDT_L1	(1 << 6)
+#define UFS_PHY_RMMI_CFGWR_L1		(1 << 5)
+#define UFS_PHY_RMMI_CFGRD_L1		(1 << 4)
+#define UFS_PHY_RMMI_RX_CFGUPDT_L0	(1 << 3)
+#define UFS_PHY_RMMI_TX_CFGUPDT_L0	(1 << 2)
+#define UFS_PHY_RMMI_CFGWR_L0		(1 << 1)
+#define UFS_PHY_RMMI_CFGRD_L0		(1 << 0)
+#define UFS_PHY_RMMI_ATTRID			PHY_OFF(0x164)
+#define UFS_PHY_RMMI_ATTRWRVAL			PHY_OFF(0x168)
+#define UFS_PHY_RMMI_ATTRRDVAL_L0_STATUS	PHY_OFF(0x16C)
+#define UFS_PHY_RMMI_ATTRRDVAL_L1_STATUS	PHY_OFF(0x170)
+#define UFS_PHY_PCS_READY_STATUS		PHY_OFF(0x174)
+
+#define UFS_PHY_TX_LANE_ENABLE_MASK		0x3
+
+/*
+ * This structure represents the 20nm specific phy.
+ * common_cfg MUST remain the first field in this structure
+ * in case extra fields are added. This way, when calling
+ * get_ufs_qcom_phy() of generic phy, we can extract the
+ * common phy structure (struct ufs_qcom_phy) out of it
+ * regardless of the relevant specific phy.
+ */
+struct ufs_qcom_phy_qmp_20nm {
+	struct ufs_qcom_phy common_cfg;
+};
+
+static struct ufs_qcom_phy_calibration phy_cal_table_rate_A_1_2_0[] = {
+	UFS_QCOM_PHY_CAL_ENTRY(UFS_PHY_POWER_DOWN_CONTROL, 0x01),
+	UFS_QCOM_PHY_CAL_ENTRY(UFS_PHY_RX_SIGDET_CTRL3, 0x0D),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_VCOTAIL_EN, 0xe1),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CRCTRL, 0xcc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_SYSCLK_EN_SEL_TXBAND, 0x08),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CLKEPDIV, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_RXTXEPCLK_EN, 0x10),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START1, 0x82),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START2, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START1, 0x80),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START2, 0x80),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START3, 0x40),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP1, 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP2, 0x19),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP3, 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP_EN, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RESETSM_CNTRL, 0x90),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RESETSM_CNTRL2, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL1(0), 0xf2),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_HALF(0), 0x0c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_QUARTER(0), 0x12),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL1(1), 0xf2),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_HALF(1), 0x0c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_QUARTER(1), 0x12),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_LSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_MSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_LSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_MSB(0), 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_LSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_MSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_LSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_MSB(1), 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CP_SETI, 0x3f),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_IP_SETP, 0x1b),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CP_SETP, 0x0f),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_IP_SETI, 0x01),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_EMP_POST1_LVL(0), 0x2F),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_DRV_LVL(0), 0x20),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_EMP_POST1_LVL(1), 0x2F),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_DRV_LVL(1), 0x20),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_LANE_MODE(0), 0x68),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_LANE_MODE(1), 0x68),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(1), 0xdc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(0), 0xdc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_BIAS_EN_CLKBUFLR_EN, 0x3),
+};
+
+static struct ufs_qcom_phy_calibration phy_cal_table_rate_A_1_3_0[] = {
+	UFS_QCOM_PHY_CAL_ENTRY(UFS_PHY_POWER_DOWN_CONTROL, 0x01),
+	UFS_QCOM_PHY_CAL_ENTRY(UFS_PHY_RX_SIGDET_CTRL3, 0x0D),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_VCOTAIL_EN, 0xe1),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CRCTRL, 0xcc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_SYSCLK_EN_SEL_TXBAND, 0x08),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CLKEPDIV, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_RXTXEPCLK_EN, 0x10),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START1, 0x82),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START2, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START1, 0x80),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START2, 0x80),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START3, 0x40),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP1, 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP2, 0x19),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP3, 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP_EN, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RESETSM_CNTRL, 0x90),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RESETSM_CNTRL2, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL1(0), 0xf2),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_HALF(0), 0x0c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_QUARTER(0), 0x12),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL1(1), 0xf2),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_HALF(1), 0x0c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_QUARTER(1), 0x12),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_LSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_MSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_LSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_MSB(0), 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_LSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_MSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_LSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_MSB(1), 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CP_SETI, 0x2b),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_IP_SETP, 0x38),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CP_SETP, 0x3c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RES_CODE_UP_OFFSET, 0x02),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RES_CODE_DN_OFFSET, 0x02),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_IP_SETI, 0x01),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CNTRL, 0x40),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_LANE_MODE(0), 0x68),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_LANE_MODE(1), 0x68),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(1), 0xdc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(0), 0xdc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_BIAS_EN_CLKBUFLR_EN, 0x3),
+};
+
+static struct ufs_qcom_phy_calibration phy_cal_table_rate_B[] = {
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START1, 0x98),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP1, 0x65),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP2, 0x1e),
+};
+
+#endif
diff --git a/include/linux/phy/phy-qcom-ufs.h b/include/linux/phy/phy-qcom-ufs.h
new file mode 100644
index 000000000000..9d18e9f948e9
--- /dev/null
+++ b/include/linux/phy/phy-qcom-ufs.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2013-2015, Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef PHY_QCOM_UFS_H_
+#define PHY_QCOM_UFS_H_
+
+#include "phy.h"
+
+/**
+ * ufs_qcom_phy_enable_ref_clk() - Enable the phy
+ * ref clock.
+ * @phy: reference to a generic phy
+ *
+ * returns 0 for success, and non-zero for error.
+ */
+int ufs_qcom_phy_enable_ref_clk(struct phy *phy);
+
+/**
+ * ufs_qcom_phy_disable_ref_clk() - Disable the phy
+ * ref clock.
+ * @phy: reference to a generic phy.
+ */
+void ufs_qcom_phy_disable_ref_clk(struct phy *phy);
+
+/**
+ * ufs_qcom_phy_enable_dev_ref_clk() - Enable the device
+ * ref clock.
+ * @phy: reference to a generic phy.
+ */
+void ufs_qcom_phy_enable_dev_ref_clk(struct phy *phy);
+
+/**
+ * ufs_qcom_phy_disable_dev_ref_clk() - Disable the device
+ * ref clock.
+ * @phy: reference to a generic phy.
+ */
+void ufs_qcom_phy_disable_dev_ref_clk(struct phy *phy);
+
+int ufs_qcom_phy_enable_iface_clk(struct phy *phy);
+void ufs_qcom_phy_disable_iface_clk(struct phy *phy);
+int ufs_qcom_phy_start_serdes(struct phy *phy);
+int ufs_qcom_phy_set_tx_lane_enable(struct phy *phy, u32 tx_lanes);
+int ufs_qcom_phy_calibrate_phy(struct phy *phy, bool is_rate_B);
+int ufs_qcom_phy_is_pcs_ready(struct phy *phy);
+void ufs_qcom_phy_save_controller_version(struct phy *phy,
+			u8 major, u16 minor, u16 step);
+
+#endif /* PHY_QCOM_UFS_H_ */
-- 
cgit v1.2.3


From 3c33f5b99d688deafd21d4a770303691c7c3a320 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 20 Jan 2015 09:26:19 -0600
Subject: livepatch: support for repatching a function

Add support for patching a function multiple times.  If multiple patches
affect a function, the function in the most recently enabled patch
"wins".  This enables a cumulative patch upgrade path, where each patch
is a superset of previous patches.

This requires restructuring the data a little bit.  With the current
design, where each klp_func struct has its own ftrace_ops, we'd have to
unregister the old ops and then register the new ops, because
FTRACE_OPS_FL_IPMODIFY prevents us from having two ops registered for
the same function at the same time.  That would leave a regression
window where the function isn't patched at all (not good for a patch
upgrade path).

This patch replaces the per-klp_func ftrace_ops with a global klp_ops
list, with one ftrace_ops per original function.  A single ftrace_ops is
shared between all klp_funcs which have the same old_addr.  This allows
the switch between function versions to happen instantaneously by
updating the klp_ops struct's func_stack list.  The winner is the
klp_func at the top of the func_stack (front of the list).

[ jkosina@suse.cz: turn WARN_ON() into WARN_ON_ONCE() in ftrace handler to
  avoid storm in pathological cases ]

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h |   4 +-
 kernel/livepatch/core.c   | 170 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 121 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 950bc615842f..f14c6fb262b4 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -40,8 +40,8 @@ enum klp_state {
  * @old_addr:	a hint conveying at what address the old function
  *		can be found (optional, vmlinux patches only)
  * @kobj:	kobject for sysfs resources
- * @fops:	ftrace operations structure
  * @state:	tracks function-level patch application state
+ * @stack_node:	list node for klp_ops func_stack list
  */
 struct klp_func {
 	/* external */
@@ -59,8 +59,8 @@ struct klp_func {
 
 	/* internal */
 	struct kobject kobj;
-	struct ftrace_ops *fops;
 	enum klp_state state;
+	struct list_head stack_node;
 };
 
 /**
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 2401e7f955d3..bc05d390ce85 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -29,17 +29,53 @@
 #include <linux/kallsyms.h>
 #include <linux/livepatch.h>
 
-/*
- * The klp_mutex protects the klp_patches list and state transitions of any
- * structure reachable from the patches list.  References to any structure must
- * be obtained under mutex protection.
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_addr.  This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list.  The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node:	node for the global klp_ops list
+ * @func_stack:	list head for the stack of klp_func's (active func is on top)
+ * @fops:	registered ftrace ops struct
  */
+struct klp_ops {
+	struct list_head node;
+	struct list_head func_stack;
+	struct ftrace_ops fops;
+};
 
+/*
+ * The klp_mutex protects the global lists and state transitions of any
+ * structure reachable from them.  References to any structure must be obtained
+ * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
+ * ensure it gets consistent data).
+ */
 static DEFINE_MUTEX(klp_mutex);
+
 static LIST_HEAD(klp_patches);
+static LIST_HEAD(klp_ops);
 
 static struct kobject *klp_root_kobj;
 
+static struct klp_ops *klp_find_ops(unsigned long old_addr)
+{
+	struct klp_ops *ops;
+	struct klp_func *func;
+
+	list_for_each_entry(ops, &klp_ops, node) {
+		func = list_first_entry(&ops->func_stack, struct klp_func,
+					stack_node);
+		if (func->old_addr == old_addr)
+			return ops;
+	}
+
+	return NULL;
+}
+
 static bool klp_is_module(struct klp_object *obj)
 {
 	return obj->name;
@@ -267,16 +303,28 @@ static int klp_write_object_relocations(struct module *pmod,
 
 static void notrace klp_ftrace_handler(unsigned long ip,
 				       unsigned long parent_ip,
-				       struct ftrace_ops *ops,
+				       struct ftrace_ops *fops,
 				       struct pt_regs *regs)
 {
-	struct klp_func *func = ops->private;
+	struct klp_ops *ops;
+	struct klp_func *func;
+
+	ops = container_of(fops, struct klp_ops, fops);
+
+	rcu_read_lock();
+	func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+				      stack_node);
+	rcu_read_unlock();
+
+	if (WARN_ON_ONCE(!func))
+		return;
 
 	klp_arch_set_pc(regs, (unsigned long)func->new_func);
 }
 
 static int klp_disable_func(struct klp_func *func)
 {
+	struct klp_ops *ops;
 	int ret;
 
 	if (WARN_ON(func->state != KLP_ENABLED))
@@ -285,16 +333,28 @@ static int klp_disable_func(struct klp_func *func)
 	if (WARN_ON(!func->old_addr))
 		return -EINVAL;
 
-	ret = unregister_ftrace_function(func->fops);
-	if (ret) {
-		pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
-		       func->old_name, ret);
-		return ret;
-	}
+	ops = klp_find_ops(func->old_addr);
+	if (WARN_ON(!ops))
+		return -EINVAL;
 
-	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
-	if (ret)
-		pr_warn("function unregister succeeded but failed to clear the filter\n");
+	if (list_is_singular(&ops->func_stack)) {
+		ret = unregister_ftrace_function(&ops->fops);
+		if (ret) {
+			pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
+			       func->old_name, ret);
+			return ret;
+		}
+
+		ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+		if (ret)
+			pr_warn("function unregister succeeded but failed to clear the filter\n");
+
+		list_del_rcu(&func->stack_node);
+		list_del(&ops->node);
+		kfree(ops);
+	} else {
+		list_del_rcu(&func->stack_node);
+	}
 
 	func->state = KLP_DISABLED;
 
@@ -303,6 +363,7 @@ static int klp_disable_func(struct klp_func *func)
 
 static int klp_enable_func(struct klp_func *func)
 {
+	struct klp_ops *ops;
 	int ret;
 
 	if (WARN_ON(!func->old_addr))
@@ -311,22 +372,50 @@ static int klp_enable_func(struct klp_func *func)
 	if (WARN_ON(func->state != KLP_DISABLED))
 		return -EINVAL;
 
-	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 0, 0);
-	if (ret) {
-		pr_err("failed to set ftrace filter for function '%s' (%d)\n",
-		       func->old_name, ret);
-		return ret;
-	}
+	ops = klp_find_ops(func->old_addr);
+	if (!ops) {
+		ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+		if (!ops)
+			return -ENOMEM;
+
+		ops->fops.func = klp_ftrace_handler;
+		ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
+				  FTRACE_OPS_FL_DYNAMIC |
+				  FTRACE_OPS_FL_IPMODIFY;
+
+		list_add(&ops->node, &klp_ops);
+
+		INIT_LIST_HEAD(&ops->func_stack);
+		list_add_rcu(&func->stack_node, &ops->func_stack);
+
+		ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
+		if (ret) {
+			pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+			       func->old_name, ret);
+			goto err;
+		}
+
+		ret = register_ftrace_function(&ops->fops);
+		if (ret) {
+			pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+			       func->old_name, ret);
+			ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+			goto err;
+		}
+
 
-	ret = register_ftrace_function(func->fops);
-	if (ret) {
-		pr_err("failed to register ftrace handler for function '%s' (%d)\n",
-		       func->old_name, ret);
-		ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
 	} else {
-		func->state = KLP_ENABLED;
+		list_add_rcu(&func->stack_node, &ops->func_stack);
 	}
 
+	func->state = KLP_ENABLED;
+
+	return ret;
+
+err:
+	list_del_rcu(&func->stack_node);
+	list_del(&ops->node);
+	kfree(ops);
 	return ret;
 }
 
@@ -582,10 +671,6 @@ static struct kobj_type klp_ktype_patch = {
 
 static void klp_kobj_release_func(struct kobject *kobj)
 {
-	struct klp_func *func;
-
-	func = container_of(kobj, struct klp_func, kobj);
-	kfree(func->fops);
 }
 
 static struct kobj_type klp_ktype_func = {
@@ -642,28 +727,11 @@ static void klp_free_patch(struct klp_patch *patch)
 
 static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 {
-	struct ftrace_ops *ops;
-	int ret;
-
-	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
-	if (!ops)
-		return -ENOMEM;
-
-	ops->private = func;
-	ops->func = klp_ftrace_handler;
-	ops->flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC |
-		     FTRACE_OPS_FL_IPMODIFY;
-	func->fops = ops;
+	INIT_LIST_HEAD(&func->stack_node);
 	func->state = KLP_DISABLED;
 
-	ret = kobject_init_and_add(&func->kobj, &klp_ktype_func,
-				   obj->kobj, func->old_name);
-	if (ret) {
-		kfree(func->fops);
-		return ret;
-	}
-
-	return 0;
+	return kobject_init_and_add(&func->kobj, &klp_ktype_func,
+				    obj->kobj, func->old_name);
 }
 
 /* parts of the initialization that is done only when the object is loaded */
-- 
cgit v1.2.3


From 97b713ba3ebaa6c8d84c2c720f5468a7c6a6eb4e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:31 +0100
Subject: fs: kill BDI_CAP_SWAP_BACKED

This bdi flag isn't too useful - we can determine that a vma is backed by
either swap or shmem trivially in the caller.

This also allows removing the backing_dev_info instaces for swap and shmem
in favor of noop_backing_dev_info.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev.h | 13 -------------
 mm/madvise.c                | 17 ++++++++++-------
 mm/shmem.c                  | 25 +++++++------------------
 mm/swap.c                   |  2 --
 mm/swap_state.c             |  7 +------
 5 files changed, 18 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012b7a14..e936cea856dc 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -238,8 +238,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_WRITE_MAP:      Can be mapped for writing
  * BDI_CAP_EXEC_MAP:       Can be mapped for execution
  *
- * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
- *
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
@@ -250,7 +248,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_WRITE_MAP	0x00000020
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
-#define BDI_CAP_SWAP_BACKED	0x00000100
 #define BDI_CAP_STABLE_WRITES	0x00000200
 #define BDI_CAP_STRICTLIMIT	0x00000400
 
@@ -329,11 +326,6 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 				      BDI_CAP_NO_WRITEBACK));
 }
 
-static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
-{
-	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
-}
-
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -344,11 +336,6 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 
-static inline bool mapping_cap_swap_backed(struct address_space *mapping)
-{
-	return bdi_cap_swap_backed(mapping->backing_dev_info);
-}
-
 static inline int bdi_sched_wait(void *word)
 {
 	schedule();
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc93289..1383a8916bc3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -222,19 +222,22 @@ static long madvise_willneed(struct vm_area_struct *vma,
 	struct file *file = vma->vm_file;
 
 #ifdef CONFIG_SWAP
-	if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+	if (!file) {
 		*prev = vma;
-		if (!file)
-			force_swapin_readahead(vma, start, end);
-		else
-			force_shm_swapin_readahead(vma, start, end,
-						file->f_mapping);
+		force_swapin_readahead(vma, start, end);
 		return 0;
 	}
-#endif
 
+	if (shmem_mapping(file->f_mapping)) {
+		*prev = vma;
+		force_shm_swapin_readahead(vma, start, end,
+					file->f_mapping);
+		return 0;
+	}
+#else
 	if (!file)
 		return -EBADF;
+#endif
 
 	if (file->f_mapping->a_ops->get_xip_mem) {
 		/* no bad return value, but ignore advice */
diff --git a/mm/shmem.c b/mm/shmem.c
index 73ba1df7c8ba..1b77eaf589fd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations;
 static const struct inode_operations shmem_special_inode_operations;
 static const struct vm_operations_struct shmem_vm_ops;
 
-static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
@@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		goto redirty;
 
 	/*
-	 * shmem_backing_dev_info's capabilities prevent regular writeback or
-	 * sync from ever calling shmem_writepage; but a stacking filesystem
-	 * might use ->writepage of its underlying filesystem, in which case
-	 * tmpfs should write out to swap only in response to memory pressure,
-	 * and not for the writeback threads or sync.
+	 * Our capabilities prevent regular writeback or sync from ever calling
+	 * shmem_writepage; but a stacking filesystem might use ->writepage of
+	 * its underlying filesystem, in which case tmpfs should write out to
+	 * swap only in response to memory pressure, and not for the writeback
+	 * threads or sync.
 	 */
 	if (!wbc->for_reclaim) {
 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
@@ -1415,7 +1410,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
-		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
+		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_generation = get_seconds();
 		info = SHMEM_I(inode);
@@ -1461,7 +1456,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 bool shmem_mapping(struct address_space *mapping)
 {
-	return mapping->backing_dev_info == &shmem_backing_dev_info;
+	return mapping->host->i_sb->s_op == &shmem_ops;
 }
 
 #ifdef CONFIG_TMPFS
@@ -3226,10 +3221,6 @@ int __init shmem_init(void)
 	if (shmem_inode_cachep)
 		return 0;
 
-	error = bdi_init(&shmem_backing_dev_info);
-	if (error)
-		goto out4;
-
 	error = shmem_init_inodecache();
 	if (error)
 		goto out3;
@@ -3253,8 +3244,6 @@ out1:
 out2:
 	shmem_destroy_inodecache();
 out3:
-	bdi_destroy(&shmem_backing_dev_info);
-out4:
 	shm_mnt = ERR_PTR(error);
 	return error;
 }
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33936b4..4e0109a2f37b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1138,8 +1138,6 @@ void __init swap_setup(void)
 #ifdef CONFIG_SWAP
 	int i;
 
-	if (bdi_init(swapper_spaces[0].backing_dev_info))
-		panic("Failed to init swap bdi");
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		spin_lock_init(&swapper_spaces[i].tree_lock);
 		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9711342987a0..1c137b69ecde 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -32,17 +32,12 @@ static const struct address_space_operations swap_aops = {
 #endif
 };
 
-static struct backing_dev_info swap_backing_dev_info = {
-	.name		= "swap",
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
 struct address_space swapper_spaces[MAX_SWAPFILES] = {
 	[0 ... MAX_SWAPFILES - 1] = {
 		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
 		.i_mmap_writable = ATOMIC_INIT(0),
 		.a_ops		= &swap_aops,
-		.backing_dev_info = &swap_backing_dev_info,
+		.backing_dev_info = &noop_backing_dev_info,
 	}
 };
 
-- 
cgit v1.2.3


From b4caecd48005fbed3949dde6c1cb233142fd69e9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:32 +0100
Subject: fs: introduce f_op->mmap_capabilities for nommu mmap support

Since "BDI: Provide backing device capability information [try #3]" the
backing_dev_info structure also provides flags for the kind of mmap
operation available in a nommu environment, which is entirely unrelated
to it's original purpose.

Introduce a new nommu-only file operation to provide this information to
the nommu mmap code instead.  Splitting this from the backing_dev_info
structure allows to remove lots of backing_dev_info instance that aren't
otherwise needed, and entirely gets rid of the concept of providing a
backing_dev_info for a character device.  It also removes the need for
the mtd_inodefs filesystem.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/nommu-mmap.txt                    |  8 +--
 block/blk-core.c                                |  2 +-
 drivers/char/mem.c                              | 64 ++++++++++----------
 drivers/mtd/mtdchar.c                           | 72 ++++------------------
 drivers/mtd/mtdconcat.c                         | 10 ----
 drivers/mtd/mtdcore.c                           | 80 +++++++------------------
 drivers/mtd/mtdpart.c                           |  1 -
 drivers/staging/lustre/lustre/llite/llite_lib.c |  2 +-
 fs/9p/v9fs.c                                    |  2 +-
 fs/afs/volume.c                                 |  2 +-
 fs/aio.c                                        | 14 +----
 fs/btrfs/disk-io.c                              |  3 +-
 fs/char_dev.c                                   | 24 --------
 fs/cifs/connect.c                               |  2 +-
 fs/coda/inode.c                                 |  2 +-
 fs/configfs/configfs_internal.h                 |  2 -
 fs/configfs/inode.c                             | 18 +-----
 fs/configfs/mount.c                             | 11 +---
 fs/ecryptfs/main.c                              |  2 +-
 fs/exofs/super.c                                |  2 +-
 fs/ncpfs/inode.c                                |  2 +-
 fs/ramfs/file-nommu.c                           |  7 +++
 fs/ramfs/inode.c                                | 22 +------
 fs/romfs/mmap-nommu.c                           | 10 ++++
 fs/ubifs/super.c                                |  2 +-
 include/linux/backing-dev.h                     | 33 ++--------
 include/linux/cdev.h                            |  2 -
 include/linux/fs.h                              | 23 +++++++
 include/linux/mtd/mtd.h                         |  2 +
 mm/backing-dev.c                                |  7 +--
 mm/nommu.c                                      | 69 ++++++++++-----------
 security/security.c                             | 13 ++--
 32 files changed, 169 insertions(+), 346 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index 8e1ddec2c78a..ae57b9ea0d41 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -43,12 +43,12 @@ and it's also much more restricted in the latter case:
            even if this was created by another process.
 
          - If possible, the file mapping will be directly on the backing device
-           if the backing device has the BDI_CAP_MAP_DIRECT capability and
+           if the backing device has the NOMMU_MAP_DIRECT capability and
            appropriate mapping protection capabilities. Ramfs, romfs, cramfs
            and mtd might all permit this.
 
 	 - If the backing device device can't or won't permit direct sharing,
-           but does have the BDI_CAP_MAP_COPY capability, then a copy of the
+           but does have the NOMMU_MAP_COPY capability, then a copy of the
            appropriate bit of the file will be read into a contiguous bit of
            memory and any extraneous space beyond the EOF will be cleared
 
@@ -220,7 +220,7 @@ directly (can't be copied).
 
 The file->f_op->mmap() operation will be called to actually inaugurate the
 mapping. It can be rejected at that point. Returning the ENOSYS error will
-cause the mapping to be copied instead if BDI_CAP_MAP_COPY is specified.
+cause the mapping to be copied instead if NOMMU_MAP_COPY is specified.
 
 The vm_ops->close() routine will be invoked when the last mapping on a chardev
 is removed. An existing mapping will be shared, partially or not, if possible
@@ -232,7 +232,7 @@ want to handle it, despite the fact it's got an operation. For instance, it
 might try directing the call to a secondary driver which turns out not to
 implement it. Such is the case for the framebuffer driver which attempts to
 direct the call to the device-specific driver. Under such circumstances, the
-mapping request will be rejected if BDI_CAP_MAP_COPY is not specified, and a
+mapping request will be rejected if NOMMU_MAP_COPY is not specified, and a
 copy mapped otherwise.
 
 IMPORTANT NOTE:
diff --git a/block/blk-core.c b/block/blk-core.c
index 3ad405571dcc..928aac29bccd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -607,7 +607,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
-	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+	q->backing_dev_info.capabilities = 0;
 	q->backing_dev_info.name = "block";
 	q->node = node_id;
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 4c58333b4257..9a6b63783a94 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -287,13 +287,24 @@ static unsigned long get_unmapped_area_mem(struct file *file,
 	return pgoff << PAGE_SHIFT;
 }
 
+/* permit direct mmap, for read, write or exec */
+static unsigned memory_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_DIRECT |
+		NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
+static unsigned zero_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_COPY;
+}
+
 /* can't do an in-place private mapping if there's no MMU */
 static inline int private_mapping_ok(struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_MAYSHARE;
 }
 #else
-#define get_unmapped_area_mem	NULL
 
 static inline int private_mapping_ok(struct vm_area_struct *vma)
 {
@@ -721,7 +732,10 @@ static const struct file_operations mem_fops = {
 	.write		= write_mem,
 	.mmap		= mmap_mem,
 	.open		= open_mem,
+#ifndef CONFIG_MMU
 	.get_unmapped_area = get_unmapped_area_mem,
+	.mmap_capabilities = memory_mmap_capabilities,
+#endif
 };
 
 #ifdef CONFIG_DEVKMEM
@@ -731,7 +745,10 @@ static const struct file_operations kmem_fops = {
 	.write		= write_kmem,
 	.mmap		= mmap_kmem,
 	.open		= open_kmem,
+#ifndef CONFIG_MMU
 	.get_unmapped_area = get_unmapped_area_mem,
+	.mmap_capabilities = memory_mmap_capabilities,
+#endif
 };
 #endif
 
@@ -760,16 +777,9 @@ static const struct file_operations zero_fops = {
 	.read_iter	= read_iter_zero,
 	.aio_write	= aio_write_zero,
 	.mmap		= mmap_zero,
-};
-
-/*
- * capabilities for /dev/zero
- * - permits private mappings, "copies" are taken of the source of zeros
- * - no writeback happens
- */
-static struct backing_dev_info zero_bdi = {
-	.name		= "char/mem",
-	.capabilities	= BDI_CAP_MAP_COPY | BDI_CAP_NO_ACCT_AND_WRITEBACK,
+#ifndef CONFIG_MMU
+	.mmap_capabilities = zero_mmap_capabilities,
+#endif
 };
 
 static const struct file_operations full_fops = {
@@ -783,22 +793,22 @@ static const struct memdev {
 	const char *name;
 	umode_t mode;
 	const struct file_operations *fops;
-	struct backing_dev_info *dev_info;
+	fmode_t fmode;
 } devlist[] = {
-	 [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi },
+	 [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET },
 #ifdef CONFIG_DEVKMEM
-	 [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi },
+	 [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET },
 #endif
-	 [3] = { "null", 0666, &null_fops, NULL },
+	 [3] = { "null", 0666, &null_fops, 0 },
 #ifdef CONFIG_DEVPORT
-	 [4] = { "port", 0, &port_fops, NULL },
+	 [4] = { "port", 0, &port_fops, 0 },
 #endif
-	 [5] = { "zero", 0666, &zero_fops, &zero_bdi },
-	 [7] = { "full", 0666, &full_fops, NULL },
-	 [8] = { "random", 0666, &random_fops, NULL },
-	 [9] = { "urandom", 0666, &urandom_fops, NULL },
+	 [5] = { "zero", 0666, &zero_fops, 0 },
+	 [7] = { "full", 0666, &full_fops, 0 },
+	 [8] = { "random", 0666, &random_fops, 0 },
+	 [9] = { "urandom", 0666, &urandom_fops, 0 },
 #ifdef CONFIG_PRINTK
-	[11] = { "kmsg", 0644, &kmsg_fops, NULL },
+	[11] = { "kmsg", 0644, &kmsg_fops, 0 },
 #endif
 };
 
@@ -816,12 +826,7 @@ static int memory_open(struct inode *inode, struct file *filp)
 		return -ENXIO;
 
 	filp->f_op = dev->fops;
-	if (dev->dev_info)
-		filp->f_mapping->backing_dev_info = dev->dev_info;
-
-	/* Is /dev/mem or /dev/kmem ? */
-	if (dev->dev_info == &directly_mappable_cdev_bdi)
-		filp->f_mode |= FMODE_UNSIGNED_OFFSET;
+	filp->f_mode |= dev->fmode;
 
 	if (dev->fops->open)
 		return dev->fops->open(inode, filp);
@@ -846,11 +851,6 @@ static struct class *mem_class;
 static int __init chr_dev_init(void)
 {
 	int minor;
-	int err;
-
-	err = bdi_init(&zero_bdi);
-	if (err)
-		return err;
 
 	if (register_chrdev(MEM_MAJOR, "mem", &memory_fops))
 		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 53563955931b..55fa27ecf4e1 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -49,7 +49,6 @@ static DEFINE_MUTEX(mtd_mutex);
  */
 struct mtd_file_info {
 	struct mtd_info *mtd;
-	struct inode *ino;
 	enum mtd_file_modes mode;
 };
 
@@ -59,10 +58,6 @@ static loff_t mtdchar_lseek(struct file *file, loff_t offset, int orig)
 	return fixed_size_llseek(file, offset, orig, mfi->mtd->size);
 }
 
-static int count;
-static struct vfsmount *mnt;
-static struct file_system_type mtd_inodefs_type;
-
 static int mtdchar_open(struct inode *inode, struct file *file)
 {
 	int minor = iminor(inode);
@@ -70,7 +65,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
 	int ret = 0;
 	struct mtd_info *mtd;
 	struct mtd_file_info *mfi;
-	struct inode *mtd_ino;
 
 	pr_debug("MTD_open\n");
 
@@ -78,10 +72,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
 	if ((file->f_mode & FMODE_WRITE) && (minor & 1))
 		return -EACCES;
 
-	ret = simple_pin_fs(&mtd_inodefs_type, &mnt, &count);
-	if (ret)
-		return ret;
-
 	mutex_lock(&mtd_mutex);
 	mtd = get_mtd_device(NULL, devnum);
 
@@ -95,43 +85,26 @@ static int mtdchar_open(struct inode *inode, struct file *file)
 		goto out1;
 	}
 
-	mtd_ino = iget_locked(mnt->mnt_sb, devnum);
-	if (!mtd_ino) {
-		ret = -ENOMEM;
-		goto out1;
-	}
-	if (mtd_ino->i_state & I_NEW) {
-		mtd_ino->i_private = mtd;
-		mtd_ino->i_mode = S_IFCHR;
-		mtd_ino->i_data.backing_dev_info = mtd->backing_dev_info;
-		unlock_new_inode(mtd_ino);
-	}
-	file->f_mapping = mtd_ino->i_mapping;
-
 	/* You can't open it RW if it's not a writeable device */
 	if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) {
 		ret = -EACCES;
-		goto out2;
+		goto out1;
 	}
 
 	mfi = kzalloc(sizeof(*mfi), GFP_KERNEL);
 	if (!mfi) {
 		ret = -ENOMEM;
-		goto out2;
+		goto out1;
 	}
-	mfi->ino = mtd_ino;
 	mfi->mtd = mtd;
 	file->private_data = mfi;
 	mutex_unlock(&mtd_mutex);
 	return 0;
 
-out2:
-	iput(mtd_ino);
 out1:
 	put_mtd_device(mtd);
 out:
 	mutex_unlock(&mtd_mutex);
-	simple_release_fs(&mnt, &count);
 	return ret;
 } /* mtdchar_open */
 
@@ -148,12 +121,9 @@ static int mtdchar_close(struct inode *inode, struct file *file)
 	if ((file->f_mode & FMODE_WRITE))
 		mtd_sync(mtd);
 
-	iput(mfi->ino);
-
 	put_mtd_device(mtd);
 	file->private_data = NULL;
 	kfree(mfi);
-	simple_release_fs(&mnt, &count);
 
 	return 0;
 } /* mtdchar_close */
@@ -1117,6 +1087,13 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file,
 	ret = mtd_get_unmapped_area(mtd, len, offset, flags);
 	return ret == -EOPNOTSUPP ? -ENODEV : ret;
 }
+
+static unsigned mtdchar_mmap_capabilities(struct file *file)
+{
+	struct mtd_file_info *mfi = file->private_data;
+
+	return mtd_mmap_capabilities(mfi->mtd);
+}
 #endif
 
 /*
@@ -1160,27 +1137,10 @@ static const struct file_operations mtd_fops = {
 	.mmap		= mtdchar_mmap,
 #ifndef CONFIG_MMU
 	.get_unmapped_area = mtdchar_get_unmapped_area,
+	.mmap_capabilities = mtdchar_mmap_capabilities,
 #endif
 };
 
-static const struct super_operations mtd_ops = {
-	.drop_inode = generic_delete_inode,
-	.statfs = simple_statfs,
-};
-
-static struct dentry *mtd_inodefs_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *data)
-{
-	return mount_pseudo(fs_type, "mtd_inode:", &mtd_ops, NULL, MTD_INODE_FS_MAGIC);
-}
-
-static struct file_system_type mtd_inodefs_type = {
-       .name = "mtd_inodefs",
-       .mount = mtd_inodefs_mount,
-       .kill_sb = kill_anon_super,
-};
-MODULE_ALIAS_FS("mtd_inodefs");
-
 int __init init_mtdchar(void)
 {
 	int ret;
@@ -1193,23 +1153,11 @@ int __init init_mtdchar(void)
 		return ret;
 	}
 
-	ret = register_filesystem(&mtd_inodefs_type);
-	if (ret) {
-		pr_err("Can't register mtd_inodefs filesystem, error %d\n",
-		       ret);
-		goto err_unregister_chdev;
-	}
-
-	return ret;
-
-err_unregister_chdev:
-	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
 	return ret;
 }
 
 void __exit cleanup_mtdchar(void)
 {
-	unregister_filesystem(&mtd_inodefs_type);
 	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
 }
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index b9000563b9f4..eacc3aac7327 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -732,8 +732,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 
 	concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks;
 
-	concat->mtd.backing_dev_info = subdev[0]->backing_dev_info;
-
 	concat->subdev[0] = subdev[0];
 
 	for (i = 1; i < num_devs; i++) {
@@ -761,14 +759,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 				    subdev[i]->flags & MTD_WRITEABLE;
 		}
 
-		/* only permit direct mapping if the BDIs are all the same
-		 * - copy-mapping is still permitted
-		 */
-		if (concat->mtd.backing_dev_info !=
-		    subdev[i]->backing_dev_info)
-			concat->mtd.backing_dev_info =
-				&default_backing_dev_info;
-
 		concat->mtd.size += subdev[i]->size;
 		concat->mtd.ecc_stats.badblocks +=
 			subdev[i]->ecc_stats.badblocks;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 4c611871d7e6..ff38a1df22f7 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -43,33 +43,7 @@
 
 #include "mtdcore.h"
 
-/*
- * backing device capabilities for non-mappable devices (such as NAND flash)
- * - permits private mappings, copies are taken of the data
- */
-static struct backing_dev_info mtd_bdi_unmappable = {
-	.capabilities	= BDI_CAP_MAP_COPY,
-};
-
-/*
- * backing device capabilities for R/O mappable devices (such as ROM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_ro_mappable = {
-	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
-			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP),
-};
-
-/*
- * backing device capabilities for writable mappable devices (such as RAM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_rw_mappable = {
-	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
-			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP |
-			   BDI_CAP_WRITE_MAP),
+static struct backing_dev_info mtd_bdi = {
 };
 
 static int mtd_cls_suspend(struct device *dev, pm_message_t state);
@@ -365,6 +339,22 @@ static struct device_type mtd_devtype = {
 	.release	= mtd_release,
 };
 
+#ifndef CONFIG_MMU
+unsigned mtd_mmap_capabilities(struct mtd_info *mtd)
+{
+	switch (mtd->type) {
+	case MTD_RAM:
+		return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+			NOMMU_MAP_READ | NOMMU_MAP_WRITE;
+	case MTD_ROM:
+		return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+			NOMMU_MAP_READ;
+	default:
+		return NOMMU_MAP_COPY;
+	}
+}
+#endif
+
 /**
  *	add_mtd_device - register an MTD device
  *	@mtd: pointer to new MTD device info structure
@@ -380,19 +370,7 @@ int add_mtd_device(struct mtd_info *mtd)
 	struct mtd_notifier *not;
 	int i, error;
 
-	if (!mtd->backing_dev_info) {
-		switch (mtd->type) {
-		case MTD_RAM:
-			mtd->backing_dev_info = &mtd_bdi_rw_mappable;
-			break;
-		case MTD_ROM:
-			mtd->backing_dev_info = &mtd_bdi_ro_mappable;
-			break;
-		default:
-			mtd->backing_dev_info = &mtd_bdi_unmappable;
-			break;
-		}
-	}
+	mtd->backing_dev_info = &mtd_bdi;
 
 	BUG_ON(mtd->writesize == 0);
 	mutex_lock(&mtd_table_mutex);
@@ -1237,17 +1215,9 @@ static int __init init_mtd(void)
 	if (ret)
 		goto err_reg;
 
-	ret = mtd_bdi_init(&mtd_bdi_unmappable, "mtd-unmap");
-	if (ret)
-		goto err_bdi1;
-
-	ret = mtd_bdi_init(&mtd_bdi_ro_mappable, "mtd-romap");
-	if (ret)
-		goto err_bdi2;
-
-	ret = mtd_bdi_init(&mtd_bdi_rw_mappable, "mtd-rwmap");
+	ret = mtd_bdi_init(&mtd_bdi, "mtd");
 	if (ret)
-		goto err_bdi3;
+		goto err_bdi;
 
 	proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops);
 
@@ -1260,11 +1230,7 @@ static int __init init_mtd(void)
 out_procfs:
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
-err_bdi3:
-	bdi_destroy(&mtd_bdi_ro_mappable);
-err_bdi2:
-	bdi_destroy(&mtd_bdi_unmappable);
-err_bdi1:
+err_bdi:
 	class_unregister(&mtd_class);
 err_reg:
 	pr_err("Error registering mtd class or bdi: %d\n", ret);
@@ -1277,9 +1243,7 @@ static void __exit cleanup_mtd(void)
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
 	class_unregister(&mtd_class);
-	bdi_destroy(&mtd_bdi_unmappable);
-	bdi_destroy(&mtd_bdi_ro_mappable);
-	bdi_destroy(&mtd_bdi_rw_mappable);
+	bdi_destroy(&mtd_bdi);
 }
 
 module_init(init_mtd);
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index a3e3a7d074d5..e779de315ade 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -378,7 +378,6 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
 
 	slave->mtd.name = name;
 	slave->mtd.owner = master->owner;
-	slave->mtd.backing_dev_info = master->backing_dev_info;
 
 	/* NOTE:  we don't arrange MTDs as a tree; it'd be error-prone
 	 * to have the same data be in two different partitions.
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index a3367bfb1456..d5b149c5542d 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -987,7 +987,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	if (err)
 		goto out_free;
 	lsi->lsi_flags |= LSI_BDI_INITIALIZED;
-	lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+	lsi->lsi_bdi.capabilities = 0;
 	err = ll_bdi_register(&lsi->lsi_bdi);
 	if (err)
 		goto out_free;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	}
 	init_rwsem(&v9ses->rename_sem);
 
-	rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+	rc = bdi_setup_and_register(&v9ses->bdi, "9p");
 	if (rc) {
 		kfree(v9ses->aname);
 		kfree(v9ses->uname);
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 	volume->cell		= params->cell;
 	volume->vid		= vlocation->vldb.vid[params->type];
 
-	ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+	ret = bdi_setup_and_register(&volume->bdi, "afs");
 	if (ret)
 		goto error_bdi;
 
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893ecc296..6f13d3fab07f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
 
-/* Backing dev info for aio fs.
- * -no dirty page accounting or writeback happens
- */
-static struct backing_dev_info aio_fs_backing_dev_info = {
-	.name           = "aiofs",
-	.state          = 0,
-	.capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
-};
-
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
 	struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 
 	inode->i_mapping->a_ops = &aio_ctx_aops;
 	inode->i_mapping->private_data = ctx;
-	inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
+	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_size = PAGE_SIZE * nr_pages;
 
 	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +221,6 @@ static int __init aio_setup(void)
 	if (IS_ERR(aio_mnt))
 		panic("Failed to create aio fs mount.");
 
-	if (bdi_init(&aio_fs_backing_dev_info))
-		panic("Failed to init aio fs backing dev info.");
-
 	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..afc4092989cd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1715,8 +1715,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
 	int err;
 
-	bdi->capabilities = BDI_CAP_MAP_COPY;
-	err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
+	err = bdi_setup_and_register(bdi, "btrfs");
 	if (err)
 		return err;
 
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
 
 #include "internal.h"
 
-/*
- * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
- * devices
- * - permits shared-mmap for read, write and/or exec
- * - does not permit private mmap in NOMMU mode (can't do COW)
- * - no readahead or I/O queue unplugging required
- */
-struct backing_dev_info directly_mappable_cdev_bdi = {
-	.name = "char",
-	.capabilities	= (
-#ifdef CONFIG_MMU
-		/* permit private copies of the data to be taken */
-		BDI_CAP_MAP_COPY |
-#endif
-		/* permit direct mmap, for read, write or exec */
-		BDI_CAP_MAP_DIRECT |
-		BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
-		/* no writeback happens */
-		BDI_CAP_NO_ACCT_AND_WRITEBACK),
-};
-
 static struct kobj_map *cdev_map;
 
 static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 void __init chrdev_init(void)
 {
 	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
-	if (bdi_init(&directly_mappable_cdev_bdi))
-		panic("Failed to init directly mappable cdev bdi");
 }
 
 
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
-EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
 	int referral_walks_count = 0;
 #endif
 
-	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
 	if (rc)
 		return rc;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 		goto unlock_out;
 	}
 
-	error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+	error = bdi_setup_and_register(&vc->bdi, "coda");
 	if (error)
 		goto unlock_out;
 
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
 
 extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
 extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
-extern int configfs_inode_init(void);
-extern void configfs_inode_exit(void);
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..0ad6b4d6de00 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
 	.write_end	= simple_write_end,
 };
 
-static struct backing_dev_info configfs_backing_dev_info = {
-	.name		= "configfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
 static const struct inode_operations configfs_inode_operations ={
 	.setattr	= configfs_setattr,
 };
@@ -137,7 +131,7 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mapping->a_ops = &configfs_aops;
-		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
 
 		if (sd->s_iattr) {
@@ -283,13 +277,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 	}
 	mutex_unlock(&dir->d_inode->i_mutex);
 }
-
-int __init configfs_inode_init(void)
-{
-	return bdi_init(&configfs_backing_dev_info);
-}
-
-void configfs_inode_exit(void)
-{
-	bdi_destroy(&configfs_backing_dev_info);
-}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
 	if (!config_kobj)
 		goto out2;
 
-	err = configfs_inode_init();
-	if (err)
-		goto out3;
-
 	err = register_filesystem(&configfs_fs_type);
 	if (err)
-		goto out4;
+		goto out3;
 
 	return 0;
-out4:
-	pr_err("Unable to register filesystem!\n");
-	configfs_inode_exit();
 out3:
+	pr_err("Unable to register filesystem!\n");
 	kobject_put(config_kobj);
 out2:
 	kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
 	kobject_put(config_kobj);
 	kmem_cache_destroy(configfs_dir_cachep);
 	configfs_dir_cachep = NULL;
-	configfs_inode_exit();
 }
 
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 		goto out;
 	}
 
-	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
 	if (rc)
 		goto out1;
 
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+	ret = bdi_setup_and_register(&sbi->bdi, "exofs");
 	if (ret) {
 		EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
 		dput(sb->s_root);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..a699a3fc62c0 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -560,7 +560,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	server = NCP_SBP(sb);
 	memset(server, 0, sizeof(*server));
 
-	error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+	error = bdi_setup_and_register(&server->bdi, "ncpfs");
 	if (error)
 		goto out_fput;
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 						   unsigned long flags);
 static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 
+static unsigned ramfs_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
+		NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
 const struct file_operations ramfs_file_operations = {
+	.mmap_capabilities	= ramfs_mmap_capabilities,
 	.mmap			= ramfs_nommu_mmap,
 	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
 	.read			= new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..ad4d712002f4 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
 	.set_page_dirty	= __set_page_dirty_no_writeback,
 };
 
-static struct backing_dev_info ramfs_backing_dev_info = {
-	.name		= "ramfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK |
-			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
-			  BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
-};
-
 struct inode *ramfs_get_inode(struct super_block *sb,
 				const struct inode *dir, umode_t mode, dev_t dev)
 {
@@ -67,7 +59,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
-		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
+		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +259,9 @@ static struct file_system_type ramfs_fs_type = {
 int __init init_ramfs_fs(void)
 {
 	static unsigned long once;
-	int err;
 
 	if (test_and_set_bit(0, &once))
 		return 0;
-
-	err = bdi_init(&ramfs_backing_dev_info);
-	if (err)
-		return err;
-
-	err = register_filesystem(&ramfs_fs_type);
-	if (err)
-		bdi_destroy(&ramfs_backing_dev_info);
-
-	return err;
+	return register_filesystem(&ramfs_fs_type);
 }
 fs_initcall(init_ramfs_fs);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
 }
 
+static unsigned romfs_mmap_capabilities(struct file *file)
+{
+	struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+
+	if (!mtd)
+		return NOMMU_MAP_COPY;
+	return mtd_mmap_capabilities(mtd);
+}
+
 const struct file_operations romfs_ro_fops = {
 	.llseek			= generic_file_llseek,
 	.read			= new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
 	.splice_read		= generic_file_splice_read,
 	.mmap			= romfs_mmap,
 	.get_unmapped_area	= romfs_get_unmapped_area,
+	.mmap_capabilities	= romfs_mmap_capabilities,
 };
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..ed93dc6ae245 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2017,7 +2017,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
 	 */
 	c->bdi.name = "ubifs",
-	c->bdi.capabilities = BDI_CAP_MAP_COPY;
+	c->bdi.capabilities = 0;
 	err  = bdi_init(&c->bdi);
 	if (err)
 		goto out_close;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e936cea856dc..478f95d92d73 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -114,7 +114,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
+int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
@@ -228,42 +228,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_NO_ACCT_DIRTY:  Dirty pages shouldn't contribute to accounting
  * BDI_CAP_NO_WRITEBACK:   Don't write pages back
  * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
- *
- * These flags let !MMU mmap() govern direct device mapping vs immediate
- * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
- *
- * BDI_CAP_MAP_COPY:       Copy can be mapped (MAP_PRIVATE)
- * BDI_CAP_MAP_DIRECT:     Can be mapped directly (MAP_SHARED)
- * BDI_CAP_READ_MAP:       Can be mapped for reading
- * BDI_CAP_WRITE_MAP:      Can be mapped for writing
- * BDI_CAP_EXEC_MAP:       Can be mapped for execution
- *
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
-#define BDI_CAP_MAP_COPY	0x00000004
-#define BDI_CAP_MAP_DIRECT	0x00000008
-#define BDI_CAP_READ_MAP	0x00000010
-#define BDI_CAP_WRITE_MAP	0x00000020
-#define BDI_CAP_EXEC_MAP	0x00000040
-#define BDI_CAP_NO_ACCT_WB	0x00000080
-#define BDI_CAP_STABLE_WRITES	0x00000200
-#define BDI_CAP_STRICTLIMIT	0x00000400
-
-#define BDI_CAP_VMFLAGS \
-	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+#define BDI_CAP_NO_ACCT_WB	0x00000004
+#define BDI_CAP_STABLE_WRITES	0x00000008
+#define BDI_CAP_STRICTLIMIT	0x00000010
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 
-#if defined(VM_MAYREAD) && \
-	(BDI_CAP_READ_MAP != VM_MAYREAD || \
-	 BDI_CAP_WRITE_MAP != VM_MAYWRITE || \
-	 BDI_CAP_EXEC_MAP != VM_MAYEXEC)
-#error please change backing_dev_info::capabilities flags
-#endif
-
 extern struct backing_dev_info default_backing_dev_info;
 extern struct backing_dev_info noop_backing_dev_info;
 
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index fb4591977b03..f8763615a5f2 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -30,6 +30,4 @@ void cdev_del(struct cdev *);
 
 void cd_forget(struct inode *);
 
-extern struct backing_dev_info directly_mappable_cdev_bdi;
-
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..1dada399aa23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1502,6 +1502,26 @@ struct block_device_operations;
 #define HAVE_COMPAT_IOCTL 1
 #define HAVE_UNLOCKED_IOCTL 1
 
+/*
+ * These flags let !MMU mmap() govern direct device mapping vs immediate
+ * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
+ *
+ * NOMMU_MAP_COPY:	Copy can be mapped (MAP_PRIVATE)
+ * NOMMU_MAP_DIRECT:	Can be mapped directly (MAP_SHARED)
+ * NOMMU_MAP_READ:	Can be mapped for reading
+ * NOMMU_MAP_WRITE:	Can be mapped for writing
+ * NOMMU_MAP_EXEC:	Can be mapped for execution
+ */
+#define NOMMU_MAP_COPY		0x00000001
+#define NOMMU_MAP_DIRECT	0x00000008
+#define NOMMU_MAP_READ		VM_MAYREAD
+#define NOMMU_MAP_WRITE		VM_MAYWRITE
+#define NOMMU_MAP_EXEC		VM_MAYEXEC
+
+#define NOMMU_VMFLAGS \
+	(NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
+
+
 struct iov_iter;
 
 struct file_operations {
@@ -1536,6 +1556,9 @@ struct file_operations {
 	long (*fallocate)(struct file *file, int mode, loff_t offset,
 			  loff_t len);
 	void (*show_fdinfo)(struct seq_file *m, struct file *f);
+#ifndef CONFIG_MMU
+	unsigned (*mmap_capabilities)(struct file *);
+#endif
 };
 
 struct inode_operations {
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 031ff3a9a0bd..3301c4c289d6 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -408,4 +408,6 @@ static inline int mtd_is_bitflip_or_eccerr(int err) {
 	return mtd_is_bitflip(err) || mtd_is_eccerr(err);
 }
 
+unsigned mtd_mmap_capabilities(struct mtd_info *mtd);
+
 #endif /* __MTD_MTD_H__ */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..16c68958aeda 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -17,8 +17,6 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 struct backing_dev_info default_backing_dev_info = {
 	.name		= "default",
 	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
-	.state		= 0,
-	.capabilities	= BDI_CAP_MAP_COPY,
 };
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
@@ -513,13 +511,12 @@ EXPORT_SYMBOL(bdi_destroy);
  * For use from filesystems to quickly init and register a bdi associated
  * with dirty writeback
  */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
-			   unsigned int cap)
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
 {
 	int err;
 
 	bdi->name = name;
-	bdi->capabilities = cap;
+	bdi->capabilities = 0;
 	err = bdi_init(bdi);
 	if (err)
 		return err;
diff --git a/mm/nommu.c b/mm/nommu.c
index b51eadf6d952..13af96f35a4b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -946,9 +946,6 @@ static int validate_mmap_request(struct file *file,
 		return -EOVERFLOW;
 
 	if (file) {
-		/* validate file mapping requests */
-		struct address_space *mapping;
-
 		/* files must support mmap */
 		if (!file->f_op->mmap)
 			return -ENODEV;
@@ -957,28 +954,22 @@ static int validate_mmap_request(struct file *file,
 		 * - we support chardevs that provide their own "memory"
 		 * - we support files/blockdevs that are memory backed
 		 */
-		mapping = file->f_mapping;
-		if (!mapping)
-			mapping = file_inode(file)->i_mapping;
-
-		capabilities = 0;
-		if (mapping && mapping->backing_dev_info)
-			capabilities = mapping->backing_dev_info->capabilities;
-
-		if (!capabilities) {
+		if (file->f_op->mmap_capabilities) {
+			capabilities = file->f_op->mmap_capabilities(file);
+		} else {
 			/* no explicit capabilities set, so assume some
 			 * defaults */
 			switch (file_inode(file)->i_mode & S_IFMT) {
 			case S_IFREG:
 			case S_IFBLK:
-				capabilities = BDI_CAP_MAP_COPY;
+				capabilities = NOMMU_MAP_COPY;
 				break;
 
 			case S_IFCHR:
 				capabilities =
-					BDI_CAP_MAP_DIRECT |
-					BDI_CAP_READ_MAP |
-					BDI_CAP_WRITE_MAP;
+					NOMMU_MAP_DIRECT |
+					NOMMU_MAP_READ |
+					NOMMU_MAP_WRITE;
 				break;
 
 			default:
@@ -989,9 +980,9 @@ static int validate_mmap_request(struct file *file,
 		/* eliminate any capabilities that we can't support on this
 		 * device */
 		if (!file->f_op->get_unmapped_area)
-			capabilities &= ~BDI_CAP_MAP_DIRECT;
+			capabilities &= ~NOMMU_MAP_DIRECT;
 		if (!file->f_op->read)
-			capabilities &= ~BDI_CAP_MAP_COPY;
+			capabilities &= ~NOMMU_MAP_COPY;
 
 		/* The file shall have been opened with read permission. */
 		if (!(file->f_mode & FMODE_READ))
@@ -1010,29 +1001,29 @@ static int validate_mmap_request(struct file *file,
 			if (locks_verify_locked(file))
 				return -EAGAIN;
 
-			if (!(capabilities & BDI_CAP_MAP_DIRECT))
+			if (!(capabilities & NOMMU_MAP_DIRECT))
 				return -ENODEV;
 
 			/* we mustn't privatise shared mappings */
-			capabilities &= ~BDI_CAP_MAP_COPY;
+			capabilities &= ~NOMMU_MAP_COPY;
 		} else {
 			/* we're going to read the file into private memory we
 			 * allocate */
-			if (!(capabilities & BDI_CAP_MAP_COPY))
+			if (!(capabilities & NOMMU_MAP_COPY))
 				return -ENODEV;
 
 			/* we don't permit a private writable mapping to be
 			 * shared with the backing device */
 			if (prot & PROT_WRITE)
-				capabilities &= ~BDI_CAP_MAP_DIRECT;
+				capabilities &= ~NOMMU_MAP_DIRECT;
 		}
 
-		if (capabilities & BDI_CAP_MAP_DIRECT) {
-			if (((prot & PROT_READ)  && !(capabilities & BDI_CAP_READ_MAP))  ||
-			    ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
-			    ((prot & PROT_EXEC)  && !(capabilities & BDI_CAP_EXEC_MAP))
+		if (capabilities & NOMMU_MAP_DIRECT) {
+			if (((prot & PROT_READ)  && !(capabilities & NOMMU_MAP_READ))  ||
+			    ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
+			    ((prot & PROT_EXEC)  && !(capabilities & NOMMU_MAP_EXEC))
 			    ) {
-				capabilities &= ~BDI_CAP_MAP_DIRECT;
+				capabilities &= ~NOMMU_MAP_DIRECT;
 				if (flags & MAP_SHARED) {
 					printk(KERN_WARNING
 					       "MAP_SHARED not completely supported on !MMU\n");
@@ -1049,21 +1040,21 @@ static int validate_mmap_request(struct file *file,
 		} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
 			/* handle implication of PROT_EXEC by PROT_READ */
 			if (current->personality & READ_IMPLIES_EXEC) {
-				if (capabilities & BDI_CAP_EXEC_MAP)
+				if (capabilities & NOMMU_MAP_EXEC)
 					prot |= PROT_EXEC;
 			}
 		} else if ((prot & PROT_READ) &&
 			 (prot & PROT_EXEC) &&
-			 !(capabilities & BDI_CAP_EXEC_MAP)
+			 !(capabilities & NOMMU_MAP_EXEC)
 			 ) {
 			/* backing file is not executable, try to copy */
-			capabilities &= ~BDI_CAP_MAP_DIRECT;
+			capabilities &= ~NOMMU_MAP_DIRECT;
 		}
 	} else {
 		/* anonymous mappings are always memory backed and can be
 		 * privately mapped
 		 */
-		capabilities = BDI_CAP_MAP_COPY;
+		capabilities = NOMMU_MAP_COPY;
 
 		/* handle PROT_EXEC implication by PROT_READ */
 		if ((prot & PROT_READ) &&
@@ -1095,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
 	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
 	/* vm_flags |= mm->def_flags; */
 
-	if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+	if (!(capabilities & NOMMU_MAP_DIRECT)) {
 		/* attempt to share read-only copies of mapped file chunks */
 		vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 		if (file && !(prot & PROT_WRITE))
@@ -1104,7 +1095,7 @@ static unsigned long determine_vm_flags(struct file *file,
 		/* overlay a shareable mapping on the backing device or inode
 		 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
 		 * romfs/cramfs */
-		vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
+		vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
 		if (flags & MAP_SHARED)
 			vm_flags |= VM_SHARED;
 	}
@@ -1157,7 +1148,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 	 * shared mappings on devices or memory
 	 * - VM_MAYSHARE will be set if it may attempt to share
 	 */
-	if (capabilities & BDI_CAP_MAP_DIRECT) {
+	if (capabilities & NOMMU_MAP_DIRECT) {
 		ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
 		if (ret == 0) {
 			/* shouldn't return success if we're not sharing */
@@ -1346,7 +1337,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 			if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
 			    !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
 				/* new mapping is not a subset of the region */
-				if (!(capabilities & BDI_CAP_MAP_DIRECT))
+				if (!(capabilities & NOMMU_MAP_DIRECT))
 					goto sharing_violation;
 				continue;
 			}
@@ -1385,7 +1376,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 		 * - this is the hook for quasi-memory character devices to
 		 *   tell us the location of a shared mapping
 		 */
-		if (capabilities & BDI_CAP_MAP_DIRECT) {
+		if (capabilities & NOMMU_MAP_DIRECT) {
 			addr = file->f_op->get_unmapped_area(file, addr, len,
 							     pgoff, flags);
 			if (IS_ERR_VALUE(addr)) {
@@ -1397,10 +1388,10 @@ unsigned long do_mmap_pgoff(struct file *file,
 				 * the mapping so we'll have to attempt to copy
 				 * it */
 				ret = -ENODEV;
-				if (!(capabilities & BDI_CAP_MAP_COPY))
+				if (!(capabilities & NOMMU_MAP_COPY))
 					goto error_just_free;
 
-				capabilities &= ~BDI_CAP_MAP_DIRECT;
+				capabilities &= ~NOMMU_MAP_DIRECT;
 			} else {
 				vma->vm_start = region->vm_start = addr;
 				vma->vm_end = region->vm_end = addr + len;
@@ -1411,7 +1402,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 	vma->vm_region = region;
 
 	/* set up the mapping
-	 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+	 * - the region is filled in if NOMMU_MAP_DIRECT is still set
 	 */
 	if (file && vma->vm_flags & VM_SHARED)
 		ret = do_mmap_shared_file(vma);
diff --git a/security/security.c b/security/security.c
index 18b35c63fc0c..a0442b20f001 100644
--- a/security/security.c
+++ b/security/security.c
@@ -726,16 +726,15 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
 		return prot | PROT_EXEC;
 	/*
 	 * ditto if it's not on noexec mount, except that on !MMU we need
-	 * BDI_CAP_EXEC_MMAP (== VM_MAYEXEC) in this case
+	 * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
 	 */
 	if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
 #ifndef CONFIG_MMU
-		unsigned long caps = 0;
-		struct address_space *mapping = file->f_mapping;
-		if (mapping && mapping->backing_dev_info)
-			caps = mapping->backing_dev_info->capabilities;
-		if (!(caps & BDI_CAP_EXEC_MAP))
-			return prot;
+		if (file->f_op->mmap_capabilities) {
+			unsigned caps = file->f_op->mmap_capabilities(file);
+			if (!(caps & NOMMU_MAP_EXEC))
+				return prot;
+		}
 #endif
 		return prot | PROT_EXEC;
 	}
-- 
cgit v1.2.3


From de1414a654e66b81b5348dbc5259ecf2fb61655e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:36 +0100
Subject: fs: export inode_to_bdi and use it in favor of
 mapping->backing_dev_info

Now that we got rid of the bdi abuse on character devices we can always use
sb->s_bdi to get at the backing_dev_info for a file, except for the block
device special case.  Export inode_to_bdi and replace uses of
mapping->backing_dev_info with it to prepare for the removal of
mapping->backing_dev_info.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/btrfs/file.c                  |  2 +-
 fs/ceph/file.c                   |  2 +-
 fs/ext2/ialloc.c                 |  2 +-
 fs/ext4/super.c                  |  2 +-
 fs/fs-writeback.c                |  3 ++-
 fs/fuse/file.c                   | 10 +++++-----
 fs/gfs2/aops.c                   |  2 +-
 fs/gfs2/super.c                  |  2 +-
 fs/nfs/filelayout/filelayout.c   |  2 +-
 fs/nfs/write.c                   |  6 +++---
 fs/ntfs/file.c                   |  3 ++-
 fs/ocfs2/file.c                  |  2 +-
 fs/xfs/xfs_file.c                |  2 +-
 include/linux/backing-dev.h      |  6 ++++--
 include/trace/events/writeback.h |  6 +++---
 mm/fadvise.c                     |  4 ++--
 mm/filemap.c                     |  4 ++--
 mm/filemap_xip.c                 |  3 ++-
 mm/page-writeback.c              | 29 +++++++++++++----------------
 mm/readahead.c                   |  4 ++--
 mm/truncate.c                    |  2 +-
 mm/vmscan.c                      |  4 ++--
 22 files changed, 52 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..835c04a874fd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 
 	mutex_lock(&inode->i_mutex);
 
-	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err) {
 		mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce74b394b49d..905986dd4c3c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -945,7 +945,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = file->f_mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
 	struct ext2_group_desc * gdp;
 	struct backing_dev_info *bdi;
 
-	bdi = inode->i_mapping->backing_dev_info;
+	bdi = inode_to_bdi(inode);
 	if (bdi_read_congested(bdi))
 		return;
 	if (bdi_write_congested(bdi))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 74c5f53595fb..ad88e601a6cd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
 static int block_device_ejected(struct super_block *sb)
 {
 	struct inode *bd_inode = sb->s_bdev->bd_inode;
-	struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
 
 	return bdi->dev == NULL;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e8116a44cc29..a20b1145f4d5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,7 +66,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(writeback_in_progress);
 
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 #ifdef CONFIG_BLOCK
@@ -75,6 +75,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 #endif
 	return sb->s_bdi;
 }
+EXPORT_SYMBOL_GPL(inode_to_bdi);
 
 static inline struct inode *wb_inode(struct list_head *head)
 {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..19d80b82d344 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 {
 	struct inode *inode = req->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	int i;
 
 	list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
 	req->end = fuse_writepage_end;
 	req->inode = inode;
 
-	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 
 	if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
 					old_req->state == FUSE_REQ_PENDING)) {
-		struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
 
 		copy_highpage(old_req->pages[0], page);
 		spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
 	req->page_descs[req->num_pages].offset = 0;
 	req->page_descs[req->num_pages].length = PAGE_SIZE;
 
-	inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	err = 0;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
 		if (!clear_page_dirty_for_io(page))
 			goto continue_unlock;
 
-		trace_wbc_writepage(wbc, mapping->backing_dev_info);
+		trace_wbc_writepage(wbc, inode_to_bdi(inode));
 
 		ret = __gfs2_jdata_writepage(page, wbc);
 		if (unlikely(ret)) {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
-	struct backing_dev_info *bdi = metamapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
 	int ret = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7afb52f6a25a..51aa889611cf 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1081,7 +1081,7 @@ mds_commit:
 	spin_unlock(cinfo->lock);
 	if (!cinfo->dreq) {
 		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+		inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 			     BDI_RECLAIMABLE);
 		__mark_inode_dirty(req->wb_context->dentry->d_inode,
 				   I_DIRTY_DATASYNC);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af3af685a9e3..298abcc5281b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -786,7 +786,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 	spin_unlock(cinfo->lock);
 	if (!cinfo->dreq) {
 		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+		inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 			     BDI_RECLAIMABLE);
 		__mark_inode_dirty(req->wb_context->dentry->d_inode,
 				   I_DIRTY_DATASYNC);
@@ -853,7 +853,7 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
 	dec_zone_page_state(page, NR_UNSTABLE_NFS);
-	dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+	dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
 }
 
 /* Called holding inode (/cinfo) lock */
@@ -1564,7 +1564,7 @@ void nfs_retry_commit(struct list_head *page_list,
 		nfs_mark_request_commit(req, lseg, cinfo);
 		if (!cinfo->dreq) {
 			dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-			dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+			dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 				     BDI_RECLAIMABLE);
 		}
 		nfs_unlock_and_release_request(req);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 	count = iov_length(iov, nr_segs);
 	pos = *ppos;
 	/* We can write back this queue in page reclaim. */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 	written = 0;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693dd0f6..abe7d98d6178 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2363,7 +2363,7 @@ relock:
 			goto out_dio;
 		}
 	} else {
-		current->backing_dev_info = file->f_mapping->backing_dev_info;
+		current->backing_dev_info = inode_to_bdi(inode);
 		written = generic_perform_write(file, from, *ppos);
 		if (likely(written >= 0))
 			iocb->ki_pos = *ppos + written;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..5684ac3e7d18 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -699,7 +699,7 @@ xfs_file_buffered_aio_write(
 
 	iov_iter_truncate(from, count);
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 write_retry:
 	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 478f95d92d73..ed59dee03a71 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -106,6 +106,8 @@ struct backing_dev_info {
 #endif
 };
 
+struct backing_dev_info *inode_to_bdi(struct inode *inode);
+
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
@@ -303,12 +305,12 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
-	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
+	return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host));
 }
 
 static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 {
-	return bdi_cap_account_dirty(mapping->backing_dev_info);
+	return bdi_cap_account_dirty(inode_to_bdi(mapping->host));
 }
 
 static inline int bdi_sched_wait(void *word)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d65ab3f..74f5207bd090 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -47,7 +47,7 @@ TRACE_EVENT(writeback_dirty_page,
 
 	TP_fast_assign(
 		strncpy(__entry->name,
-			mapping ? dev_name(mapping->backing_dev_info->dev) : "(unknown)", 32);
+			mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32);
 		__entry->ino = mapping ? mapping->host->i_ino : 0;
 		__entry->index = page->index;
 	),
@@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 	),
 
 	TP_fast_assign(
-		struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(inode);
 
 		/* may be called for files on pseudo FSes w/ unregistered bdi */
 		strncpy(__entry->name,
@@ -116,7 +116,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
 
 	TP_fast_assign(
 		strncpy(__entry->name,
-			dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+			dev_name(inode_to_bdi(inode)->dev), 32);
 		__entry->ino		= inode->i_ino;
 		__entry->sync_mode	= wbc->sync_mode;
 	),
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 2ad7adf4f0a4..fac23ecf8d72 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -73,7 +73,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	else
 		endbyte--;		/* inclusive */
 
-	bdi = mapping->backing_dev_info;
+	bdi = inode_to_bdi(mapping->host);
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
@@ -113,7 +113,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	case POSIX_FADV_NOREUSE:
 		break;
 	case POSIX_FADV_DONTNEED:
-		if (!bdi_write_congested(mapping->backing_dev_info))
+		if (!bdi_write_congested(bdi))
 			__filemap_fdatawrite_range(mapping, offset, endbyte,
 						   WB_SYNC_NONE);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 673e4581a2e5..5d7c23c26f81 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 	 */
 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
 	}
 }
 
@@ -2565,7 +2565,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	size_t		count = iov_iter_count(from);
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
 		goto out;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0d105aeff82f..26897fbfbe19 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/backing-dev.h>
 #include <linux/pagemap.h>
 #include <linux/export.h>
 #include <linux/uio.h>
@@ -410,7 +411,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	count = len;
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
 	if (ret)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f4335238e33..d4cbb4bd7d1c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long task_ratelimit;
 	unsigned long dirty_ratelimit;
 	unsigned long pos_ratio;
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
 	unsigned long start_time = jiffies;
 
@@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
  */
 void balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	int ratelimit;
 	int *p;
 
@@ -1929,7 +1929,7 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			trace_wbc_writepage(wbc, mapping->backing_dev_info);
+			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
 			ret = (*writepage)(page, wbc, data);
 			if (unlikely(ret)) {
 				if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 	trace_writeback_dirty_page(page, mapping);
 
 	if (mapping_cap_account_dirty(mapping)) {
+		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+		__inc_bdi_stat(bdi, BDI_RECLAIMABLE);
+		__inc_bdi_stat(bdi, BDI_DIRTIED);
 		task_io_account_write(PAGE_CACHE_SIZE);
 		current->nr_dirtied++;
 		this_cpu_inc(bdp_ratelimits);
@@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page)
 	if (mapping && mapping_cap_account_dirty(mapping)) {
 		current->nr_dirtied--;
 		dec_zone_page_state(page, NR_DIRTIED);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
 	}
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2295,7 +2297,7 @@ int clear_page_dirty_for_io(struct page *page)
 		 */
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_bdi_stat(mapping->backing_dev_info,
+			dec_bdi_stat(inode_to_bdi(mapping->host),
 					BDI_RECLAIMABLE);
 			return 1;
 		}
@@ -2315,7 +2317,7 @@ int test_clear_page_writeback(struct page *page)
 
 	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
 	if (mapping) {
-		struct backing_dev_info *bdi = mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 		unsigned long flags;
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2352,7 +2354,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 
 	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
 	if (mapping) {
-		struct backing_dev_info *bdi = mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 		unsigned long flags;
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2406,12 +2408,7 @@ EXPORT_SYMBOL(mapping_tagged);
  */
 void wait_for_stable_page(struct page *page)
 {
-	struct address_space *mapping = page_mapping(page);
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-
-	if (!bdi_cap_stable_pages_required(bdi))
-		return;
-
-	wait_on_page_writeback(page);
+	if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+		wait_on_page_writeback(page);
 }
 EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/readahead.c b/mm/readahead.c
index 17b9172ec37f..935675844b2e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -27,7 +27,7 @@
 void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 {
-	ra->ra_pages = mapping->backing_dev_info->ra_pages;
+	ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
 	ra->prev_pos = -1;
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
 	/*
 	 * Defer asynchronous read-ahead on IO congestion.
 	 */
-	if (bdi_read_congested(mapping->backing_dev_info))
+	if (bdi_read_congested(inode_to_bdi(mapping->host)))
 		return;
 
 	/* do read-ahead */
diff --git a/mm/truncate.c b/mm/truncate.c
index f1e4d6052369..ddec5a5966d7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_bdi_stat(mapping->backing_dev_info,
+			dec_bdi_stat(inode_to_bdi(mapping->host),
 					BDI_RECLAIMABLE);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ab2505c3ef54..e00a16393f21 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -497,7 +497,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
-	if (!may_write_to_queue(mapping->backing_dev_info, sc))
+	if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
 		return PAGE_KEEP;
 
 	if (clear_page_dirty_for_io(page)) {
@@ -876,7 +876,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 */
 		mapping = page_mapping(page);
 		if (((dirty || writeback) && mapping &&
-		     bdi_write_congested(mapping->backing_dev_info)) ||
+		     bdi_write_congested(inode_to_bdi(mapping->host))) ||
 		    (writeback && PageReclaim(page)))
 			nr_congested++;
 
-- 
cgit v1.2.3


From b83ae6d421435c6204150300f1c25bfbd39cd62b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:37 +0100
Subject: fs: remove mapping->backing_dev_info

Now that we never use the backing_dev_info pointer in struct address_space
we can simply remove it and save 4 to 8 bytes in every inode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/char/raw.c     |  4 +---
 fs/aio.c               |  1 -
 fs/block_dev.c         | 26 +-------------------------
 fs/btrfs/disk-io.c     |  1 -
 fs/btrfs/inode.c       |  6 ------
 fs/ceph/inode.c        |  2 --
 fs/cifs/inode.c        |  2 --
 fs/configfs/inode.c    |  1 -
 fs/ecryptfs/inode.c    |  1 -
 fs/exofs/inode.c       |  2 --
 fs/fuse/inode.c        |  1 -
 fs/gfs2/glock.c        |  1 -
 fs/gfs2/ops_fstype.c   |  1 -
 fs/hugetlbfs/inode.c   |  1 -
 fs/inode.c             | 13 -------------
 fs/kernfs/inode.c      |  1 -
 fs/ncpfs/inode.c       |  1 -
 fs/nfs/inode.c         |  1 -
 fs/nilfs2/gcinode.c    |  1 -
 fs/nilfs2/mdt.c        |  6 ++----
 fs/nilfs2/page.c       |  4 +---
 fs/nilfs2/page.h       |  3 +--
 fs/nilfs2/super.c      |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c |  2 --
 fs/ramfs/inode.c       |  1 -
 fs/romfs/super.c       |  3 ---
 fs/ubifs/dir.c         |  2 --
 fs/ubifs/super.c       |  3 ---
 include/linux/fs.h     |  3 +--
 mm/backing-dev.c       |  1 -
 mm/shmem.c             |  1 -
 mm/swap_state.c        |  1 -
 32 files changed, 8 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index a24891b97547..6e29bf2db536 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -104,11 +104,9 @@ static int raw_release(struct inode *inode, struct file *filp)
 
 	mutex_lock(&raw_mutex);
 	bdev = raw_devices[minor].binding;
-	if (--raw_devices[minor].inuse == 0) {
+	if (--raw_devices[minor].inuse == 0)
 		/* Here  inode->i_mapping == bdev->bd_inode->i_mapping  */
 		inode->i_mapping = &inode->i_data;
-		inode->i_mapping->backing_dev_info = &default_backing_dev_info;
-	}
 	mutex_unlock(&raw_mutex);
 
 	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
diff --git a/fs/aio.c b/fs/aio.c
index 6f13d3fab07f..3bf8b1d250c3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -176,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 
 	inode->i_mapping->a_ops = &aio_ctx_aops;
 	inode->i_mapping->private_data = ctx;
-	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_size = PAGE_SIZE * nr_pages;
 
 	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 026ca7b8431c..a9f92794d7a0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -60,19 +60,6 @@ static void bdev_write_inode(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 }
 
-/*
- * Move the inode from its current bdi to a new bdi.  Make sure the inode
- * is clean before moving so that it doesn't linger on the old bdi.
- */
-static void bdev_inode_switch_bdi(struct inode *inode,
-			struct backing_dev_info *dst)
-{
-	spin_lock(&inode->i_lock);
-	WARN_ON_ONCE(inode->i_state & I_DIRTY);
-	inode->i_data.backing_dev_info = dst;
-	spin_unlock(&inode->i_lock);
-}
-
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 void kill_bdev(struct block_device *bdev)
 {
@@ -589,7 +576,6 @@ struct block_device *bdget(dev_t dev)
 		inode->i_bdev = bdev;
 		inode->i_data.a_ops = &def_blk_aops;
 		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-		inode->i_data.backing_dev_info = &default_backing_dev_info;
 		spin_lock(&bdev_lock);
 		list_add(&bdev->bd_list, &all_bdevs);
 		spin_unlock(&bdev_lock);
@@ -1150,8 +1136,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
 		if (!partno) {
-			struct backing_dev_info *bdi;
-
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!bdev->bd_part)
@@ -1177,11 +1161,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				}
 			}
 
-			if (!ret) {
+			if (!ret)
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-				bdi = blk_get_backing_dev_info(bdev);
-				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
-			}
 
 			/*
 			 * If the device is invalidated, rescan partition
@@ -1208,8 +1189,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (ret)
 				goto out_clear;
 			bdev->bd_contains = whole;
-			bdev_inode_switch_bdi(bdev->bd_inode,
-				whole->bd_inode->i_data.backing_dev_info);
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1249,7 +1228,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_queue = NULL;
-	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
@@ -1474,8 +1452,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		 * dirty data before.
 		 */
 		bdev_write_inode(bdev->bd_inode);
-		bdev_inode_switch_bdi(bdev->bd_inode,
-					&default_backing_dev_info);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index afc4092989cd..1ec872e3a926 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2318,7 +2318,6 @@ int open_ctree(struct super_block *sb,
 	 */
 	fs_info->btree_inode->i_size = OFFSET_MAX;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
 
 	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bf326affb94..54bcf639d1cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3608,7 +3608,6 @@ cache_acl:
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3622,6 @@ cache_acl:
 	case S_IFLNK:
 		inode->i_op = &btrfs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		break;
 	default:
 		inode->i_op = &btrfs_special_inode_operations;
@@ -6088,7 +6086,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	inode->i_fop = &btrfs_file_operations;
 	inode->i_op = &btrfs_file_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err)
@@ -9203,7 +9200,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_fop = &btrfs_file_operations;
 	inode->i_op = &btrfs_file_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9247,7 +9243,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	inode_set_bytes(inode, name_len);
 	btrfs_i_size_write(inode, name_len);
 	err = btrfs_update_inode(trans, root, inode);
@@ -9459,7 +9454,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inode->i_op = &btrfs_file_inode_operations;
 
 	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
 	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f61a74115beb..6b5173605154 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	}
 
 	inode->i_mapping->a_ops = &ceph_aops;
-	inode->i_mapping->backing_dev_info =
-		&ceph_sb_to_client(inode->i_sb)->backing_dev_info;
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFIFO:
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
 		if (inode->i_state & I_NEW) {
 			inode->i_ino = hash;
-			if (S_ISREG(inode->i_mode))
-				inode->i_data.backing_dev_info = sb->s_bdi;
 #ifdef CONFIG_CIFS_FSCACHE
 			/* initialize per-inode cache cookie pointer */
 			CIFS_I(inode)->fscache = NULL;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 0ad6b4d6de00..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -131,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mapping->a_ops = &configfs_aops;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
 
 		if (sd->s_iattr) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
 	inode->i_ino = lower_inode->i_ino;
 	inode->i_version++;
 	inode->i_mapping->a_ops = &ecryptfs_aops;
-	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	if (S_ISLNK(inode->i_mode))
 		inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..6fc91df99ff8 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1214,7 +1214,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
 	}
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &exofs_file_inode_operations;
 		inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1313,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
 
 	set_obj_2bcreated(oi);
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
 	inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f38256e4476e..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 		if (!fc->writeback_cache || !S_ISREG(attr->mode))
 			inode->i_flags |= S_NOCMTIME;
 		inode->i_generation = generation;
-		inode->i_data.backing_dev_info = &fc->bdi;
 		fuse_init_inode(inode, attr);
 		unlock_new_inode(inode);
 	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524aa3eac..08ea717981f7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -775,7 +775,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		mapping->flags = 0;
 		mapping_set_gfp_mask(mapping, GFP_NOFS);
 		mapping->private_data = NULL;
-		mapping->backing_dev_info = s->s_bdi;
 		mapping->writeback_index = 0;
 	}
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	mapping->private_data = NULL;
-	mapping->backing_dev_info = sb->s_bdi;
 	mapping->writeback_index = 0;
 
 	spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index de7c95c7d840..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -492,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->private_data = resv_map;
 		info = HUGETLBFS_I(inode);
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..e4e8caa7464c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -170,20 +170,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	atomic_set(&mapping->i_mmap_writable, 0);
 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->private_data = NULL;
-	mapping->backing_dev_info = &default_backing_dev_info;
 	mapping->writeback_index = 0;
-
-	/*
-	 * If the block_device provides a backing_dev_info for client
-	 * inodes then use that.  Otherwise the inode share the bdev's
-	 * backing_dev_info.
-	 */
-	if (sb->s_bdev) {
-		struct backing_dev_info *bdi;
-
-		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-		mapping->backing_dev_info = bdi;
-	}
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 	INIT_HLIST_HEAD(&inode->i_dentry);	/* buggered by rcu freeing */
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 06f06887b2d2..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -286,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	kernfs_get(kn);
 	inode->i_private = kn;
 	inode->i_mapping->a_ops = &kernfs_aops;
-	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_op = &kernfs_iops;
 
 	set_default_inode_attr(inode, kn->mode);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index a699a3fc62c0..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 	if (inode) {
 		atomic_set(&NCP_FINFO(inode)->opened, info->opened);
 
-		inode->i_mapping->backing_dev_info = sb->s_bdi;
 		inode->i_ino = info->ino;
 		ncp_set_attr(inode, info);
 		if (S_ISREG(inode->i_mode)) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4bffe637ea32..24aac72420f4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -387,7 +387,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		if (S_ISREG(inode->i_mode)) {
 			inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
 			inode->i_data.a_ops = &nfs_file_aops;
-			inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
 		} else if (S_ISDIR(inode->i_mode)) {
 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
 	inode->i_mapping->a_ops = &empty_aops;
-	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	ii->i_flags = 0;
 	nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
 
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
-	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	inode->i_op = &def_mdt_iops;
 	inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 			       struct nilfs_shadow_map *shadow)
 {
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
-	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
 	address_space_init_once(&shadow->frozen_data);
-	nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
+	nilfs_mapping_init(&shadow->frozen_data, inode);
 	address_space_init_once(&shadow->frozen_btnodes);
-	nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
+	nilfs_mapping_init(&shadow->frozen_btnodes, inode);
 	mi->mi_shadow = shadow;
 	return 0;
 }
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	return nc;
 }
 
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
-			struct backing_dev_info *bdi)
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
 {
 	mapping->host = inode;
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	mapping->private_data = NULL;
-	mapping->backing_dev_info = bdi;
 	mapping->a_ops = &empty_aops;
 }
 
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_page(struct page *, bool);
 void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
-			struct backing_dev_info *bdi);
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 					    sector_t start_blk,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3d4bbac36bea..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 	ii->i_state = 0;
 	ii->i_cno = 0;
 	ii->vfs_inode.i_version = 1;
-	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
+	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
 	return &ii->vfs_inode;
 }
 
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 6000d3029b26..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -398,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, NULL, mode);
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inc_nlink(inode);
 
@@ -422,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 
 	inode->i_ino = get_next_ino();
 	inode_init_owner(inode, parent, mode);
-	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
 	ip = DLMFS_I(inode);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index ad4d712002f4..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -59,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
 	case ROMFH_REG:
 		i->i_fop = &romfs_ro_fops;
 		i->i_data.a_ops = &romfs_aops;
-		if (i->i_sb->s_mtd)
-			i->i_data.backing_dev_info =
-				i->i_sb->s_mtd->backing_dev_info;
 		if (nextfh & ROMFH_EXEC)
 			mode |= S_IXUGO;
 		break;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..c49b1981ac95 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
 	inode->i_mtime = inode->i_atime = inode->i_ctime =
 			 ubifs_current_time(inode);
 	inode->i_mapping->nrpages = 0;
-	/* Disable readahead */
-	inode->i_mapping->backing_dev_info = &c->bdi;
 
 	switch (mode & S_IFMT) {
 	case S_IFREG:
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ed93dc6ae245..6197154f36ca 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 	if (err)
 		goto out_invalid;
 
-	/* Disable read-ahead */
-	inode->i_mapping->backing_dev_info = &c->bdi;
-
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &ubifs_file_address_operations;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1dada399aa23..65d02de342e1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -34,6 +34,7 @@
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
 
+struct backing_dev_info;
 struct export_operations;
 struct hd_geometry;
 struct iovec;
@@ -394,7 +395,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 
-struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
@@ -409,7 +409,6 @@ struct address_space {
 	pgoff_t			writeback_index;/* writeback starts here */
 	const struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
-	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	void			*private_data;	/* ditto */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 16c68958aeda..52e0c7652448 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,7 +24,6 @@ struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
-EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 
 static struct class *bdi_class;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 1b77eaf589fd..4c61d3d5bfb4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1410,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_generation = get_seconds();
 		info = SHMEM_I(inode);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1c137b69ecde..405923f77334 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,7 +37,6 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = {
 		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
 		.i_mmap_writable = ATOMIC_INIT(0),
 		.a_ops		= &swap_aops,
-		.backing_dev_info = &noop_backing_dev_info,
 	}
 };
 
-- 
cgit v1.2.3


From df0ce26cb4ee8bc233d50213b97213532aff0a3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:41 +0100
Subject: fs: remove default_backing_dev_info

Now that default_backing_dev_info is not used for writeback purposes we can
git rid of it easily:

 - instead of using it's name for tracing unregistered bdi we just use
   "unknown"
 - btrfs and ceph can just assign the default read ahead window themselves
   like several other filesystems already do.
 - we can assign noop_backing_dev_info as the default one in alloc_super.
   All filesystems already either assigned their own or
   noop_backing_dev_info.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/btrfs/disk-io.c               | 2 +-
 fs/ceph/super.c                  | 2 +-
 fs/super.c                       | 8 ++------
 include/linux/backing-dev.h      | 1 -
 include/trace/events/writeback.h | 6 ++----
 mm/backing-dev.c                 | 9 ---------
 6 files changed, 6 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ec872e3a926..1afb18226da8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1719,7 +1719,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 	if (err)
 		return err;
 
-	bdi->ra_pages	= default_backing_dev_info.ra_pages;
+	bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
 	bdi->congested_fn	= btrfs_congested_fn;
 	bdi->congested_data	= info;
 	return 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e350cc1611e4..5ae62587a71d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -899,7 +899,7 @@ static int ceph_register_bdi(struct super_block *sb,
 			>> PAGE_SHIFT;
 	else
 		fsc->backing_dev_info.ra_pages =
-			default_backing_dev_info.ra_pages;
+			VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
 
 	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
 			   atomic_long_inc_return(&bdi_seq));
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..3b4dadafdd60 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -185,8 +185,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	}
 	init_waitqueue_head(&s->s_writers.wait);
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
+	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
-	s->s_bdi = &default_backing_dev_info;
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	INIT_LIST_HEAD(&s->s_inodes);
@@ -863,10 +863,7 @@ EXPORT_SYMBOL(free_anon_bdev);
 
 int set_anon_super(struct super_block *s, void *data)
 {
-	int error = get_anon_bdev(&s->s_dev);
-	if (!error)
-		s->s_bdi = &noop_backing_dev_info;
-	return error;
+	return get_anon_bdev(&s->s_dev);
 }
 
 EXPORT_SYMBOL(set_anon_super);
@@ -1111,7 +1108,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	sb = root->d_sb;
 	BUG_ON(!sb);
 	WARN_ON(!sb->s_bdi);
-	WARN_ON(sb->s_bdi == &default_backing_dev_info);
 	sb->s_flags |= MS_BORN;
 
 	error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index ed59dee03a71..d94077fea1f8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -241,7 +241,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 
-extern struct backing_dev_info default_backing_dev_info;
 extern struct backing_dev_info noop_backing_dev_info;
 
 int writeback_in_progress(struct backing_dev_info *bdi);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 74f5207bd090..0e9310905413 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -156,10 +156,8 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__field(int, reason)
 	),
 	TP_fast_assign(
-		struct device *dev = bdi->dev;
-		if (!dev)
-			dev = default_backing_dev_info.dev;
-		strncpy(__entry->name, dev_name(dev), 32);
+		strncpy(__entry->name,
+			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
 		__entry->nr_pages = work->nr_pages;
 		__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
 		__entry->sync_mode = work->sync_mode;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1725adb242e0..7690ec77c722 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,12 +14,6 @@
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
-struct backing_dev_info default_backing_dev_info = {
-	.name		= "default",
-	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
-};
-EXPORT_SYMBOL_GPL(default_backing_dev_info);
-
 struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -250,9 +244,6 @@ static int __init default_bdi_init(void)
 	if (!bdi_wq)
 		return -ENOMEM;
 
-	err = bdi_init(&default_backing_dev_info);
-	if (!err)
-		bdi_register(&default_backing_dev_info, NULL, "default");
 	err = bdi_init(&noop_backing_dev_info);
 
 	return err;
-- 
cgit v1.2.3


From 85cdf36e11557dc367c1361e4b7bb2c4619cae91 Mon Sep 17 00:00:00 2001
From: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Date: Thu, 1 Jan 2015 18:04:52 +0100
Subject: power: ab8500_fg.c: Remove unused function

Remove the function ab8500_fg_reinit() that is not used anywhere.

This was partially found by using a static code analysis program called cppcheck.

Signed-off-by: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/ab8500_fg.c            | 14 --------------
 include/linux/mfd/abx500/ab8500-bm.h |  1 -
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_fg.c b/drivers/power/ab8500_fg.c
index 69b80bcaa9e7..c908658aa31a 100644
--- a/drivers/power/ab8500_fg.c
+++ b/drivers/power/ab8500_fg.c
@@ -2435,20 +2435,6 @@ static void ab8500_fg_reinit_work(struct work_struct *work)
 	}
 }
 
-/**
- * ab8500_fg_reinit() - forces FG algorithm to reinitialize with current values
- *
- * This function can be used to force the FG algorithm to recalculate a new
- * voltage based battery capacity.
- */
-void ab8500_fg_reinit(void)
-{
-	struct ab8500_fg *di = ab8500_fg_get();
-	/* User won't be notified if a null pointer returned. */
-	if (di != NULL)
-		queue_delayed_work(di->fg_wq, &di->fg_reinit_work, 0);
-}
-
 /* Exposure to the sysfs interface */
 
 struct ab8500_fg_sysfs_entry {
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index cc892a8d8d6e..12a5b396921e 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -461,7 +461,6 @@ struct ab8500_fg;
 #ifdef CONFIG_AB8500_BM
 extern struct abx500_bm_data ab8500_bm_data;
 
-void ab8500_fg_reinit(void);
 void ab8500_charger_usb_state_changed(u8 bm_usb_state, u16 mA);
 struct ab8500_btemp *ab8500_btemp_get(void);
 int ab8500_btemp_get_batctrl_temp(struct ab8500_btemp *btemp);
-- 
cgit v1.2.3


From d93ba7a5a97c9f315bacdcdb8de4e5f368e7b396 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 20 Jan 2015 20:06:30 -0500
Subject: block: Add discard flag to blkdev_issue_zeroout() function

blkdev_issue_discard() will zero a given block range. This is done by
way of explicit writing, thus provisioning or allocating the blocks on
disk.

There are use cases where the desired behavior is to zero the blocks but
unprovision them if possible. The blocks must deterministically contain
zeroes when they are subsequently read back.

This patch adds a flag to blkdev_issue_zeroout() that provides this
variant. If the discard flag is set and a block device guarantees
discard_zeroes_data we will use REQ_DISCARD to clear the block range. If
the device does not support discard_zeroes_data or if the discard
request fails we will fall back to first REQ_WRITE_SAME and then a
regular REQ_WRITE.

Also update the callers of blkdev_issue_zero() to reflect the new flag
and make sb_issue_zeroout() prefer the discard approach.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c                    | 30 ++++++++++++++++++++++++++----
 block/ioctl.c                      |  2 +-
 drivers/block/drbd/drbd_receiver.c |  2 +-
 include/linux/blkdev.h             |  4 ++--
 4 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 8411be3c19d3..715e948f58a4 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -283,23 +283,45 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @discard:	whether to discard the block range
  *
  * Description:
- *  Generate and issue number of bios with zerofiled pages.
+
+ *  Zero-fill a block range.  If the discard flag is set and the block
+ *  device guarantees that subsequent READ operations to the block range
+ *  in question will return zeroes, the blocks will be discarded. Should
+ *  the discard request fail, if the discard flag is not set, or if
+ *  discard_zeroes_data is not supported, this function will resort to
+ *  zeroing the blocks manually, thus provisioning (allocating,
+ *  anchoring) them. If the block device supports the WRITE SAME command
+ *  blkdev_issue_zeroout() will use it to optimize the process of
+ *  clearing the block range. Otherwise the zeroing will be performed
+ *  using regular WRITE calls.
  */
 
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			 sector_t nr_sects, gfp_t gfp_mask)
+			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
 {
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned char bdn[BDEVNAME_SIZE];
+
+	if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data) {
+
+		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0))
+			return 0;
+
+		bdevname(bdev, bdn);
+		pr_warn("%s: DISCARD failed. Manually zeroing.\n", bdn);
+	}
+
 	if (bdev_write_same(bdev)) {
-		unsigned char bdn[BDEVNAME_SIZE];
 
 		if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
 					     ZERO_PAGE(0)))
 			return 0;
 
 		bdevname(bdev, bdn);
-		pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
+		pr_warn("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
 	}
 
 	return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
diff --git a/block/ioctl.c b/block/ioctl.c
index 6c7bf903742f..7d8befde2aca 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -198,7 +198,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
 	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
 		return -EINVAL;
 
-	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL);
+	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d169b4a79267..cee20354ac37 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
 		list_add_tail(&peer_req->w.list, &device->active_ee);
 		spin_unlock_irq(&device->resource->req_lock);
 		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
-			sector, data_size >> 9, GFP_NOIO))
+			sector, data_size >> 9, GFP_NOIO, false))
 			peer_req->flags |= EE_WAS_ERROR;
 		drbd_endio_write_sec_final(peer_req);
 		return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e9086be6d9a0..4c4b732d7556 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1162,7 +1162,7 @@ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask);
+		sector_t nr_sects, gfp_t gfp_mask, bool discard);
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
@@ -1176,7 +1176,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 	return blkdev_issue_zeroout(sb->s_bdev,
 				    block << (sb->s_blocksize_bits - 9),
 				    nr_blocks << (sb->s_blocksize_bits - 9),
-				    gfp_mask);
+				    gfp_mask, true);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
-- 
cgit v1.2.3


From 9c45101e88b2bf2ce36b8833fcfa784a9149aa74 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 19 Nov 2014 09:21:58 +0100
Subject: quota: Cleanup flags definitions

Currently all quota flags were defined just in kernel-private headers.
Export flags readable / writeable from userspace to userspace via
include/uapi/linux/quota.h.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c           |  2 +-
 include/linux/dqblk_v1.h   |  3 ---
 include/linux/quota.h      | 14 ++++++++------
 include/uapi/linux/quota.h | 14 +++++++++++++-
 4 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8f0acef3d184..f8be368b9086 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1248,7 +1248,7 @@ static int ignore_hardlimit(struct dquot *dquot)
 
 	return capable(CAP_SYS_RESOURCE) &&
 	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
-		!(info->dqi_flags & V1_DQF_RSQUASH));
+		!(info->dqi_flags & DQF_ROOT_SQUASH));
 }
 
 /* needs dq_data_lock */
diff --git a/include/linux/dqblk_v1.h b/include/linux/dqblk_v1.h
index 3713a7232dd8..c0d4d1e2a45c 100644
--- a/include/linux/dqblk_v1.h
+++ b/include/linux/dqblk_v1.h
@@ -5,9 +5,6 @@
 #ifndef _LINUX_DQBLK_V1_H
 #define _LINUX_DQBLK_V1_H
 
-/* Root squash turned on */
-#define V1_DQF_RSQUASH 1
-
 /* Numbers of blocks needed for updates */
 #define V1_INIT_ALLOC 1
 #define V1_INIT_REWRITE 1
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 50978b781a19..0c42113607ce 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -223,12 +223,14 @@ struct mem_dqinfo {
 
 struct super_block;
 
-#define DQF_MASK 0xffff		/* Mask for format specific flags */
-#define DQF_GETINFO_MASK 0x1ffff	/* Mask for flags passed to userspace */
-#define DQF_SETINFO_MASK 0xffff		/* Mask for flags modifiable from userspace */
-#define DQF_SYS_FILE_B		16
-#define DQF_SYS_FILE (1 << DQF_SYS_FILE_B)	/* Quota file stored as system file */
-#define DQF_INFO_DIRTY_B	31
+/* Mask for flags passed to userspace */
+#define DQF_GETINFO_MASK (DQF_ROOT_SQUASH | DQF_SYS_FILE)
+/* Mask for flags modifiable from userspace */
+#define DQF_SETINFO_MASK DQF_ROOT_SQUASH
+
+enum {
+	DQF_INFO_DIRTY_B = DQF_PRIVATE,
+};
 #define DQF_INFO_DIRTY (1 << DQF_INFO_DIRTY_B)	/* Is info dirty? */
 
 extern void mark_info_dirty(struct super_block *sb, int type);
diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
index 3b6cfbeb086d..1f49b8341c99 100644
--- a/include/uapi/linux/quota.h
+++ b/include/uapi/linux/quota.h
@@ -126,10 +126,22 @@ struct if_dqblk {
 #define IIF_FLAGS	4
 #define IIF_ALL		(IIF_BGRACE | IIF_IGRACE | IIF_FLAGS)
 
+enum {
+	DQF_ROOT_SQUASH_B = 0,
+	DQF_SYS_FILE_B = 16,
+	/* Kernel internal flags invisible to userspace */
+	DQF_PRIVATE
+};
+
+/* Root squash enabled (for v1 quota format) */
+#define DQF_ROOT_SQUASH	(1 << DQF_ROOT_SQUASH_B)
+/* Quota stored in a system file */
+#define DQF_SYS_FILE	(1 << DQF_SYS_FILE_B)
+
 struct if_dqinfo {
 	__u64 dqi_bgrace;
 	__u64 dqi_igrace;
-	__u32 dqi_flags;
+	__u32 dqi_flags;	/* DFQ_* */
 	__u32 dqi_valid;
 };
 
-- 
cgit v1.2.3


From c1155c64e603378dccfc21ee0612cf60dd11725b Mon Sep 17 00:00:00 2001
From: Jonghwa Lee <jonghwa3.lee@samsung.com>
Date: Fri, 19 Dec 2014 17:55:13 +0900
Subject: power: charger-manager: Use alarmtimer for battery monitoring in
 suspend.

To guerantee proper charing and managing batteries even in suspend,
charger-manager has used rtc device with rtc framework interface.
However, it is better to use alarmtimer for cleaner and more appropriate
operation.
This patch makes driver to use alarmtimer for polling work in suspend and
removes all deprecated codes related with using rtc interface.

Signed-off-by: Jonghwa Lee <jonghwa3.lee@samsung.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/Kconfig                 |   2 +-
 drivers/power/charger-manager.c       | 289 ++++++++++------------------------
 include/linux/power/charger-manager.h |  32 +---
 3 files changed, 84 insertions(+), 239 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index e2569a538501..a79f16afb588 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -315,7 +315,7 @@ config CHARGER_GPIO
 
 config CHARGER_MANAGER
 	bool "Battery charger manager for multiple chargers"
-	depends on REGULATOR && RTC_CLASS
+	depends on REGULATOR
 	select EXTCON
 	help
           Say Y to enable charger-manager support, which allows multiple
diff --git a/drivers/power/charger-manager.c b/drivers/power/charger-manager.c
index 649052e1f2d9..14b0d85318eb 100644
--- a/drivers/power/charger-manager.c
+++ b/drivers/power/charger-manager.c
@@ -69,16 +69,10 @@ static LIST_HEAD(cm_list);
 static DEFINE_MUTEX(cm_list_mtx);
 
 /* About in-suspend (suspend-again) monitoring */
-static struct rtc_device *rtc_dev;
-/*
- * Backup RTC alarm
- * Save the wakeup alarm before entering suspend-to-RAM
- */
-static struct rtc_wkalrm rtc_wkalarm_save;
-/* Backup RTC alarm time in terms of seconds since 01-01-1970 00:00:00 */
-static unsigned long rtc_wkalarm_save_time;
+static struct alarm *cm_timer;
+
 static bool cm_suspended;
-static bool cm_rtc_set;
+static bool cm_timer_set;
 static unsigned long cm_suspend_duration_ms;
 
 /* About normal (not suspended) monitoring */
@@ -87,9 +81,6 @@ static unsigned long next_polling; /* Next appointed polling time */
 static struct workqueue_struct *cm_wq; /* init at driver add */
 static struct delayed_work cm_monitor_work; /* init at driver add */
 
-/* Global charger-manager description */
-static struct charger_global_desc *g_desc; /* init with setup_charger_manager */
-
 /**
  * is_batt_present - See if the battery presents in place.
  * @cm: the Charger Manager representing the battery.
@@ -1047,10 +1038,13 @@ static bool cm_setup_timer(void)
 {
 	struct charger_manager *cm;
 	unsigned int wakeup_ms = UINT_MAX;
-	bool ret = false;
+	int timer_req = 0;
 
-	mutex_lock(&cm_list_mtx);
+	if (time_after(next_polling, jiffies))
+		CM_MIN_VALID(wakeup_ms,
+			jiffies_to_msecs(next_polling - jiffies));
 
+	mutex_lock(&cm_list_mtx);
 	list_for_each_entry(cm, &cm_list, entry) {
 		unsigned int fbchk_ms = 0;
 
@@ -1070,162 +1064,38 @@ static bool cm_setup_timer(void)
 		/* Skip if polling is not required for this CM */
 		if (!is_polling_required(cm) && !cm->emergency_stop)
 			continue;
+		timer_req++;
 		if (cm->desc->polling_interval_ms == 0)
 			continue;
 		CM_MIN_VALID(wakeup_ms, cm->desc->polling_interval_ms);
 	}
-
 	mutex_unlock(&cm_list_mtx);
 
-	if (wakeup_ms < UINT_MAX && wakeup_ms > 0) {
-		pr_info("Charger Manager wakeup timer: %u ms\n", wakeup_ms);
-		if (rtc_dev) {
-			struct rtc_wkalrm tmp;
-			unsigned long time, now;
-			unsigned long add = DIV_ROUND_UP(wakeup_ms, 1000);
-
-			/*
-			 * Set alarm with the polling interval (wakeup_ms)
-			 * except when rtc_wkalarm_save comes first.
-			 * However, the alarm time should be NOW +
-			 * CM_RTC_SMALL or later.
-			 */
-			tmp.enabled = 1;
-			rtc_read_time(rtc_dev, &tmp.time);
-			rtc_tm_to_time(&tmp.time, &now);
-			if (add < CM_RTC_SMALL)
-				add = CM_RTC_SMALL;
-			time = now + add;
+	if (timer_req && cm_timer) {
+		ktime_t now, add;
 
-			ret = true;
+		/*
+		 * Set alarm with the polling interval (wakeup_ms)
+		 * The alarm time should be NOW + CM_RTC_SMALL or later.
+		 */
+		if (wakeup_ms == UINT_MAX ||
+			wakeup_ms < CM_RTC_SMALL * MSEC_PER_SEC)
+			wakeup_ms = 2 * CM_RTC_SMALL * MSEC_PER_SEC;
 
-			if (rtc_wkalarm_save.enabled &&
-			    rtc_wkalarm_save_time &&
-			    rtc_wkalarm_save_time < time) {
-				if (rtc_wkalarm_save_time < now + CM_RTC_SMALL)
-					time = now + CM_RTC_SMALL;
-				else
-					time = rtc_wkalarm_save_time;
+		pr_info("Charger Manager wakeup timer: %u ms\n", wakeup_ms);
 
-				/* The timer is not appointed by CM */
-				ret = false;
-			}
+		now = ktime_get_boottime();
+		add = ktime_set(wakeup_ms / MSEC_PER_SEC,
+				(wakeup_ms % MSEC_PER_SEC) * NSEC_PER_MSEC);
+		alarm_start(cm_timer, ktime_add(now, add));
 
-			pr_info("Waking up after %lu secs\n", time - now);
+		cm_suspend_duration_ms = wakeup_ms;
 
-			rtc_time_to_tm(time, &tmp.time);
-			rtc_set_alarm(rtc_dev, &tmp);
-			cm_suspend_duration_ms += wakeup_ms;
-			return ret;
-		}
+		return true;
 	}
-
-	if (rtc_dev)
-		rtc_set_alarm(rtc_dev, &rtc_wkalarm_save);
 	return false;
 }
 
-static void _cm_fbchk_in_suspend(struct charger_manager *cm)
-{
-	unsigned long jiffy_now = jiffies;
-
-	if (!cm->fullbatt_vchk_jiffies_at)
-		return;
-
-	if (g_desc && g_desc->assume_timer_stops_in_suspend)
-		jiffy_now += msecs_to_jiffies(cm_suspend_duration_ms);
-
-	/* Execute now if it's going to be executed not too long after */
-	jiffy_now += CM_JIFFIES_SMALL;
-
-	if (time_after_eq(jiffy_now, cm->fullbatt_vchk_jiffies_at))
-		fullbatt_vchk(&cm->fullbatt_vchk_work.work);
-}
-
-/**
- * cm_suspend_again - Determine whether suspend again or not
- *
- * Returns true if the system should be suspended again
- * Returns false if the system should be woken up
- */
-bool cm_suspend_again(void)
-{
-	struct charger_manager *cm;
-	bool ret = false;
-
-	if (!g_desc || !g_desc->rtc_only_wakeup || !g_desc->rtc_only_wakeup() ||
-	    !cm_rtc_set)
-		return false;
-
-	if (cm_monitor())
-		goto out;
-
-	ret = true;
-	mutex_lock(&cm_list_mtx);
-	list_for_each_entry(cm, &cm_list, entry) {
-		_cm_fbchk_in_suspend(cm);
-
-		if (cm->status_save_ext_pwr_inserted != is_ext_pwr_online(cm) ||
-		    cm->status_save_batt != is_batt_present(cm)) {
-			ret = false;
-			break;
-		}
-	}
-	mutex_unlock(&cm_list_mtx);
-
-	cm_rtc_set = cm_setup_timer();
-out:
-	/* It's about the time when the non-CM appointed timer goes off */
-	if (rtc_wkalarm_save.enabled) {
-		unsigned long now;
-		struct rtc_time tmp;
-
-		rtc_read_time(rtc_dev, &tmp);
-		rtc_tm_to_time(&tmp, &now);
-
-		if (rtc_wkalarm_save_time &&
-		    now + CM_RTC_SMALL >= rtc_wkalarm_save_time)
-			return false;
-	}
-	return ret;
-}
-EXPORT_SYMBOL_GPL(cm_suspend_again);
-
-/**
- * setup_charger_manager - initialize charger_global_desc data
- * @gd: pointer to instance of charger_global_desc
- */
-int setup_charger_manager(struct charger_global_desc *gd)
-{
-	if (!gd)
-		return -EINVAL;
-
-	if (rtc_dev)
-		rtc_class_close(rtc_dev);
-	rtc_dev = NULL;
-	g_desc = NULL;
-
-	if (!gd->rtc_only_wakeup) {
-		pr_err("The callback rtc_only_wakeup is not given\n");
-		return -EINVAL;
-	}
-
-	if (gd->rtc_name) {
-		rtc_dev = rtc_class_open(gd->rtc_name);
-		if (IS_ERR_OR_NULL(rtc_dev)) {
-			rtc_dev = NULL;
-			/* Retry at probe. RTC may be not registered yet */
-		}
-	} else {
-		pr_warn("No wakeup timer is given for charger manager.  "
-			"In-suspend monitoring won't work.\n");
-	}
-
-	g_desc = gd;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(setup_charger_manager);
-
 /**
  * charger_extcon_work - enable/diable charger according to the state
  *			of charger cable
@@ -1719,6 +1589,12 @@ static inline struct charger_desc *cm_get_drv_data(struct platform_device *pdev)
 	return dev_get_platdata(&pdev->dev);
 }
 
+static enum alarmtimer_restart cm_timer_func(struct alarm *alarm, ktime_t now)
+{
+	cm_timer_set = false;
+	return ALARMTIMER_NORESTART;
+}
+
 static int charger_manager_probe(struct platform_device *pdev)
 {
 	struct charger_desc *desc = cm_get_drv_data(pdev);
@@ -1728,16 +1604,6 @@ static int charger_manager_probe(struct platform_device *pdev)
 	union power_supply_propval val;
 	struct power_supply *fuel_gauge;
 
-	if (g_desc && !rtc_dev && g_desc->rtc_name) {
-		rtc_dev = rtc_class_open(g_desc->rtc_name);
-		if (IS_ERR_OR_NULL(rtc_dev)) {
-			rtc_dev = NULL;
-			dev_err(&pdev->dev, "Cannot get RTC %s\n",
-				g_desc->rtc_name);
-			return -ENODEV;
-		}
-	}
-
 	if (IS_ERR(desc)) {
 		dev_err(&pdev->dev, "No platform data (desc) found\n");
 		return -ENODEV;
@@ -1752,6 +1618,12 @@ static int charger_manager_probe(struct platform_device *pdev)
 	cm->dev = &pdev->dev;
 	cm->desc = desc;
 
+	/* Initialize alarm timer */
+	if (alarmtimer_get_rtcdev()) {
+		cm_timer = devm_kzalloc(cm->dev, sizeof(*cm_timer), GFP_KERNEL);
+		alarm_init(cm_timer, ALARM_BOOTTIME, cm_timer_func);
+	}
+
 	/*
 	 * The following two do not need to be errors.
 	 * Users may intentionally ignore those two features.
@@ -1993,38 +1865,41 @@ static int cm_suspend_noirq(struct device *dev)
 	return ret;
 }
 
+static bool cm_need_to_awake(void)
+{
+	struct charger_manager *cm;
+
+	if (cm_timer)
+		return false;
+
+	mutex_lock(&cm_list_mtx);
+	list_for_each_entry(cm, &cm_list, entry) {
+		if (is_charging(cm)) {
+			mutex_unlock(&cm_list_mtx);
+			return true;
+		}
+	}
+	mutex_unlock(&cm_list_mtx);
+
+	return false;
+}
+
 static int cm_suspend_prepare(struct device *dev)
 {
 	struct charger_manager *cm = dev_get_drvdata(dev);
 
-	if (!cm_suspended) {
-		if (rtc_dev) {
-			struct rtc_time tmp;
-			unsigned long now;
-
-			rtc_read_alarm(rtc_dev, &rtc_wkalarm_save);
-			rtc_read_time(rtc_dev, &tmp);
+	if (cm_need_to_awake())
+		return -EBUSY;
 
-			if (rtc_wkalarm_save.enabled) {
-				rtc_tm_to_time(&rtc_wkalarm_save.time,
-					       &rtc_wkalarm_save_time);
-				rtc_tm_to_time(&tmp, &now);
-				if (now > rtc_wkalarm_save_time)
-					rtc_wkalarm_save_time = 0;
-			} else {
-				rtc_wkalarm_save_time = 0;
-			}
-		}
+	if (!cm_suspended)
 		cm_suspended = true;
-	}
 
-	cancel_delayed_work(&cm->fullbatt_vchk_work);
-	cm->status_save_ext_pwr_inserted = is_ext_pwr_online(cm);
-	cm->status_save_batt = is_batt_present(cm);
+	cm_timer_set = cm_setup_timer();
 
-	if (!cm_rtc_set) {
-		cm_suspend_duration_ms = 0;
-		cm_rtc_set = cm_setup_timer();
+	if (cm_timer_set) {
+		cancel_work_sync(&setup_polling);
+		cancel_delayed_work_sync(&cm_monitor_work);
+		cancel_delayed_work(&cm->fullbatt_vchk_work);
 	}
 
 	return 0;
@@ -2034,18 +1909,21 @@ static void cm_suspend_complete(struct device *dev)
 {
 	struct charger_manager *cm = dev_get_drvdata(dev);
 
-	if (cm_suspended) {
-		if (rtc_dev) {
-			struct rtc_wkalrm tmp;
-
-			rtc_read_alarm(rtc_dev, &tmp);
-			rtc_wkalarm_save.pending = tmp.pending;
-			rtc_set_alarm(rtc_dev, &rtc_wkalarm_save);
-		}
+	if (cm_suspended)
 		cm_suspended = false;
-		cm_rtc_set = false;
+
+	if (cm_timer_set) {
+		ktime_t remain;
+
+		alarm_cancel(cm_timer);
+		cm_timer_set = false;
+		remain = alarm_expires_remaining(cm_timer);
+		cm_suspend_duration_ms -= ktime_to_ms(remain);
+		schedule_work(&setup_polling);
 	}
 
+	_cm_monitor(cm);
+
 	/* Re-enqueue delayed work (fullbatt_vchk_work) */
 	if (cm->fullbatt_vchk_jiffies_at) {
 		unsigned long delay = 0;
@@ -2060,21 +1938,18 @@ static void cm_suspend_complete(struct device *dev)
 		}
 
 		/*
-		 * Account for cm_suspend_duration_ms if
-		 * assume_timer_stops_in_suspend is active
+		 * Account for cm_suspend_duration_ms with assuming that
+		 * timer stops in suspend.
 		 */
-		if (g_desc && g_desc->assume_timer_stops_in_suspend) {
-			if (delay > cm_suspend_duration_ms)
-				delay -= cm_suspend_duration_ms;
-			else
-				delay = 0;
-		}
+		if (delay > cm_suspend_duration_ms)
+			delay -= cm_suspend_duration_ms;
+		else
+			delay = 0;
 
 		queue_delayed_work(cm_wq, &cm->fullbatt_vchk_work,
 				   msecs_to_jiffies(delay));
 	}
 	device_set_wakeup_capable(cm->dev, false);
-	uevent_notify(cm, NULL);
 }
 
 static const struct dev_pm_ops charger_manager_pm = {
diff --git a/include/linux/power/charger-manager.h b/include/linux/power/charger-manager.h
index e97fc656a058..416ebeb6ee1e 100644
--- a/include/linux/power/charger-manager.h
+++ b/include/linux/power/charger-manager.h
@@ -17,6 +17,7 @@
 
 #include <linux/power_supply.h>
 #include <linux/extcon.h>
+#include <linux/alarmtimer.h>
 
 enum data_source {
 	CM_BATTERY_PRESENT,
@@ -44,29 +45,6 @@ enum cm_event_types {
 	CM_EVENT_OTHERS,
 };
 
-/**
- * struct charger_global_desc
- * @rtc_name: the name of RTC used to wake up the system from suspend.
- * @rtc_only_wakeup:
- *	If the system is woken up by waekup-sources other than the RTC or
- *	callbacks, Charger Manager should recognize with
- *	rtc_only_wakeup() returning false.
- *	If the RTC given to CM is the only wakeup reason,
- *	rtc_only_wakeup should return true.
- * @assume_timer_stops_in_suspend:
- *	Assume that the jiffy timer stops in suspend-to-RAM.
- *	When enabled, CM does not rely on jiffies value in
- *	suspend_again and assumes that jiffies value does not
- *	change during suspend.
- */
-struct charger_global_desc {
-	char *rtc_name;
-
-	bool (*rtc_only_wakeup)(void);
-
-	bool assume_timer_stops_in_suspend;
-};
-
 /**
  * struct charger_cable
  * @extcon_name: the name of extcon device.
@@ -266,22 +244,14 @@ struct charger_manager {
 	char psy_name_buf[PSY_NAME_MAX + 1];
 	struct power_supply charger_psy;
 
-	bool status_save_ext_pwr_inserted;
-	bool status_save_batt;
-
 	u64 charging_start_time;
 	u64 charging_end_time;
 };
 
 #ifdef CONFIG_CHARGER_MANAGER
-extern int setup_charger_manager(struct charger_global_desc *gd);
-extern bool cm_suspend_again(void);
 extern void cm_notify_event(struct power_supply *psy,
 				enum cm_event_types type, char *msg);
 #else
-static inline int setup_charger_manager(struct charger_global_desc *gd)
-{ return 0; }
-static inline bool cm_suspend_again(void) { return false; }
 static inline void cm_notify_event(struct power_supply *psy,
 				enum cm_event_types type, char *msg) { }
 #endif
-- 
cgit v1.2.3


From 8116bf4cb62d337c953cfa5369ef4cf83e73140c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Wed, 21 Jan 2015 20:44:01 -0500
Subject: locks: update comments that refer to inode->i_flock

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
---
 fs/locks.c         |  2 +-
 include/linux/fs.h | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 2fc36b3772a0..4d0d41163a50 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2212,7 +2212,7 @@ again:
 	 */
 	/*
 	 * we need that spin_lock here - it prevents reordering between
-	 * update of inode->i_flock and check for it done in close().
+	 * update of i_flctx->flc_posix and check for it done in close().
 	 * rcu_read_lock() wouldn't do.
 	 */
 	spin_lock(&current->files->file_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f87cb2f03103..ddd2fa7cefd3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -925,12 +925,11 @@ int locks_in_grace(struct net *);
  * FIXME: should we create a separate "struct lock_request" to help distinguish
  * these two uses?
  *
- * The i_flock list is ordered by:
+ * The varous i_flctx lists are ordered by:
  *
- * 1) lock type -- FL_LEASEs first, then FL_FLOCK, and finally FL_POSIX
- * 2) lock owner
- * 3) lock range start
- * 4) lock range end
+ * 1) lock owner
+ * 2) lock range start
+ * 3) lock range end
  *
  * Obviously, the last two criteria only matter for POSIX locks.
  */
@@ -1992,8 +1991,9 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 {
 	/*
 	 * Since this check is lockless, we must ensure that any refcounts
-	 * taken are done before checking inode->i_flock. Otherwise, we could
-	 * end up racing with tasks trying to set a new lease on this file.
+	 * taken are done before checking i_flctx->flc_lease. Otherwise, we
+	 * could end up racing with tasks trying to set a new lease on this
+	 * file.
 	 */
 	smp_mb();
 	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
@@ -2005,8 +2005,9 @@ static inline int break_deleg(struct inode *inode, unsigned int mode)
 {
 	/*
 	 * Since this check is lockless, we must ensure that any refcounts
-	 * taken are done before checking inode->i_flock. Otherwise, we could
-	 * end up racing with tasks trying to set a new lease on this file.
+	 * taken are done before checking i_flctx->flc_lease. Otherwise, we
+	 * could end up racing with tasks trying to set a new lease on this
+	 * file.
 	 */
 	smp_mb();
 	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
-- 
cgit v1.2.3


From 30b8b0066cafef274fc92462578ee346211ce7cb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:39 +0000
Subject: init: Get rid of x86isms

The UP local API support can be set up from an early initcall. No need
for horrible hackery in the init code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/20150115211703.827943883@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig            |  4 ++++
 arch/x86/kernel/apic/apic.c |  7 +++++++
 include/linux/smp.h         |  7 +++++++
 init/main.c                 | 13 -------------
 4 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba397bde7948..ffcc3ca5862a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -855,6 +855,10 @@ config SCHED_MC
 
 source "kernel/Kconfig.preempt"
 
+config UP_LATE_INIT
+       def_bool y
+       depends on X86_UP_APIC
+
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
 	depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !PCI_MSI
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c681e9ba9e47..19f1bc714ee6 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2267,6 +2267,13 @@ int __init APIC_init_uniprocessor(void)
 	return 0;
 }
 
+#ifdef CONFIG_UP_LATE_INIT
+void __init up_late_init(void)
+{
+	APIC_init_uniprocessor();
+}
+#endif
+
 /*
  * Power management
  */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 93dff5fff524..be91db2a7017 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -151,6 +151,13 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
 static inline void kick_all_cpus_sync(void) {  }
 static inline void wake_up_all_idle_cpus(void) {  }
 
+#ifdef CONFIG_UP_LATE_INIT
+extern void __init up_late_init(void);
+static inline void smp_init(void) { up_late_init(); }
+#else
+static inline void smp_init(void) { }
+#endif
+
 #endif /* !SMP */
 
 /*
diff --git a/init/main.c b/init/main.c
index 61b993767db5..179ada15d08a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -87,10 +87,6 @@
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/smp.h>
-#endif
-
 static int kernel_init(void *);
 
 extern void init_IRQ(void);
@@ -351,15 +347,6 @@ __setup("rdinit=", rdinit_setup);
 
 #ifndef CONFIG_SMP
 static const unsigned int setup_max_cpus = NR_CPUS;
-#ifdef CONFIG_X86_LOCAL_APIC
-static void __init smp_init(void)
-{
-	APIC_init_uniprocessor();
-}
-#else
-#define smp_init()	do { } while (0)
-#endif
-
 static inline void setup_nr_cpu_ids(void) { }
 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 #endif
-- 
cgit v1.2.3


From 0b271258544b3f89f9a14fa3b53e20ec9ce75392 Mon Sep 17 00:00:00 2001
From: Beomho Seo <beomho.seo@samsung.com>
Date: Tue, 9 Dec 2014 21:03:51 +0900
Subject: mfd: rt5033: Add Richtek RT5033 driver core.

This patch adds a new driver for Richtek RT5033 driver.
RT5033 is a Multifunction device which includes battery charger, fuel gauge,
flash LED current source, LDO and synchronous Buck converter. It is interfaced
to host controller using I2C interface.

Signed-off-by: Beomho Seo <beomho.seo@samsung.com>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                |  12 ++
 drivers/mfd/Makefile               |   1 +
 drivers/mfd/rt5033.c               | 142 ++++++++++++++++++++
 include/linux/mfd/rt5033-private.h | 260 +++++++++++++++++++++++++++++++++++++
 include/linux/mfd/rt5033.h         |  62 +++++++++
 5 files changed, 477 insertions(+)
 create mode 100644 drivers/mfd/rt5033.c
 create mode 100644 include/linux/mfd/rt5033-private.h
 create mode 100644 include/linux/mfd/rt5033.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 2e6b7311fabc..9cd2af660cc5 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -623,6 +623,18 @@ config MFD_RTSX_PCI
 	  types of memory cards, such as Memory Stick, Memory Stick Pro,
 	  Secure Digital and MultiMediaCard.
 
+config MFD_RT5033
+	tristate "Richtek RT5033 Power Management IC"
+	depends on I2C=y
+	select MFD_CORE
+	select REGMAP_I2C
+	help
+	  This driver provides for the Richtek RT5033 Power Management IC,
+	  which includes the I2C driver and the Core APIs. This driver provides
+	  common support for accessing the device. The device supports multiple
+	  sub-devices like charger, fuel gauge, flash LED, current source,
+	  LDO and Buck.
+
 config MFD_RTSX_USB
 	tristate "Realtek USB card reader"
 	depends on USB
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 53467e211381..4059c247c59a 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -176,6 +176,7 @@ obj-$(CONFIG_MFD_IPAQ_MICRO)	+= ipaq-micro.o
 obj-$(CONFIG_MFD_MENF21BMC)	+= menf21bmc.o
 obj-$(CONFIG_MFD_HI6421_PMIC)	+= hi6421-pmic-core.o
 obj-$(CONFIG_MFD_DLN2)		+= dln2.o
+obj-$(CONFIG_MFD_RT5033)	+= rt5033.o
 
 intel-soc-pmic-objs		:= intel_soc_pmic_core.o intel_soc_pmic_crc.o
 obj-$(CONFIG_INTEL_SOC_PMIC)	+= intel-soc-pmic.o
diff --git a/drivers/mfd/rt5033.c b/drivers/mfd/rt5033.c
new file mode 100644
index 000000000000..db395a6c52bc
--- /dev/null
+++ b/drivers/mfd/rt5033.c
@@ -0,0 +1,142 @@
+/*
+ * MFD core driver for the Richtek RT5033.
+ *
+ * RT5033 comprises multiple sub-devices switcing charger, fuel gauge,
+ * flash LED, current source, LDO and BUCK regulators.
+ *
+ * Copyright (C) 2014 Samsung Electronics, Co., Ltd.
+ * Author: Beomho Seo <beomho.seo@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published bythe Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/of_device.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/rt5033.h>
+#include <linux/mfd/rt5033-private.h>
+
+static const struct regmap_irq rt5033_irqs[] = {
+	{ .mask = RT5033_PMIC_IRQ_BUCKOCP, },
+	{ .mask = RT5033_PMIC_IRQ_BUCKLV, },
+	{ .mask = RT5033_PMIC_IRQ_SAFELDOLV, },
+	{ .mask = RT5033_PMIC_IRQ_LDOLV, },
+	{ .mask = RT5033_PMIC_IRQ_OT, },
+	{ .mask = RT5033_PMIC_IRQ_VDDA_UV, },
+};
+
+static const struct regmap_irq_chip rt5033_irq_chip = {
+	.name		= "rt5033",
+	.status_base	= RT5033_REG_PMIC_IRQ_STAT,
+	.mask_base	= RT5033_REG_PMIC_IRQ_CTRL,
+	.mask_invert	= true,
+	.num_regs	= 1,
+	.irqs		= rt5033_irqs,
+	.num_irqs	= ARRAY_SIZE(rt5033_irqs),
+};
+
+static const struct mfd_cell rt5033_devs[] = {
+	{ .name = "rt5033-regulator", },
+	{
+		.name = "rt5033-charger",
+		.of_compatible = "richtek,rt5033-charger",
+	}, {
+		.name = "rt5033-battery",
+		.of_compatible = "richtek,rt5033-battery",
+	},
+};
+
+static const struct regmap_config rt5033_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.max_register	= RT5033_REG_END,
+};
+
+static int rt5033_i2c_probe(struct i2c_client *i2c,
+				const struct i2c_device_id *id)
+{
+	struct rt5033_dev *rt5033;
+	unsigned int dev_id;
+	int ret;
+
+	rt5033 = devm_kzalloc(&i2c->dev, sizeof(*rt5033), GFP_KERNEL);
+	if (!rt5033)
+		return -ENOMEM;
+
+	i2c_set_clientdata(i2c, rt5033);
+	rt5033->dev = &i2c->dev;
+	rt5033->irq = i2c->irq;
+	rt5033->wakeup = true;
+
+	rt5033->regmap = devm_regmap_init_i2c(i2c, &rt5033_regmap_config);
+	if (IS_ERR(rt5033->regmap)) {
+		dev_err(&i2c->dev, "Failed to allocate register map.\n");
+		return PTR_ERR(rt5033->regmap);
+	}
+
+	ret = regmap_read(rt5033->regmap, RT5033_REG_DEVICE_ID, &dev_id);
+	if (ret) {
+		dev_err(&i2c->dev, "Device not found\n");
+		return -ENODEV;
+	}
+	dev_info(&i2c->dev, "Device found Device ID: %04x\n", dev_id);
+
+	ret = regmap_add_irq_chip(rt5033->regmap, rt5033->irq,
+			IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+			0, &rt5033_irq_chip, &rt5033->irq_data);
+	if (ret) {
+		dev_err(&i2c->dev, "Failed to request IRQ %d: %d\n",
+							rt5033->irq, ret);
+		return ret;
+	}
+
+	ret = mfd_add_devices(rt5033->dev, -1, rt5033_devs,
+			ARRAY_SIZE(rt5033_devs), NULL, 0,
+			regmap_irq_get_domain(rt5033->irq_data));
+	if (ret < 0) {
+		dev_err(&i2c->dev, "Failed to add RT5033 child devices.\n");
+		return ret;
+	}
+
+	device_init_wakeup(rt5033->dev, rt5033->wakeup);
+
+	return 0;
+}
+
+static int rt5033_i2c_remove(struct i2c_client *i2c)
+{
+	mfd_remove_devices(&i2c->dev);
+
+	return 0;
+}
+
+static const struct i2c_device_id rt5033_i2c_id[] = {
+	{ "rt5033", },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, rt5033_i2c_id);
+
+static const struct of_device_id rt5033_dt_match[] = {
+	{ .compatible = "richtek,rt5033", },
+	{ }
+};
+
+static struct i2c_driver rt5033_driver = {
+	.driver = {
+		.name = "rt5033",
+		.of_match_table = of_match_ptr(rt5033_dt_match),
+	},
+	.probe = rt5033_i2c_probe,
+	.remove = rt5033_i2c_remove,
+	.id_table = rt5033_i2c_id,
+};
+module_i2c_driver(rt5033_driver);
+
+MODULE_ALIAS("i2c:rt5033");
+MODULE_DESCRIPTION("Richtek RT5033 multi-function core driver");
+MODULE_AUTHOR("Beomho Seo <beomho.seo@samsung.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/rt5033-private.h b/include/linux/mfd/rt5033-private.h
new file mode 100644
index 000000000000..1b63fc2f42d1
--- /dev/null
+++ b/include/linux/mfd/rt5033-private.h
@@ -0,0 +1,260 @@
+/*
+ * MFD core driver for Richtek RT5033
+ *
+ * Copyright (C) 2014 Samsung Electronics, Co., Ltd.
+ * Author: Beomho Seo <beomho.seo@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published bythe Free Software Foundation.
+ */
+
+#ifndef __RT5033_PRIVATE_H__
+#define __RT5033_PRIVATE_H__
+
+enum rt5033_reg {
+	RT5033_REG_CHG_STAT		= 0x00,
+	RT5033_REG_CHG_CTRL1		= 0x01,
+	RT5033_REG_CHG_CTRL2		= 0x02,
+	RT5033_REG_DEVICE_ID		= 0x03,
+	RT5033_REG_CHG_CTRL3		= 0x04,
+	RT5033_REG_CHG_CTRL4		= 0x05,
+	RT5033_REG_CHG_CTRL5		= 0x06,
+	RT5033_REG_RT_CTRL0		= 0x07,
+	RT5033_REG_CHG_RESET		= 0x08,
+	/* Reserved 0x09~0x18 */
+	RT5033_REG_RT_CTRL1		= 0x19,
+	/* Reserved 0x1A~0x20 */
+	RT5033_REG_FLED_FUNCTION1	= 0x21,
+	RT5033_REG_FLED_FUNCTION2	= 0x22,
+	RT5033_REG_FLED_STROBE_CTRL1	= 0x23,
+	RT5033_REG_FLED_STROBE_CTRL2	= 0x24,
+	RT5033_REG_FLED_CTRL1		= 0x25,
+	RT5033_REG_FLED_CTRL2		= 0x26,
+	RT5033_REG_FLED_CTRL3		= 0x27,
+	RT5033_REG_FLED_CTRL4		= 0x28,
+	RT5033_REG_FLED_CTRL5		= 0x29,
+	/* Reserved 0x2A~0x40 */
+	RT5033_REG_CTRL			= 0x41,
+	RT5033_REG_BUCK_CTRL		= 0x42,
+	RT5033_REG_LDO_CTRL		= 0x43,
+	/* Reserved 0x44~0x46 */
+	RT5033_REG_MANUAL_RESET_CTRL	= 0x47,
+	/* Reserved 0x48~0x5F */
+	RT5033_REG_CHG_IRQ1		= 0x60,
+	RT5033_REG_CHG_IRQ2		= 0x61,
+	RT5033_REG_CHG_IRQ3		= 0x62,
+	RT5033_REG_CHG_IRQ1_CTRL	= 0x63,
+	RT5033_REG_CHG_IRQ2_CTRL	= 0x64,
+	RT5033_REG_CHG_IRQ3_CTRL	= 0x65,
+	RT5033_REG_LED_IRQ_STAT		= 0x66,
+	RT5033_REG_LED_IRQ_CTRL		= 0x67,
+	RT5033_REG_PMIC_IRQ_STAT	= 0x68,
+	RT5033_REG_PMIC_IRQ_CTRL	= 0x69,
+	RT5033_REG_SHDN_CTRL		= 0x6A,
+	RT5033_REG_OFF_EVENT		= 0x6B,
+
+	RT5033_REG_END,
+};
+
+/* RT5033 Charger state register */
+#define RT5033_CHG_STAT_MASK		0x20
+#define RT5033_CHG_STAT_DISCHARGING	0x00
+#define RT5033_CHG_STAT_FULL		0x10
+#define RT5033_CHG_STAT_CHARGING	0x20
+#define RT5033_CHG_STAT_NOT_CHARGING	0x30
+#define RT5033_CHG_STAT_TYPE_MASK	0x60
+#define RT5033_CHG_STAT_TYPE_PRE	0x20
+#define RT5033_CHG_STAT_TYPE_FAST	0x60
+
+/* RT5033 CHGCTRL1 register */
+#define RT5033_CHGCTRL1_IAICR_MASK	0xe0
+#define RT5033_CHGCTRL1_MODE_MASK	0x01
+
+/* RT5033 CHGCTRL2 register */
+#define RT5033_CHGCTRL2_CV_MASK		0xfc
+
+/* RT5033 CHGCTRL3 register */
+#define RT5033_CHGCTRL3_CFO_EN_MASK	0x40
+#define RT5033_CHGCTRL3_TIMER_MASK	0x38
+#define RT5033_CHGCTRL3_TIMER_EN_MASK	0x01
+
+/* RT5033 CHGCTRL4 register */
+#define RT5033_CHGCTRL4_EOC_MASK	0x07
+#define RT5033_CHGCTRL4_IPREC_MASK	0x18
+
+/* RT5033 CHGCTRL5 register */
+#define RT5033_CHGCTRL5_VPREC_MASK	0x0f
+#define RT5033_CHGCTRL5_ICHG_MASK	0xf0
+#define RT5033_CHGCTRL5_ICHG_SHIFT	0x04
+#define RT5033_CHG_MAX_CURRENT		0x0d
+
+/* RT5033 RT CTRL1 register */
+#define RT5033_RT_CTRL1_UUG_MASK	0x02
+#define RT5033_RT_HZ_MASK		0x01
+
+/* RT5033 control register */
+#define RT5033_CTRL_FCCM_BUCK_MASK		0x00
+#define RT5033_CTRL_BUCKOMS_MASK		0x01
+#define RT5033_CTRL_LDOOMS_MASK			0x02
+#define RT5033_CTRL_SLDOOMS_MASK		0x03
+#define RT5033_CTRL_EN_BUCK_MASK		0x04
+#define RT5033_CTRL_EN_LDO_MASK			0x05
+#define RT5033_CTRL_EN_SAFE_LDO_MASK		0x06
+#define RT5033_CTRL_LDO_SLEEP_MASK		0x07
+
+/* RT5033 BUCK control register */
+#define RT5033_BUCK_CTRL_MASK			0x1f
+
+/* RT5033 LDO control register */
+#define RT5033_LDO_CTRL_MASK			0x1f
+
+/* RT5033 charger property - model, manufacturer */
+
+#define RT5033_CHARGER_MODEL	"RT5033WSC Charger"
+#define RT5033_MANUFACTURER	"Richtek Technology Corporation"
+
+/*
+ * RT5033 charger fast-charge current lmits (as in CHGCTRL1 register),
+ * AICR mode limits the input current for example,
+ * the AIRC 100 mode limits the input current to 100 mA.
+ */
+#define RT5033_AICR_100_MODE			0x20
+#define RT5033_AICR_500_MODE			0x40
+#define RT5033_AICR_700_MODE			0x60
+#define RT5033_AICR_900_MODE			0x80
+#define RT5033_AICR_1500_MODE			0xc0
+#define RT5033_AICR_2000_MODE			0xe0
+#define RT5033_AICR_MODE_MASK			0xe0
+
+/* RT5033 use internal timer need to set time */
+#define RT5033_FAST_CHARGE_TIMER4		0x00
+#define RT5033_FAST_CHARGE_TIMER6		0x01
+#define RT5033_FAST_CHARGE_TIMER8		0x02
+#define RT5033_FAST_CHARGE_TIMER9		0x03
+#define RT5033_FAST_CHARGE_TIMER12		0x04
+#define RT5033_FAST_CHARGE_TIMER14		0x05
+#define RT5033_FAST_CHARGE_TIMER16		0x06
+
+#define RT5033_INT_TIMER_ENABLE			0x01
+
+/* RT5033 charger termination enable mask */
+#define RT5033_TE_ENABLE_MASK			0x08
+
+/*
+ * RT5033 charger opa mode. RT50300 have two opa mode charger mode
+ * and boost mode for OTG
+ */
+
+#define RT5033_CHARGER_MODE			0x00
+#define RT5033_BOOST_MODE			0x01
+
+/* RT5033 charger termination enable */
+#define RT5033_TE_ENABLE			0x08
+
+/* RT5033 charger CFO enable */
+#define RT5033_CFO_ENABLE			0x40
+
+/* RT5033 charger constant charge voltage (as in CHGCTRL2 register), uV */
+#define RT5033_CHARGER_CONST_VOLTAGE_LIMIT_MIN	3650000U
+#define RT5033_CHARGER_CONST_VOLTAGE_STEP_NUM   25000U
+#define RT5033_CHARGER_CONST_VOLTAGE_LIMIT_MAX	4400000U
+
+/* RT5033 charger pre-charge current limits (as in CHGCTRL4 register), uA */
+#define RT5033_CHARGER_PRE_CURRENT_LIMIT_MIN	350000U
+#define RT5033_CHARGER_PRE_CURRENT_STEP_NUM	100000U
+#define RT5033_CHARGER_PRE_CURRENT_LIMIT_MAX	650000U
+
+/* RT5033 charger fast-charge current (as in CHGCTRL5 register), uA */
+#define RT5033_CHARGER_FAST_CURRENT_MIN		700000U
+#define RT5033_CHARGER_FAST_CURRENT_STEP_NUM	100000U
+#define RT5033_CHARGER_FAST_CURRENT_MAX		2000000U
+
+/*
+ * RT5033 charger const-charge end of charger current (
+ * as in CHGCTRL4 register), uA
+ */
+#define RT5033_CHARGER_EOC_MIN			150000U
+#define RT5033_CHARGER_EOC_REF			300000U
+#define RT5033_CHARGER_EOC_STEP_NUM1		50000U
+#define RT5033_CHARGER_EOC_STEP_NUM2		100000U
+#define RT5033_CHARGER_EOC_MAX			600000U
+
+/*
+ * RT5033 charger pre-charge threshold volt limits
+ * (as in CHGCTRL5 register), uV
+ */
+
+#define RT5033_CHARGER_PRE_THRESHOLD_LIMIT_MIN	2300000U
+#define RT5033_CHARGER_PRE_THRESHOLD_STEP_NUM	100000U
+#define RT5033_CHARGER_PRE_THRESHOLD_LIMIT_MAX	3800000U
+
+/*
+ * RT5033 charger enable UUG, If UUG enable MOS auto control by H/W charger
+ * circuit.
+ */
+#define RT5033_CHARGER_UUG_ENABLE		0x02
+
+/* RT5033 charger High impedance mode */
+#define RT5033_CHARGER_HZ_DISABLE		0x00
+#define RT5033_CHARGER_HZ_ENABLE		0x01
+
+/* RT5033 regulator BUCK output voltage uV */
+#define RT5033_REGULATOR_BUCK_VOLTAGE_MIN		1000000U
+#define RT5033_REGULATOR_BUCK_VOLTAGE_MAX		3000000U
+#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP		100000U
+#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP_NUM		32
+
+/* RT5033 regulator LDO output voltage uV */
+#define RT5033_REGULATOR_LDO_VOLTAGE_MIN		1200000U
+#define RT5033_REGULATOR_LDO_VOLTAGE_MAX		3000000U
+#define RT5033_REGULATOR_LDO_VOLTAGE_STEP		100000U
+#define RT5033_REGULATOR_LDO_VOLTAGE_STEP_NUM		32
+
+/* RT5033 regulator SAFE LDO output voltage uV */
+#define RT5033_REGULATOR_SAFE_LDO_VOLTAGE		4900000U
+
+enum rt5033_fuel_reg {
+	RT5033_FUEL_REG_OCV_H		= 0x00,
+	RT5033_FUEL_REG_OCV_L		= 0x01,
+	RT5033_FUEL_REG_VBAT_H		= 0x02,
+	RT5033_FUEL_REG_VBAT_L		= 0x03,
+	RT5033_FUEL_REG_SOC_H		= 0x04,
+	RT5033_FUEL_REG_SOC_L		= 0x05,
+	RT5033_FUEL_REG_CTRL_H		= 0x06,
+	RT5033_FUEL_REG_CTRL_L		= 0x07,
+	RT5033_FUEL_REG_CRATE		= 0x08,
+	RT5033_FUEL_REG_DEVICE_ID	= 0x09,
+	RT5033_FUEL_REG_AVG_VOLT_H	= 0x0A,
+	RT5033_FUEL_REG_AVG_VOLT_L	= 0x0B,
+	RT5033_FUEL_REG_CONFIG_H	= 0x0C,
+	RT5033_FUEL_REG_CONFIG_L	= 0x0D,
+	/* Reserved 0x0E~0x0F */
+	RT5033_FUEL_REG_IRQ_CTRL	= 0x10,
+	RT5033_FUEL_REG_IRQ_FLAG	= 0x11,
+	RT5033_FUEL_VMIN		= 0x12,
+	RT5033_FUEL_SMIN		= 0x13,
+	/* Reserved 0x14~0x1F */
+	RT5033_FUEL_VGCOMP1		= 0x20,
+	RT5033_FUEL_VGCOMP2		= 0x21,
+	RT5033_FUEL_VGCOMP3		= 0x22,
+	RT5033_FUEL_VGCOMP4		= 0x23,
+	/* Reserved 0x24~0xFD */
+	RT5033_FUEL_MFA_H		= 0xFE,
+	RT5033_FUEL_MFA_L		= 0xFF,
+
+	RT5033_FUEL_REG_END,
+};
+
+/* RT5033 fuel gauge battery present property */
+#define RT5033_FUEL_BAT_PRESENT		0x02
+
+/* RT5033 PMIC interrupts */
+#define RT5033_PMIC_IRQ_BUCKOCP		2
+#define RT5033_PMIC_IRQ_BUCKLV		3
+#define RT5033_PMIC_IRQ_SAFELDOLV	4
+#define RT5033_PMIC_IRQ_LDOLV		5
+#define RT5033_PMIC_IRQ_OT		6
+#define RT5033_PMIC_IRQ_VDDA_UV		7
+
+#endif /* __RT5033_PRIVATE_H__ */
diff --git a/include/linux/mfd/rt5033.h b/include/linux/mfd/rt5033.h
new file mode 100644
index 000000000000..010cff49a98e
--- /dev/null
+++ b/include/linux/mfd/rt5033.h
@@ -0,0 +1,62 @@
+/*
+ * MFD core driver for the RT5033
+ *
+ * Copyright (C) 2014 Samsung Electronics
+ * Author: Beomho Seo <beomho.seo@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published bythe Free Software Foundation.
+ */
+
+#ifndef __RT5033_H__
+#define __RT5033_H__
+
+#include <linux/regulator/consumer.h>
+#include <linux/i2c.h>
+#include <linux/regmap.h>
+#include <linux/power_supply.h>
+
+/* RT5033 regulator IDs */
+enum rt5033_regulators {
+	RT5033_BUCK = 0,
+	RT5033_LDO,
+	RT5033_SAFE_LDO,
+
+	RT5033_REGULATOR_NUM,
+};
+
+struct rt5033_dev {
+	struct device *dev;
+
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *irq_data;
+	int irq;
+	bool wakeup;
+};
+
+struct rt5033_battery {
+	struct i2c_client	*client;
+	struct rt5033_dev	*rt5033;
+	struct regmap		*regmap;
+	struct power_supply	psy;
+};
+
+/* RT5033 charger platform data */
+struct rt5033_charger_data {
+	unsigned int pre_uamp;
+	unsigned int pre_uvolt;
+	unsigned int const_uvolt;
+	unsigned int eoc_uamp;
+	unsigned int fast_uamp;
+};
+
+struct rt5033_charger {
+	struct device		*dev;
+	struct rt5033_dev	*rt5033;
+	struct power_supply	psy;
+
+	struct rt5033_charger_data	*chg;
+};
+
+#endif /* __RT5033_H__ */
-- 
cgit v1.2.3


From b8fce55c09d3327014191247957d875c9fe5b7ce Mon Sep 17 00:00:00 2001
From: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Date: Mon, 22 Dec 2014 16:51:06 +0000
Subject: mfd: Add support for DA9150 combined charger & fuel-gauge device

DA9150 is a combined Charger and Fuel-Gauge IC, with additional
GPIO and GPADC functionality.

Signed-off-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |   12 +
 drivers/mfd/Makefile                 |    2 +-
 drivers/mfd/da9150-core.c            |  413 ++++++++++++
 include/linux/mfd/da9150/core.h      |   68 ++
 include/linux/mfd/da9150/registers.h | 1155 ++++++++++++++++++++++++++++++++++
 5 files changed, 1649 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mfd/da9150-core.c
 create mode 100644 include/linux/mfd/da9150/core.h
 create mode 100644 include/linux/mfd/da9150/registers.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 9cd2af660cc5..c47b2da7986e 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -195,6 +195,18 @@ config MFD_DA9063
 	  Additional drivers must be enabled in order to use the functionality
 	  of the device.
 
+config MFD_DA9150
+	tristate "Dialog Semiconductor DA9150 Charger Fuel-Gauge chip"
+	depends on I2C=y
+	select MFD_CORE
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	help
+	  This adds support for the DA9150 integrated charger and fuel-gauge
+	  chip. This driver provides common support for accessing the device.
+	  Additional drivers must be enabled in order to use the specific
+	  features of the device.
+
 config MFD_DLN2
 	tristate "Diolan DLN2 support"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 4059c247c59a..02c2f2c69b94 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -113,7 +113,7 @@ obj-$(CONFIG_MFD_DA9055)	+= da9055.o
 
 da9063-objs			:= da9063-core.o da9063-irq.o da9063-i2c.o
 obj-$(CONFIG_MFD_DA9063)	+= da9063.o
-
+obj-$(CONFIG_MFD_DA9150)	+= da9150-core.o
 obj-$(CONFIG_MFD_MAX14577)	+= max14577.o
 obj-$(CONFIG_MFD_MAX77686)	+= max77686.o
 obj-$(CONFIG_MFD_MAX77693)	+= max77693.o
diff --git a/drivers/mfd/da9150-core.c b/drivers/mfd/da9150-core.c
new file mode 100644
index 000000000000..4d757b97ef9a
--- /dev/null
+++ b/drivers/mfd/da9150-core.c
@@ -0,0 +1,413 @@
+/*
+ * DA9150 Core MFD Driver
+ *
+ * Copyright (c) 2014 Dialog Semiconductor
+ *
+ * Author: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/i2c.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/da9150/core.h>
+#include <linux/mfd/da9150/registers.h>
+
+static bool da9150_volatile_reg(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case DA9150_PAGE_CON:
+	case DA9150_STATUS_A:
+	case DA9150_STATUS_B:
+	case DA9150_STATUS_C:
+	case DA9150_STATUS_D:
+	case DA9150_STATUS_E:
+	case DA9150_STATUS_F:
+	case DA9150_STATUS_G:
+	case DA9150_STATUS_H:
+	case DA9150_STATUS_I:
+	case DA9150_STATUS_J:
+	case DA9150_STATUS_K:
+	case DA9150_STATUS_L:
+	case DA9150_STATUS_N:
+	case DA9150_FAULT_LOG_A:
+	case DA9150_FAULT_LOG_B:
+	case DA9150_EVENT_E:
+	case DA9150_EVENT_F:
+	case DA9150_EVENT_G:
+	case DA9150_EVENT_H:
+	case DA9150_CONTROL_B:
+	case DA9150_CONTROL_C:
+	case DA9150_GPADC_MAN:
+	case DA9150_GPADC_RES_A:
+	case DA9150_GPADC_RES_B:
+	case DA9150_ADETVB_CFG_C:
+	case DA9150_ADETD_STAT:
+	case DA9150_ADET_CMPSTAT:
+	case DA9150_ADET_CTRL_A:
+	case DA9150_PPR_TCTR_B:
+	case DA9150_COREBTLD_STAT_A:
+	case DA9150_CORE_DATA_A:
+	case DA9150_CORE_DATA_B:
+	case DA9150_CORE_DATA_C:
+	case DA9150_CORE_DATA_D:
+	case DA9150_CORE2WIRE_STAT_A:
+	case DA9150_FW_CTRL_C:
+	case DA9150_FG_CTRL_B:
+	case DA9150_FW_CTRL_B:
+	case DA9150_GPADC_CMAN:
+	case DA9150_GPADC_CRES_A:
+	case DA9150_GPADC_CRES_B:
+	case DA9150_CC_ICHG_RES_A:
+	case DA9150_CC_ICHG_RES_B:
+	case DA9150_CC_IAVG_RES_A:
+	case DA9150_CC_IAVG_RES_B:
+	case DA9150_TAUX_CTRL_A:
+	case DA9150_TAUX_VALUE_H:
+	case DA9150_TAUX_VALUE_L:
+	case DA9150_TBAT_RES_A:
+	case DA9150_TBAT_RES_B:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static const struct regmap_range_cfg da9150_range_cfg[] = {
+	{
+		.range_min = DA9150_PAGE_CON,
+		.range_max = DA9150_TBAT_RES_B,
+		.selector_reg = DA9150_PAGE_CON,
+		.selector_mask = DA9150_I2C_PAGE_MASK,
+		.selector_shift = DA9150_I2C_PAGE_SHIFT,
+		.window_start = 0,
+		.window_len = 256,
+	},
+};
+
+static struct regmap_config da9150_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.ranges = da9150_range_cfg,
+	.num_ranges = ARRAY_SIZE(da9150_range_cfg),
+	.max_register = DA9150_TBAT_RES_B,
+
+	.cache_type = REGCACHE_RBTREE,
+
+	.volatile_reg = da9150_volatile_reg,
+};
+
+u8 da9150_reg_read(struct da9150 *da9150, u16 reg)
+{
+	int val, ret;
+
+	ret = regmap_read(da9150->regmap, reg, &val);
+	if (ret)
+		dev_err(da9150->dev, "Failed to read from reg 0x%x: %d\n",
+			reg, ret);
+
+	return (u8) val;
+}
+EXPORT_SYMBOL_GPL(da9150_reg_read);
+
+void da9150_reg_write(struct da9150 *da9150, u16 reg, u8 val)
+{
+	int ret;
+
+	ret = regmap_write(da9150->regmap, reg, val);
+	if (ret)
+		dev_err(da9150->dev, "Failed to write to reg 0x%x: %d\n",
+			reg, ret);
+}
+EXPORT_SYMBOL_GPL(da9150_reg_write);
+
+void da9150_set_bits(struct da9150 *da9150, u16 reg, u8 mask, u8 val)
+{
+	int ret;
+
+	ret = regmap_update_bits(da9150->regmap, reg, mask, val);
+	if (ret)
+		dev_err(da9150->dev, "Failed to set bits in reg 0x%x: %d\n",
+			reg, ret);
+}
+EXPORT_SYMBOL_GPL(da9150_set_bits);
+
+void da9150_bulk_read(struct da9150 *da9150, u16 reg, int count, u8 *buf)
+{
+	int ret;
+
+	ret = regmap_bulk_read(da9150->regmap, reg, buf, count);
+	if (ret)
+		dev_err(da9150->dev, "Failed to bulk read from reg 0x%x: %d\n",
+			reg, ret);
+}
+EXPORT_SYMBOL_GPL(da9150_bulk_read);
+
+void da9150_bulk_write(struct da9150 *da9150, u16 reg, int count, const u8 *buf)
+{
+	int ret;
+
+	ret = regmap_raw_write(da9150->regmap, reg, buf, count);
+	if (ret)
+		dev_err(da9150->dev, "Failed to bulk write to reg 0x%x %d\n",
+			reg, ret);
+}
+EXPORT_SYMBOL_GPL(da9150_bulk_write);
+
+static struct regmap_irq da9150_irqs[] = {
+	[DA9150_IRQ_VBUS] = {
+		.reg_offset = 0,
+		.mask = DA9150_E_VBUS_MASK,
+	},
+	[DA9150_IRQ_CHG] = {
+		.reg_offset = 0,
+		.mask = DA9150_E_CHG_MASK,
+	},
+	[DA9150_IRQ_TCLASS] = {
+		.reg_offset = 0,
+		.mask = DA9150_E_TCLASS_MASK,
+	},
+	[DA9150_IRQ_TJUNC] = {
+		.reg_offset = 0,
+		.mask = DA9150_E_TJUNC_MASK,
+	},
+	[DA9150_IRQ_VFAULT] = {
+		.reg_offset = 0,
+		.mask = DA9150_E_VFAULT_MASK,
+	},
+	[DA9150_IRQ_CONF] = {
+		.reg_offset = 1,
+		.mask = DA9150_E_CONF_MASK,
+	},
+	[DA9150_IRQ_DAT] = {
+		.reg_offset = 1,
+		.mask = DA9150_E_DAT_MASK,
+	},
+	[DA9150_IRQ_DTYPE] = {
+		.reg_offset = 1,
+		.mask = DA9150_E_DTYPE_MASK,
+	},
+	[DA9150_IRQ_ID] = {
+		.reg_offset = 1,
+		.mask = DA9150_E_ID_MASK,
+	},
+	[DA9150_IRQ_ADP] = {
+		.reg_offset = 1,
+		.mask = DA9150_E_ADP_MASK,
+	},
+	[DA9150_IRQ_SESS_END] = {
+		.reg_offset = 1,
+		.mask = DA9150_E_SESS_END_MASK,
+	},
+	[DA9150_IRQ_SESS_VLD] = {
+		.reg_offset = 1,
+		.mask = DA9150_E_SESS_VLD_MASK,
+	},
+	[DA9150_IRQ_FG] = {
+		.reg_offset = 2,
+		.mask = DA9150_E_FG_MASK,
+	},
+	[DA9150_IRQ_GP] = {
+		.reg_offset = 2,
+		.mask = DA9150_E_GP_MASK,
+	},
+	[DA9150_IRQ_TBAT] = {
+		.reg_offset = 2,
+		.mask = DA9150_E_TBAT_MASK,
+	},
+	[DA9150_IRQ_GPIOA] = {
+		.reg_offset = 2,
+		.mask = DA9150_E_GPIOA_MASK,
+	},
+	[DA9150_IRQ_GPIOB] = {
+		.reg_offset = 2,
+		.mask = DA9150_E_GPIOB_MASK,
+	},
+	[DA9150_IRQ_GPIOC] = {
+		.reg_offset = 2,
+		.mask = DA9150_E_GPIOC_MASK,
+	},
+	[DA9150_IRQ_GPIOD] = {
+		.reg_offset = 2,
+		.mask = DA9150_E_GPIOD_MASK,
+	},
+	[DA9150_IRQ_GPADC] = {
+		.reg_offset = 2,
+		.mask = DA9150_E_GPADC_MASK,
+	},
+	[DA9150_IRQ_WKUP] = {
+		.reg_offset = 3,
+		.mask = DA9150_E_WKUP_MASK,
+	},
+};
+
+static struct regmap_irq_chip da9150_regmap_irq_chip = {
+	.name = "da9150_irq",
+	.status_base = DA9150_EVENT_E,
+	.mask_base = DA9150_IRQ_MASK_E,
+	.ack_base = DA9150_EVENT_E,
+	.num_regs = DA9150_NUM_IRQ_REGS,
+	.irqs = da9150_irqs,
+	.num_irqs = ARRAY_SIZE(da9150_irqs),
+};
+
+static struct resource da9150_gpadc_resources[] = {
+	{
+		.name = "GPADC",
+		.start = DA9150_IRQ_GPADC,
+		.end = DA9150_IRQ_GPADC,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource da9150_charger_resources[] = {
+	{
+		.name = "CHG_STATUS",
+		.start = DA9150_IRQ_CHG,
+		.end = DA9150_IRQ_CHG,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "CHG_TJUNC",
+		.start = DA9150_IRQ_TJUNC,
+		.end = DA9150_IRQ_TJUNC,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "CHG_VFAULT",
+		.start = DA9150_IRQ_VFAULT,
+		.end = DA9150_IRQ_VFAULT,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "CHG_VBUS",
+		.start = DA9150_IRQ_VBUS,
+		.end = DA9150_IRQ_VBUS,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct mfd_cell da9150_devs[] = {
+	{
+		.name = "da9150-gpadc",
+		.of_compatible = "dlg,da9150-gpadc",
+		.resources = da9150_gpadc_resources,
+		.num_resources = ARRAY_SIZE(da9150_gpadc_resources),
+	},
+	{
+		.name = "da9150-charger",
+		.of_compatible = "dlg,da9150-charger",
+		.resources = da9150_charger_resources,
+		.num_resources = ARRAY_SIZE(da9150_charger_resources),
+	},
+};
+
+static int da9150_probe(struct i2c_client *client,
+			const struct i2c_device_id *id)
+{
+	struct da9150 *da9150;
+	struct da9150_pdata *pdata = dev_get_platdata(&client->dev);
+	int ret;
+
+	da9150 = devm_kzalloc(&client->dev, sizeof(*da9150), GFP_KERNEL);
+	if (!da9150)
+		return -ENOMEM;
+
+	da9150->dev = &client->dev;
+	da9150->irq = client->irq;
+	i2c_set_clientdata(client, da9150);
+
+	da9150->regmap = devm_regmap_init_i2c(client, &da9150_regmap_config);
+	if (IS_ERR(da9150->regmap)) {
+		ret = PTR_ERR(da9150->regmap);
+		dev_err(da9150->dev, "Failed to allocate register map: %d\n",
+			ret);
+		return ret;
+	}
+
+	da9150->irq_base = pdata ? pdata->irq_base : -1;
+
+	ret = regmap_add_irq_chip(da9150->regmap, da9150->irq,
+				  IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+				  da9150->irq_base, &da9150_regmap_irq_chip,
+				  &da9150->regmap_irq_data);
+	if (ret)
+		return ret;
+
+	da9150->irq_base = regmap_irq_chip_get_base(da9150->regmap_irq_data);
+	enable_irq_wake(da9150->irq);
+
+	ret = mfd_add_devices(da9150->dev, -1, da9150_devs,
+			      ARRAY_SIZE(da9150_devs), NULL,
+			      da9150->irq_base, NULL);
+	if (ret) {
+		dev_err(da9150->dev, "Failed to add child devices: %d\n", ret);
+		regmap_del_irq_chip(da9150->irq, da9150->regmap_irq_data);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int da9150_remove(struct i2c_client *client)
+{
+	struct da9150 *da9150 = i2c_get_clientdata(client);
+
+	regmap_del_irq_chip(da9150->irq, da9150->regmap_irq_data);
+	mfd_remove_devices(da9150->dev);
+
+	return 0;
+}
+
+static void da9150_shutdown(struct i2c_client *client)
+{
+	struct da9150 *da9150 = i2c_get_clientdata(client);
+
+	/* Make sure we have a wakup source for the device */
+	da9150_set_bits(da9150, DA9150_CONFIG_D,
+			DA9150_WKUP_PM_EN_MASK,
+			DA9150_WKUP_PM_EN_MASK);
+
+	/* Set device to DISABLED mode */
+	da9150_set_bits(da9150, DA9150_CONTROL_C,
+			DA9150_DISABLE_MASK, DA9150_DISABLE_MASK);
+}
+
+static const struct i2c_device_id da9150_i2c_id[] = {
+	{ "da9150", },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, da9150_i2c_id);
+
+static const struct of_device_id da9150_of_match[] = {
+	{ .compatible = "dlg,da9150", },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, da9150_of_match);
+
+static struct i2c_driver da9150_driver = {
+	.driver	= {
+		.name	= "da9150",
+		.of_match_table = of_match_ptr(da9150_of_match),
+	},
+	.probe		= da9150_probe,
+	.remove		= da9150_remove,
+	.shutdown	= da9150_shutdown,
+	.id_table	= da9150_i2c_id,
+};
+
+module_i2c_driver(da9150_driver);
+
+MODULE_DESCRIPTION("MFD Core Driver for DA9150");
+MODULE_AUTHOR("Adam Thomson <Adam.Thomson.Opensource@diasemi.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/da9150/core.h b/include/linux/mfd/da9150/core.h
new file mode 100644
index 000000000000..76e668933a77
--- /dev/null
+++ b/include/linux/mfd/da9150/core.h
@@ -0,0 +1,68 @@
+/*
+ * DA9150 MFD Driver - Core Data
+ *
+ * Copyright (c) 2014 Dialog Semiconductor
+ *
+ * Author: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __DA9150_CORE_H
+#define __DA9150_CORE_H
+
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/regmap.h>
+
+/* I2C address paging */
+#define DA9150_REG_PAGE_SHIFT	8
+#define DA9150_REG_PAGE_MASK	0xFF
+
+/* IRQs */
+#define DA9150_NUM_IRQ_REGS	4
+#define DA9150_IRQ_VBUS		0
+#define DA9150_IRQ_CHG		1
+#define DA9150_IRQ_TCLASS	2
+#define DA9150_IRQ_TJUNC	3
+#define DA9150_IRQ_VFAULT	4
+#define DA9150_IRQ_CONF		5
+#define DA9150_IRQ_DAT		6
+#define DA9150_IRQ_DTYPE	7
+#define DA9150_IRQ_ID		8
+#define DA9150_IRQ_ADP		9
+#define DA9150_IRQ_SESS_END	10
+#define DA9150_IRQ_SESS_VLD	11
+#define DA9150_IRQ_FG		12
+#define DA9150_IRQ_GP		13
+#define DA9150_IRQ_TBAT		14
+#define DA9150_IRQ_GPIOA	15
+#define DA9150_IRQ_GPIOB	16
+#define DA9150_IRQ_GPIOC	17
+#define DA9150_IRQ_GPIOD	18
+#define DA9150_IRQ_GPADC	19
+#define DA9150_IRQ_WKUP		20
+
+struct da9150_pdata {
+	int irq_base;
+};
+
+struct da9150 {
+	struct device *dev;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *regmap_irq_data;
+	int irq;
+	int irq_base;
+};
+
+/* Device I/O */
+u8 da9150_reg_read(struct da9150 *da9150, u16 reg);
+void da9150_reg_write(struct da9150 *da9150, u16 reg, u8 val);
+void da9150_set_bits(struct da9150 *da9150, u16 reg, u8 mask, u8 val);
+
+void da9150_bulk_read(struct da9150 *da9150, u16 reg, int count, u8 *buf);
+void da9150_bulk_write(struct da9150 *da9150, u16 reg, int count, const u8 *buf);
+#endif /* __DA9150_CORE_H */
diff --git a/include/linux/mfd/da9150/registers.h b/include/linux/mfd/da9150/registers.h
new file mode 100644
index 000000000000..27ca6ee4d840
--- /dev/null
+++ b/include/linux/mfd/da9150/registers.h
@@ -0,0 +1,1155 @@
+/*
+ * DA9150 MFD Driver - Registers
+ *
+ * Copyright (c) 2014 Dialog Semiconductor
+ *
+ * Author: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __DA9150_REGISTERS_H
+#define __DA9150_REGISTERS_H
+
+#include <linux/bitops.h>
+
+/* Registers */
+#define DA9150_PAGE_CON			0x000
+#define DA9150_STATUS_A			0x068
+#define DA9150_STATUS_B			0x069
+#define DA9150_STATUS_C			0x06A
+#define DA9150_STATUS_D			0x06B
+#define DA9150_STATUS_E			0x06C
+#define DA9150_STATUS_F			0x06D
+#define DA9150_STATUS_G			0x06E
+#define DA9150_STATUS_H			0x06F
+#define DA9150_STATUS_I			0x070
+#define DA9150_STATUS_J			0x071
+#define DA9150_STATUS_K			0x072
+#define DA9150_STATUS_L			0x073
+#define DA9150_STATUS_N			0x074
+#define DA9150_FAULT_LOG_A		0x076
+#define DA9150_FAULT_LOG_B		0x077
+#define DA9150_EVENT_E			0x078
+#define DA9150_EVENT_F			0x079
+#define DA9150_EVENT_G			0x07A
+#define DA9150_EVENT_H			0x07B
+#define DA9150_IRQ_MASK_E		0x07C
+#define DA9150_IRQ_MASK_F		0x07D
+#define DA9150_IRQ_MASK_G		0x07E
+#define DA9150_IRQ_MASK_H		0x07F
+#define DA9150_PAGE_CON_1		0x080
+#define DA9150_CONFIG_A			0x0E0
+#define DA9150_CONFIG_B			0x0E1
+#define DA9150_CONFIG_C			0x0E2
+#define DA9150_CONFIG_D			0x0E3
+#define DA9150_CONFIG_E			0x0E4
+#define DA9150_CONTROL_A		0x0E5
+#define DA9150_CONTROL_B		0x0E6
+#define DA9150_CONTROL_C		0x0E7
+#define DA9150_GPIO_A_B			0x0E8
+#define DA9150_GPIO_C_D			0x0E9
+#define DA9150_GPIO_MODE_CONT		0x0EA
+#define DA9150_GPIO_CTRL_B		0x0EB
+#define DA9150_GPIO_CTRL_A		0x0EC
+#define DA9150_GPIO_CTRL_C		0x0ED
+#define DA9150_GPIO_CFG_A		0x0EE
+#define DA9150_GPIO_CFG_B		0x0EF
+#define DA9150_GPIO_CFG_C		0x0F0
+#define DA9150_GPADC_MAN		0x0F2
+#define DA9150_GPADC_RES_A		0x0F4
+#define DA9150_GPADC_RES_B		0x0F5
+#define DA9150_PAGE_CON_2		0x100
+#define DA9150_OTP_CONT_SHARED		0x101
+#define DA9150_INTERFACE_SHARED		0x105
+#define DA9150_CONFIG_A_SHARED		0x106
+#define DA9150_CONFIG_D_SHARED		0x109
+#define DA9150_ADETVB_CFG_C		0x150
+#define DA9150_ADETD_STAT		0x151
+#define DA9150_ADET_CMPSTAT		0x152
+#define DA9150_ADET_CTRL_A		0x153
+#define DA9150_ADETVB_CFG_B		0x154
+#define DA9150_ADETVB_CFG_A		0x155
+#define DA9150_ADETAC_CFG_A		0x156
+#define DA9150_ADDETAC_CFG_B		0x157
+#define DA9150_ADETAC_CFG_C		0x158
+#define DA9150_ADETAC_CFG_D		0x159
+#define DA9150_ADETVB_CFG_D		0x15A
+#define DA9150_ADETID_CFG_A		0x15B
+#define DA9150_ADET_RID_PT_CHG_H	0x15C
+#define DA9150_ADET_RID_PT_CHG_L	0x15D
+#define DA9150_PPR_TCTR_B		0x160
+#define DA9150_PPR_BKCTRL_A		0x163
+#define DA9150_PPR_BKCFG_A		0x164
+#define DA9150_PPR_BKCFG_B		0x165
+#define DA9150_PPR_CHGCTRL_A		0x166
+#define DA9150_PPR_CHGCTRL_B		0x167
+#define DA9150_PPR_CHGCTRL_C		0x168
+#define DA9150_PPR_TCTR_A		0x169
+#define DA9150_PPR_CHGCTRL_D		0x16A
+#define DA9150_PPR_CHGCTRL_E		0x16B
+#define DA9150_PPR_CHGCTRL_F		0x16C
+#define DA9150_PPR_CHGCTRL_G		0x16D
+#define DA9150_PPR_CHGCTRL_H		0x16E
+#define DA9150_PPR_CHGCTRL_I		0x16F
+#define DA9150_PPR_CHGCTRL_J		0x170
+#define DA9150_PPR_CHGCTRL_K		0x171
+#define DA9150_PPR_CHGCTRL_L		0x172
+#define DA9150_PPR_CHGCTRL_M		0x173
+#define DA9150_PPR_THYST_A		0x174
+#define DA9150_PPR_THYST_B		0x175
+#define DA9150_PPR_THYST_C		0x176
+#define DA9150_PPR_THYST_D		0x177
+#define DA9150_PPR_THYST_E		0x178
+#define DA9150_PPR_THYST_F		0x179
+#define DA9150_PPR_THYST_G		0x17A
+#define DA9150_PAGE_CON_3		0x180
+#define DA9150_PAGE_CON_4		0x200
+#define DA9150_PAGE_CON_5		0x280
+#define DA9150_PAGE_CON_6		0x300
+#define DA9150_COREBTLD_STAT_A		0x302
+#define DA9150_COREBTLD_CTRL_A		0x303
+#define DA9150_CORE_CONFIG_A		0x304
+#define DA9150_CORE_CONFIG_C		0x305
+#define DA9150_CORE_CONFIG_B		0x306
+#define DA9150_CORE_CFG_DATA_A		0x307
+#define DA9150_CORE_CFG_DATA_B		0x308
+#define DA9150_CORE_CMD_A		0x309
+#define DA9150_CORE_DATA_A		0x30A
+#define DA9150_CORE_DATA_B		0x30B
+#define DA9150_CORE_DATA_C		0x30C
+#define DA9150_CORE_DATA_D		0x30D
+#define DA9150_CORE2WIRE_STAT_A		0x310
+#define DA9150_CORE2WIRE_CTRL_A		0x311
+#define DA9150_FW_CTRL_A		0x312
+#define DA9150_FW_CTRL_C		0x313
+#define DA9150_FW_CTRL_D		0x314
+#define DA9150_FG_CTRL_A		0x315
+#define DA9150_FG_CTRL_B		0x316
+#define DA9150_FW_CTRL_E		0x317
+#define DA9150_FW_CTRL_B		0x318
+#define DA9150_GPADC_CMAN		0x320
+#define DA9150_GPADC_CRES_A		0x322
+#define DA9150_GPADC_CRES_B		0x323
+#define DA9150_CC_CFG_A			0x328
+#define DA9150_CC_CFG_B			0x329
+#define DA9150_CC_ICHG_RES_A		0x32A
+#define DA9150_CC_ICHG_RES_B		0x32B
+#define DA9150_CC_IAVG_RES_A		0x32C
+#define DA9150_CC_IAVG_RES_B		0x32D
+#define DA9150_TAUX_CTRL_A		0x330
+#define DA9150_TAUX_RELOAD_H		0x332
+#define DA9150_TAUX_RELOAD_L		0x333
+#define DA9150_TAUX_VALUE_H		0x334
+#define DA9150_TAUX_VALUE_L		0x335
+#define DA9150_AUX_DATA_0		0x338
+#define DA9150_AUX_DATA_1		0x339
+#define DA9150_AUX_DATA_2		0x33A
+#define DA9150_AUX_DATA_3		0x33B
+#define DA9150_BIF_CTRL			0x340
+#define DA9150_TBAT_CTRL_A		0x342
+#define DA9150_TBAT_CTRL_B		0x343
+#define DA9150_TBAT_RES_A		0x344
+#define DA9150_TBAT_RES_B		0x345
+
+/* DA9150_PAGE_CON = 0x000 */
+#define DA9150_PAGE_SHIFT			0
+#define DA9150_PAGE_MASK			(0x3f << 0)
+#define DA9150_I2C_PAGE_SHIFT			1
+#define DA9150_I2C_PAGE_MASK			(0x1f << 1)
+#define DA9150_WRITE_MODE_SHIFT			6
+#define DA9150_WRITE_MODE_MASK			BIT(6)
+#define DA9150_REVERT_SHIFT			7
+#define DA9150_REVERT_MASK			BIT(7)
+
+/* DA9150_STATUS_A = 0x068 */
+#define DA9150_WKUP_STAT_SHIFT			2
+#define DA9150_WKUP_STAT_MASK			(0x0f << 2)
+#define DA9150_SLEEP_STAT_SHIFT			6
+#define DA9150_SLEEP_STAT_MASK			(0x03 << 6)
+
+/* DA9150_STATUS_B = 0x069 */
+#define DA9150_VFAULT_STAT_SHIFT		0
+#define DA9150_VFAULT_STAT_MASK			BIT(0)
+#define DA9150_TFAULT_STAT_SHIFT		1
+#define DA9150_TFAULT_STAT_MASK			BIT(1)
+
+/* DA9150_STATUS_C = 0x06A */
+#define DA9150_VDD33_STAT_SHIFT			0
+#define DA9150_VDD33_STAT_MASK			BIT(0)
+#define DA9150_VDD33_SLEEP_SHIFT		1
+#define DA9150_VDD33_SLEEP_MASK			BIT(1)
+#define DA9150_LFOSC_STAT_SHIFT			7
+#define DA9150_LFOSC_STAT_MASK			BIT(7)
+
+/* DA9150_STATUS_D = 0x06B */
+#define DA9150_GPIOA_STAT_SHIFT			0
+#define DA9150_GPIOA_STAT_MASK			BIT(0)
+#define DA9150_GPIOB_STAT_SHIFT			1
+#define DA9150_GPIOB_STAT_MASK			BIT(1)
+#define DA9150_GPIOC_STAT_SHIFT			2
+#define DA9150_GPIOC_STAT_MASK			BIT(2)
+#define DA9150_GPIOD_STAT_SHIFT			3
+#define DA9150_GPIOD_STAT_MASK			BIT(3)
+
+/* DA9150_STATUS_E = 0x06C */
+#define DA9150_DTYPE_SHIFT			0
+#define DA9150_DTYPE_MASK			(0x1f << 0)
+#define DA9150_DTYPE_DT_NIL			(0x00 << 0)
+#define DA9150_DTYPE_DT_USB_OTG			BIT(0)
+#define DA9150_DTYPE_DT_USB_STD			(0x02 << 0)
+#define DA9150_DTYPE_DT_USB_CHG			(0x03 << 0)
+#define DA9150_DTYPE_DT_ACA_CHG			(0x04 << 0)
+#define DA9150_DTYPE_DT_ACA_OTG			(0x05 << 0)
+#define DA9150_DTYPE_DT_ACA_DOC			(0x06 << 0)
+#define DA9150_DTYPE_DT_DED_CHG			(0x07 << 0)
+#define DA9150_DTYPE_DT_CR5_CHG			(0x08 << 0)
+#define DA9150_DTYPE_DT_CR4_CHG			(0x0c << 0)
+#define DA9150_DTYPE_DT_PT_CHG			(0x11 << 0)
+#define DA9150_DTYPE_DT_NN_ACC			(0x16 << 0)
+#define DA9150_DTYPE_DT_NN_CHG			(0x17 << 0)
+
+/* DA9150_STATUS_F = 0x06D */
+#define DA9150_SESS_VLD_SHIFT			0
+#define DA9150_SESS_VLD_MASK			BIT(0)
+#define DA9150_ID_ERR_SHIFT			1
+#define DA9150_ID_ERR_MASK			BIT(1)
+#define DA9150_PT_CHG_SHIFT			2
+#define DA9150_PT_CHG_MASK			BIT(2)
+
+/* DA9150_STATUS_G = 0x06E */
+#define DA9150_RID_SHIFT			0
+#define DA9150_RID_MASK				(0xff << 0)
+
+/* DA9150_STATUS_H = 0x06F */
+#define DA9150_VBUS_STAT_SHIFT			0
+#define DA9150_VBUS_STAT_MASK			(0x07 << 0)
+#define DA9150_VBUS_STAT_OFF			(0x00 << 0)
+#define DA9150_VBUS_STAT_WAIT			BIT(0)
+#define DA9150_VBUS_STAT_CHG			(0x02 << 0)
+#define DA9150_VBUS_TRED_SHIFT			3
+#define DA9150_VBUS_TRED_MASK			BIT(3)
+#define DA9150_VBUS_DROP_STAT_SHIFT		4
+#define DA9150_VBUS_DROP_STAT_MASK		(0x0f << 4)
+
+/* DA9150_STATUS_I = 0x070 */
+#define DA9150_VBUS_ISET_STAT_SHIFT		0
+#define DA9150_VBUS_ISET_STAT_MASK		(0x1f << 0)
+#define DA9150_VBUS_OT_SHIFT			7
+#define DA9150_VBUS_OT_MASK			BIT(7)
+
+/* DA9150_STATUS_J = 0x071 */
+#define DA9150_CHG_STAT_SHIFT			0
+#define DA9150_CHG_STAT_MASK			(0x0f << 0)
+#define DA9150_CHG_STAT_OFF			(0x00 << 0)
+#define DA9150_CHG_STAT_SUSP			BIT(0)
+#define DA9150_CHG_STAT_ACT			(0x02 << 0)
+#define DA9150_CHG_STAT_PRE			(0x03 << 0)
+#define DA9150_CHG_STAT_CC			(0x04 << 0)
+#define DA9150_CHG_STAT_CV			(0x05 << 0)
+#define DA9150_CHG_STAT_FULL			(0x06 << 0)
+#define DA9150_CHG_STAT_TEMP			(0x07 << 0)
+#define DA9150_CHG_STAT_TIME			(0x08 << 0)
+#define DA9150_CHG_STAT_BAT			(0x09 << 0)
+#define DA9150_CHG_TEMP_SHIFT			4
+#define DA9150_CHG_TEMP_MASK			(0x07 << 4)
+#define DA9150_CHG_TEMP_UNDER			(0x06 << 4)
+#define DA9150_CHG_TEMP_OVER			(0x07 << 4)
+#define DA9150_CHG_IEND_STAT_SHIFT		7
+#define DA9150_CHG_IEND_STAT_MASK		BIT(7)
+
+/* DA9150_STATUS_K = 0x072 */
+#define DA9150_CHG_IAV_H_SHIFT			0
+#define DA9150_CHG_IAV_H_MASK			(0xff << 0)
+
+/* DA9150_STATUS_L = 0x073 */
+#define DA9150_CHG_IAV_L_SHIFT			5
+#define DA9150_CHG_IAV_L_MASK			(0x07 << 5)
+
+/* DA9150_STATUS_N = 0x074 */
+#define DA9150_CHG_TIME_SHIFT			1
+#define DA9150_CHG_TIME_MASK			BIT(1)
+#define DA9150_CHG_TRED_SHIFT			2
+#define DA9150_CHG_TRED_MASK			BIT(2)
+#define DA9150_CHG_TJUNC_CLASS_SHIFT		3
+#define DA9150_CHG_TJUNC_CLASS_MASK		(0x07 << 3)
+#define DA9150_CHG_TJUNC_CLASS_6		(0x06 << 3)
+#define DA9150_EBS_STAT_SHIFT			6
+#define DA9150_EBS_STAT_MASK			BIT(6)
+#define DA9150_CHG_BAT_REMOVED_SHIFT		7
+#define DA9150_CHG_BAT_REMOVED_MASK		BIT(7)
+
+/* DA9150_FAULT_LOG_A = 0x076 */
+#define DA9150_TEMP_FAULT_SHIFT			0
+#define DA9150_TEMP_FAULT_MASK			BIT(0)
+#define DA9150_VSYS_FAULT_SHIFT			1
+#define DA9150_VSYS_FAULT_MASK			BIT(1)
+#define DA9150_START_FAULT_SHIFT		2
+#define DA9150_START_FAULT_MASK			BIT(2)
+#define DA9150_EXT_FAULT_SHIFT			3
+#define DA9150_EXT_FAULT_MASK			BIT(3)
+#define DA9150_POR_FAULT_SHIFT			4
+#define DA9150_POR_FAULT_MASK			BIT(4)
+
+/* DA9150_FAULT_LOG_B = 0x077 */
+#define DA9150_VBUS_FAULT_SHIFT			0
+#define DA9150_VBUS_FAULT_MASK			BIT(0)
+#define DA9150_OTG_FAULT_SHIFT			1
+#define DA9150_OTG_FAULT_MASK			BIT(1)
+
+/* DA9150_EVENT_E = 0x078 */
+#define DA9150_E_VBUS_SHIFT			0
+#define DA9150_E_VBUS_MASK			BIT(0)
+#define DA9150_E_CHG_SHIFT			1
+#define DA9150_E_CHG_MASK			BIT(1)
+#define DA9150_E_TCLASS_SHIFT			2
+#define DA9150_E_TCLASS_MASK			BIT(2)
+#define DA9150_E_TJUNC_SHIFT			3
+#define DA9150_E_TJUNC_MASK			BIT(3)
+#define DA9150_E_VFAULT_SHIFT			4
+#define DA9150_E_VFAULT_MASK			BIT(4)
+#define DA9150_EVENTS_H_SHIFT			5
+#define DA9150_EVENTS_H_MASK			BIT(5)
+#define DA9150_EVENTS_G_SHIFT			6
+#define DA9150_EVENTS_G_MASK			BIT(6)
+#define DA9150_EVENTS_F_SHIFT			7
+#define DA9150_EVENTS_F_MASK			BIT(7)
+
+/* DA9150_EVENT_F = 0x079 */
+#define DA9150_E_CONF_SHIFT			0
+#define DA9150_E_CONF_MASK			BIT(0)
+#define DA9150_E_DAT_SHIFT			1
+#define DA9150_E_DAT_MASK			BIT(1)
+#define DA9150_E_DTYPE_SHIFT			3
+#define DA9150_E_DTYPE_MASK			BIT(3)
+#define DA9150_E_ID_SHIFT			4
+#define DA9150_E_ID_MASK			BIT(4)
+#define DA9150_E_ADP_SHIFT			5
+#define DA9150_E_ADP_MASK			BIT(5)
+#define DA9150_E_SESS_END_SHIFT			6
+#define DA9150_E_SESS_END_MASK			BIT(6)
+#define DA9150_E_SESS_VLD_SHIFT			7
+#define DA9150_E_SESS_VLD_MASK			BIT(7)
+
+/* DA9150_EVENT_G = 0x07A */
+#define DA9150_E_FG_SHIFT			0
+#define DA9150_E_FG_MASK			BIT(0)
+#define DA9150_E_GP_SHIFT			1
+#define DA9150_E_GP_MASK			BIT(1)
+#define DA9150_E_TBAT_SHIFT			2
+#define DA9150_E_TBAT_MASK			BIT(2)
+#define DA9150_E_GPIOA_SHIFT			3
+#define DA9150_E_GPIOA_MASK			BIT(3)
+#define DA9150_E_GPIOB_SHIFT			4
+#define DA9150_E_GPIOB_MASK			BIT(4)
+#define DA9150_E_GPIOC_SHIFT			5
+#define DA9150_E_GPIOC_MASK			BIT(5)
+#define DA9150_E_GPIOD_SHIFT			6
+#define DA9150_E_GPIOD_MASK			BIT(6)
+#define DA9150_E_GPADC_SHIFT			7
+#define DA9150_E_GPADC_MASK			BIT(7)
+
+/* DA9150_EVENT_H = 0x07B */
+#define DA9150_E_WKUP_SHIFT			0
+#define DA9150_E_WKUP_MASK			BIT(0)
+
+/* DA9150_IRQ_MASK_E = 0x07C */
+#define DA9150_M_VBUS_SHIFT			0
+#define DA9150_M_VBUS_MASK			BIT(0)
+#define DA9150_M_CHG_SHIFT			1
+#define DA9150_M_CHG_MASK			BIT(1)
+#define DA9150_M_TJUNC_SHIFT			3
+#define DA9150_M_TJUNC_MASK			BIT(3)
+#define DA9150_M_VFAULT_SHIFT			4
+#define DA9150_M_VFAULT_MASK			BIT(4)
+
+/* DA9150_IRQ_MASK_F = 0x07D */
+#define DA9150_M_CONF_SHIFT			0
+#define DA9150_M_CONF_MASK			BIT(0)
+#define DA9150_M_DAT_SHIFT			1
+#define DA9150_M_DAT_MASK			BIT(1)
+#define DA9150_M_DTYPE_SHIFT			3
+#define DA9150_M_DTYPE_MASK			BIT(3)
+#define DA9150_M_ID_SHIFT			4
+#define DA9150_M_ID_MASK			BIT(4)
+#define DA9150_M_ADP_SHIFT			5
+#define DA9150_M_ADP_MASK			BIT(5)
+#define DA9150_M_SESS_END_SHIFT			6
+#define DA9150_M_SESS_END_MASK			BIT(6)
+#define DA9150_M_SESS_VLD_SHIFT			7
+#define DA9150_M_SESS_VLD_MASK			BIT(7)
+
+/* DA9150_IRQ_MASK_G = 0x07E */
+#define DA9150_M_FG_SHIFT			0
+#define DA9150_M_FG_MASK			BIT(0)
+#define DA9150_M_GP_SHIFT			1
+#define DA9150_M_GP_MASK			BIT(1)
+#define DA9150_M_TBAT_SHIFT			2
+#define DA9150_M_TBAT_MASK			BIT(2)
+#define DA9150_M_GPIOA_SHIFT			3
+#define DA9150_M_GPIOA_MASK			BIT(3)
+#define DA9150_M_GPIOB_SHIFT			4
+#define DA9150_M_GPIOB_MASK			BIT(4)
+#define DA9150_M_GPIOC_SHIFT			5
+#define DA9150_M_GPIOC_MASK			BIT(5)
+#define DA9150_M_GPIOD_SHIFT			6
+#define DA9150_M_GPIOD_MASK			BIT(6)
+#define DA9150_M_GPADC_SHIFT			7
+#define DA9150_M_GPADC_MASK			BIT(7)
+
+/* DA9150_IRQ_MASK_H = 0x07F */
+#define DA9150_M_WKUP_SHIFT			0
+#define DA9150_M_WKUP_MASK			BIT(0)
+
+/* DA9150_PAGE_CON_1 = 0x080 */
+#define DA9150_PAGE_SHIFT			0
+#define DA9150_PAGE_MASK			(0x3f << 0)
+#define DA9150_WRITE_MODE_SHIFT			6
+#define DA9150_WRITE_MODE_MASK			BIT(6)
+#define DA9150_REVERT_SHIFT			7
+#define DA9150_REVERT_MASK			BIT(7)
+
+/* DA9150_CONFIG_A = 0x0E0 */
+#define DA9150_RESET_DUR_SHIFT			0
+#define DA9150_RESET_DUR_MASK			(0x03 << 0)
+#define DA9150_RESET_EXT_SHIFT			2
+#define DA9150_RESET_EXT_MASK			(0x03 << 2)
+#define DA9150_START_MAX_SHIFT			4
+#define DA9150_START_MAX_MASK			(0x03 << 4)
+#define DA9150_PS_WAIT_EN_SHIFT			6
+#define DA9150_PS_WAIT_EN_MASK			BIT(6)
+#define DA9150_PS_DISABLE_DIRECT_SHIFT		7
+#define DA9150_PS_DISABLE_DIRECT_MASK		BIT(7)
+
+/* DA9150_CONFIG_B = 0x0E1 */
+#define DA9150_VFAULT_ADJ_SHIFT			0
+#define DA9150_VFAULT_ADJ_MASK			(0x0f << 0)
+#define DA9150_VFAULT_HYST_SHIFT		4
+#define DA9150_VFAULT_HYST_MASK			(0x07 << 4)
+#define DA9150_VFAULT_EN_SHIFT			7
+#define DA9150_VFAULT_EN_MASK			BIT(7)
+
+/* DA9150_CONFIG_C = 0x0E2 */
+#define DA9150_VSYS_MIN_SHIFT			3
+#define DA9150_VSYS_MIN_MASK			(0x1f << 3)
+
+/* DA9150_CONFIG_D = 0x0E3 */
+#define DA9150_LFOSC_EXT_SHIFT			0
+#define DA9150_LFOSC_EXT_MASK			BIT(0)
+#define DA9150_VDD33_DWN_SHIFT			1
+#define DA9150_VDD33_DWN_MASK			BIT(1)
+#define DA9150_WKUP_PM_EN_SHIFT			2
+#define DA9150_WKUP_PM_EN_MASK			BIT(2)
+#define DA9150_WKUP_CE_SEL_SHIFT		3
+#define DA9150_WKUP_CE_SEL_MASK			(0x03 << 3)
+#define DA9150_WKUP_CLK32K_EN_SHIFT		5
+#define DA9150_WKUP_CLK32K_EN_MASK		BIT(5)
+#define DA9150_DISABLE_DEL_SHIFT		7
+#define DA9150_DISABLE_DEL_MASK			BIT(7)
+
+/* DA9150_CONFIG_E = 0x0E4 */
+#define DA9150_PM_SPKSUP_DIS_SHIFT		0
+#define DA9150_PM_SPKSUP_DIS_MASK		BIT(0)
+#define DA9150_PM_MERGE_SHIFT			1
+#define DA9150_PM_MERGE_MASK			BIT(1)
+#define DA9150_PM_SR_OFF_SHIFT			2
+#define DA9150_PM_SR_OFF_MASK			BIT(2)
+#define DA9150_PM_TIMEOUT_EN_SHIFT		3
+#define DA9150_PM_TIMEOUT_EN_MASK		BIT(3)
+#define DA9150_PM_DLY_SEL_SHIFT			4
+#define DA9150_PM_DLY_SEL_MASK			(0x07 << 4)
+#define DA9150_PM_OUT_DLY_SEL_SHIFT		7
+#define DA9150_PM_OUT_DLY_SEL_MASK		BIT(7)
+
+/* DA9150_CONTROL_A = 0x0E5 */
+#define DA9150_VDD33_SL_SHIFT			0
+#define DA9150_VDD33_SL_MASK			BIT(0)
+#define DA9150_VDD33_LPM_SHIFT			1
+#define DA9150_VDD33_LPM_MASK			(0x03 << 1)
+#define DA9150_VDD33_EN_SHIFT			3
+#define DA9150_VDD33_EN_MASK			BIT(3)
+#define DA9150_GPI_LPM_SHIFT			6
+#define DA9150_GPI_LPM_MASK			BIT(6)
+#define DA9150_PM_IF_LPM_SHIFT			7
+#define DA9150_PM_IF_LPM_MASK			BIT(7)
+
+/* DA9150_CONTROL_B = 0x0E6 */
+#define DA9150_LPM_SHIFT			0
+#define DA9150_LPM_MASK				BIT(0)
+#define DA9150_RESET_SHIFT			1
+#define DA9150_RESET_MASK			BIT(1)
+#define DA9150_RESET_USRCONF_EN_SHIFT		2
+#define DA9150_RESET_USRCONF_EN_MASK		BIT(2)
+
+/* DA9150_CONTROL_C = 0x0E7 */
+#define DA9150_DISABLE_SHIFT			0
+#define DA9150_DISABLE_MASK			BIT(0)
+
+/* DA9150_GPIO_A_B = 0x0E8 */
+#define DA9150_GPIOA_PIN_SHIFT			0
+#define DA9150_GPIOA_PIN_MASK			(0x07 << 0)
+#define DA9150_GPIOA_PIN_GPI			(0x00 << 0)
+#define DA9150_GPIOA_PIN_GPO_OD			BIT(0)
+#define DA9150_GPIOA_TYPE_SHIFT			3
+#define DA9150_GPIOA_TYPE_MASK			BIT(3)
+#define DA9150_GPIOB_PIN_SHIFT			4
+#define DA9150_GPIOB_PIN_MASK			(0x07 << 4)
+#define DA9150_GPIOB_PIN_GPI			(0x00 << 4)
+#define DA9150_GPIOB_PIN_GPO_OD			BIT(4)
+#define DA9150_GPIOB_TYPE_SHIFT			7
+#define DA9150_GPIOB_TYPE_MASK			BIT(7)
+
+/* DA9150_GPIO_C_D = 0x0E9 */
+#define DA9150_GPIOC_PIN_SHIFT			0
+#define DA9150_GPIOC_PIN_MASK			(0x07 << 0)
+#define DA9150_GPIOC_PIN_GPI			(0x00 << 0)
+#define DA9150_GPIOC_PIN_GPO_OD			BIT(0)
+#define DA9150_GPIOC_TYPE_SHIFT			3
+#define DA9150_GPIOC_TYPE_MASK			BIT(3)
+#define DA9150_GPIOD_PIN_SHIFT			4
+#define DA9150_GPIOD_PIN_MASK			(0x07 << 4)
+#define DA9150_GPIOD_PIN_GPI			(0x00 << 4)
+#define DA9150_GPIOD_PIN_GPO_OD			BIT(4)
+#define DA9150_GPIOD_TYPE_SHIFT			7
+#define DA9150_GPIOD_TYPE_MASK			BIT(7)
+
+/* DA9150_GPIO_MODE_CONT = 0x0EA */
+#define DA9150_GPIOA_MODE_SHIFT			0
+#define DA9150_GPIOA_MODE_MASK			BIT(0)
+#define DA9150_GPIOB_MODE_SHIFT			1
+#define DA9150_GPIOB_MODE_MASK			BIT(1)
+#define DA9150_GPIOC_MODE_SHIFT			2
+#define DA9150_GPIOC_MODE_MASK			BIT(2)
+#define DA9150_GPIOD_MODE_SHIFT			3
+#define DA9150_GPIOD_MODE_MASK			BIT(3)
+#define DA9150_GPIOA_CONT_SHIFT			4
+#define DA9150_GPIOA_CONT_MASK			BIT(4)
+#define DA9150_GPIOB_CONT_SHIFT			5
+#define DA9150_GPIOB_CONT_MASK			BIT(5)
+#define DA9150_GPIOC_CONT_SHIFT			6
+#define DA9150_GPIOC_CONT_MASK			BIT(6)
+#define DA9150_GPIOD_CONT_SHIFT			7
+#define DA9150_GPIOD_CONT_MASK			BIT(7)
+
+/* DA9150_GPIO_CTRL_B = 0x0EB */
+#define DA9150_WAKE_PIN_SHIFT			0
+#define DA9150_WAKE_PIN_MASK			(0x03 << 0)
+#define DA9150_WAKE_MODE_SHIFT			2
+#define DA9150_WAKE_MODE_MASK			BIT(2)
+#define DA9150_WAKE_CONT_SHIFT			3
+#define DA9150_WAKE_CONT_MASK			BIT(3)
+#define DA9150_WAKE_DLY_SHIFT			4
+#define DA9150_WAKE_DLY_MASK			BIT(4)
+
+/* DA9150_GPIO_CTRL_A = 0x0EC */
+#define DA9150_GPIOA_ANAEN_SHIFT		0
+#define DA9150_GPIOA_ANAEN_MASK			BIT(0)
+#define DA9150_GPIOB_ANAEN_SHIFT		1
+#define DA9150_GPIOB_ANAEN_MASK			BIT(1)
+#define DA9150_GPIOC_ANAEN_SHIFT		2
+#define DA9150_GPIOC_ANAEN_MASK			BIT(2)
+#define DA9150_GPIOD_ANAEN_SHIFT		3
+#define DA9150_GPIOD_ANAEN_MASK			BIT(3)
+#define DA9150_GPIO_ANAEN			0x01
+#define DA9150_GPIO_ANAEN_MASK			0x0F
+#define DA9150_CHGLED_PIN_SHIFT			5
+#define DA9150_CHGLED_PIN_MASK			(0x07 << 5)
+
+/* DA9150_GPIO_CTRL_C = 0x0ED */
+#define DA9150_CHGBL_DUR_SHIFT			0
+#define DA9150_CHGBL_DUR_MASK			(0x03 << 0)
+#define DA9150_CHGBL_DBL_SHIFT			2
+#define DA9150_CHGBL_DBL_MASK			BIT(2)
+#define DA9150_CHGBL_FRQ_SHIFT			3
+#define DA9150_CHGBL_FRQ_MASK			(0x03 << 3)
+#define DA9150_CHGBL_FLKR_SHIFT			5
+#define DA9150_CHGBL_FLKR_MASK			BIT(5)
+
+/* DA9150_GPIO_CFG_A = 0x0EE */
+#define DA9150_CE_LPM_DEB_SHIFT			0
+#define DA9150_CE_LPM_DEB_MASK			(0x07 << 0)
+
+/* DA9150_GPIO_CFG_B = 0x0EF */
+#define DA9150_GPIOA_PUPD_SHIFT			0
+#define DA9150_GPIOA_PUPD_MASK			BIT(0)
+#define DA9150_GPIOB_PUPD_SHIFT			1
+#define DA9150_GPIOB_PUPD_MASK			BIT(1)
+#define DA9150_GPIOC_PUPD_SHIFT			2
+#define DA9150_GPIOC_PUPD_MASK			BIT(2)
+#define DA9150_GPIOD_PUPD_SHIFT			3
+#define DA9150_GPIOD_PUPD_MASK			BIT(3)
+#define DA9150_GPIO_PUPD_MASK			(0xF << 0)
+#define DA9150_GPI_DEB_SHIFT			4
+#define DA9150_GPI_DEB_MASK			(0x07 << 4)
+#define DA9150_LPM_EN_SHIFT			7
+#define DA9150_LPM_EN_MASK			BIT(7)
+
+/* DA9150_GPIO_CFG_C = 0x0F0 */
+#define DA9150_GPI_V_SHIFT			0
+#define DA9150_GPI_V_MASK			BIT(0)
+#define DA9150_VDDIO_INT_SHIFT			1
+#define DA9150_VDDIO_INT_MASK			BIT(1)
+#define DA9150_FAULT_PIN_SHIFT			3
+#define DA9150_FAULT_PIN_MASK			(0x07 << 3)
+#define DA9150_FAULT_TYPE_SHIFT			6
+#define DA9150_FAULT_TYPE_MASK			BIT(6)
+#define DA9150_NIRQ_PUPD_SHIFT			7
+#define DA9150_NIRQ_PUPD_MASK			BIT(7)
+
+/* DA9150_GPADC_MAN = 0x0F2 */
+#define DA9150_GPADC_EN_SHIFT			0
+#define DA9150_GPADC_EN_MASK			BIT(0)
+#define DA9150_GPADC_MUX_SHIFT			1
+#define DA9150_GPADC_MUX_MASK			(0x1f << 1)
+
+/* DA9150_GPADC_RES_A = 0x0F4 */
+#define DA9150_GPADC_RES_H_SHIFT		0
+#define DA9150_GPADC_RES_H_MASK			(0xff << 0)
+
+/* DA9150_GPADC_RES_B = 0x0F5 */
+#define DA9150_GPADC_RUN_SHIFT			0
+#define DA9150_GPADC_RUN_MASK			BIT(0)
+#define DA9150_GPADC_RES_L_SHIFT		6
+#define DA9150_GPADC_RES_L_MASK			(0x03 << 6)
+#define DA9150_GPADC_RES_L_BITS			2
+
+/* DA9150_PAGE_CON_2 = 0x100 */
+#define DA9150_PAGE_SHIFT			0
+#define DA9150_PAGE_MASK			(0x3f << 0)
+#define DA9150_WRITE_MODE_SHIFT			6
+#define DA9150_WRITE_MODE_MASK			BIT(6)
+#define DA9150_REVERT_SHIFT			7
+#define DA9150_REVERT_MASK			BIT(7)
+
+/* DA9150_OTP_CONT_SHARED = 0x101 */
+#define DA9150_PC_DONE_SHIFT			3
+#define DA9150_PC_DONE_MASK			BIT(3)
+
+/* DA9150_INTERFACE_SHARED = 0x105 */
+#define DA9150_IF_BASE_ADDR_SHIFT		4
+#define DA9150_IF_BASE_ADDR_MASK		(0x0f << 4)
+
+/* DA9150_CONFIG_A_SHARED = 0x106 */
+#define DA9150_NIRQ_VDD_SHIFT			1
+#define DA9150_NIRQ_VDD_MASK			BIT(1)
+#define DA9150_NIRQ_PIN_SHIFT			2
+#define DA9150_NIRQ_PIN_MASK			BIT(2)
+#define DA9150_NIRQ_TYPE_SHIFT			3
+#define DA9150_NIRQ_TYPE_MASK			BIT(3)
+#define DA9150_PM_IF_V_SHIFT			4
+#define DA9150_PM_IF_V_MASK			BIT(4)
+#define DA9150_PM_IF_FMP_SHIFT			5
+#define DA9150_PM_IF_FMP_MASK			BIT(5)
+#define DA9150_PM_IF_HSM_SHIFT			6
+#define DA9150_PM_IF_HSM_MASK			BIT(6)
+
+/* DA9150_CONFIG_D_SHARED = 0x109 */
+#define DA9150_NIRQ_MODE_SHIFT			1
+#define DA9150_NIRQ_MODE_MASK			BIT(1)
+
+/* DA9150_ADETVB_CFG_C = 0x150 */
+#define DA9150_TADP_RISE_SHIFT			0
+#define DA9150_TADP_RISE_MASK			(0xff << 0)
+
+/* DA9150_ADETD_STAT = 0x151 */
+#define DA9150_DCD_STAT_SHIFT			0
+#define DA9150_DCD_STAT_MASK			BIT(0)
+#define DA9150_PCD_STAT_SHIFT			1
+#define DA9150_PCD_STAT_MASK			(0x03 << 1)
+#define DA9150_SCD_STAT_SHIFT			3
+#define DA9150_SCD_STAT_MASK			(0x03 << 3)
+#define DA9150_DP_STAT_SHIFT			5
+#define DA9150_DP_STAT_MASK			BIT(5)
+#define DA9150_DM_STAT_SHIFT			6
+#define DA9150_DM_STAT_MASK			BIT(6)
+
+/* DA9150_ADET_CMPSTAT = 0x152 */
+#define DA9150_DP_COMP_SHIFT			1
+#define DA9150_DP_COMP_MASK			BIT(1)
+#define DA9150_DM_COMP_SHIFT			2
+#define DA9150_DM_COMP_MASK			BIT(2)
+#define DA9150_ADP_SNS_COMP_SHIFT		3
+#define DA9150_ADP_SNS_COMP_MASK		BIT(3)
+#define DA9150_ADP_PRB_COMP_SHIFT		4
+#define DA9150_ADP_PRB_COMP_MASK		BIT(4)
+#define DA9150_ID_COMP_SHIFT			5
+#define DA9150_ID_COMP_MASK			BIT(5)
+
+/* DA9150_ADET_CTRL_A = 0x153 */
+#define DA9150_AID_DAT_SHIFT			0
+#define DA9150_AID_DAT_MASK			BIT(0)
+#define DA9150_AID_ID_SHIFT			1
+#define DA9150_AID_ID_MASK			BIT(1)
+#define DA9150_AID_TRIG_SHIFT			2
+#define DA9150_AID_TRIG_MASK			BIT(2)
+
+/* DA9150_ADETVB_CFG_B = 0x154 */
+#define DA9150_VB_MODE_SHIFT			0
+#define DA9150_VB_MODE_MASK			(0x03 << 0)
+#define DA9150_VB_MODE_VB_SESS			BIT(0)
+
+#define DA9150_TADP_PRB_SHIFT			2
+#define DA9150_TADP_PRB_MASK			BIT(2)
+#define DA9150_DAT_RPD_EXT_SHIFT		5
+#define DA9150_DAT_RPD_EXT_MASK			BIT(5)
+#define DA9150_CONF_RPD_SHIFT			6
+#define DA9150_CONF_RPD_MASK			BIT(6)
+#define DA9150_CONF_SRP_SHIFT			7
+#define DA9150_CONF_SRP_MASK			BIT(7)
+
+/* DA9150_ADETVB_CFG_A = 0x155 */
+#define DA9150_AID_MODE_SHIFT			0
+#define DA9150_AID_MODE_MASK			(0x03 << 0)
+#define DA9150_AID_EXT_POL_SHIFT		2
+#define DA9150_AID_EXT_POL_MASK			BIT(2)
+
+/* DA9150_ADETAC_CFG_A = 0x156 */
+#define DA9150_ISET_CDP_SHIFT			0
+#define DA9150_ISET_CDP_MASK			(0x1f << 0)
+#define DA9150_CONF_DBP_SHIFT			5
+#define DA9150_CONF_DBP_MASK			BIT(5)
+
+/* DA9150_ADDETAC_CFG_B = 0x157 */
+#define DA9150_ISET_DCHG_SHIFT			0
+#define DA9150_ISET_DCHG_MASK			(0x1f << 0)
+#define DA9150_CONF_GPIOA_SHIFT			5
+#define DA9150_CONF_GPIOA_MASK			BIT(5)
+#define DA9150_CONF_GPIOB_SHIFT			6
+#define DA9150_CONF_GPIOB_MASK			BIT(6)
+#define DA9150_AID_VB_SHIFT			7
+#define DA9150_AID_VB_MASK			BIT(7)
+
+/* DA9150_ADETAC_CFG_C = 0x158 */
+#define DA9150_ISET_DEF_SHIFT			0
+#define DA9150_ISET_DEF_MASK			(0x1f << 0)
+#define DA9150_CONF_MODE_SHIFT			5
+#define DA9150_CONF_MODE_MASK			(0x03 << 5)
+#define DA9150_AID_CR_DIS_SHIFT			7
+#define DA9150_AID_CR_DIS_MASK			BIT(7)
+
+/* DA9150_ADETAC_CFG_D = 0x159 */
+#define DA9150_ISET_UNIT_SHIFT			0
+#define DA9150_ISET_UNIT_MASK			(0x1f << 0)
+#define DA9150_AID_UNCLAMP_SHIFT		5
+#define DA9150_AID_UNCLAMP_MASK			BIT(5)
+
+/* DA9150_ADETVB_CFG_D = 0x15A */
+#define DA9150_ID_MODE_SHIFT			0
+#define DA9150_ID_MODE_MASK			(0x03 << 0)
+#define DA9150_DAT_MODE_SHIFT			2
+#define DA9150_DAT_MODE_MASK			(0x0f << 2)
+#define DA9150_DAT_SWP_SHIFT			6
+#define DA9150_DAT_SWP_MASK			BIT(6)
+#define DA9150_DAT_CLAMP_EXT_SHIFT		7
+#define DA9150_DAT_CLAMP_EXT_MASK		BIT(7)
+
+/* DA9150_ADETID_CFG_A = 0x15B */
+#define DA9150_TID_POLL_SHIFT			0
+#define DA9150_TID_POLL_MASK			(0x07 << 0)
+#define DA9150_RID_CONV_SHIFT			3
+#define DA9150_RID_CONV_MASK			BIT(3)
+
+/* DA9150_ADET_RID_PT_CHG_H = 0x15C */
+#define DA9150_RID_PT_CHG_H_SHIFT		0
+#define DA9150_RID_PT_CHG_H_MASK		(0xff << 0)
+
+/* DA9150_ADET_RID_PT_CHG_L = 0x15D */
+#define DA9150_RID_PT_CHG_L_SHIFT		6
+#define DA9150_RID_PT_CHG_L_MASK		(0x03 << 6)
+
+/* DA9150_PPR_TCTR_B = 0x160 */
+#define DA9150_CHG_TCTR_VAL_SHIFT		0
+#define DA9150_CHG_TCTR_VAL_MASK		(0xff << 0)
+
+/* DA9150_PPR_BKCTRL_A = 0x163 */
+#define DA9150_VBUS_MODE_SHIFT			0
+#define DA9150_VBUS_MODE_MASK			(0x03 << 0)
+#define DA9150_VBUS_MODE_CHG			BIT(0)
+#define DA9150_VBUS_MODE_OTG			(0x02 << 0)
+#define DA9150_VBUS_LPM_SHIFT			2
+#define DA9150_VBUS_LPM_MASK			(0x03 << 2)
+#define DA9150_VBUS_SUSP_SHIFT			4
+#define DA9150_VBUS_SUSP_MASK			BIT(4)
+#define DA9150_VBUS_PWM_SHIFT			5
+#define DA9150_VBUS_PWM_MASK			BIT(5)
+#define DA9150_VBUS_ISO_SHIFT			6
+#define DA9150_VBUS_ISO_MASK			BIT(6)
+#define DA9150_VBUS_LDO_SHIFT			7
+#define DA9150_VBUS_LDO_MASK			BIT(7)
+
+/* DA9150_PPR_BKCFG_A = 0x164 */
+#define DA9150_VBUS_ISET_SHIFT			0
+#define DA9150_VBUS_ISET_MASK			(0x1f << 0)
+#define DA9150_VBUS_IMAX_SHIFT			5
+#define DA9150_VBUS_IMAX_MASK			BIT(5)
+#define DA9150_VBUS_IOTG_SHIFT			6
+#define DA9150_VBUS_IOTG_MASK			(0x03 << 6)
+
+/* DA9150_PPR_BKCFG_B = 0x165 */
+#define DA9150_VBUS_DROP_SHIFT			0
+#define DA9150_VBUS_DROP_MASK			(0x0f << 0)
+#define DA9150_VBUS_FAULT_DIS_SHIFT		6
+#define DA9150_VBUS_FAULT_DIS_MASK		BIT(6)
+#define DA9150_OTG_FAULT_DIS_SHIFT		7
+#define DA9150_OTG_FAULT_DIS_MASK		BIT(7)
+
+/* DA9150_PPR_CHGCTRL_A = 0x166 */
+#define DA9150_CHG_EN_SHIFT			0
+#define DA9150_CHG_EN_MASK			BIT(0)
+
+/* DA9150_PPR_CHGCTRL_B = 0x167 */
+#define DA9150_CHG_VBAT_SHIFT			0
+#define DA9150_CHG_VBAT_MASK			(0x1f << 0)
+#define DA9150_CHG_VDROP_SHIFT			6
+#define DA9150_CHG_VDROP_MASK			(0x03 << 6)
+
+/* DA9150_PPR_CHGCTRL_C = 0x168 */
+#define DA9150_CHG_VFAULT_SHIFT			0
+#define DA9150_CHG_VFAULT_MASK			(0x0f << 0)
+#define DA9150_CHG_IPRE_SHIFT			4
+#define DA9150_CHG_IPRE_MASK			(0x03 << 4)
+
+/* DA9150_PPR_TCTR_A = 0x169 */
+#define DA9150_CHG_TCTR_SHIFT			0
+#define DA9150_CHG_TCTR_MASK			(0x07 << 0)
+#define DA9150_CHG_TCTR_MODE_SHIFT		4
+#define DA9150_CHG_TCTR_MODE_MASK		BIT(4)
+
+/* DA9150_PPR_CHGCTRL_D = 0x16A */
+#define DA9150_CHG_IBAT_SHIFT			0
+#define DA9150_CHG_IBAT_MASK			(0xff << 0)
+
+/* DA9150_PPR_CHGCTRL_E = 0x16B */
+#define DA9150_CHG_IEND_SHIFT			0
+#define DA9150_CHG_IEND_MASK			(0xff << 0)
+
+/* DA9150_PPR_CHGCTRL_F = 0x16C */
+#define DA9150_CHG_VCOLD_SHIFT			0
+#define DA9150_CHG_VCOLD_MASK			(0x1f << 0)
+#define DA9150_TBAT_TQA_EN_SHIFT		6
+#define DA9150_TBAT_TQA_EN_MASK			BIT(6)
+#define DA9150_TBAT_TDP_EN_SHIFT		7
+#define DA9150_TBAT_TDP_EN_MASK			BIT(7)
+
+/* DA9150_PPR_CHGCTRL_G = 0x16D */
+#define DA9150_CHG_VWARM_SHIFT			0
+#define DA9150_CHG_VWARM_MASK			(0x1f << 0)
+
+/* DA9150_PPR_CHGCTRL_H = 0x16E */
+#define DA9150_CHG_VHOT_SHIFT			0
+#define DA9150_CHG_VHOT_MASK			(0x1f << 0)
+
+/* DA9150_PPR_CHGCTRL_I = 0x16F */
+#define DA9150_CHG_ICOLD_SHIFT			0
+#define DA9150_CHG_ICOLD_MASK			(0xff << 0)
+
+/* DA9150_PPR_CHGCTRL_J = 0x170 */
+#define DA9150_CHG_IWARM_SHIFT			0
+#define DA9150_CHG_IWARM_MASK			(0xff << 0)
+
+/* DA9150_PPR_CHGCTRL_K = 0x171 */
+#define DA9150_CHG_IHOT_SHIFT			0
+#define DA9150_CHG_IHOT_MASK			(0xff << 0)
+
+/* DA9150_PPR_CHGCTRL_L = 0x172 */
+#define DA9150_CHG_IBAT_TRED_SHIFT		0
+#define DA9150_CHG_IBAT_TRED_MASK		(0xff << 0)
+
+/* DA9150_PPR_CHGCTRL_M = 0x173 */
+#define DA9150_CHG_VFLOAT_SHIFT			0
+#define DA9150_CHG_VFLOAT_MASK			(0x0f << 0)
+#define DA9150_CHG_LPM_SHIFT			5
+#define DA9150_CHG_LPM_MASK			BIT(5)
+#define DA9150_CHG_NBLO_SHIFT			6
+#define DA9150_CHG_NBLO_MASK			BIT(6)
+#define DA9150_EBS_EN_SHIFT			7
+#define DA9150_EBS_EN_MASK			BIT(7)
+
+/* DA9150_PPR_THYST_A = 0x174 */
+#define DA9150_TBAT_T1_SHIFT			0
+#define DA9150_TBAT_T1_MASK			(0xff << 0)
+
+/* DA9150_PPR_THYST_B = 0x175 */
+#define DA9150_TBAT_T2_SHIFT			0
+#define DA9150_TBAT_T2_MASK			(0xff << 0)
+
+/* DA9150_PPR_THYST_C = 0x176 */
+#define DA9150_TBAT_T3_SHIFT			0
+#define DA9150_TBAT_T3_MASK			(0xff << 0)
+
+/* DA9150_PPR_THYST_D = 0x177 */
+#define DA9150_TBAT_T4_SHIFT			0
+#define DA9150_TBAT_T4_MASK			(0xff << 0)
+
+/* DA9150_PPR_THYST_E = 0x178 */
+#define DA9150_TBAT_T5_SHIFT			0
+#define DA9150_TBAT_T5_MASK			(0xff << 0)
+
+/* DA9150_PPR_THYST_F = 0x179 */
+#define DA9150_TBAT_H1_SHIFT			0
+#define DA9150_TBAT_H1_MASK			(0xff << 0)
+
+/* DA9150_PPR_THYST_G = 0x17A */
+#define DA9150_TBAT_H5_SHIFT			0
+#define DA9150_TBAT_H5_MASK			(0xff << 0)
+
+/* DA9150_PAGE_CON_3 = 0x180 */
+#define DA9150_PAGE_SHIFT			0
+#define DA9150_PAGE_MASK			(0x3f << 0)
+#define DA9150_WRITE_MODE_SHIFT			6
+#define DA9150_WRITE_MODE_MASK			BIT(6)
+#define DA9150_REVERT_SHIFT			7
+#define DA9150_REVERT_MASK			BIT(7)
+
+/* DA9150_PAGE_CON_4 = 0x200 */
+#define DA9150_PAGE_SHIFT			0
+#define DA9150_PAGE_MASK			(0x3f << 0)
+#define DA9150_WRITE_MODE_SHIFT			6
+#define DA9150_WRITE_MODE_MASK			BIT(6)
+#define DA9150_REVERT_SHIFT			7
+#define DA9150_REVERT_MASK			BIT(7)
+
+/* DA9150_PAGE_CON_5 = 0x280 */
+#define DA9150_PAGE_SHIFT			0
+#define DA9150_PAGE_MASK			(0x3f << 0)
+#define DA9150_WRITE_MODE_SHIFT			6
+#define DA9150_WRITE_MODE_MASK			BIT(6)
+#define DA9150_REVERT_SHIFT			7
+#define DA9150_REVERT_MASK			BIT(7)
+
+/* DA9150_PAGE_CON_6 = 0x300 */
+#define DA9150_PAGE_SHIFT			0
+#define DA9150_PAGE_MASK			(0x3f << 0)
+#define DA9150_WRITE_MODE_SHIFT			6
+#define DA9150_WRITE_MODE_MASK			BIT(6)
+#define DA9150_REVERT_SHIFT			7
+#define DA9150_REVERT_MASK			BIT(7)
+
+/* DA9150_COREBTLD_STAT_A = 0x302 */
+#define DA9150_BOOTLD_STAT_SHIFT		0
+#define DA9150_BOOTLD_STAT_MASK			(0x03 << 0)
+#define DA9150_CORE_LOCKUP_SHIFT		2
+#define DA9150_CORE_LOCKUP_MASK			BIT(2)
+
+/* DA9150_COREBTLD_CTRL_A = 0x303 */
+#define DA9150_CORE_RESET_SHIFT			0
+#define DA9150_CORE_RESET_MASK			BIT(0)
+#define DA9150_CORE_STOP_SHIFT			1
+#define DA9150_CORE_STOP_MASK			BIT(1)
+
+/* DA9150_CORE_CONFIG_A = 0x304 */
+#define DA9150_CORE_MEMMUX_SHIFT		0
+#define DA9150_CORE_MEMMUX_MASK			(0x03 << 0)
+#define DA9150_WDT_AUTO_START_SHIFT		2
+#define DA9150_WDT_AUTO_START_MASK		BIT(2)
+#define DA9150_WDT_AUTO_LOCK_SHIFT		3
+#define DA9150_WDT_AUTO_LOCK_MASK		BIT(3)
+#define DA9150_WDT_HLT_NO_CLK_SHIFT		4
+#define DA9150_WDT_HLT_NO_CLK_MASK		BIT(4)
+
+/* DA9150_CORE_CONFIG_C = 0x305 */
+#define DA9150_CORE_SW_SIZE_SHIFT		0
+#define DA9150_CORE_SW_SIZE_MASK		(0xff << 0)
+
+/* DA9150_CORE_CONFIG_B = 0x306 */
+#define DA9150_BOOTLD_EN_SHIFT			0
+#define DA9150_BOOTLD_EN_MASK			BIT(0)
+#define DA9150_CORE_EN_SHIFT			2
+#define DA9150_CORE_EN_MASK			BIT(2)
+#define DA9150_CORE_SW_SRC_SHIFT		3
+#define DA9150_CORE_SW_SRC_MASK			(0x07 << 3)
+#define DA9150_DEEP_SLEEP_EN_SHIFT		7
+#define DA9150_DEEP_SLEEP_EN_MASK		BIT(7)
+
+/* DA9150_CORE_CFG_DATA_A = 0x307 */
+#define DA9150_CORE_CFG_DT_A_SHIFT		0
+#define DA9150_CORE_CFG_DT_A_MASK		(0xff << 0)
+
+/* DA9150_CORE_CFG_DATA_B = 0x308 */
+#define DA9150_CORE_CFG_DT_B_SHIFT		0
+#define DA9150_CORE_CFG_DT_B_MASK		(0xff << 0)
+
+/* DA9150_CORE_CMD_A = 0x309 */
+#define DA9150_CORE_CMD_SHIFT			0
+#define DA9150_CORE_CMD_MASK			(0xff << 0)
+
+/* DA9150_CORE_DATA_A = 0x30A */
+#define DA9150_CORE_DATA_0_SHIFT		0
+#define DA9150_CORE_DATA_0_MASK			(0xff << 0)
+
+/* DA9150_CORE_DATA_B = 0x30B */
+#define DA9150_CORE_DATA_1_SHIFT		0
+#define DA9150_CORE_DATA_1_MASK			(0xff << 0)
+
+/* DA9150_CORE_DATA_C = 0x30C */
+#define DA9150_CORE_DATA_2_SHIFT		0
+#define DA9150_CORE_DATA_2_MASK			(0xff << 0)
+
+/* DA9150_CORE_DATA_D = 0x30D */
+#define DA9150_CORE_DATA_3_SHIFT		0
+#define DA9150_CORE_DATA_3_MASK			(0xff << 0)
+
+/* DA9150_CORE2WIRE_STAT_A = 0x310 */
+#define DA9150_FW_FWDL_ERR_SHIFT		7
+#define DA9150_FW_FWDL_ERR_MASK			BIT(7)
+
+/* DA9150_CORE2WIRE_CTRL_A = 0x311 */
+#define DA9150_FW_FWDL_EN_SHIFT			0
+#define DA9150_FW_FWDL_EN_MASK			BIT(0)
+#define DA9150_FG_QIF_EN_SHIFT			1
+#define DA9150_FG_QIF_EN_MASK			BIT(1)
+#define DA9150_CORE_BASE_ADDR_SHIFT		4
+#define DA9150_CORE_BASE_ADDR_MASK		(0x0f << 4)
+
+/* DA9150_FW_CTRL_A = 0x312 */
+#define DA9150_FW_SEAL_SHIFT			0
+#define DA9150_FW_SEAL_MASK			(0xff << 0)
+
+/* DA9150_FW_CTRL_C = 0x313 */
+#define DA9150_FW_FWDL_CRC_SHIFT		0
+#define DA9150_FW_FWDL_CRC_MASK			(0xff << 0)
+
+/* DA9150_FW_CTRL_D = 0x314 */
+#define DA9150_FW_FWDL_BASE_SHIFT		0
+#define DA9150_FW_FWDL_BASE_MASK		(0x0f << 0)
+
+/* DA9150_FG_CTRL_A = 0x315 */
+#define DA9150_FG_QIF_CODE_SHIFT		0
+#define DA9150_FG_QIF_CODE_MASK			(0xff << 0)
+
+/* DA9150_FG_CTRL_B = 0x316 */
+#define DA9150_FG_QIF_VALUE_SHIFT		0
+#define DA9150_FG_QIF_VALUE_MASK		(0xff << 0)
+
+/* DA9150_FW_CTRL_E = 0x317 */
+#define DA9150_FW_FWDL_SEG_SHIFT		0
+#define DA9150_FW_FWDL_SEG_MASK			(0xff << 0)
+
+/* DA9150_FW_CTRL_B = 0x318 */
+#define DA9150_FW_FWDL_VALUE_SHIFT		0
+#define DA9150_FW_FWDL_VALUE_MASK		(0xff << 0)
+
+/* DA9150_GPADC_CMAN = 0x320 */
+#define DA9150_GPADC_CEN_SHIFT			0
+#define DA9150_GPADC_CEN_MASK			BIT(0)
+#define DA9150_GPADC_CMUX_SHIFT			1
+#define DA9150_GPADC_CMUX_MASK			(0x1f << 1)
+
+/* DA9150_GPADC_CRES_A = 0x322 */
+#define DA9150_GPADC_CRES_H_SHIFT		0
+#define DA9150_GPADC_CRES_H_MASK		(0xff << 0)
+
+/* DA9150_GPADC_CRES_B = 0x323 */
+#define DA9150_GPADC_CRUN_SHIFT			0
+#define DA9150_GPADC_CRUN_MASK			BIT(0)
+#define DA9150_GPADC_CRES_L_SHIFT		6
+#define DA9150_GPADC_CRES_L_MASK		(0x03 << 6)
+
+/* DA9150_CC_CFG_A = 0x328 */
+#define DA9150_CC_EN_SHIFT			0
+#define DA9150_CC_EN_MASK			BIT(0)
+#define DA9150_CC_TIMEBASE_SHIFT		1
+#define DA9150_CC_TIMEBASE_MASK			(0x03 << 1)
+#define DA9150_CC_CFG_SHIFT			5
+#define DA9150_CC_CFG_MASK			(0x03 << 5)
+#define DA9150_CC_ENDLESS_MODE_SHIFT		7
+#define DA9150_CC_ENDLESS_MODE_MASK		BIT(7)
+
+/* DA9150_CC_CFG_B = 0x329 */
+#define DA9150_CC_OPT_SHIFT			0
+#define DA9150_CC_OPT_MASK			(0x03 << 0)
+#define DA9150_CC_PREAMP_SHIFT			2
+#define DA9150_CC_PREAMP_MASK			(0x03 << 2)
+
+/* DA9150_CC_ICHG_RES_A = 0x32A */
+#define DA9150_CC_ICHG_RES_H_SHIFT		0
+#define DA9150_CC_ICHG_RES_H_MASK		(0xff << 0)
+
+/* DA9150_CC_ICHG_RES_B = 0x32B */
+#define DA9150_CC_ICHG_RES_L_SHIFT		3
+#define DA9150_CC_ICHG_RES_L_MASK		(0x1f << 3)
+
+/* DA9150_CC_IAVG_RES_A = 0x32C */
+#define DA9150_CC_IAVG_RES_H_SHIFT		0
+#define DA9150_CC_IAVG_RES_H_MASK		(0xff << 0)
+
+/* DA9150_CC_IAVG_RES_B = 0x32D */
+#define DA9150_CC_IAVG_RES_L_SHIFT		0
+#define DA9150_CC_IAVG_RES_L_MASK		(0xff << 0)
+
+/* DA9150_TAUX_CTRL_A = 0x330 */
+#define DA9150_TAUX_EN_SHIFT			0
+#define DA9150_TAUX_EN_MASK			BIT(0)
+#define DA9150_TAUX_MOD_SHIFT			1
+#define DA9150_TAUX_MOD_MASK			BIT(1)
+#define DA9150_TAUX_UPDATE_SHIFT		2
+#define DA9150_TAUX_UPDATE_MASK			BIT(2)
+
+/* DA9150_TAUX_RELOAD_H = 0x332 */
+#define DA9150_TAUX_RLD_H_SHIFT			0
+#define DA9150_TAUX_RLD_H_MASK			(0xff << 0)
+
+/* DA9150_TAUX_RELOAD_L = 0x333 */
+#define DA9150_TAUX_RLD_L_SHIFT			3
+#define DA9150_TAUX_RLD_L_MASK			(0x1f << 3)
+
+/* DA9150_TAUX_VALUE_H = 0x334 */
+#define DA9150_TAUX_VAL_H_SHIFT			0
+#define DA9150_TAUX_VAL_H_MASK			(0xff << 0)
+
+/* DA9150_TAUX_VALUE_L = 0x335 */
+#define DA9150_TAUX_VAL_L_SHIFT			3
+#define DA9150_TAUX_VAL_L_MASK			(0x1f << 3)
+
+/* DA9150_AUX_DATA_0 = 0x338 */
+#define DA9150_AUX_DAT_0_SHIFT			0
+#define DA9150_AUX_DAT_0_MASK			(0xff << 0)
+
+/* DA9150_AUX_DATA_1 = 0x339 */
+#define DA9150_AUX_DAT_1_SHIFT			0
+#define DA9150_AUX_DAT_1_MASK			(0xff << 0)
+
+/* DA9150_AUX_DATA_2 = 0x33A */
+#define DA9150_AUX_DAT_2_SHIFT			0
+#define DA9150_AUX_DAT_2_MASK			(0xff << 0)
+
+/* DA9150_AUX_DATA_3 = 0x33B */
+#define DA9150_AUX_DAT_3_SHIFT			0
+#define DA9150_AUX_DAT_3_MASK			(0xff << 0)
+
+/* DA9150_BIF_CTRL = 0x340 */
+#define DA9150_BIF_ISRC_EN_SHIFT		0
+#define DA9150_BIF_ISRC_EN_MASK			BIT(0)
+
+/* DA9150_TBAT_CTRL_A = 0x342 */
+#define DA9150_TBAT_EN_SHIFT			0
+#define DA9150_TBAT_EN_MASK			BIT(0)
+#define DA9150_TBAT_SW1_SHIFT			1
+#define DA9150_TBAT_SW1_MASK			BIT(1)
+#define DA9150_TBAT_SW2_SHIFT			2
+#define DA9150_TBAT_SW2_MASK			BIT(2)
+
+/* DA9150_TBAT_CTRL_B = 0x343 */
+#define DA9150_TBAT_SW_FRC_SHIFT		0
+#define DA9150_TBAT_SW_FRC_MASK			BIT(0)
+#define DA9150_TBAT_STAT_SW1_SHIFT		1
+#define DA9150_TBAT_STAT_SW1_MASK		BIT(1)
+#define DA9150_TBAT_STAT_SW2_SHIFT		2
+#define DA9150_TBAT_STAT_SW2_MASK		BIT(2)
+#define DA9150_TBAT_HIGH_CURR_SHIFT		3
+#define DA9150_TBAT_HIGH_CURR_MASK		BIT(3)
+
+/* DA9150_TBAT_RES_A = 0x344 */
+#define DA9150_TBAT_RES_H_SHIFT			0
+#define DA9150_TBAT_RES_H_MASK			(0xff << 0)
+
+/* DA9150_TBAT_RES_B = 0x345 */
+#define DA9150_TBAT_RES_DIS_SHIFT		0
+#define DA9150_TBAT_RES_DIS_MASK		BIT(0)
+#define DA9150_TBAT_RES_L_SHIFT			6
+#define DA9150_TBAT_RES_L_MASK			(0x03 << 6)
+
+#endif /* __DA9150_REGISTERS_H */
-- 
cgit v1.2.3


From 774e0b41d485e18e655247441b8ca8b8dbe217c4 Mon Sep 17 00:00:00 2001
From: Todd E Brandt <todd.e.brandt@linux.intel.com>
Date: Wed, 7 Jan 2015 13:25:52 -0800
Subject: mfd: axp20x: Add support for fuel gauge cell driver

mfd/axp20x: add support for fuel gauge cell

Register definitions and platform data structure
for fuel gauge cell devices.

Signed-off-by: Todd Brandt <todd.e.brandt@intel.com>
Acked-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/axp20x.h | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 81589d176ae8..dfabd6db7ddf 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -124,10 +124,27 @@ enum {
 #define AXP288_PMIC_ADC_H               0x56
 #define AXP288_PMIC_ADC_L               0x57
 #define AXP288_ADC_TS_PIN_CTRL          0x84
-
 #define AXP288_PMIC_ADC_EN              0x84
-#define AXP288_FG_TUNE5			0xed
 
+/* Fuel Gauge */
+#define AXP288_FG_RDC1_REG          0xba
+#define AXP288_FG_RDC0_REG          0xbb
+#define AXP288_FG_OCVH_REG          0xbc
+#define AXP288_FG_OCVL_REG          0xbd
+#define AXP288_FG_OCV_CURVE_REG     0xc0
+#define AXP288_FG_DES_CAP1_REG      0xe0
+#define AXP288_FG_DES_CAP0_REG      0xe1
+#define AXP288_FG_CC_MTR1_REG       0xe2
+#define AXP288_FG_CC_MTR0_REG       0xe3
+#define AXP288_FG_OCV_CAP_REG       0xe4
+#define AXP288_FG_CC_CAP_REG        0xe5
+#define AXP288_FG_LOW_CAP_REG       0xe6
+#define AXP288_FG_TUNE0             0xe8
+#define AXP288_FG_TUNE1             0xe9
+#define AXP288_FG_TUNE2             0xea
+#define AXP288_FG_TUNE3             0xeb
+#define AXP288_FG_TUNE4             0xec
+#define AXP288_FG_TUNE5             0xed
 
 /* Regulators IDs */
 enum {
@@ -236,4 +253,26 @@ struct axp20x_dev {
 	const struct regmap_irq_chip	*regmap_irq_chip;
 };
 
+#define BATTID_LEN				64
+#define OCV_CURVE_SIZE			32
+#define MAX_THERM_CURVE_SIZE	25
+#define PD_DEF_MIN_TEMP			0
+#define PD_DEF_MAX_TEMP			55
+
+struct axp20x_fg_pdata {
+	char battid[BATTID_LEN + 1];
+	int design_cap;
+	int min_volt;
+	int max_volt;
+	int max_temp;
+	int min_temp;
+	int cap1;
+	int cap0;
+	int rdc1;
+	int rdc0;
+	int ocv_curve[OCV_CURVE_SIZE];
+	int tcsz;
+	int thermistor_curve[MAX_THERM_CURVE_SIZE][2];
+};
+
 #endif /* __LINUX_MFD_AXP20X_H */
-- 
cgit v1.2.3


From 2b50635ea376c89082ef1f3204bcd7791a6e37d7 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Mon, 29 Dec 2014 10:09:16 +0100
Subject: mfd: max77686/802: Remove support for board files

The driver is used only on Exynos based boards with DTS support.
After removal of board file support from max77686 and max77802 regulator
drivers, the MFD driver can be converted to DTS-only version. This
simplifies a little the code:
1. No dead (unused) entries in platform_data structure.
2. More code removed.
3. Regulator driver does not depend on allocated memory
   from MFD driver.
4. It makes also easier extending the regulator driver.

Add to the max77686 MFD driver dependency on CONFIG_OF because without
DTS the regulator drivers (max77686 and max77802) won't bind.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Reviewed-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Tested-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |  1 +
 drivers/mfd/max77686.c               | 23 -----------------------
 include/linux/mfd/max77686-private.h |  1 -
 include/linux/mfd/max77686.h         | 28 ----------------------------
 4 files changed, 1 insertion(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index c47b2da7986e..be5b684eefcf 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -429,6 +429,7 @@ config MFD_MAX14577
 config MFD_MAX77686
 	bool "Maxim Semiconductor MAX77686/802 PMIC Support"
 	depends on I2C=y
+	depends on OF
 	select MFD_CORE
 	select REGMAP_I2C
 	select REGMAP_IRQ
diff --git a/drivers/mfd/max77686.c b/drivers/mfd/max77686.c
index 2b2f2ccda523..760d08d7923d 100644
--- a/drivers/mfd/max77686.c
+++ b/drivers/mfd/max77686.c
@@ -205,24 +205,10 @@ static const struct of_device_id max77686_pmic_dt_match[] = {
 	{ },
 };
 
-static struct max77686_platform_data *max77686_i2c_parse_dt_pdata(struct device
-								  *dev)
-{
-	struct max77686_platform_data *pd;
-
-	pd = devm_kzalloc(dev, sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return NULL;
-
-	dev->platform_data = pd;
-	return pd;
-}
-
 static int max77686_i2c_probe(struct i2c_client *i2c,
 			      const struct i2c_device_id *id)
 {
 	struct max77686_dev *max77686 = NULL;
-	struct max77686_platform_data *pdata = dev_get_platdata(&i2c->dev);
 	const struct of_device_id *match;
 	unsigned int data;
 	int ret = 0;
@@ -233,14 +219,6 @@ static int max77686_i2c_probe(struct i2c_client *i2c,
 	const struct mfd_cell *cells;
 	int n_devs;
 
-	if (IS_ENABLED(CONFIG_OF) && i2c->dev.of_node && !pdata)
-		pdata = max77686_i2c_parse_dt_pdata(&i2c->dev);
-
-	if (!pdata) {
-		dev_err(&i2c->dev, "No platform data found.\n");
-		return -EINVAL;
-	}
-
 	max77686 = devm_kzalloc(&i2c->dev,
 				sizeof(struct max77686_dev), GFP_KERNEL);
 	if (!max77686)
@@ -259,7 +237,6 @@ static int max77686_i2c_probe(struct i2c_client *i2c,
 	max77686->dev = &i2c->dev;
 	max77686->i2c = i2c;
 
-	max77686->wakeup = pdata->wakeup;
 	max77686->irq = i2c->irq;
 
 	if (max77686->type == TYPE_MAX77686) {
diff --git a/include/linux/mfd/max77686-private.h b/include/linux/mfd/max77686-private.h
index 960b92ad450d..f5043490d67c 100644
--- a/include/linux/mfd/max77686-private.h
+++ b/include/linux/mfd/max77686-private.h
@@ -447,7 +447,6 @@ struct max77686_dev {
 	struct regmap_irq_chip_data *rtc_irq_data;
 
 	int irq;
-	bool wakeup;
 	struct mutex irqlock;
 	int irq_masks_cur[MAX77686_IRQ_GROUP_NR];
 	int irq_masks_cache[MAX77686_IRQ_GROUP_NR];
diff --git a/include/linux/mfd/max77686.h b/include/linux/mfd/max77686.h
index 553f7d09258a..bb995ab9a575 100644
--- a/include/linux/mfd/max77686.h
+++ b/include/linux/mfd/max77686.h
@@ -119,12 +119,6 @@ enum max77802_regulators {
 	MAX77802_REG_MAX,
 };
 
-struct max77686_regulator_data {
-	int id;
-	struct regulator_init_data *initdata;
-	struct device_node *of_node;
-};
-
 enum max77686_opmode {
 	MAX77686_OPMODE_NORMAL,
 	MAX77686_OPMODE_LP,
@@ -136,26 +130,4 @@ struct max77686_opmode_data {
 	int mode;
 };
 
-struct max77686_platform_data {
-	int ono;
-	int wakeup;
-
-	/* ---- PMIC ---- */
-	struct max77686_regulator_data *regulators;
-	int num_regulators;
-
-	struct max77686_opmode_data *opmode_data;
-
-	/*
-	 * GPIO-DVS feature is not enabled with the current version of
-	 * MAX77686 driver. Buck2/3/4_voltages[0] is used as the default
-	 * voltage at probe. DVS/SELB gpios are set as OUTPUT-LOW.
-	 */
-	int buck234_gpio_dvs[3]; /* GPIO of [0]DVS1, [1]DVS2, [2]DVS3 */
-	int buck234_gpio_selb[3]; /* [0]SELB2, [1]SELB3, [2]SELB4 */
-	unsigned int buck2_voltage[8]; /* buckx_voltage in uV */
-	unsigned int buck3_voltage[8];
-	unsigned int buck4_voltage[8];
-};
-
 #endif /* __LINUX_MFD_MAX77686_H */
-- 
cgit v1.2.3


From 58e214382bdd1eb48c5a3519182bddcb26edabad Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Wed, 26 Nov 2014 13:51:00 -0800
Subject: mfd: qcom-rpm: Driver for the Qualcomm RPM

Driver for the Resource Power Manager (RPM) found in Qualcomm 8660, 8960
and 8064 based devices. The driver exposes resources that child drivers
can operate on; to implementing regulator, clock and bus frequency
drivers.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig          |  14 ++
 drivers/mfd/Makefile         |   1 +
 drivers/mfd/qcom_rpm.c       | 581 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/qcom_rpm.h |  13 +
 4 files changed, 609 insertions(+)
 create mode 100644 drivers/mfd/qcom_rpm.c
 create mode 100644 include/linux/mfd/qcom_rpm.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index be5b684eefcf..38356e39adba 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -602,6 +602,20 @@ config MFD_PM8921_CORE
 	  Say M here if you want to include support for PM8921 chip as a module.
 	  This will build a module called "pm8921-core".
 
+config MFD_QCOM_RPM
+	tristate "Qualcomm Resource Power Manager (RPM)"
+	depends on ARCH_QCOM && OF
+	help
+	  If you say yes to this option, support will be included for the
+	  Resource Power Manager system found in the Qualcomm 8660, 8960 and
+	  8064 based devices.
+
+	  This is required to access many regulators, clocks and bus
+	  frequencies controlled by the RPM on these devices.
+
+	  Say M here if you want to include support for the Qualcomm RPM as a
+	  module. This will build a module called "qcom_rpm".
+
 config MFD_SPMI_PMIC
 	tristate "Qualcomm SPMI PMICs"
 	depends on ARCH_QCOM || COMPILE_TEST
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 02c2f2c69b94..19f3d744e3bd 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -153,6 +153,7 @@ obj-$(CONFIG_MFD_SI476X_CORE)	+= si476x-core.o
 obj-$(CONFIG_MFD_CS5535)	+= cs5535-mfd.o
 obj-$(CONFIG_MFD_OMAP_USB_HOST)	+= omap-usb-host.o omap-usb-tll.o
 obj-$(CONFIG_MFD_PM8921_CORE) 	+= pm8921-core.o ssbi.o
+obj-$(CONFIG_MFD_QCOM_RPM)	+= qcom_rpm.o
 obj-$(CONFIG_MFD_SPMI_PMIC)	+= qcom-spmi-pmic.o
 obj-$(CONFIG_TPS65911_COMPARATOR)	+= tps65911-comparator.o
 obj-$(CONFIG_MFD_TPS65090)	+= tps65090.o
diff --git a/drivers/mfd/qcom_rpm.c b/drivers/mfd/qcom_rpm.c
new file mode 100644
index 000000000000..f696328c2933
--- /dev/null
+++ b/drivers/mfd/qcom_rpm.c
@@ -0,0 +1,581 @@
+/*
+ * Copyright (c) 2014, Sony Mobile Communications AB.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ * Author: Bjorn Andersson <bjorn.andersson@sonymobile.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/qcom_rpm.h>
+#include <linux/mfd/syscon.h>
+#include <linux/regmap.h>
+
+#include <dt-bindings/mfd/qcom-rpm.h>
+
+struct qcom_rpm_resource {
+	unsigned target_id;
+	unsigned status_id;
+	unsigned select_id;
+	unsigned size;
+};
+
+struct qcom_rpm_data {
+	u32 version;
+	const struct qcom_rpm_resource *resource_table;
+	unsigned n_resources;
+};
+
+struct qcom_rpm {
+	struct device *dev;
+	struct regmap *ipc_regmap;
+	unsigned ipc_offset;
+	unsigned ipc_bit;
+
+	struct completion ack;
+	struct mutex lock;
+
+	void __iomem *status_regs;
+	void __iomem *ctrl_regs;
+	void __iomem *req_regs;
+
+	u32 ack_status;
+
+	const struct qcom_rpm_data *data;
+};
+
+#define RPM_STATUS_REG(rpm, i)	((rpm)->status_regs + (i) * 4)
+#define RPM_CTRL_REG(rpm, i)	((rpm)->ctrl_regs + (i) * 4)
+#define RPM_REQ_REG(rpm, i)	((rpm)->req_regs + (i) * 4)
+
+#define RPM_REQUEST_TIMEOUT	(5 * HZ)
+
+#define RPM_REQUEST_CONTEXT	3
+#define RPM_REQ_SELECT		11
+#define RPM_ACK_CONTEXT		15
+#define RPM_ACK_SELECTOR	23
+#define RPM_SELECT_SIZE		7
+
+#define RPM_NOTIFICATION	BIT(30)
+#define RPM_REJECTED		BIT(31)
+
+#define RPM_SIGNAL		BIT(2)
+
+static const struct qcom_rpm_resource apq8064_rpm_resource_table[] = {
+	[QCOM_RPM_CXO_CLK] =			{ 25, 9, 5, 1 },
+	[QCOM_RPM_PXO_CLK] =			{ 26, 10, 6, 1 },
+	[QCOM_RPM_APPS_FABRIC_CLK] =		{ 27, 11, 8, 1 },
+	[QCOM_RPM_SYS_FABRIC_CLK] =		{ 28, 12, 9, 1 },
+	[QCOM_RPM_MM_FABRIC_CLK] =		{ 29, 13, 10, 1 },
+	[QCOM_RPM_DAYTONA_FABRIC_CLK] =		{ 30, 14, 11, 1 },
+	[QCOM_RPM_SFPB_CLK] =			{ 31, 15, 12, 1 },
+	[QCOM_RPM_CFPB_CLK] =			{ 32, 16, 13, 1 },
+	[QCOM_RPM_MMFPB_CLK] =			{ 33, 17, 14, 1 },
+	[QCOM_RPM_EBI1_CLK] =			{ 34, 18, 16, 1 },
+	[QCOM_RPM_APPS_FABRIC_HALT] =		{ 35, 19, 18, 1 },
+	[QCOM_RPM_APPS_FABRIC_MODE] =		{ 37, 20, 19, 1 },
+	[QCOM_RPM_APPS_FABRIC_IOCTL] =		{ 40, 21, 20, 1 },
+	[QCOM_RPM_APPS_FABRIC_ARB] =		{ 41, 22, 21, 12 },
+	[QCOM_RPM_SYS_FABRIC_HALT] =		{ 53, 23, 22, 1 },
+	[QCOM_RPM_SYS_FABRIC_MODE] =		{ 55, 24, 23, 1 },
+	[QCOM_RPM_SYS_FABRIC_IOCTL] =		{ 58, 25, 24, 1 },
+	[QCOM_RPM_SYS_FABRIC_ARB] =		{ 59, 26, 25, 30 },
+	[QCOM_RPM_MM_FABRIC_HALT] =		{ 89, 27, 26, 1 },
+	[QCOM_RPM_MM_FABRIC_MODE] =		{ 91, 28, 27, 1 },
+	[QCOM_RPM_MM_FABRIC_IOCTL] =		{ 94, 29, 28, 1 },
+	[QCOM_RPM_MM_FABRIC_ARB] =		{ 95, 30, 29, 21 },
+	[QCOM_RPM_PM8921_SMPS1] =		{ 116, 31, 30, 2 },
+	[QCOM_RPM_PM8921_SMPS2] =		{ 118, 33, 31, 2 },
+	[QCOM_RPM_PM8921_SMPS3] =		{ 120, 35, 32, 2 },
+	[QCOM_RPM_PM8921_SMPS4] =		{ 122, 37, 33, 2 },
+	[QCOM_RPM_PM8921_SMPS5] =		{ 124, 39, 34, 2 },
+	[QCOM_RPM_PM8921_SMPS6] =		{ 126, 41, 35, 2 },
+	[QCOM_RPM_PM8921_SMPS7] =		{ 128, 43, 36, 2 },
+	[QCOM_RPM_PM8921_SMPS8] =		{ 130, 45, 37, 2 },
+	[QCOM_RPM_PM8921_LDO1] =		{ 132, 47, 38, 2 },
+	[QCOM_RPM_PM8921_LDO2] =		{ 134, 49, 39, 2 },
+	[QCOM_RPM_PM8921_LDO3] =		{ 136, 51, 40, 2 },
+	[QCOM_RPM_PM8921_LDO4] =		{ 138, 53, 41, 2 },
+	[QCOM_RPM_PM8921_LDO5] =		{ 140, 55, 42, 2 },
+	[QCOM_RPM_PM8921_LDO6] =		{ 142, 57, 43, 2 },
+	[QCOM_RPM_PM8921_LDO7] =		{ 144, 59, 44, 2 },
+	[QCOM_RPM_PM8921_LDO8] =		{ 146, 61, 45, 2 },
+	[QCOM_RPM_PM8921_LDO9] =		{ 148, 63, 46, 2 },
+	[QCOM_RPM_PM8921_LDO10] =		{ 150, 65, 47, 2 },
+	[QCOM_RPM_PM8921_LDO11] =		{ 152, 67, 48, 2 },
+	[QCOM_RPM_PM8921_LDO12] =		{ 154, 69, 49, 2 },
+	[QCOM_RPM_PM8921_LDO13] =		{ 156, 71, 50, 2 },
+	[QCOM_RPM_PM8921_LDO14] =		{ 158, 73, 51, 2 },
+	[QCOM_RPM_PM8921_LDO15] =		{ 160, 75, 52, 2 },
+	[QCOM_RPM_PM8921_LDO16] =		{ 162, 77, 53, 2 },
+	[QCOM_RPM_PM8921_LDO17] =		{ 164, 79, 54, 2 },
+	[QCOM_RPM_PM8921_LDO18] =		{ 166, 81, 55, 2 },
+	[QCOM_RPM_PM8921_LDO19] =		{ 168, 83, 56, 2 },
+	[QCOM_RPM_PM8921_LDO20] =		{ 170, 85, 57, 2 },
+	[QCOM_RPM_PM8921_LDO21] =		{ 172, 87, 58, 2 },
+	[QCOM_RPM_PM8921_LDO22] =		{ 174, 89, 59, 2 },
+	[QCOM_RPM_PM8921_LDO23] =		{ 176, 91, 60, 2 },
+	[QCOM_RPM_PM8921_LDO24] =		{ 178, 93, 61, 2 },
+	[QCOM_RPM_PM8921_LDO25] =		{ 180, 95, 62, 2 },
+	[QCOM_RPM_PM8921_LDO26] =		{ 182, 97, 63, 2 },
+	[QCOM_RPM_PM8921_LDO27] =		{ 184, 99, 64, 2 },
+	[QCOM_RPM_PM8921_LDO28] =		{ 186, 101, 65, 2 },
+	[QCOM_RPM_PM8921_LDO29] =		{ 188, 103, 66, 2 },
+	[QCOM_RPM_PM8921_CLK1] =		{ 190, 105, 67, 2 },
+	[QCOM_RPM_PM8921_CLK2] =		{ 192, 107, 68, 2 },
+	[QCOM_RPM_PM8921_LVS1] =		{ 194, 109, 69, 1 },
+	[QCOM_RPM_PM8921_LVS2] =		{ 195, 110, 70, 1 },
+	[QCOM_RPM_PM8921_LVS3] =		{ 196, 111, 71, 1 },
+	[QCOM_RPM_PM8921_LVS4] =		{ 197, 112, 72, 1 },
+	[QCOM_RPM_PM8921_LVS5] =		{ 198, 113, 73, 1 },
+	[QCOM_RPM_PM8921_LVS6] =		{ 199, 114, 74, 1 },
+	[QCOM_RPM_PM8921_LVS7] =		{ 200, 115, 75, 1 },
+	[QCOM_RPM_PM8821_SMPS1] =		{ 201, 116, 76, 2 },
+	[QCOM_RPM_PM8821_SMPS2] =		{ 203, 118, 77, 2 },
+	[QCOM_RPM_PM8821_LDO1] =		{ 205, 120, 78, 2 },
+	[QCOM_RPM_PM8921_NCP] =			{ 207, 122, 80, 2 },
+	[QCOM_RPM_CXO_BUFFERS] =		{ 209, 124, 81, 1 },
+	[QCOM_RPM_USB_OTG_SWITCH] =		{ 210, 125, 82, 1 },
+	[QCOM_RPM_HDMI_SWITCH] =		{ 211, 126, 83, 1 },
+	[QCOM_RPM_DDR_DMM] =			{ 212, 127, 84, 2 },
+	[QCOM_RPM_VDDMIN_GPIO] =		{ 215, 131, 89, 1 },
+};
+
+static const struct qcom_rpm_data apq8064_template = {
+	.version = 3,
+	.resource_table = apq8064_rpm_resource_table,
+	.n_resources = ARRAY_SIZE(apq8064_rpm_resource_table),
+};
+
+static const struct qcom_rpm_resource msm8660_rpm_resource_table[] = {
+	[QCOM_RPM_CXO_CLK] =			{ 32, 12, 5, 1 },
+	[QCOM_RPM_PXO_CLK] =			{ 33, 13, 6, 1 },
+	[QCOM_RPM_PLL_4] =			{ 34, 14, 7, 1 },
+	[QCOM_RPM_APPS_FABRIC_CLK] =		{ 35, 15, 8, 1 },
+	[QCOM_RPM_SYS_FABRIC_CLK] =		{ 36, 16, 9, 1 },
+	[QCOM_RPM_MM_FABRIC_CLK] =		{ 37, 17, 10, 1 },
+	[QCOM_RPM_DAYTONA_FABRIC_CLK] =		{ 38, 18, 11, 1 },
+	[QCOM_RPM_SFPB_CLK] =			{ 39, 19, 12, 1 },
+	[QCOM_RPM_CFPB_CLK] =			{ 40, 20, 13, 1 },
+	[QCOM_RPM_MMFPB_CLK] =			{ 41, 21, 14, 1 },
+	[QCOM_RPM_SMI_CLK] =			{ 42, 22, 15, 1 },
+	[QCOM_RPM_EBI1_CLK] =			{ 43, 23, 16, 1 },
+	[QCOM_RPM_APPS_L2_CACHE_CTL] =		{ 44, 24, 17, 1 },
+	[QCOM_RPM_APPS_FABRIC_HALT] =		{ 45, 25, 18, 2 },
+	[QCOM_RPM_APPS_FABRIC_MODE] =		{ 47, 26, 19, 3 },
+	[QCOM_RPM_APPS_FABRIC_ARB] =		{ 51, 28, 21, 6 },
+	[QCOM_RPM_SYS_FABRIC_HALT] =		{ 63, 29, 22, 2 },
+	[QCOM_RPM_SYS_FABRIC_MODE] =		{ 65, 30, 23, 3 },
+	[QCOM_RPM_SYS_FABRIC_ARB] =		{ 69, 32, 25, 22 },
+	[QCOM_RPM_MM_FABRIC_HALT] =		{ 105, 33, 26, 2 },
+	[QCOM_RPM_MM_FABRIC_MODE] =		{ 107, 34, 27, 3 },
+	[QCOM_RPM_MM_FABRIC_ARB] =		{ 111, 36, 29, 23 },
+	[QCOM_RPM_PM8901_SMPS0] =		{ 134, 37, 30, 2 },
+	[QCOM_RPM_PM8901_SMPS1] =		{ 136, 39, 31, 2 },
+	[QCOM_RPM_PM8901_SMPS2] =		{ 138, 41, 32, 2 },
+	[QCOM_RPM_PM8901_SMPS3] =		{ 140, 43, 33, 2 },
+	[QCOM_RPM_PM8901_SMPS4] =		{ 142, 45, 34, 2 },
+	[QCOM_RPM_PM8901_LDO0] =		{ 144, 47, 35, 2 },
+	[QCOM_RPM_PM8901_LDO1] =		{ 146, 49, 36, 2 },
+	[QCOM_RPM_PM8901_LDO2] =		{ 148, 51, 37, 2 },
+	[QCOM_RPM_PM8901_LDO3] =		{ 150, 53, 38, 2 },
+	[QCOM_RPM_PM8901_LDO4] =		{ 152, 55, 39, 2 },
+	[QCOM_RPM_PM8901_LDO5] =		{ 154, 57, 40, 2 },
+	[QCOM_RPM_PM8901_LDO6] =		{ 156, 59, 41, 2 },
+	[QCOM_RPM_PM8901_LVS0] =		{ 158, 61, 42, 1 },
+	[QCOM_RPM_PM8901_LVS1] =		{ 159, 62, 43, 1 },
+	[QCOM_RPM_PM8901_LVS2] =		{ 160, 63, 44, 1 },
+	[QCOM_RPM_PM8901_LVS3] =		{ 161, 64, 45, 1 },
+	[QCOM_RPM_PM8901_MVS] =			{ 162, 65, 46, 1 },
+	[QCOM_RPM_PM8058_SMPS0] =		{ 163, 66, 47, 2 },
+	[QCOM_RPM_PM8058_SMPS1] =		{ 165, 68, 48, 2 },
+	[QCOM_RPM_PM8058_SMPS2] =		{ 167, 70, 49, 2 },
+	[QCOM_RPM_PM8058_SMPS3] =		{ 169, 72, 50, 2 },
+	[QCOM_RPM_PM8058_SMPS4] =		{ 171, 74, 51, 2 },
+	[QCOM_RPM_PM8058_LDO0] =		{ 173, 76, 52, 2 },
+	[QCOM_RPM_PM8058_LDO1] =		{ 175, 78, 53, 2 },
+	[QCOM_RPM_PM8058_LDO2] =		{ 177, 80, 54, 2 },
+	[QCOM_RPM_PM8058_LDO3] =		{ 179, 82, 55, 2 },
+	[QCOM_RPM_PM8058_LDO4] =		{ 181, 84, 56, 2 },
+	[QCOM_RPM_PM8058_LDO5] =		{ 183, 86, 57, 2 },
+	[QCOM_RPM_PM8058_LDO6] =		{ 185, 88, 58, 2 },
+	[QCOM_RPM_PM8058_LDO7] =		{ 187, 90, 59, 2 },
+	[QCOM_RPM_PM8058_LDO8] =		{ 189, 92, 60, 2 },
+	[QCOM_RPM_PM8058_LDO9] =		{ 191, 94, 61, 2 },
+	[QCOM_RPM_PM8058_LDO10] =		{ 193, 96, 62, 2 },
+	[QCOM_RPM_PM8058_LDO11] =		{ 195, 98, 63, 2 },
+	[QCOM_RPM_PM8058_LDO12] =		{ 197, 100, 64, 2 },
+	[QCOM_RPM_PM8058_LDO13] =		{ 199, 102, 65, 2 },
+	[QCOM_RPM_PM8058_LDO14] =		{ 201, 104, 66, 2 },
+	[QCOM_RPM_PM8058_LDO15] =		{ 203, 106, 67, 2 },
+	[QCOM_RPM_PM8058_LDO16] =		{ 205, 108, 68, 2 },
+	[QCOM_RPM_PM8058_LDO17] =		{ 207, 110, 69, 2 },
+	[QCOM_RPM_PM8058_LDO18] =		{ 209, 112, 70, 2 },
+	[QCOM_RPM_PM8058_LDO19] =		{ 211, 114, 71, 2 },
+	[QCOM_RPM_PM8058_LDO20] =		{ 213, 116, 72, 2 },
+	[QCOM_RPM_PM8058_LDO21] =		{ 215, 118, 73, 2 },
+	[QCOM_RPM_PM8058_LDO22] =		{ 217, 120, 74, 2 },
+	[QCOM_RPM_PM8058_LDO23] =		{ 219, 122, 75, 2 },
+	[QCOM_RPM_PM8058_LDO24] =		{ 221, 124, 76, 2 },
+	[QCOM_RPM_PM8058_LDO25] =		{ 223, 126, 77, 2 },
+	[QCOM_RPM_PM8058_LVS0] =		{ 225, 128, 78, 1 },
+	[QCOM_RPM_PM8058_LVS1] =		{ 226, 129, 79, 1 },
+	[QCOM_RPM_PM8058_NCP] =			{ 227, 130, 80, 2 },
+	[QCOM_RPM_CXO_BUFFERS] =		{ 229, 132, 81, 1 },
+};
+
+static const struct qcom_rpm_data msm8660_template = {
+	.version = 2,
+	.resource_table = msm8660_rpm_resource_table,
+	.n_resources = ARRAY_SIZE(msm8660_rpm_resource_table),
+};
+
+static const struct qcom_rpm_resource msm8960_rpm_resource_table[] = {
+	[QCOM_RPM_CXO_CLK] =			{ 25, 9, 5, 1 },
+	[QCOM_RPM_PXO_CLK] =			{ 26, 10, 6, 1 },
+	[QCOM_RPM_APPS_FABRIC_CLK] =		{ 27, 11, 8, 1 },
+	[QCOM_RPM_SYS_FABRIC_CLK] =		{ 28, 12, 9, 1 },
+	[QCOM_RPM_MM_FABRIC_CLK] =		{ 29, 13, 10, 1 },
+	[QCOM_RPM_DAYTONA_FABRIC_CLK] =		{ 30, 14, 11, 1 },
+	[QCOM_RPM_SFPB_CLK] =			{ 31, 15, 12, 1 },
+	[QCOM_RPM_CFPB_CLK] =			{ 32, 16, 13, 1 },
+	[QCOM_RPM_MMFPB_CLK] =			{ 33, 17, 14, 1 },
+	[QCOM_RPM_EBI1_CLK] =			{ 34, 18, 16, 1 },
+	[QCOM_RPM_APPS_FABRIC_HALT] =		{ 35, 19, 18, 1 },
+	[QCOM_RPM_APPS_FABRIC_MODE] =		{ 37, 20, 19, 1 },
+	[QCOM_RPM_APPS_FABRIC_IOCTL] =		{ 40, 21, 20, 1 },
+	[QCOM_RPM_APPS_FABRIC_ARB] =		{ 41, 22, 21, 12 },
+	[QCOM_RPM_SYS_FABRIC_HALT] =		{ 53, 23, 22, 1 },
+	[QCOM_RPM_SYS_FABRIC_MODE] =		{ 55, 24, 23, 1 },
+	[QCOM_RPM_SYS_FABRIC_IOCTL] =		{ 58, 25, 24, 1 },
+	[QCOM_RPM_SYS_FABRIC_ARB] =		{ 59, 26, 25, 29 },
+	[QCOM_RPM_MM_FABRIC_HALT] =		{ 88, 27, 26, 1 },
+	[QCOM_RPM_MM_FABRIC_MODE] =		{ 90, 28, 27, 1 },
+	[QCOM_RPM_MM_FABRIC_IOCTL] =		{ 93, 29, 28, 1 },
+	[QCOM_RPM_MM_FABRIC_ARB] =		{ 94, 30, 29, 23 },
+	[QCOM_RPM_PM8921_SMPS1] =		{ 117, 31, 30, 2 },
+	[QCOM_RPM_PM8921_SMPS2] =		{ 119, 33, 31, 2 },
+	[QCOM_RPM_PM8921_SMPS3] =		{ 121, 35, 32, 2 },
+	[QCOM_RPM_PM8921_SMPS4] =		{ 123, 37, 33, 2 },
+	[QCOM_RPM_PM8921_SMPS5] =		{ 125, 39, 34, 2 },
+	[QCOM_RPM_PM8921_SMPS6] =		{ 127, 41, 35, 2 },
+	[QCOM_RPM_PM8921_SMPS7] =		{ 129, 43, 36, 2 },
+	[QCOM_RPM_PM8921_SMPS8] =		{ 131, 45, 37, 2 },
+	[QCOM_RPM_PM8921_LDO1] =		{ 133, 47, 38, 2 },
+	[QCOM_RPM_PM8921_LDO2] =		{ 135, 49, 39, 2 },
+	[QCOM_RPM_PM8921_LDO3] =		{ 137, 51, 40, 2 },
+	[QCOM_RPM_PM8921_LDO4] =		{ 139, 53, 41, 2 },
+	[QCOM_RPM_PM8921_LDO5] =		{ 141, 55, 42, 2 },
+	[QCOM_RPM_PM8921_LDO6] =		{ 143, 57, 43, 2 },
+	[QCOM_RPM_PM8921_LDO7] =		{ 145, 59, 44, 2 },
+	[QCOM_RPM_PM8921_LDO8] =		{ 147, 61, 45, 2 },
+	[QCOM_RPM_PM8921_LDO9] =		{ 149, 63, 46, 2 },
+	[QCOM_RPM_PM8921_LDO10] =		{ 151, 65, 47, 2 },
+	[QCOM_RPM_PM8921_LDO11] =		{ 153, 67, 48, 2 },
+	[QCOM_RPM_PM8921_LDO12] =		{ 155, 69, 49, 2 },
+	[QCOM_RPM_PM8921_LDO13] =		{ 157, 71, 50, 2 },
+	[QCOM_RPM_PM8921_LDO14] =		{ 159, 73, 51, 2 },
+	[QCOM_RPM_PM8921_LDO15] =		{ 161, 75, 52, 2 },
+	[QCOM_RPM_PM8921_LDO16] =		{ 163, 77, 53, 2 },
+	[QCOM_RPM_PM8921_LDO17] =		{ 165, 79, 54, 2 },
+	[QCOM_RPM_PM8921_LDO18] =		{ 167, 81, 55, 2 },
+	[QCOM_RPM_PM8921_LDO19] =		{ 169, 83, 56, 2 },
+	[QCOM_RPM_PM8921_LDO20] =		{ 171, 85, 57, 2 },
+	[QCOM_RPM_PM8921_LDO21] =		{ 173, 87, 58, 2 },
+	[QCOM_RPM_PM8921_LDO22] =		{ 175, 89, 59, 2 },
+	[QCOM_RPM_PM8921_LDO23] =		{ 177, 91, 60, 2 },
+	[QCOM_RPM_PM8921_LDO24] =		{ 179, 93, 61, 2 },
+	[QCOM_RPM_PM8921_LDO25] =		{ 181, 95, 62, 2 },
+	[QCOM_RPM_PM8921_LDO26] =		{ 183, 97, 63, 2 },
+	[QCOM_RPM_PM8921_LDO27] =		{ 185, 99, 64, 2 },
+	[QCOM_RPM_PM8921_LDO28] =		{ 187, 101, 65, 2 },
+	[QCOM_RPM_PM8921_LDO29] =		{ 189, 103, 66, 2 },
+	[QCOM_RPM_PM8921_CLK1] =		{ 191, 105, 67, 2 },
+	[QCOM_RPM_PM8921_CLK2] =		{ 193, 107, 68, 2 },
+	[QCOM_RPM_PM8921_LVS1] =		{ 195, 109, 69, 1 },
+	[QCOM_RPM_PM8921_LVS2] =		{ 196, 110, 70, 1 },
+	[QCOM_RPM_PM8921_LVS3] =		{ 197, 111, 71, 1 },
+	[QCOM_RPM_PM8921_LVS4] =		{ 198, 112, 72, 1 },
+	[QCOM_RPM_PM8921_LVS5] =		{ 199, 113, 73, 1 },
+	[QCOM_RPM_PM8921_LVS6] =		{ 200, 114, 74, 1 },
+	[QCOM_RPM_PM8921_LVS7] =		{ 201, 115, 75, 1 },
+	[QCOM_RPM_PM8921_NCP] =			{ 202, 116, 80, 2 },
+	[QCOM_RPM_CXO_BUFFERS] =		{ 204, 118, 81, 1 },
+	[QCOM_RPM_USB_OTG_SWITCH] =		{ 205, 119, 82, 1 },
+	[QCOM_RPM_HDMI_SWITCH] =		{ 206, 120, 83, 1 },
+	[QCOM_RPM_DDR_DMM] =			{ 207, 121, 84, 2 },
+};
+
+static const struct qcom_rpm_data msm8960_template = {
+	.version = 3,
+	.resource_table = msm8960_rpm_resource_table,
+	.n_resources = ARRAY_SIZE(msm8960_rpm_resource_table),
+};
+
+static const struct of_device_id qcom_rpm_of_match[] = {
+	{ .compatible = "qcom,rpm-apq8064", .data = &apq8064_template },
+	{ .compatible = "qcom,rpm-msm8660", .data = &msm8660_template },
+	{ .compatible = "qcom,rpm-msm8960", .data = &msm8960_template },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, qcom_rpm_of_match);
+
+int qcom_rpm_write(struct qcom_rpm *rpm,
+		   int state,
+		   int resource,
+		   u32 *buf, size_t count)
+{
+	const struct qcom_rpm_resource *res;
+	const struct qcom_rpm_data *data = rpm->data;
+	u32 sel_mask[RPM_SELECT_SIZE] = { 0 };
+	int left;
+	int ret = 0;
+	int i;
+
+	if (WARN_ON(resource < 0 || resource >= data->n_resources))
+		return -EINVAL;
+
+	res = &data->resource_table[resource];
+	if (WARN_ON(res->size != count))
+		return -EINVAL;
+
+	mutex_lock(&rpm->lock);
+
+	for (i = 0; i < res->size; i++)
+		writel_relaxed(buf[i], RPM_REQ_REG(rpm, res->target_id + i));
+
+	bitmap_set((unsigned long *)sel_mask, res->select_id, 1);
+	for (i = 0; i < ARRAY_SIZE(sel_mask); i++) {
+		writel_relaxed(sel_mask[i],
+			       RPM_CTRL_REG(rpm, RPM_REQ_SELECT + i));
+	}
+
+	writel_relaxed(BIT(state), RPM_CTRL_REG(rpm, RPM_REQUEST_CONTEXT));
+
+	reinit_completion(&rpm->ack);
+	regmap_write(rpm->ipc_regmap, rpm->ipc_offset, BIT(rpm->ipc_bit));
+
+	left = wait_for_completion_timeout(&rpm->ack, RPM_REQUEST_TIMEOUT);
+	if (!left)
+		ret = -ETIMEDOUT;
+	else if (rpm->ack_status & RPM_REJECTED)
+		ret = -EIO;
+
+	mutex_unlock(&rpm->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(qcom_rpm_write);
+
+static irqreturn_t qcom_rpm_ack_interrupt(int irq, void *dev)
+{
+	struct qcom_rpm *rpm = dev;
+	u32 ack;
+	int i;
+
+	ack = readl_relaxed(RPM_CTRL_REG(rpm, RPM_ACK_CONTEXT));
+	for (i = 0; i < RPM_SELECT_SIZE; i++)
+		writel_relaxed(0, RPM_CTRL_REG(rpm, RPM_ACK_SELECTOR + i));
+	writel(0, RPM_CTRL_REG(rpm, RPM_ACK_CONTEXT));
+
+	if (ack & RPM_NOTIFICATION) {
+		dev_warn(rpm->dev, "ignoring notification!\n");
+	} else {
+		rpm->ack_status = ack;
+		complete(&rpm->ack);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t qcom_rpm_err_interrupt(int irq, void *dev)
+{
+	struct qcom_rpm *rpm = dev;
+
+	regmap_write(rpm->ipc_regmap, rpm->ipc_offset, BIT(rpm->ipc_bit));
+	dev_err(rpm->dev, "RPM triggered fatal error\n");
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t qcom_rpm_wakeup_interrupt(int irq, void *dev)
+{
+	return IRQ_HANDLED;
+}
+
+static int qcom_rpm_probe(struct platform_device *pdev)
+{
+	const struct of_device_id *match;
+	struct device_node *syscon_np;
+	struct resource *res;
+	struct qcom_rpm *rpm;
+	u32 fw_version[3];
+	int irq_wakeup;
+	int irq_ack;
+	int irq_err;
+	int ret;
+
+	rpm = devm_kzalloc(&pdev->dev, sizeof(*rpm), GFP_KERNEL);
+	if (!rpm)
+		return -ENOMEM;
+
+	rpm->dev = &pdev->dev;
+	mutex_init(&rpm->lock);
+	init_completion(&rpm->ack);
+
+	irq_ack = platform_get_irq_byname(pdev, "ack");
+	if (irq_ack < 0) {
+		dev_err(&pdev->dev, "required ack interrupt missing\n");
+		return irq_ack;
+	}
+
+	irq_err = platform_get_irq_byname(pdev, "err");
+	if (irq_err < 0) {
+		dev_err(&pdev->dev, "required err interrupt missing\n");
+		return irq_err;
+	}
+
+	irq_wakeup = platform_get_irq_byname(pdev, "wakeup");
+	if (irq_wakeup < 0) {
+		dev_err(&pdev->dev, "required wakeup interrupt missing\n");
+		return irq_wakeup;
+	}
+
+	match = of_match_device(qcom_rpm_of_match, &pdev->dev);
+	rpm->data = match->data;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	rpm->status_regs = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(rpm->status_regs))
+		return PTR_ERR(rpm->status_regs);
+	rpm->ctrl_regs = rpm->status_regs + 0x400;
+	rpm->req_regs = rpm->status_regs + 0x600;
+
+	syscon_np = of_parse_phandle(pdev->dev.of_node, "qcom,ipc", 0);
+	if (!syscon_np) {
+		dev_err(&pdev->dev, "no qcom,ipc node\n");
+		return -ENODEV;
+	}
+
+	rpm->ipc_regmap = syscon_node_to_regmap(syscon_np);
+	if (IS_ERR(rpm->ipc_regmap))
+		return PTR_ERR(rpm->ipc_regmap);
+
+	ret = of_property_read_u32_index(pdev->dev.of_node, "qcom,ipc", 1,
+					 &rpm->ipc_offset);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "no offset in qcom,ipc\n");
+		return -EINVAL;
+	}
+
+	ret = of_property_read_u32_index(pdev->dev.of_node, "qcom,ipc", 2,
+					 &rpm->ipc_bit);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "no bit in qcom,ipc\n");
+		return -EINVAL;
+	}
+
+	dev_set_drvdata(&pdev->dev, rpm);
+
+	fw_version[0] = readl(RPM_STATUS_REG(rpm, 0));
+	fw_version[1] = readl(RPM_STATUS_REG(rpm, 1));
+	fw_version[2] = readl(RPM_STATUS_REG(rpm, 2));
+	if (fw_version[0] != rpm->data->version) {
+		dev_err(&pdev->dev,
+			"RPM version %u.%u.%u incompatible with driver version %u",
+			fw_version[0],
+			fw_version[1],
+			fw_version[2],
+			rpm->data->version);
+		return -EFAULT;
+	}
+
+	dev_info(&pdev->dev, "RPM firmware %u.%u.%u\n", fw_version[0],
+							fw_version[1],
+							fw_version[2]);
+
+	ret = devm_request_irq(&pdev->dev,
+			       irq_ack,
+			       qcom_rpm_ack_interrupt,
+			       IRQF_TRIGGER_RISING | IRQF_NO_SUSPEND,
+			       "qcom_rpm_ack",
+			       rpm);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to request ack interrupt\n");
+		return ret;
+	}
+
+	ret = irq_set_irq_wake(irq_ack, 1);
+	if (ret)
+		dev_warn(&pdev->dev, "failed to mark ack irq as wakeup\n");
+
+	ret = devm_request_irq(&pdev->dev,
+			       irq_err,
+			       qcom_rpm_err_interrupt,
+			       IRQF_TRIGGER_RISING,
+			       "qcom_rpm_err",
+			       rpm);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to request err interrupt\n");
+		return ret;
+	}
+
+	ret = devm_request_irq(&pdev->dev,
+			       irq_wakeup,
+			       qcom_rpm_wakeup_interrupt,
+			       IRQF_TRIGGER_RISING,
+			       "qcom_rpm_wakeup",
+			       rpm);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to request wakeup interrupt\n");
+		return ret;
+	}
+
+	ret = irq_set_irq_wake(irq_wakeup, 1);
+	if (ret)
+		dev_warn(&pdev->dev, "failed to mark wakeup irq as wakeup\n");
+
+	return of_platform_populate(pdev->dev.of_node, NULL, NULL, &pdev->dev);
+}
+
+static int qcom_rpm_remove(struct platform_device *pdev)
+{
+	of_platform_depopulate(&pdev->dev);
+	return 0;
+}
+
+static struct platform_driver qcom_rpm_driver = {
+	.probe = qcom_rpm_probe,
+	.remove = qcom_rpm_remove,
+	.driver  = {
+		.name  = "qcom_rpm",
+		.of_match_table = qcom_rpm_of_match,
+	},
+};
+
+static int __init qcom_rpm_init(void)
+{
+	return platform_driver_register(&qcom_rpm_driver);
+}
+arch_initcall(qcom_rpm_init);
+
+static void __exit qcom_rpm_exit(void)
+{
+	platform_driver_unregister(&qcom_rpm_driver);
+}
+module_exit(qcom_rpm_exit)
+
+MODULE_DESCRIPTION("Qualcomm Resource Power Manager driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Bjorn Andersson <bjorn.andersson@sonymobile.com>");
diff --git a/include/linux/mfd/qcom_rpm.h b/include/linux/mfd/qcom_rpm.h
new file mode 100644
index 000000000000..742ebf1b76ca
--- /dev/null
+++ b/include/linux/mfd/qcom_rpm.h
@@ -0,0 +1,13 @@
+#ifndef __QCOM_RPM_H__
+#define __QCOM_RPM_H__
+
+#include <linux/types.h>
+
+struct qcom_rpm;
+
+#define QCOM_RPM_ACTIVE_STATE	0
+#define QCOM_RPM_SLEEP_STATE	1
+
+int qcom_rpm_write(struct qcom_rpm *rpm, int state, int resource, u32 *buf, size_t count);
+
+#endif
-- 
cgit v1.2.3


From 71e03de46d73b87aab5f80fa55449a9e40c55d17 Mon Sep 17 00:00:00 2001
From: Steve Twiss <stwiss.opensource@diasemi.com>
Date: Tue, 20 Jan 2015 13:54:25 +0000
Subject: mfd: da9063: Add device tree support

Add device tree support for DA9063 regulators; Real-Time Clock
and Watchdog.

Signed-off-by: Steve Twiss <stwiss.opensource@diasemi.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/da9063-core.c       | 2 ++
 drivers/mfd/da9063-i2c.c        | 9 +++++++++
 include/linux/mfd/da9063/core.h | 1 +
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/da9063-core.c b/drivers/mfd/da9063-core.c
index f38bc98a3c57..facd3610ac77 100644
--- a/drivers/mfd/da9063-core.c
+++ b/drivers/mfd/da9063-core.c
@@ -86,6 +86,7 @@ static const struct mfd_cell da9063_devs[] = {
 	},
 	{
 		.name		= DA9063_DRVNAME_WATCHDOG,
+		.of_compatible	= "dlg,da9063-watchdog",
 	},
 	{
 		.name		= DA9063_DRVNAME_HWMON,
@@ -101,6 +102,7 @@ static const struct mfd_cell da9063_devs[] = {
 		.name		= DA9063_DRVNAME_RTC,
 		.num_resources	= ARRAY_SIZE(da9063_rtc_resources),
 		.resources	= da9063_rtc_resources,
+		.of_compatible	= "dlg,da9063-rtc",
 	},
 	{
 		.name		= DA9063_DRVNAME_VIBRATION,
diff --git a/drivers/mfd/da9063-i2c.c b/drivers/mfd/da9063-i2c.c
index 21fd8d9a217b..6f3a7c0001f9 100644
--- a/drivers/mfd/da9063-i2c.c
+++ b/drivers/mfd/da9063-i2c.c
@@ -25,6 +25,9 @@
 #include <linux/mfd/da9063/pdata.h>
 #include <linux/mfd/da9063/registers.h>
 
+#include <linux/of.h>
+#include <linux/regulator/of_regulator.h>
+
 static const struct regmap_range da9063_ad_readable_ranges[] = {
 	{
 		.range_min = DA9063_REG_PAGE_CON,
@@ -203,6 +206,11 @@ static struct regmap_config da9063_regmap_config = {
 	.cache_type = REGCACHE_RBTREE,
 };
 
+static const struct of_device_id da9063_dt_ids[] = {
+	{ .compatible = "dlg,da9063", },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, da9063_dt_ids);
 static int da9063_i2c_probe(struct i2c_client *i2c,
 	const struct i2c_device_id *id)
 {
@@ -257,6 +265,7 @@ static struct i2c_driver da9063_i2c_driver = {
 	.driver = {
 		.name = "da9063",
 		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(da9063_dt_ids),
 	},
 	.probe    = da9063_i2c_probe,
 	.remove   = da9063_i2c_remove,
diff --git a/include/linux/mfd/da9063/core.h b/include/linux/mfd/da9063/core.h
index b92a3262f8f6..79f4d822ba13 100644
--- a/include/linux/mfd/da9063/core.h
+++ b/include/linux/mfd/da9063/core.h
@@ -36,6 +36,7 @@ enum da9063_models {
 enum da9063_variant_codes {
 	PMIC_DA9063_AD = 0x3,
 	PMIC_DA9063_BB = 0x5,
+	PMIC_DA9063_CA = 0x6,
 };
 
 /* Interrupts */
-- 
cgit v1.2.3


From 1f94a94f67e1083e19fb7b436dd7ca7a4ba03f2b Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 9 Jan 2015 20:34:39 -0600
Subject: PCI: Add generic config accessors

Many PCI controllers' configuration space accesses are memory-mapped and
vary only in address calculation and access checks.  There are 2 main
access methods: a decoded address space such as ECAM or a single address
and data register similar to x86.  This implementation can support both
cases as well as be used in cases that need additional pre- or post-access
handling.

Add a new pci_ops member, map_bus, which can do access checks and any
necessary setup.  It returns the address to use for the configuration space
access.  The access types supported are 32-bit only accesses or correct
byte, word, or dword sized accesses.

Tested-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
---
 drivers/pci/access.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h  | 11 +++++++
 2 files changed, 98 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 49dd766852ba..d9b64a175990 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -67,6 +67,93 @@ EXPORT_SYMBOL(pci_bus_write_config_byte);
 EXPORT_SYMBOL(pci_bus_write_config_word);
 EXPORT_SYMBOL(pci_bus_write_config_dword);
 
+int pci_generic_config_read(struct pci_bus *bus, unsigned int devfn,
+			    int where, int size, u32 *val)
+{
+	void __iomem *addr;
+
+	addr = bus->ops->map_bus(bus, devfn, where);
+	if (!addr) {
+		*val = ~0;
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	if (size == 1)
+		*val = readb(addr);
+	else if (size == 2)
+		*val = readw(addr);
+	else
+		*val = readl(addr);
+
+	return PCIBIOS_SUCCESSFUL;
+}
+EXPORT_SYMBOL_GPL(pci_generic_config_read);
+
+int pci_generic_config_write(struct pci_bus *bus, unsigned int devfn,
+			     int where, int size, u32 val)
+{
+	void __iomem *addr;
+
+	addr = bus->ops->map_bus(bus, devfn, where);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (size == 1)
+		writeb(val, addr);
+	else if (size == 2)
+		writew(val, addr);
+	else
+		writel(val, addr);
+
+	return PCIBIOS_SUCCESSFUL;
+}
+EXPORT_SYMBOL_GPL(pci_generic_config_write);
+
+int pci_generic_config_read32(struct pci_bus *bus, unsigned int devfn,
+			      int where, int size, u32 *val)
+{
+	void __iomem *addr;
+
+	addr = bus->ops->map_bus(bus, devfn, where & ~0x3);
+	if (!addr) {
+		*val = ~0;
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	*val = readl(addr);
+
+	if (size <= 2)
+		*val = (*val >> (8 * (where & 3))) & ((1 << (size * 8)) - 1);
+
+	return PCIBIOS_SUCCESSFUL;
+}
+EXPORT_SYMBOL_GPL(pci_generic_config_read32);
+
+int pci_generic_config_write32(struct pci_bus *bus, unsigned int devfn,
+			       int where, int size, u32 val)
+{
+	void __iomem *addr;
+	u32 mask, tmp;
+
+	addr = bus->ops->map_bus(bus, devfn, where & ~0x3);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (size == 4) {
+		writel(val, addr);
+		return PCIBIOS_SUCCESSFUL;
+	} else {
+		mask = ~(((1 << (size * 8)) - 1) << ((where & 0x3) * 8));
+	}
+
+	tmp = readl(addr) & mask;
+	tmp |= val << ((where & 0x3) * 8);
+	writel(tmp, addr);
+
+	return PCIBIOS_SUCCESSFUL;
+}
+EXPORT_SYMBOL_GPL(pci_generic_config_write32);
+
 /**
  * pci_bus_set_ops - Set raw operations of pci bus
  * @bus:	pci bus struct
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 360a966a97a5..e7fd51900182 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -560,6 +560,7 @@ static inline int pcibios_err_to_errno(int err)
 /* Low-level architecture-dependent routines */
 
 struct pci_ops {
+	void __iomem *(*map_bus)(struct pci_bus *bus, unsigned int devfn, int where);
 	int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val);
 	int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val);
 };
@@ -857,6 +858,16 @@ int pci_bus_write_config_word(struct pci_bus *bus, unsigned int devfn,
 			      int where, u16 val);
 int pci_bus_write_config_dword(struct pci_bus *bus, unsigned int devfn,
 			       int where, u32 val);
+
+int pci_generic_config_read(struct pci_bus *bus, unsigned int devfn,
+			    int where, int size, u32 *val);
+int pci_generic_config_write(struct pci_bus *bus, unsigned int devfn,
+			    int where, int size, u32 val);
+int pci_generic_config_read32(struct pci_bus *bus, unsigned int devfn,
+			      int where, int size, u32 *val);
+int pci_generic_config_write32(struct pci_bus *bus, unsigned int devfn,
+			       int where, int size, u32 val);
+
 struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops);
 
 static inline int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val)
-- 
cgit v1.2.3


From 1862ee22ce2e28087299aebb6556a5cdc122d0ef Mon Sep 17 00:00:00 2001
From: Pawel Moll <pawel.moll@arm.com>
Date: Fri, 23 Jan 2015 14:45:55 +1030
Subject: virtio-mmio: Update the device to OASIS spec version

This patch add a support for second version of the virtio-mmio device,
which follows OASIS "Virtual I/O Device (VIRTIO) Version 1.0"
specification.

Main changes:

1. The control register symbolic names use the new device/driver
   nomenclature rather than the old guest/host one.

2. The driver detect the device version (version 1 is the pre-OASIS
   spec, version 2 is compatible with fist revision of the OASIS spec)
   and drives the device accordingly.

3. New version uses direct addressing (64 bit address split into two
   low/high register) instead of the guest page size based one,
   and addresses each part of the queue (descriptors, available, used)
   separately.

4. The device activity is now explicitly triggered by writing to the
   "queue ready" register.

5. Whole 64 bit features are properly handled now (both ways).

Signed-off-by: Pawel Moll <pawel.moll@arm.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_mmio.c | 131 ++++++++++++++++++++++++++-----------------
 include/linux/virtio_mmio.h  |  44 ++++++++++++---
 2 files changed, 118 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 00d115b22bd8..cad569890908 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -1,7 +1,7 @@
 /*
  * Virtio memory mapped device driver
  *
- * Copyright 2011, ARM Ltd.
+ * Copyright 2011-2014, ARM Ltd.
  *
  * This module allows virtio devices to be used over a virtual, memory mapped
  * platform device.
@@ -50,36 +50,6 @@
  *
  *
  *
- * Registers layout (all 32-bit wide):
- *
- * offset d. name             description
- * ------ -- ---------------- -----------------
- *
- * 0x000  R  MagicValue       Magic value "virt"
- * 0x004  R  Version          Device version (current max. 1)
- * 0x008  R  DeviceID         Virtio device ID
- * 0x00c  R  VendorID         Virtio vendor ID
- *
- * 0x010  R  HostFeatures     Features supported by the host
- * 0x014  W  HostFeaturesSel  Set of host features to access via HostFeatures
- *
- * 0x020  W  GuestFeatures    Features activated by the guest
- * 0x024  W  GuestFeaturesSel Set of activated features to set via GuestFeatures
- * 0x028  W  GuestPageSize    Size of guest's memory page in bytes
- *
- * 0x030  W  QueueSel         Queue selector
- * 0x034  R  QueueNumMax      Maximum size of the currently selected queue
- * 0x038  W  QueueNum         Queue size for the currently selected queue
- * 0x03c  W  QueueAlign       Used Ring alignment for the current queue
- * 0x040  RW QueuePFN         PFN for the currently selected queue
- *
- * 0x050  W  QueueNotify      Queue notifier
- * 0x060  R  InterruptStatus  Interrupt status register
- * 0x064  W  InterruptACK     Interrupt acknowledge register
- * 0x070  RW Status           Device status register
- *
- * 0x100+ RW                  Device-specific configuration space
- *
  * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
@@ -145,11 +115,16 @@ struct virtio_mmio_vq_info {
 static u64 vm_get_features(struct virtio_device *vdev)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+	u64 features;
+
+	writel(1, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL);
+	features = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES);
+	features <<= 32;
 
-	/* TODO: Features > 32 bits */
-	writel(0, vm_dev->base + VIRTIO_MMIO_HOST_FEATURES_SEL);
+	writel(0, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL);
+	features |= readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES);
 
-	return readl(vm_dev->base + VIRTIO_MMIO_HOST_FEATURES);
+	return features;
 }
 
 static int vm_finalize_features(struct virtio_device *vdev)
@@ -159,11 +134,20 @@ static int vm_finalize_features(struct virtio_device *vdev)
 	/* Give virtio_ring a chance to accept features. */
 	vring_transport_features(vdev);
 
-	/* Make sure we don't have any features > 32 bits! */
-	BUG_ON((u32)vdev->features != vdev->features);
+	/* Make sure there is are no mixed devices */
+	if (vm_dev->version == 2 &&
+			!__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) {
+		dev_err(&vdev->dev, "New virtio-mmio devices (version 2) must provide VIRTIO_F_VERSION_1 feature!\n");
+		return -EINVAL;
+	}
+
+	writel(1, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL);
+	writel((u32)(vdev->features >> 32),
+			vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES);
 
-	writel(0, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES_SEL);
-	writel(vdev->features, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES);
+	writel(0, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL);
+	writel((u32)vdev->features,
+			vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES);
 
 	return 0;
 }
@@ -275,7 +259,12 @@ static void vm_del_vq(struct virtqueue *vq)
 
 	/* Select and deactivate the queue */
 	writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
-	writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
+	if (vm_dev->version == 1) {
+		writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
+	} else {
+		writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
+		WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
+	}
 
 	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN));
 	free_pages_exact(info->queue, size);
@@ -312,7 +301,8 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 	writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
 
 	/* Queue shouldn't already be set up. */
-	if (readl(vm_dev->base + VIRTIO_MMIO_QUEUE_PFN)) {
+	if (readl(vm_dev->base + (vm_dev->version == 1 ?
+			VIRTIO_MMIO_QUEUE_PFN : VIRTIO_MMIO_QUEUE_READY))) {
 		err = -ENOENT;
 		goto error_available;
 	}
@@ -356,13 +346,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 		info->num /= 2;
 	}
 
-	/* Activate the queue */
-	writel(info->num, vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
-	writel(VIRTIO_MMIO_VRING_ALIGN,
-			vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN);
-	writel(virt_to_phys(info->queue) >> PAGE_SHIFT,
-			vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
-
 	/* Create the vring */
 	vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, vdev,
 				 true, info->queue, vm_notify, callback, name);
@@ -371,6 +354,33 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 		goto error_new_virtqueue;
 	}
 
+	/* Activate the queue */
+	writel(info->num, vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
+	if (vm_dev->version == 1) {
+		writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN);
+		writel(virt_to_phys(info->queue) >> PAGE_SHIFT,
+				vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
+	} else {
+		u64 addr;
+
+		addr = virt_to_phys(info->queue);
+		writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW);
+		writel((u32)(addr >> 32),
+				vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH);
+
+		addr = virt_to_phys(virtqueue_get_avail(vq));
+		writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW);
+		writel((u32)(addr >> 32),
+				vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH);
+
+		addr = virt_to_phys(virtqueue_get_used(vq));
+		writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW);
+		writel((u32)(addr >> 32),
+				vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH);
+
+		writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
+	}
+
 	vq->priv = info;
 	info->vq = vq;
 
@@ -381,7 +391,12 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 	return vq;
 
 error_new_virtqueue:
-	writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
+	if (vm_dev->version == 1) {
+		writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
+	} else {
+		writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
+		WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
+	}
 	free_pages_exact(info->queue, size);
 error_alloc_pages:
 	kfree(info);
@@ -476,16 +491,32 @@ static int virtio_mmio_probe(struct platform_device *pdev)
 
 	/* Check device version */
 	vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION);
-	if (vm_dev->version != 1) {
+	if (vm_dev->version < 1 || vm_dev->version > 2) {
 		dev_err(&pdev->dev, "Version %ld not supported!\n",
 				vm_dev->version);
 		return -ENXIO;
 	}
 
 	vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID);
+	if (vm_dev->vdev.id.device == 0) {
+		/*
+		 * virtio-mmio device with an ID 0 is a (dummy) placeholder
+		 * with no function. End probing now with no error reported.
+		 */
+		return -ENODEV;
+	}
 	vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID);
 
-	writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE);
+	/* Reject legacy-only IDs for version 2 devices */
+	if (vm_dev->version == 2 &&
+			virtio_device_is_legacy_only(vm_dev->vdev.id)) {
+		dev_err(&pdev->dev, "Version 2 not supported for devices %u!\n",
+				vm_dev->vdev.id.device);
+		return -ENODEV;
+	}
+
+	if (vm_dev->version == 1)
+		writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE);
 
 	platform_set_drvdata(pdev, vm_dev);
 
diff --git a/include/linux/virtio_mmio.h b/include/linux/virtio_mmio.h
index 5c7b6f0daef8..c4b09689ab64 100644
--- a/include/linux/virtio_mmio.h
+++ b/include/linux/virtio_mmio.h
@@ -51,23 +51,29 @@
 /* Virtio vendor ID - Read Only */
 #define VIRTIO_MMIO_VENDOR_ID		0x00c
 
-/* Bitmask of the features supported by the host
+/* Bitmask of the features supported by the device (host)
  * (32 bits per set) - Read Only */
-#define VIRTIO_MMIO_HOST_FEATURES	0x010
+#define VIRTIO_MMIO_DEVICE_FEATURES	0x010
 
-/* Host features set selector - Write Only */
-#define VIRTIO_MMIO_HOST_FEATURES_SEL	0x014
+/* Device (host) features set selector - Write Only */
+#define VIRTIO_MMIO_DEVICE_FEATURES_SEL	0x014
 
-/* Bitmask of features activated by the guest
+/* Bitmask of features activated by the driver (guest)
  * (32 bits per set) - Write Only */
-#define VIRTIO_MMIO_GUEST_FEATURES	0x020
+#define VIRTIO_MMIO_DRIVER_FEATURES	0x020
 
 /* Activated features set selector - Write Only */
-#define VIRTIO_MMIO_GUEST_FEATURES_SEL	0x024
+#define VIRTIO_MMIO_DRIVER_FEATURES_SEL	0x024
+
+
+#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
 
 /* Guest's memory page size in bytes - Write Only */
 #define VIRTIO_MMIO_GUEST_PAGE_SIZE	0x028
 
+#endif
+
+
 /* Queue selector - Write Only */
 #define VIRTIO_MMIO_QUEUE_SEL		0x030
 
@@ -77,12 +83,21 @@
 /* Queue size for the currently selected queue - Write Only */
 #define VIRTIO_MMIO_QUEUE_NUM		0x038
 
+
+#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
+
 /* Used Ring alignment for the currently selected queue - Write Only */
 #define VIRTIO_MMIO_QUEUE_ALIGN		0x03c
 
 /* Guest's PFN for the currently selected queue - Read Write */
 #define VIRTIO_MMIO_QUEUE_PFN		0x040
 
+#endif
+
+
+/* Ready bit for the currently selected queue - Read Write */
+#define VIRTIO_MMIO_QUEUE_READY		0x044
+
 /* Queue notifier - Write Only */
 #define VIRTIO_MMIO_QUEUE_NOTIFY	0x050
 
@@ -95,6 +110,21 @@
 /* Device status register - Read Write */
 #define VIRTIO_MMIO_STATUS		0x070
 
+/* Selected queue's Descriptor Table address, 64 bits in two halves */
+#define VIRTIO_MMIO_QUEUE_DESC_LOW	0x080
+#define VIRTIO_MMIO_QUEUE_DESC_HIGH	0x084
+
+/* Selected queue's Available Ring address, 64 bits in two halves */
+#define VIRTIO_MMIO_QUEUE_AVAIL_LOW	0x090
+#define VIRTIO_MMIO_QUEUE_AVAIL_HIGH	0x094
+
+/* Selected queue's Used Ring address, 64 bits in two halves */
+#define VIRTIO_MMIO_QUEUE_USED_LOW	0x0a0
+#define VIRTIO_MMIO_QUEUE_USED_HIGH	0x0a4
+
+/* Configuration atomicity value */
+#define VIRTIO_MMIO_CONFIG_GENERATION	0x0fc
+
 /* The config space is defined by each driver as
  * the per-driver configuration space - Read Write */
 #define VIRTIO_MMIO_CONFIG		0x100
-- 
cgit v1.2.3


From 55422d0bd292f5ad143cc32cb8bb8505257274c4 Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Thu, 22 Jan 2015 00:00:23 -0500
Subject: audit: replace getname()/putname() hacks with reference counters

In order to ensure that filenames are not released before the audit
subsystem is done with the strings there are a number of hacks built
into the fs and audit subsystems around getname() and putname().  To
say these hacks are "ugly" would be kind.

This patch removes the filename hackery in favor of a more
conventional reference count based approach.  The diffstat below tells
most of the story; lots of audit/fs specific code is replaced with a
traditional reference count based approach that is easily understood,
even by those not familiar with the audit and/or fs subsystems.

CC: viro@zeniv.linux.org.uk
CC: linux-fsdevel@vger.kernel.org
Signed-off-by: Paul Moore <pmoore@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            |  29 +++++++-------
 include/linux/audit.h |   3 --
 include/linux/fs.h    |   9 +----
 kernel/audit.h        |  17 +-------
 kernel/auditsc.c      | 105 ++++++--------------------------------------------
 5 files changed, 29 insertions(+), 134 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index a3fde77d4abf..96ca11dea4a2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -118,15 +118,6 @@
  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
  * PATH_MAX includes the nul terminator --RR.
  */
-void final_putname(struct filename *name)
-{
-	if (name->separate) {
-		__putname(name->name);
-		kfree(name);
-	} else {
-		__putname(name);
-	}
-}
 
 #define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))
 
@@ -145,6 +136,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
 	result = __getname();
 	if (unlikely(!result))
 		return ERR_PTR(-ENOMEM);
+	result->refcnt = 1;
 
 	/*
 	 * First, try to embed the struct filename inside the names_cache
@@ -179,6 +171,7 @@ recopy:
 		}
 		result->name = kname;
 		result->separate = true;
+		result->refcnt = 1;
 		max = PATH_MAX;
 		goto recopy;
 	}
@@ -202,7 +195,7 @@ recopy:
 	return result;
 
 error:
-	final_putname(result);
+	putname(result);
 	return err;
 }
 
@@ -243,19 +236,25 @@ getname_kernel(const char * filename)
 	memcpy((char *)result->name, filename, len);
 	result->uptr = NULL;
 	result->aname = NULL;
+	result->refcnt = 1;
 	audit_getname(result);
 
 	return result;
 }
 
-#ifdef CONFIG_AUDITSYSCALL
 void putname(struct filename *name)
 {
-	if (unlikely(!audit_dummy_context()))
-		return audit_putname(name);
-	final_putname(name);
+	BUG_ON(name->refcnt <= 0);
+
+	if (--name->refcnt > 0)
+		return;
+
+	if (name->separate) {
+		__putname(name->name);
+		kfree(name);
+	} else
+		__putname(name);
 }
-#endif
 
 static int check_acl(struct inode *inode, int mask)
 {
diff --git a/include/linux/audit.h b/include/linux/audit.h
index af84234e1f6e..87c2d347d255 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -128,7 +128,6 @@ extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
 extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern struct filename *__audit_reusename(const __user char *uptr);
 extern void __audit_getname(struct filename *name);
-extern void audit_putname(struct filename *name);
 
 #define AUDIT_INODE_PARENT	1	/* dentry represents the parent */
 #define AUDIT_INODE_HIDDEN	2	/* audit record should be hidden */
@@ -353,8 +352,6 @@ static inline struct filename *audit_reusename(const __user char *name)
 }
 static inline void audit_getname(struct filename *name)
 { }
-static inline void audit_putname(struct filename *name)
-{ }
 static inline void __audit_inode(struct filename *name,
 					const struct dentry *dentry,
 					unsigned int flags)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f90c0282c114..b49842fe203f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2080,6 +2080,7 @@ struct filename {
 	const char		*name;	/* pointer to actual string */
 	const __user char	*uptr;	/* original userland pointer */
 	struct audit_names	*aname;
+	int			refcnt;
 	bool			separate; /* should "name" be freed? */
 };
 
@@ -2101,6 +2102,7 @@ extern int filp_close(struct file *, fl_owner_t id);
 extern struct filename *getname_flags(const char __user *, int, int *);
 extern struct filename *getname(const char __user *);
 extern struct filename *getname_kernel(const char *);
+extern void putname(struct filename *name);
 
 enum {
 	FILE_CREATED = 1,
@@ -2121,15 +2123,8 @@ extern void __init vfs_caches_init(unsigned long);
 
 extern struct kmem_cache *names_cachep;
 
-extern void final_putname(struct filename *name);
-
 #define __getname()		kmem_cache_alloc(names_cachep, GFP_KERNEL)
 #define __putname(name)		kmem_cache_free(names_cachep, (void *)(name))
-#ifndef CONFIG_AUDITSYSCALL
-#define putname(name)		final_putname(name)
-#else
-extern void putname(struct filename *name);
-#endif
 
 #ifdef CONFIG_BLOCK
 extern int register_blkdev(unsigned int, const char *);
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad5a1d9..1caa0d345d90 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
 #include <linux/skbuff.h>
 #include <uapi/linux/mqueue.h>
 
-/* 0 = no checking
-   1 = put_count checking
-   2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
-
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
  * for saving names from getname().  If we get more names we will allocate
  * a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
 	};
 };
 
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
+/* When fs/namei.c:getname() is called, we store the pointer in name and bump
+ * the refcnt in the associated filename struct.
  *
  * Further, in fs/namei.c:path_lookup() we store the inode and device.
  */
@@ -86,7 +79,6 @@ struct audit_names {
 	struct filename		*name;
 	int			name_len;	/* number of chars to log */
 	bool			hidden;		/* don't log this record */
-	bool			name_put;	/* call __putname()? */
 
 	unsigned long		ino;
 	dev_t			dev;
@@ -208,11 +200,6 @@ struct audit_context {
 	};
 	int fds[2];
 	struct audit_proctitle proctitle;
-
-#if AUDIT_DEBUG
-	int		    put_count;
-	int		    ino_count;
-#endif
 };
 
 extern u32 audit_ever_enabled;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4f521964ccaa..0c38604a630c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
 {
 	struct audit_names *n, *next;
 
-#if AUDIT_DEBUG == 2
-	if (context->put_count + context->ino_count != context->name_count) {
-		int i = 0;
-
-		pr_err("%s:%d(:%d): major=%d in_syscall=%d"
-		       " name_count=%d put_count=%d ino_count=%d"
-		       " [NOT freeing]\n", __FILE__, __LINE__,
-		       context->serial, context->major, context->in_syscall,
-		       context->name_count, context->put_count,
-		       context->ino_count);
-		list_for_each_entry(n, &context->names_list, list) {
-			pr_err("names[%d] = %p = %s\n", i++, n->name,
-			       n->name->name ?: "(null)");
-		}
-		dump_stack();
-		return;
-	}
-#endif
-#if AUDIT_DEBUG
-	context->put_count  = 0;
-	context->ino_count  = 0;
-#endif
-
 	list_for_each_entry_safe(n, next, &context->names_list, list) {
 		list_del(&n->list);
-		if (n->name && n->name_put)
-			final_putname(n->name);
+		if (n->name)
+			putname(n->name);
 		if (n->should_free)
 			kfree(n);
 	}
@@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
 	list_add_tail(&aname->list, &context->names_list);
 
 	context->name_count++;
-#if AUDIT_DEBUG
-	context->ino_count++;
-#endif
 	return aname;
 }
 
@@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
 	list_for_each_entry(n, &context->names_list, list) {
 		if (!n->name)
 			continue;
-		if (n->name->uptr == uptr)
+		if (n->name->uptr == uptr) {
+			n->name->refcnt++;
 			return n->name;
+		}
 	}
 	return NULL;
 }
@@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name)
 	struct audit_context *context = current->audit_context;
 	struct audit_names *n;
 
-	if (!context->in_syscall) {
-#if AUDIT_DEBUG == 2
-		pr_err("%s:%d(:%d): ignoring getname(%p)\n",
-		       __FILE__, __LINE__, context->serial, name);
-		dump_stack();
-#endif
+	if (!context->in_syscall)
 		return;
-	}
-
-#if AUDIT_DEBUG
-	/* The filename _must_ have a populated ->name */
-	BUG_ON(!name->name);
-#endif
 
 	n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
 	if (!n)
@@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name)
 
 	n->name = name;
 	n->name_len = AUDIT_NAME_FULL;
-	n->name_put = true;
 	name->aname = n;
+	name->refcnt++;
 
 	if (!context->pwd.dentry)
 		get_fs_pwd(current->fs, &context->pwd);
 }
 
-/* audit_putname - intercept a putname request
- * @name: name to intercept and delay for putname
- *
- * If we have stored the name from getname in the audit context,
- * then we delay the putname until syscall exit.
- * Called from include/linux/fs.h:putname().
- */
-void audit_putname(struct filename *name)
-{
-	struct audit_context *context = current->audit_context;
-
-	BUG_ON(!context);
-	if (!name->aname || !context->in_syscall) {
-#if AUDIT_DEBUG == 2
-		pr_err("%s:%d(:%d): final_putname(%p)\n",
-		       __FILE__, __LINE__, context->serial, name);
-		if (context->name_count) {
-			struct audit_names *n;
-			int i = 0;
-
-			list_for_each_entry(n, &context->names_list, list)
-				pr_err("name[%d] = %p = %s\n", i++, n->name,
-				       n->name->name ?: "(null)");
-			}
-#endif
-		final_putname(name);
-	}
-#if AUDIT_DEBUG
-	else {
-		++context->put_count;
-		if (context->put_count > context->name_count) {
-			pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
-			       " name_count=%d put_count=%d\n",
-			       __FILE__, __LINE__,
-			       context->serial, context->major,
-			       context->in_syscall, name->name,
-			       context->name_count, context->put_count);
-			dump_stack();
-		}
-	}
-#endif
-}
-
 /**
  * __audit_inode - store the inode and device from a lookup
  * @name: name being audited
@@ -1842,11 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
 	if (!name)
 		goto out_alloc;
 
-#if AUDIT_DEBUG
-	/* The struct filename _must_ have a populated ->name */
-	BUG_ON(!name->name);
-#endif
-
 	/*
 	 * If we have a pointer to an audit_names entry already, then we can
 	 * just use it directly if the type is correct.
@@ -1893,9 +1810,10 @@ out_alloc:
 	n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
 	if (!n)
 		return;
-	if (name)
-		/* no need to set ->name_put as the original will cleanup */
+	if (name) {
 		n->name = name;
+		name->refcnt++;
+	}
 
 out:
 	if (parent) {
@@ -2000,8 +1918,7 @@ void __audit_inode_child(const struct inode *parent,
 		if (found_parent) {
 			found_child->name = found_parent->name;
 			found_child->name_len = AUDIT_NAME_FULL;
-			/* don't call __putname() */
-			found_child->name_put = false;
+			found_child->name->refcnt++;
 		}
 	}
 
-- 
cgit v1.2.3


From 41fbf3b39d5eca01527338b4d0ee15ee1ae1023c Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhang.chunyan@linaro.org>
Date: Wed, 17 Dec 2014 13:11:35 +0800
Subject: ktime.h: Introduce ktime_ms_delta

This patch adds a reusable time difference function which returns the
difference in millisecond, as often used in some driver code, e.g.
mtd/test, media/rc, etc.

Signed-off-by: Chunyan Zhang <zhang.chunyan@linaro.org>
Acked-by: Arnd Bergmann <arnd@linaro.org>
Cc: zhang.lyra@gmail.com
Cc: davem@davemloft.net
Cc: john.stultz@linaro.org
Cc: dborkman@redhat.com
Link: http://lkml.kernel.org/r/1418793095-18780-1-git-send-email-zhang.chunyan@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ktime.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index c9d645ad98ff..891ea92a68b0 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -186,6 +186,11 @@ static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier)
        return ktime_to_us(ktime_sub(later, earlier));
 }
 
+static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier)
+{
+	return ktime_to_ms(ktime_sub(later, earlier));
+}
+
 static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec)
 {
 	return ktime_add_ns(kt, usec * NSEC_PER_USEC);
-- 
cgit v1.2.3


From 536e1715226c94037df12f7c6280cbe0f6009f92 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 5 Nov 2014 11:43:26 +0100
Subject: gpu: host1x: Call ->remove() only when a device is bound

When a driver's ->probe() function fails, the host1x bus must not call
its ->remove() function because the driver will already have cleaned up
in the error handling path in ->probe().

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 9 +++++++--
 include/linux/host1x.h   | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index aaf54859adb0..e4182e68e29c 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -116,7 +116,10 @@ static void host1x_subdev_register(struct host1x_device *device,
 	if (list_empty(&device->subdevs)) {
 		err = device->driver->probe(device);
 		if (err < 0)
-			dev_err(&device->dev, "probe failed: %d\n", err);
+			dev_err(&device->dev, "probe failed for %ps: %d\n",
+				device->driver, err);
+		else
+			device->bound = true;
 	}
 }
 
@@ -130,10 +133,12 @@ static void __host1x_subdev_unregister(struct host1x_device *device,
 	 * If all subdevices have been activated, we're about to remove the
 	 * first active subdevice, so unload the driver first.
 	 */
-	if (list_empty(&device->subdevs)) {
+	if (list_empty(&device->subdevs) && device->bound) {
 		err = device->driver->remove(device);
 		if (err < 0)
 			dev_err(&device->dev, "remove failed: %d\n", err);
+
+		device->bound = false;
 	}
 
 	/*
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index bb9840fd1e18..7890b553d12e 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -272,6 +272,8 @@ struct host1x_device {
 
 	struct mutex clients_lock;
 	struct list_head clients;
+
+	bool bound;
 };
 
 static inline struct host1x_device *to_host1x_device(struct device *dev)
-- 
cgit v1.2.3


From 9bc7491906b4113b4c5ae442157c7dfc4e10cd14 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jan 2015 21:24:10 +0100
Subject: hrtimer: Prevent stale expiry time in hrtimer_interrupt()

hrtimer_interrupt() has the following subtle issue:

hrtimer_interrupt()
  lock(cpu_base);
  expires_next = KTIME_MAX;

  expire_timers(CLOCK_MONOTONIC);
  expires = get_next_timer(CLOCK_MONOTONIC);
  if (expires < expires_next)
    expires_next = expires;

  expire_timers(CLOCK_REALTIME);
    unlock(cpu_base);
    wakeup()
    hrtimer_start(CLOCK_MONOTONIC, newtimer);
    lock(cpu_base();
  expires = get_next_timer(CLOCK_REALTIME);
  if (expires < expires_next)
    expires_next = expires;

So because we already evaluated the next expiring timer of
CLOCK_MONOTONIC we ignore that the expiry time of newtimer might be
earlier than the overall next expiry time in hrtimer_interrupt().

To solve this, remove the caching of the next expiry value from
hrtimer_interrupt() and reevaluate all active clock bases for the next
expiry value. To avoid another code duplication, create a shared
evaluation function and use it for hrtimer_get_next_event(),
hrtimer_force_reprogram() and hrtimer_interrupt().

There is another subtlety in this mechanism:

While hrtimer_interrupt() is running, we want to avoid to touch the
hardware device because we will reprogram it anyway at the end of
hrtimer_interrupt(). This works nicely for hrtimers which get rearmed
via the HRTIMER_RESTART mechanism, because we drop out when the
callback on that CPU is running. But that fails, if a new timer gets
enqueued like in the example above.

This has another implication: While hrtimer_interrupt() is running we
refuse remote enqueueing of timers - see hrtimer_interrupt() and
hrtimer_check_target().

hrtimer_interrupt() tries to prevent this by setting cpu_base->expires
to KTIME_MAX, but that fails if a new timer gets queued.

Prevent both the hardware access and the remote enqueue
explicitely. We can loosen the restriction on the remote enqueue now
due to reevaluation of the next expiry value, but that needs a
seperate patch.

Folded in a fix from Vignesh Radhakrishnan.

Reported-and-tested-by: Stanislav Fomichev <stfomichev@yandex-team.ru>
Based-on-patch-by: Stanislav Fomichev <stfomichev@yandex-team.ru>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: vigneshr@codeaurora.org
Cc: john.stultz@linaro.org
Cc: viresh.kumar@linaro.org
Cc: fweisbec@gmail.com
Cc: cl@linux.com
Cc: stuart.w.hayes@gmail.com
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1501202049190.5526@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |   2 +
 kernel/time/hrtimer.c   | 108 ++++++++++++++++++++++--------------------------
 2 files changed, 52 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index a036d058a249..05f6df1fdf5b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -170,6 +170,7 @@ enum  hrtimer_base_type {
  * @clock_was_set:	Indicates that clock was set from irq context.
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
+ * @in_hrtirq:		hrtimer_interrupt() is currently executing
  * @hres_active:	State of high resolution mode
  * @hang_detected:	The last hrtimer interrupt detected a hang
  * @nr_events:		Total number of hrtimer interrupt events
@@ -185,6 +186,7 @@ struct hrtimer_cpu_base {
 	unsigned int			clock_was_set;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
+	int				in_hrtirq;
 	int				hres_active;
 	int				hang_detected;
 	unsigned long			nr_events;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..b663653a5d5b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
 	trace_hrtimer_cancel(timer);
 }
 
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+	struct hrtimer_clock_base *base = cpu_base->clock_base;
+	ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
+	int i;
+
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+		struct timerqueue_node *next;
+		struct hrtimer *timer;
+
+		next = timerqueue_getnext(&base->active);
+		if (!next)
+			continue;
+
+		timer = container_of(next, struct hrtimer, node);
+		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+		if (expires.tv64 < expires_next.tv64)
+			expires_next = expires;
+	}
+	/*
+	 * clock_was_set() might have changed base->offset of any of
+	 * the clock bases so the result might be negative. Fix it up
+	 * to prevent a false positive in clockevents_program_event().
+	 */
+	if (expires_next.tv64 < 0)
+		expires_next.tv64 = 0;
+	return expires_next;
+}
+#endif
+
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
 static void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 {
-	int i;
-	struct hrtimer_clock_base *base = cpu_base->clock_base;
-	ktime_t expires, expires_next;
-
-	expires_next.tv64 = KTIME_MAX;
-
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
-		struct hrtimer *timer;
-		struct timerqueue_node *next;
-
-		next = timerqueue_getnext(&base->active);
-		if (!next)
-			continue;
-		timer = container_of(next, struct hrtimer, node);
-
-		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-		/*
-		 * clock_was_set() has changed base->offset so the
-		 * result might be negative. Fix it up to prevent a
-		 * false positive in clockevents_program_event()
-		 */
-		if (expires.tv64 < 0)
-			expires.tv64 = 0;
-		if (expires.tv64 < expires_next.tv64)
-			expires_next = expires;
-	}
+	ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
 
 	if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
 		return;
@@ -586,6 +592,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	if (expires.tv64 >= cpu_base->expires_next.tv64)
 		return 0;
 
+	/*
+	 * When the target cpu of the timer is currently executing
+	 * hrtimer_interrupt(), then we do not touch the clock event
+	 * device. hrtimer_interrupt() will reevaluate all clock bases
+	 * before reprogramming the device.
+	 */
+	if (cpu_base->in_hrtirq)
+		return 0;
+
 	/*
 	 * If a hang was detected in the last timer interrupt then we
 	 * do not schedule a timer which is earlier than the expiry
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 ktime_t hrtimer_get_next_event(void)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	struct hrtimer_clock_base *base = cpu_base->clock_base;
-	ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
+	ktime_t mindelta = { .tv64 = KTIME_MAX };
 	unsigned long flags;
-	int i;
 
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
-	if (!hrtimer_hres_active()) {
-		for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
-			struct hrtimer *timer;
-			struct timerqueue_node *next;
-
-			next = timerqueue_getnext(&base->active);
-			if (!next)
-				continue;
-
-			timer = container_of(next, struct hrtimer, node);
-			delta.tv64 = hrtimer_get_expires_tv64(timer);
-			delta = ktime_sub(delta, base->get_time());
-			if (delta.tv64 < mindelta.tv64)
-				mindelta.tv64 = delta.tv64;
-		}
-	}
+	if (!hrtimer_hres_active())
+		mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
+				     ktime_get());
 
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	raw_spin_lock(&cpu_base->lock);
 	entry_time = now = hrtimer_update_base(cpu_base);
 retry:
-	expires_next.tv64 = KTIME_MAX;
+	cpu_base->in_hrtirq = 1;
 	/*
 	 * We set expires_next to KTIME_MAX here with cpu_base->lock
 	 * held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
 			 * are right-of a not yet expired timer, because that
 			 * timer will have to trigger a wakeup anyway.
 			 */
-
-			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
-				ktime_t expires;
-
-				expires = ktime_sub(hrtimer_get_expires(timer),
-						    base->offset);
-				if (expires.tv64 < 0)
-					expires.tv64 = KTIME_MAX;
-				if (expires.tv64 < expires_next.tv64)
-					expires_next = expires;
+			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
 				break;
-			}
 
 			__run_hrtimer(timer, &basenow);
 		}
 	}
-
+	/* Reevaluate the clock bases for the next expiry */
+	expires_next = __hrtimer_get_next_event(cpu_base);
 	/*
 	 * Store the new expiry value so the migration code can verify
 	 * against it.
 	 */
 	cpu_base->expires_next = expires_next;
+	cpu_base->in_hrtirq = 0;
 	raw_spin_unlock(&cpu_base->lock);
 
 	/* Reprogramming necessary ? */
-- 
cgit v1.2.3


From 31928aa5863e71535ee942f506ca9ac8ce1c4315 Mon Sep 17 00:00:00 2001
From: Dominik Dingel <dingel@linux.vnet.ibm.com>
Date: Thu, 4 Dec 2014 15:47:07 +0100
Subject: KVM: remove unneeded return value of vcpu_postcreate

The return value of kvm_arch_vcpu_postcreate is not checked in its
caller.  This is okay, because only x86 provides vcpu_postcreate right
now and it could only fail if vcpu_load failed.  But that is not
possible during KVM_CREATE_VCPU (kvm_arch_vcpu_load is void, too), so
just get rid of the unchecked return value.

Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/arm/kvm/arm.c         |  3 +--
 arch/mips/kvm/mips.c       |  3 +--
 arch/powerpc/kvm/powerpc.c |  3 +--
 arch/s390/kvm/kvm-s390.c   |  3 +--
 arch/x86/kvm/x86.c         | 10 +++-------
 include/linux/kvm_host.h   |  2 +-
 6 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 2d6d91001062..1a10e0ce9266 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -241,9 +241,8 @@ out:
 	return ERR_PTR(err);
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index e3b21e51ff7e..7082481cd108 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -832,9 +832,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return -ENOIOCTLCMD;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c45eaab752b0..27c0face86f4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -623,9 +623,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	return vcpu;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3e09801e3104..ec004f80ee45 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -615,9 +615,8 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
 	kvm_s390_clear_local_irqs(vcpu);
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	return 0;
 }
 
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 49ecda7ca958..274fbcbcc180 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7056,15 +7056,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	int r;
 	struct msr_data msr;
 	struct kvm *kvm = vcpu->kvm;
 
-	r = vcpu_load(vcpu);
-	if (r)
-		return r;
+	if (vcpu_load(vcpu))
+		return;
 	msr.data = 0x0;
 	msr.index = MSR_IA32_TSC;
 	msr.host_initiated = true;
@@ -7073,8 +7071,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 					KVMCLOCK_SYNC_PERIOD);
-
-	return r;
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 26f106022c88..a82432c710c9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -652,7 +652,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
 int kvm_arch_hardware_enable(void);
-- 
cgit v1.2.3


From 3c5199143bc4b35f472c5c2534026d74821e2044 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Thu, 22 Jan 2015 08:19:32 -0500
Subject: sunrpc/lockd: fix references to the BKL

The BKL is completely out of the picture in the lockd and sunrpc code
these days. Update the antiquated comments that refer to it.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svclock.c         | 4 ++--
 include/linux/sunrpc/svc.h | 2 +-
 net/sunrpc/svc.c           | 4 ++--
 net/sunrpc/svc_xprt.c      | 3 +--
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
 static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
 {
 	/*
-	 * We can get away with a static buffer because we're only
-	 * called with BKL held.
+	 * We can get away with a static buffer because this is only called
+	 * from lockd, which is single-threaded.
 	 */
 	static char buf[2*NLM_MAXCOOKIELEN+1];
 	unsigned int i, len = sizeof(buf);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6f22cfeef5e3..fae6fb947fc8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -110,7 +110,7 @@ struct svc_serv {
  * We use sv_nrthreads as a reference count.  svc_destroy() drops
  * this refcount, so we need to bump it up around operations that
  * change the number of threads.  Horrible, but there it is.
- * Should be called with the BKL held.
+ * Should be called with the "service mutex" held.
  */
 static inline void svc_get(struct svc_serv *serv)
 {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 91eaef1844c8..78974e4d9ad2 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -768,8 +768,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 EXPORT_SYMBOL_GPL(svc_set_num_threads);
 
 /*
- * Called from a server thread as it's exiting. Caller must hold the BKL or
- * the "service mutex", whichever is appropriate for the service.
+ * Called from a server thread as it's exiting. Caller must hold the "service
+ * mutex" for the service.
  */
 void
 svc_exit_thread(struct svc_rqst *rqstp)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c69358b3cf7f..163ac45c3639 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -42,7 +42,7 @@ static LIST_HEAD(svc_xprt_class_list);
  *	svc_pool->sp_lock protects most of the fields of that pool.
  *	svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
  *	when both need to be taken (rare), svc_serv->sv_lock is first.
- *	BKL protects svc_serv->sv_nrthread.
+ *	The "service mutex" protects svc_serv->sv_nrthread.
  *	svc_sock->sk_lock protects the svc_sock->sk_deferred list
  *             and the ->sk_info_authunix cache.
  *
@@ -67,7 +67,6 @@ static LIST_HEAD(svc_xprt_class_list);
  *		  that no other thread will be using the transport or will
  *		  try to set XPT_DEAD.
  */
-
 int svc_reg_xprt_class(struct svc_xprt_class *xcl)
 {
 	struct svc_xprt_class *cl;
-- 
cgit v1.2.3


From c5ed1df781cb544d4e4d189bb5b6ec7336d8888c Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Mon, 19 Jan 2015 08:30:30 +0100
Subject: bcma: use standard bus scanning during early register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Starting with kernel 3.19-rc1 early registration of bcma on MIPS is done
a bit later, with memory allocator available. This allows us to simplify
code by using standard bus scanning method.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/bcma_private.h   |  7 +----
 drivers/bcma/host_soc.c       |  2 +-
 drivers/bcma/main.c           | 33 +++++++----------------
 drivers/bcma/scan.c           | 62 +++----------------------------------------
 include/linux/bcma/bcma_soc.h |  2 --
 5 files changed, 15 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h
index 59422b5fa46c..3f314c98d089 100644
--- a/drivers/bcma/bcma_private.h
+++ b/drivers/bcma/bcma_private.h
@@ -28,9 +28,7 @@ void bcma_prepare_core(struct bcma_bus *bus, struct bcma_device *core);
 void bcma_init_bus(struct bcma_bus *bus);
 int bcma_bus_register(struct bcma_bus *bus);
 void bcma_bus_unregister(struct bcma_bus *bus);
-int __init bcma_bus_early_register(struct bcma_bus *bus,
-				   struct bcma_device *core_cc,
-				   struct bcma_device *core_mips);
+int __init bcma_bus_early_register(struct bcma_bus *bus);
 #ifdef CONFIG_PM
 int bcma_bus_suspend(struct bcma_bus *bus);
 int bcma_bus_resume(struct bcma_bus *bus);
@@ -39,9 +37,6 @@ int bcma_bus_resume(struct bcma_bus *bus);
 /* scan.c */
 void bcma_detect_chip(struct bcma_bus *bus);
 int bcma_bus_scan(struct bcma_bus *bus);
-int __init bcma_bus_scan_early(struct bcma_bus *bus,
-			       struct bcma_device_id *match,
-			       struct bcma_device *core);
 
 /* sprom.c */
 int bcma_sprom_get(struct bcma_bus *bus);
diff --git a/drivers/bcma/host_soc.c b/drivers/bcma/host_soc.c
index 335cbcfd945b..2dce34789329 100644
--- a/drivers/bcma/host_soc.c
+++ b/drivers/bcma/host_soc.c
@@ -193,7 +193,7 @@ int __init bcma_host_soc_init(struct bcma_soc *soc)
 	int err;
 
 	/* Scan bus and initialize it */
-	err = bcma_bus_early_register(bus, &soc->core_cc, &soc->core_mips);
+	err = bcma_bus_early_register(bus);
 	if (err)
 		iounmap(bus->mmio);
 
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index c166d444fef6..c3c5e0a2d5be 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -489,35 +489,20 @@ void bcma_bus_unregister(struct bcma_bus *bus)
 	kfree(cores[0]);
 }
 
-int __init bcma_bus_early_register(struct bcma_bus *bus,
-				   struct bcma_device *core_cc,
-				   struct bcma_device *core_mips)
+/*
+ * This is a special version of bus registration function designed for SoCs.
+ * It scans bus and performs basic initialization of main cores only.
+ * Please note it requires memory allocation, however it won't try to sleep.
+ */
+int __init bcma_bus_early_register(struct bcma_bus *bus)
 {
 	int err;
 	struct bcma_device *core;
-	struct bcma_device_id match;
-
-	match.manuf = BCMA_MANUF_BCM;
-	match.id = bcma_cc_core_id(bus);
-	match.class = BCMA_CL_SIM;
-	match.rev = BCMA_ANY_REV;
-
-	/* Scan for chip common core */
-	err = bcma_bus_scan_early(bus, &match, core_cc);
-	if (err) {
-		bcma_err(bus, "Failed to scan for common core: %d\n", err);
-		return -1;
-	}
-
-	match.manuf = BCMA_MANUF_MIPS;
-	match.id = BCMA_CORE_MIPS_74K;
-	match.class = BCMA_CL_SIM;
-	match.rev = BCMA_ANY_REV;
 
-	/* Scan for mips core */
-	err = bcma_bus_scan_early(bus, &match, core_mips);
+	/* Scan for devices (cores) */
+	err = bcma_bus_scan(bus);
 	if (err) {
-		bcma_err(bus, "Failed to scan for mips core: %d\n", err);
+		bcma_err(bus, "Failed to scan bus: %d\n", err);
 		return -1;
 	}
 
diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c
index 5328ee5b4df0..df806b9c5490 100644
--- a/drivers/bcma/scan.c
+++ b/drivers/bcma/scan.c
@@ -461,6 +461,10 @@ int bcma_bus_scan(struct bcma_bus *bus)
 
 	int err, core_num = 0;
 
+	/* Skip if bus was already scanned (e.g. during early register) */
+	if (bus->nr_cores)
+		return 0;
+
 	erombase = bcma_scan_read32(bus, 0, BCMA_CC_EROM);
 	if (bus->hosttype == BCMA_HOSTTYPE_SOC) {
 		eromptr = ioremap_nocache(erombase, BCMA_CORE_SIZE);
@@ -519,61 +523,3 @@ out:
 
 	return err;
 }
-
-int __init bcma_bus_scan_early(struct bcma_bus *bus,
-			       struct bcma_device_id *match,
-			       struct bcma_device *core)
-{
-	u32 erombase;
-	u32 __iomem *eromptr, *eromend;
-
-	int err = -ENODEV;
-	int core_num = 0;
-
-	erombase = bcma_scan_read32(bus, 0, BCMA_CC_EROM);
-	if (bus->hosttype == BCMA_HOSTTYPE_SOC) {
-		eromptr = ioremap_nocache(erombase, BCMA_CORE_SIZE);
-		if (!eromptr)
-			return -ENOMEM;
-	} else {
-		eromptr = bus->mmio;
-	}
-
-	eromend = eromptr + BCMA_CORE_SIZE / sizeof(u32);
-
-	bcma_scan_switch_core(bus, erombase);
-
-	while (eromptr < eromend) {
-		memset(core, 0, sizeof(*core));
-		INIT_LIST_HEAD(&core->list);
-		core->bus = bus;
-
-		err = bcma_get_next_core(bus, &eromptr, match, core_num, core);
-		if (err == -ENODEV) {
-			core_num++;
-			continue;
-		} else if (err == -ENXIO)
-			continue;
-		else if (err == -ESPIPE)
-			break;
-		else if (err < 0)
-			goto out;
-
-		core->core_index = core_num++;
-		bus->nr_cores++;
-		bcma_info(bus, "Core %d found: %s (manuf 0x%03X, id 0x%03X, rev 0x%02X, class 0x%X)\n",
-			  core->core_index, bcma_device_name(&core->id),
-			  core->id.manuf, core->id.id, core->id.rev,
-			  core->id.class);
-
-		list_add_tail(&core->list, &bus->cores);
-		err = 0;
-		break;
-	}
-
-out:
-	if (bus->hosttype == BCMA_HOSTTYPE_SOC)
-		iounmap(eromptr);
-
-	return err;
-}
diff --git a/include/linux/bcma/bcma_soc.h b/include/linux/bcma/bcma_soc.h
index f24d245f8394..1b5fc0c3b1b5 100644
--- a/include/linux/bcma/bcma_soc.h
+++ b/include/linux/bcma/bcma_soc.h
@@ -5,8 +5,6 @@
 
 struct bcma_soc {
 	struct bcma_bus bus;
-	struct bcma_device core_cc;
-	struct bcma_device core_mips;
 };
 
 int __init bcma_host_soc_register(struct bcma_soc *soc);
-- 
cgit v1.2.3


From ee1b6f7aff94019c09e73837054979063f722046 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 15 Jan 2015 17:32:25 -0800
Subject: block: support different tag allocation policy

The libata tag allocation is using a round-robin policy. Next patch will
make libata use block generic tag allocation, so let's add a policy to
tag allocation.

Currently two policies: FIFO (default) and round-robin.

Cc: Jens Axboe <axboe@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-tag.c          | 33 +++++++++++++++++++++++++--------
 drivers/block/osdblk.c   |  2 +-
 drivers/scsi/scsi_scan.c |  3 ++-
 include/linux/blkdev.h   |  8 ++++++--
 include/scsi/scsi_host.h |  3 +++
 include/scsi/scsi_tcq.h  |  3 ++-
 6 files changed, 39 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-tag.c b/block/blk-tag.c
index a185b86741e5..f0344e6939d5 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -119,7 +119,7 @@ fail:
 }
 
 static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
-						   int depth)
+						int depth, int alloc_policy)
 {
 	struct blk_queue_tag *tags;
 
@@ -131,6 +131,8 @@ static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
 		goto fail;
 
 	atomic_set(&tags->refcnt, 1);
+	tags->alloc_policy = alloc_policy;
+	tags->next_tag = 0;
 	return tags;
 fail:
 	kfree(tags);
@@ -140,10 +142,11 @@ fail:
 /**
  * blk_init_tags - initialize the tag info for an external tag map
  * @depth:	the maximum queue depth supported
+ * @alloc_policy: tag allocation policy
  **/
-struct blk_queue_tag *blk_init_tags(int depth)
+struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy)
 {
-	return __blk_queue_init_tags(NULL, depth);
+	return __blk_queue_init_tags(NULL, depth, alloc_policy);
 }
 EXPORT_SYMBOL(blk_init_tags);
 
@@ -152,19 +155,20 @@ EXPORT_SYMBOL(blk_init_tags);
  * @q:  the request queue for the device
  * @depth:  the maximum queue depth supported
  * @tags: the tag to use
+ * @alloc_policy: tag allocation policy
  *
  * Queue lock must be held here if the function is called to resize an
  * existing map.
  **/
 int blk_queue_init_tags(struct request_queue *q, int depth,
-			struct blk_queue_tag *tags)
+			struct blk_queue_tag *tags, int alloc_policy)
 {
 	int rc;
 
 	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
 
 	if (!tags && !q->queue_tags) {
-		tags = __blk_queue_init_tags(q, depth);
+		tags = __blk_queue_init_tags(q, depth, alloc_policy);
 
 		if (!tags)
 			return -ENOMEM;
@@ -344,9 +348,21 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	}
 
 	do {
-		tag = find_first_zero_bit(bqt->tag_map, max_depth);
-		if (tag >= max_depth)
-			return 1;
+		if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) {
+			tag = find_first_zero_bit(bqt->tag_map, max_depth);
+			if (tag >= max_depth)
+				return 1;
+		} else {
+			int start = bqt->next_tag;
+			int size = min_t(int, bqt->max_depth, max_depth + start);
+			tag = find_next_zero_bit(bqt->tag_map, size, start);
+			if (tag >= size && start + size > bqt->max_depth) {
+				size = start + size - bqt->max_depth;
+				tag = find_first_zero_bit(bqt->tag_map, size);
+			}
+			if (tag >= size)
+				return 1;
+		}
 
 	} while (test_and_set_bit_lock(tag, bqt->tag_map));
 	/*
@@ -354,6 +370,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 * See blk_queue_end_tag for details.
 	 */
 
+	bqt->next_tag = (tag + 1) % bqt->max_depth;
 	rq->cmd_flags |= REQ_QUEUED;
 	rq->tag = tag;
 	bqt->tag_index[tag] = rq;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 79aa179305b5..e22942596207 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -423,7 +423,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
 	}
 
 	/* switch queue to TCQ mode; allocate tag map */
-	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
+	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO);
 	if (rc) {
 		blk_cleanup_queue(q);
 		put_disk(disk);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 983aed10ff2f..921a8c897eb2 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -290,7 +290,8 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	if (!shost_use_blk_mq(sdev->host) &&
 	    (shost->bqt || shost->hostt->use_blk_tags)) {
 		blk_queue_init_tags(sdev->request_queue,
-				    sdev->host->cmd_per_lun, shost->bqt);
+				    sdev->host->cmd_per_lun, shost->bqt,
+				    shost->hostt->tag_alloc_policy);
 	}
 	scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4c4b732d7556..6f388fd1c11c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -272,7 +272,11 @@ struct blk_queue_tag {
 	int max_depth;			/* what we will send to device */
 	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
+	int alloc_policy;		/* tag allocation policy */
+	int next_tag;			/* next tag */
 };
+#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
+#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
 
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
@@ -1139,11 +1143,11 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 extern int blk_queue_start_tag(struct request_queue *, struct request *);
 extern struct request *blk_queue_find_tag(struct request_queue *, int);
 extern void blk_queue_end_tag(struct request_queue *, struct request *);
-extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *);
+extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int);
 extern void blk_queue_free_tags(struct request_queue *);
 extern int blk_queue_resize_tags(struct request_queue *, int);
 extern void blk_queue_invalidate_tags(struct request_queue *);
-extern struct blk_queue_tag *blk_init_tags(int);
+extern struct blk_queue_tag *blk_init_tags(int, int);
 extern void blk_free_tags(struct blk_queue_tag *);
 
 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 019e66858ce6..e113c757d555 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -402,6 +402,9 @@ struct scsi_host_template {
 	 */
 	unsigned char present;
 
+	/* If use block layer to manage tags, this is tag allocation policy */
+	int tag_alloc_policy;
+
 	/*
 	 * Let the block layer assigns tags to all commands.
 	 */
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index 9708b28bd2aa..b27977e8aaed 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -66,7 +66,8 @@ static inline int scsi_init_shared_tag_map(struct Scsi_Host *shost, int depth)
 	 * devices on the shared host (for libata)
 	 */
 	if (!shost->bqt) {
-		shost->bqt = blk_init_tags(depth);
+		shost->bqt = blk_init_tags(depth,
+			shost->hostt->tag_alloc_policy);
 		if (!shost->bqt)
 			return -ENOMEM;
 	}
-- 
cgit v1.2.3


From 24391c0dc57c3756a219defaa781e68637d6ab7d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 23 Jan 2015 14:18:00 -0700
Subject: blk-mq: add tag allocation policy

This is the blk-mq part to support tag allocation policy. The default
allocation policy isn't changed (though it's not a strict FIFO). The new
policy is round-robin for libata. But it's a try-best implementation. If
multiple tasks are competing, the tags returned will be mixed (which is
unavoidable even with !mq, as requests from different tasks can be
mixed in queue)

Cc: Jens Axboe <axboe@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c      | 39 ++++++++++++++++++++++++---------------
 block/blk-mq-tag.h      |  4 +++-
 block/blk-mq.c          |  3 ++-
 drivers/scsi/scsi_lib.c |  2 ++
 include/linux/blk-mq.h  |  8 ++++++++
 5 files changed, 39 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d4daee385a23..e3387a74a9a2 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -140,7 +140,8 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
+static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag,
+			 bool nowrap)
 {
 	int tag, org_last_tag = last_tag;
 
@@ -152,7 +153,7 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
 			 * offset to 0 in a failure case, so start from 0 to
 			 * exhaust the map.
 			 */
-			if (org_last_tag && last_tag) {
+			if (org_last_tag && last_tag && !nowrap) {
 				last_tag = org_last_tag = 0;
 				continue;
 			}
@@ -170,6 +171,8 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
 	return tag;
 }
 
+#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
+
 /*
  * Straight forward bitmap tag implementation, where each bit is a tag
  * (cleared == free, and set == busy). The small twist is using per-cpu
@@ -182,7 +185,7 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
  * until the map is exhausted.
  */
 static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
-		    unsigned int *tag_cache)
+		    unsigned int *tag_cache, struct blk_mq_tags *tags)
 {
 	unsigned int last_tag, org_last_tag;
 	int index, i, tag;
@@ -194,7 +197,8 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
 	index = TAG_TO_INDEX(bt, last_tag);
 
 	for (i = 0; i < bt->map_nr; i++) {
-		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
+		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag),
+				    BT_ALLOC_RR(tags));
 		if (tag != -1) {
 			tag += (index << bt->bits_per_word);
 			goto done;
@@ -221,7 +225,7 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
 	 * up using the specific cached tag.
 	 */
 done:
-	if (tag == org_last_tag) {
+	if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) {
 		last_tag = tag + 1;
 		if (last_tag >= bt->depth - 1)
 			last_tag = 0;
@@ -250,13 +254,13 @@ static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
 static int bt_get(struct blk_mq_alloc_data *data,
 		struct blk_mq_bitmap_tags *bt,
 		struct blk_mq_hw_ctx *hctx,
-		unsigned int *last_tag)
+		unsigned int *last_tag, struct blk_mq_tags *tags)
 {
 	struct bt_wait_state *bs;
 	DEFINE_WAIT(wait);
 	int tag;
 
-	tag = __bt_get(hctx, bt, last_tag);
+	tag = __bt_get(hctx, bt, last_tag, tags);
 	if (tag != -1)
 		return tag;
 
@@ -267,7 +271,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 	do {
 		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(hctx, bt, last_tag);
+		tag = __bt_get(hctx, bt, last_tag, tags);
 		if (tag != -1)
 			break;
 
@@ -282,7 +286,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __bt_get(hctx, bt, last_tag);
+		tag = __bt_get(hctx, bt, last_tag, tags);
 		if (tag != -1)
 			break;
 
@@ -313,7 +317,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	int tag;
 
 	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
-			&data->ctx->last_tag);
+			&data->ctx->last_tag, data->hctx->tags);
 	if (tag >= 0)
 		return tag + data->hctx->tags->nr_reserved_tags;
 
@@ -329,7 +333,8 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
 		return BLK_MQ_TAG_FAIL;
 	}
 
-	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero);
+	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero,
+		data->hctx->tags);
 	if (tag < 0)
 		return BLK_MQ_TAG_FAIL;
 
@@ -401,7 +406,8 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 
 		BUG_ON(real_tag >= tags->nr_tags);
 		bt_clear_tag(&tags->bitmap_tags, real_tag);
-		*last_tag = real_tag;
+		if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
+			*last_tag = real_tag;
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
 		bt_clear_tag(&tags->breserved_tags, tag);
@@ -538,10 +544,12 @@ static void bt_free(struct blk_mq_bitmap_tags *bt)
 }
 
 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
-						   int node)
+						   int node, int alloc_policy)
 {
 	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
 
+	tags->alloc_policy = alloc_policy;
+
 	if (bt_alloc(&tags->bitmap_tags, depth, node, false))
 		goto enomem;
 	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
@@ -555,7 +563,8 @@ enomem:
 }
 
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
-				     unsigned int reserved_tags, int node)
+				     unsigned int reserved_tags,
+				     int node, int alloc_policy)
 {
 	struct blk_mq_tags *tags;
 
@@ -571,7 +580,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
-	return blk_mq_init_bitmap_tags(tags, node);
+	return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
 }
 
 void blk_mq_free_tags(struct blk_mq_tags *tags)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index a6fa0fc9d41a..90767b370308 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -42,10 +42,12 @@ struct blk_mq_tags {
 
 	struct request **rqs;
 	struct list_head page_list;
+
+	int alloc_policy;
 };
 
 
-extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
+extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a7d4a988516f..eb8e694fda06 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1374,7 +1374,8 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 	size_t rq_size, left;
 
 	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
-				set->numa_node);
+				set->numa_node,
+				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
 	if (!tags)
 		return NULL;
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9ea95dd3e260..49ab11508286 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2188,6 +2188,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
 	shost->tag_set.cmd_size = cmd_size;
 	shost->tag_set.numa_node = NUMA_NO_NODE;
 	shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	shost->tag_set.flags |=
+		BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
 	shost->tag_set.driver_data = shost;
 
 	return blk_mq_alloc_tag_set(&shost->tag_set);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5b6500c77ed2..86b08b1a5eba 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -147,6 +147,8 @@ enum {
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
 	BLK_MQ_F_SYSFS_UP	= 1 << 3,
 	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
+	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
+	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
@@ -155,6 +157,12 @@ enum {
 
 	BLK_MQ_CPU_WORK_BATCH	= 8,
 };
+#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
+	((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
+		((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
+#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
+	((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
+		<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 void blk_mq_finish_init(struct request_queue *q);
-- 
cgit v1.2.3


From 12cb5ce101abfaf74421f8cc9f196e708209eb79 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 15 Jan 2015 17:32:27 -0800
Subject: libata: use blk taging

libata uses its own tag management which is duplication and the
implementation is poor. And if we switch to blk-mq, tag is build-in.
It's time to switch to generic taging.

The SAS driver has its own tag management, and looks we can't directly
map the host controler tag to SATA tag. So I just bypassed the SAS case.

I changed the code/variable name for the tag management of libata to
make it self contained. Only sas will use it. Later if libsas implements
its tag management, the tag management code in libata can be deleted
easily.

Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/ata/libata-core.c | 54 +++++++++++++++++++++++++++++++++++------------
 drivers/ata/libata-scsi.c |  4 +++-
 drivers/ata/libata.h      |  2 +-
 include/linux/libata.h    |  5 +++--
 4 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 5c84fb5c3372..695d33df3df5 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1525,6 +1525,15 @@ static void ata_qc_complete_internal(struct ata_queued_cmd *qc)
 	complete(waiting);
 }
 
+static bool ata_valid_internal_tag(struct ata_port *ap, struct ata_device *dev,
+				   unsigned int tag)
+{
+	if (!ap->scsi_host)
+		return !test_and_set_bit(tag, &ap->sas_tag_allocated);
+	return !dev->sdev ||
+	       !blk_queue_find_tag(dev->sdev->request_queue, tag);
+}
+
 /**
  *	ata_exec_internal_sg - execute libata internal command
  *	@dev: Device to which the command is sent
@@ -1585,8 +1594,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	else
 		tag = 0;
 
-	if (test_and_set_bit(tag, &ap->qc_allocated))
-		BUG();
+	BUG_ON(!ata_valid_internal_tag(ap, dev, tag));
 	qc = __ata_qc_from_tag(ap, tag);
 
 	qc->tag = tag;
@@ -4737,27 +4745,23 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
  *	None.
  */
 
-static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
+static struct ata_queued_cmd *sas_ata_qc_new(struct ata_port *ap)
 {
 	struct ata_queued_cmd *qc = NULL;
 	unsigned int max_queue = ap->host->n_tags;
 	unsigned int i, tag;
 
-	/* no command while frozen */
-	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
-		return NULL;
-
-	for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) {
+	for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) {
 		tag = tag < max_queue ? tag : 0;
 
 		/* the last tag is reserved for internal command. */
 		if (tag == ATA_TAG_INTERNAL)
 			continue;
 
-		if (!test_and_set_bit(tag, &ap->qc_allocated)) {
+		if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
 			qc = __ata_qc_from_tag(ap, tag);
 			qc->tag = tag;
-			ap->last_tag = tag;
+			ap->sas_last_tag = tag;
 			break;
 		}
 	}
@@ -4765,6 +4769,24 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 	return qc;
 }
 
+static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap, int blktag)
+{
+	struct ata_queued_cmd *qc;
+
+	/* no command while frozen */
+	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
+		return NULL;
+
+	/* SATA will directly use block tag. libsas need its own tag management */
+	if (ap->scsi_host) {
+		qc = __ata_qc_from_tag(ap, blktag);
+		qc->tag = blktag;
+		return qc;
+	}
+
+	return sas_ata_qc_new(ap);
+}
+
 /**
  *	ata_qc_new_init - Request an available ATA command, and initialize it
  *	@dev: Device from whom we request an available command structure
@@ -4773,12 +4795,12 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
  *	None.
  */
 
-struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
+struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int blktag)
 {
 	struct ata_port *ap = dev->link->ap;
 	struct ata_queued_cmd *qc;
 
-	qc = ata_qc_new(ap);
+	qc = ata_qc_new(ap, blktag);
 	if (qc) {
 		qc->scsicmd = NULL;
 		qc->ap = ap;
@@ -4800,6 +4822,12 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
  *	LOCKING:
  *	spin_lock_irqsave(host lock)
  */
+static void sas_ata_qc_free(unsigned int tag, struct ata_port *ap)
+{
+	if (!ap->scsi_host)
+		clear_bit(tag, &ap->sas_tag_allocated);
+}
+
 void ata_qc_free(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap;
@@ -4812,7 +4840,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	tag = qc->tag;
 	if (likely(ata_tag_valid(tag))) {
 		qc->tag = ATA_TAG_POISON;
-		clear_bit(tag, &ap->qc_allocated);
+		sas_ata_qc_free(tag, ap);
 	}
 }
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index e364e86e84d7..94339c2aed1b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
 {
 	struct ata_queued_cmd *qc;
 
-	qc = ata_qc_new_init(dev);
+	qc = ata_qc_new_init(dev, cmd->request->tag);
 	if (qc) {
 		qc->scsicmd = cmd;
 		qc->scsidone = cmd->scsi_done;
@@ -3666,6 +3666,8 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
 		 */
 		shost->max_host_blocked = 1;
 
+		scsi_init_shared_tag_map(shost, host->n_tags);
+
 		rc = scsi_add_host_with_dma(ap->scsi_host,
 						&ap->tdev, ap->host->dev);
 		if (rc)
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 5f4e0cca56ec..40405135bbb6 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
 extern void ata_force_cbl(struct ata_port *ap);
 extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
 extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
-extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
+extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag);
 extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
 			   u64 block, u32 n_block, unsigned int tf_flags,
 			   unsigned int tag);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2d182413b1db..f23454762717 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -821,10 +821,10 @@ struct ata_port {
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
 
 	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE];
-	unsigned long		qc_allocated;
+	unsigned long		sas_tag_allocated; /* for sas tag allocation only */
 	unsigned int		qc_active;
 	int			nr_active_links; /* #links with active qcs */
-	unsigned int		last_tag;	/* track next tag hw expects */
+	unsigned int		sas_last_tag;	/* track next tag hw expects */
 
 	struct ata_link		link;		/* host default link */
 	struct ata_link		*slave_link;	/* see ata_slave_link_init() */
@@ -1344,6 +1344,7 @@ extern struct device_attribute *ata_common_sdev_attrs[];
 	.ioctl			= ata_scsi_ioctl,		\
 	.queuecommand		= ata_scsi_queuecmd,		\
 	.can_queue		= ATA_DEF_QUEUE,		\
+	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
 	.this_id		= ATA_SHT_THIS_ID,		\
 	.cmd_per_lun		= ATA_SHT_CMD_PER_LUN,		\
 	.emulated		= ATA_SHT_EMULATED,		\
-- 
cgit v1.2.3


From a9aaf2915ee265735c28b764551d084e61a694e0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 13 Jan 2015 11:34:00 +0530
Subject: cpufreq: stats: get rid of per-cpu cpufreq_stats_table

All CPUs sharing a cpufreq policy share stats too. For this reason,
add a stats pointer to struct cpufreq_policy and drop per-CPU variable
cpufreq_stats_table used for accessing cpufreq stats so as to reduce
code complexity.

Reviewed-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 62 +++++++++++++++++++----------------------
 include/linux/cpufreq.h         |  3 ++
 2 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 6c234f548601..3792b2e2f4a8 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -31,8 +31,6 @@ struct cpufreq_stats {
 #endif
 };
 
-static DEFINE_PER_CPU(struct cpufreq_stats *, cpufreq_stats_table);
-
 static int cpufreq_stats_update(struct cpufreq_stats *stat)
 {
 	unsigned long long cur_time = get_jiffies_64();
@@ -48,20 +46,15 @@ static int cpufreq_stats_update(struct cpufreq_stats *stat)
 
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
 {
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table, policy->cpu);
-	if (!stat)
-		return 0;
-	return sprintf(buf, "%d\n",
-			per_cpu(cpufreq_stats_table, stat->cpu)->total_trans);
+	return sprintf(buf, "%d\n", policy->stats->total_trans);
 }
 
 static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 {
+	struct cpufreq_stats *stat = policy->stats;
 	ssize_t len = 0;
 	int i;
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table, policy->cpu);
-	if (!stat)
-		return 0;
+
 	cpufreq_stats_update(stat);
 	for (i = 0; i < stat->state_num; i++) {
 		len += sprintf(buf + len, "%u %llu\n", stat->freq_table[i],
@@ -74,12 +67,10 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 #ifdef CONFIG_CPU_FREQ_STAT_DETAILS
 static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 {
+	struct cpufreq_stats *stat = policy->stats;
 	ssize_t len = 0;
 	int i, j;
 
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table, policy->cpu);
-	if (!stat)
-		return 0;
 	cpufreq_stats_update(stat);
 	len += snprintf(buf + len, PAGE_SIZE - len, "   From  :    To\n");
 	len += snprintf(buf + len, PAGE_SIZE - len, "         : ");
@@ -145,8 +136,9 @@ static int freq_table_get_index(struct cpufreq_stats *stat, unsigned int freq)
 
 static void __cpufreq_stats_free_table(struct cpufreq_policy *policy)
 {
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table, policy->cpu);
+	struct cpufreq_stats *stat = policy->stats;
 
+	/* Already freed */
 	if (!stat)
 		return;
 
@@ -155,7 +147,7 @@ static void __cpufreq_stats_free_table(struct cpufreq_policy *policy)
 	sysfs_remove_group(&policy->kobj, &stats_attr_group);
 	kfree(stat->time_in_state);
 	kfree(stat);
-	per_cpu(cpufreq_stats_table, policy->cpu) = NULL;
+	policy->stats = NULL;
 }
 
 static void cpufreq_stats_free_table(unsigned int cpu)
@@ -184,7 +176,7 @@ static int __cpufreq_stats_create_table(struct cpufreq_policy *policy)
 		return 0;
 
 	/* stats already initialized */
-	if (per_cpu(cpufreq_stats_table, cpu))
+	if (policy->stats)
 		return -EEXIST;
 
 	stat = kzalloc(sizeof(*stat), GFP_KERNEL);
@@ -196,7 +188,7 @@ static int __cpufreq_stats_create_table(struct cpufreq_policy *policy)
 		goto error_out;
 
 	stat->cpu = cpu;
-	per_cpu(cpufreq_stats_table, cpu) = stat;
+	policy->stats = stat;
 
 	cpufreq_for_each_valid_entry(pos, table)
 		count++;
@@ -231,7 +223,7 @@ error_alloc:
 	sysfs_remove_group(&policy->kobj, &stats_attr_group);
 error_out:
 	kfree(stat);
-	per_cpu(cpufreq_stats_table, cpu) = NULL;
+	policy->stats = NULL;
 	return ret;
 }
 
@@ -254,15 +246,7 @@ static void cpufreq_stats_create_table(unsigned int cpu)
 
 static void cpufreq_stats_update_policy_cpu(struct cpufreq_policy *policy)
 {
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table,
-			policy->last_cpu);
-
-	pr_debug("Updating stats_table for new_cpu %u from last_cpu %u\n",
-			policy->cpu, policy->last_cpu);
-	per_cpu(cpufreq_stats_table, policy->cpu) = per_cpu(cpufreq_stats_table,
-			policy->last_cpu);
-	per_cpu(cpufreq_stats_table, policy->last_cpu) = NULL;
-	stat->cpu = policy->cpu;
+	policy->stats->cpu = policy->cpu;
 }
 
 static int cpufreq_stat_notifier_policy(struct notifier_block *nb,
@@ -288,27 +272,36 @@ static int cpufreq_stat_notifier_trans(struct notifier_block *nb,
 		unsigned long val, void *data)
 {
 	struct cpufreq_freqs *freq = data;
+	struct cpufreq_policy *policy = cpufreq_cpu_get(freq->cpu);
 	struct cpufreq_stats *stat;
 	int old_index, new_index;
 
-	if (val != CPUFREQ_POSTCHANGE)
+	if (!policy) {
+		pr_err("%s: No policy found\n", __func__);
 		return 0;
+	}
 
-	stat = per_cpu(cpufreq_stats_table, freq->cpu);
-	if (!stat)
-		return 0;
+	if (val != CPUFREQ_POSTCHANGE)
+		goto put_policy;
+
+	if (!policy->stats) {
+		pr_debug("%s: No stats found\n", __func__);
+		goto put_policy;
+	}
+
+	stat = policy->stats;
 
 	old_index = stat->last_index;
 	new_index = freq_table_get_index(stat, freq->new);
 
 	/* We can't do stat->time_in_state[-1]= .. */
 	if (old_index == -1 || new_index == -1)
-		return 0;
+		goto put_policy;
 
 	cpufreq_stats_update(stat);
 
 	if (old_index == new_index)
-		return 0;
+		goto put_policy;
 
 	spin_lock(&cpufreq_stats_lock);
 	stat->last_index = new_index;
@@ -317,6 +310,9 @@ static int cpufreq_stat_notifier_trans(struct notifier_block *nb,
 #endif
 	stat->total_trans++;
 	spin_unlock(&cpufreq_stats_lock);
+
+put_policy:
+	cpufreq_cpu_put(policy);
 	return 0;
 }
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 4d078cebafd2..60b7b496565d 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -113,6 +113,9 @@ struct cpufreq_policy {
 	wait_queue_head_t	transition_wait;
 	struct task_struct	*transition_task; /* Task which is doing the transition */
 
+	/* cpufreq-stats */
+	struct cpufreq_stats	*stats;
+
 	/* For cpufreq driver's internal use */
 	void			*driver_data;
 };
-- 
cgit v1.2.3


From 7c418ff099110d987846c8c670479a3b90ed1dcb Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 6 Jan 2015 21:09:08 +0530
Subject: cpufreq: Remove (now) unused 'last_cpu' from struct cpufreq_policy

'last_cpu' was used only from cpufreq-stats and isn't used anymore. Get rid of
it.

Reviewed-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 3 ---
 include/linux/cpufreq.h   | 2 --
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 70b568a253e9..60ef37d569c5 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1086,10 +1086,7 @@ static int update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu,
 	}
 
 	down_write(&policy->rwsem);
-
-	policy->last_cpu = policy->cpu;
 	policy->cpu = cpu;
-
 	up_write(&policy->rwsem);
 
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 60b7b496565d..7e1a389b4e92 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -66,8 +66,6 @@ struct cpufreq_policy {
 	unsigned int		shared_type; /* ACPI: ANY or ALL affected CPUs
 						should set cpufreq */
 	unsigned int		cpu;    /* cpu nr of CPU managing this policy */
-	unsigned int		last_cpu; /* cpu nr of previous CPU that managed
-					   * this policy */
 	struct clk		*clk;
 	struct cpufreq_cpuinfo	cpuinfo;/* see above */
 
-- 
cgit v1.2.3


From d9f354460db8b58a8395936d323b4ca6e8428b9d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 6 Jan 2015 21:09:10 +0530
Subject: cpufreq: remove CPUFREQ_UPDATE_POLICY_CPU notifications

CPUFREQ_UPDATE_POLICY_CPU notifications were used only from cpufreq-stats which
doesn't use it anymore. Remove them.

This also decrements values of other notification macros defined after
CPUFREQ_UPDATE_POLICY_CPU by 1 to remove gaps. Hopefully all users are using
macro's instead of direct numbers and so they wouldn't break as macro values are
changed now.

Reviewed-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 3 ---
 include/linux/cpufreq.h   | 5 ++---
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 60ef37d569c5..ca69f42b8e1e 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1089,9 +1089,6 @@ static int update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu,
 	policy->cpu = cpu;
 	up_write(&policy->rwsem);
 
-	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
-			CPUFREQ_UPDATE_POLICY_CPU, policy);
-
 	return 0;
 }
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 7e1a389b4e92..2ee4888c1f47 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -368,9 +368,8 @@ static inline void cpufreq_resume(void) {}
 #define CPUFREQ_INCOMPATIBLE		(1)
 #define CPUFREQ_NOTIFY			(2)
 #define CPUFREQ_START			(3)
-#define CPUFREQ_UPDATE_POLICY_CPU	(4)
-#define CPUFREQ_CREATE_POLICY		(5)
-#define CPUFREQ_REMOVE_POLICY		(6)
+#define CPUFREQ_CREATE_POLICY		(4)
+#define CPUFREQ_REMOVE_POLICY		(5)
 
 #ifdef CONFIG_CPU_FREQ
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list);
-- 
cgit v1.2.3


From 382548a62ade2c003c77a1055b6eb2a47ce30084 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 20 Jan 2015 11:33:09 +0100
Subject: PM / Domains: Remove pm_genpd_dev_need_restore() API

There are currently no users of this API, let's remove it.

Additionally, if such feature would be needed future wise, a better
option is likely use pm_runtime_set_active|suspended() in some form.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 20 --------------------
 include/linux/pm_domain.h   |  2 --
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 0d8780c04a5e..c5280f2b798b 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1558,26 +1558,6 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	return ret;
 }
 
-/**
- * pm_genpd_dev_need_restore - Set/unset the device's "need restore" flag.
- * @dev: Device to set/unset the flag for.
- * @val: The new value of the device's "need restore" flag.
- */
-void pm_genpd_dev_need_restore(struct device *dev, bool val)
-{
-	struct pm_subsys_data *psd;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev->power.lock, flags);
-
-	psd = dev_to_psd(dev);
-	if (psd && psd->domain_data)
-		to_gpd_data(psd->domain_data)->need_restore = val ? 1 : 0;
-
-	spin_unlock_irqrestore(&dev->power.lock, flags);
-}
-EXPORT_SYMBOL_GPL(pm_genpd_dev_need_restore);
-
 /**
  * pm_genpd_add_subdomain - Add a subdomain to an I/O PM domain.
  * @genpd: Master PM domain to add the subdomain to.
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index a9edab2c787a..ed607760fc20 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -140,7 +140,6 @@ extern int __pm_genpd_name_add_device(const char *domain_name,
 
 extern int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 				  struct device *dev);
-extern void pm_genpd_dev_need_restore(struct device *dev, bool val);
 extern int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 				  struct generic_pm_domain *new_subdomain);
 extern int pm_genpd_add_subdomain_names(const char *master_name,
@@ -187,7 +186,6 @@ static inline int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 {
 	return -ENOSYS;
 }
-static inline void pm_genpd_dev_need_restore(struct device *dev, bool val) {}
 static inline int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 					 struct generic_pm_domain *new_sd)
 {
-- 
cgit v1.2.3


From 566fcec60b7458784d4ed9bca974c5a56dacf214 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 23 Jan 2015 15:32:46 -0500
Subject: NFSv4: Fix an atomicity problem in CLOSE

If we are to remove the serialisation of OPEN/CLOSE, then we need to
ensure that the stateid sent as part of a CLOSE operation does not
change after we test the state in nfs4_close_prepare.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 7 ++++++-
 fs/nfs/nfs4xdr.c        | 4 ++--
 include/linux/nfs_xdr.h | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c347705b0161..4863dec10865 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2587,6 +2587,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_BAD_STATEID:
 		case -NFS4ERR_EXPIRED:
+			if (!nfs4_stateid_match(&calldata->arg.stateid,
+						&state->stateid)) {
+				rpc_restart_call_prepare(task);
+				goto out_release;
+			}
 			if (calldata->arg.fmode == 0)
 				break;
 		default:
@@ -2619,6 +2624,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
 	is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
 	is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
+	nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
 	/* Calculate the change in open mode */
 	calldata->arg.fmode = 0;
 	if (state->n_rdwr == 0) {
@@ -2757,7 +2763,6 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
 	calldata->inode = state->inode;
 	calldata->state = state;
 	calldata->arg.fh = NFS_FH(state->inode);
-	calldata->arg.stateid = &state->open_stateid;
 	/* Serialization for the sequence id */
 	calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
 	if (calldata->arg.seqid == NULL)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cb4376b78ed9..7e7be5ab70bb 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1125,7 +1125,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 {
 	encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
 	encode_nfs4_seqid(xdr, arg->seqid);
-	encode_nfs4_stateid(xdr, arg->stateid);
+	encode_nfs4_stateid(xdr, &arg->stateid);
 }
 
 static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
@@ -1530,7 +1530,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
 	encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
-	encode_nfs4_stateid(xdr, arg->stateid);
+	encode_nfs4_stateid(xdr, &arg->stateid);
 	encode_nfs4_seqid(xdr, arg->seqid);
 	encode_share_access(xdr, arg->fmode);
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 467c84efb596..7e38d641236e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -389,7 +389,7 @@ struct nfs_open_confirmres {
 struct nfs_closeargs {
 	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh *         fh;
-	nfs4_stateid *		stateid;
+	nfs4_stateid 		stateid;
 	struct nfs_seqid *	seqid;
 	fmode_t			fmode;
 	const u32 *		bitmask;
-- 
cgit v1.2.3


From 8b618628b2bf83512fc8df5e8672619d65adfdfb Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Wed, 3 Dec 2014 14:43:06 -0500
Subject: ktime: Optimize ktime_divns for constant divisors

At least on ARM, do_div() is optimized to turn constant divisors into
an inline multiplication by the reciprocal value at compile time.
However this optimization is missed entirely whenever ktime_divns() is
used and the slow out-of-line division code is used all the time.

Let ktime_divns() use do_div() inline whenever the divisor is constant
and small enough.  This will make things like ktime_to_us() and
ktime_to_ms() much faster.

Cc: Arnd Bergmann <arnd.bergmann@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Nicolas Pitre <nico@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/ktime.h | 12 +++++++++++-
 kernel/time/hrtimer.c |  4 ++--
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index c9d645ad98ff..411dd8bfe539 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -166,7 +166,17 @@ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2)
 }
 
 #if BITS_PER_LONG < 64
-extern u64 ktime_divns(const ktime_t kt, s64 div);
+extern u64 __ktime_divns(const ktime_t kt, s64 div);
+static inline u64 ktime_divns(const ktime_t kt, s64 div)
+{
+	if (__builtin_constant_p(div) && !(div >> 32)) {
+		u64 ns = kt.tv64;
+		do_div(ns, div);
+		return ns;
+	} else {
+		return __ktime_divns(kt, div);
+	}
+}
 #else /* BITS_PER_LONG < 64 */
 # define ktime_divns(kt, div)		(u64)((kt).tv64 / (div))
 #endif
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..890535c41c2d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 /*
  * Divide a ktime value by a nanosecond value
  */
-u64 ktime_divns(const ktime_t kt, s64 div)
+u64 __ktime_divns(const ktime_t kt, s64 div)
 {
 	u64 dclc;
 	int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
 
 	return dclc;
 }
-EXPORT_SYMBOL_GPL(ktime_divns);
+EXPORT_SYMBOL_GPL(__ktime_divns);
 #endif /* BITS_PER_LONG >= 64 */
 
 /*
-- 
cgit v1.2.3


From d08c0cdd26d48751c15aa2b4479a410594fee9ac Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 8 Dec 2014 12:00:09 -0800
Subject: time: Expose getboottime64 for in-kernel uses

Adds a timespec64 based getboottime64() implementation
that can be used as we convert internal users of
getboottime away from using timespecs.

Cc: pang.xunlei <pang.xunlei@linaro.org>
Cc: Arnd Bergmann <arnd.bergmann@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h | 16 ++++++++++++++--
 kernel/time/timekeeping.c   | 12 ++++++------
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 9b63d13ba82b..91480137aa39 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -33,6 +33,7 @@ extern time64_t ktime_get_real_seconds(void);
 
 extern int __getnstimeofday64(struct timespec64 *tv);
 extern void getnstimeofday64(struct timespec64 *tv);
+extern void getboottime64(struct timespec64 *ts);
 
 #if BITS_PER_LONG == 64
 /**
@@ -72,6 +73,11 @@ static inline struct timespec get_monotonic_coarse(void)
 {
 	return get_monotonic_coarse64();
 }
+
+static inline void getboottime(struct timespec *ts)
+{
+	return getboottime64(ts);
+}
 #else
 /**
  * Deprecated. Use do_settimeofday64().
@@ -129,9 +135,15 @@ static inline struct timespec get_monotonic_coarse(void)
 {
 	return timespec64_to_timespec(get_monotonic_coarse64());
 }
-#endif
 
-extern void getboottime(struct timespec *ts);
+static inline void getboottime(struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	getboottime64(&ts64);
+	*ts = timespec64_to_timespec(ts64);
+}
+#endif
 
 #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
 #define ktime_get_real_ts64(ts)	getnstimeofday64(ts)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..b124af259800 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1659,24 +1659,24 @@ out:
 }
 
 /**
- * getboottime - Return the real time of system boot.
- * @ts:		pointer to the timespec to be set
+ * getboottime64 - Return the real time of system boot.
+ * @ts:		pointer to the timespec64 to be set
  *
- * Returns the wall-time of boot in a timespec.
+ * Returns the wall-time of boot in a timespec64.
  *
  * This is based on the wall_to_monotonic offset and the total suspend
  * time. Calls to settimeofday will affect the value returned (which
  * basically means that however wrong your real time clock is at boot time,
  * you get the right time here).
  */
-void getboottime(struct timespec *ts)
+void getboottime64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
 
-	*ts = ktime_to_timespec(t);
+	*ts = ktime_to_timespec64(t);
 }
-EXPORT_SYMBOL_GPL(getboottime);
+EXPORT_SYMBOL_GPL(getboottime64);
 
 unsigned long get_seconds(void)
 {
-- 
cgit v1.2.3


From 2e0c78ee5ba4d777ecf22c8f40cc968b4308ca88 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 18 Dec 2014 18:04:34 -0800
Subject: time: Expose get_monotonic_boottime64 for in-kernel use

As part of the 2038 conversion process, add a
get_monotonic_boottime64 accessor so we can depracate
get_monotonic_boottime.

Cc: pang.xunlei <pang.xunlei@linaro.org>
Cc: Arnd Bergmann <arnd.bergmann@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 91480137aa39..3eaae4754275 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -229,6 +229,11 @@ static inline void get_monotonic_boottime(struct timespec *ts)
 	*ts = ktime_to_timespec(ktime_get_boottime());
 }
 
+static inline void get_monotonic_boottime64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get_boottime());
+}
+
 static inline void timekeeping_clocktai(struct timespec *ts)
 {
 	*ts = ktime_to_timespec(ktime_get_clocktai());
-- 
cgit v1.2.3


From 9a4a445e30f0b601ca2d9433274047cbf48ebf9e Mon Sep 17 00:00:00 2001
From: Xunlei Pang <pang.xunlei@linaro.org>
Date: Thu, 22 Jan 2015 02:31:55 +0000
Subject: rtc: Convert rtc_set_ntp_time() to use timespec64

rtc_set_ntp_time() uses timespec which is y2038-unsafe,
so modify to use timespec64 which is y2038-safe, then
replace rtc_time_to_tm() with rtc_time64_to_tm().

Also adjust all its call sites(only NTP uses it) accordingly.

Cc: pang.xunlei <pang.xunlei@linaro.org>
Cc: Arnd Bergmann <arnd.bergmann@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/rtc/systohc.c | 6 +++---
 include/linux/rtc.h   | 2 +-
 kernel/time/ntp.c     | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c
index bf3e242ccc5c..eb71872d0361 100644
--- a/drivers/rtc/systohc.c
+++ b/drivers/rtc/systohc.c
@@ -20,16 +20,16 @@
  *
  * If temporary failure is indicated the caller should try again 'soon'
  */
-int rtc_set_ntp_time(struct timespec now)
+int rtc_set_ntp_time(struct timespec64 now)
 {
 	struct rtc_device *rtc;
 	struct rtc_time tm;
 	int err = -ENODEV;
 
 	if (now.tv_nsec < (NSEC_PER_SEC >> 1))
-		rtc_time_to_tm(now.tv_sec, &tm);
+		rtc_time64_to_tm(now.tv_sec, &tm);
 	else
-		rtc_time_to_tm(now.tv_sec + 1, &tm);
+		rtc_time64_to_tm(now.tv_sec + 1, &tm);
 
 	rtc = rtc_class_open(CONFIG_RTC_HCTOSYS_DEVICE);
 	if (rtc) {
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 6d6be09a2fe5..dcad7ee0d746 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -161,7 +161,7 @@ extern void devm_rtc_device_unregister(struct device *dev,
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs);
-extern int rtc_set_ntp_time(struct timespec now);
+extern int rtc_set_ntp_time(struct timespec64 now);
 int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
 extern int rtc_read_alarm(struct rtc_device *rtc,
 			struct rtc_wkalrm *alrm);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87a346fd6d61..183dfe2191c6 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work)
 
 	getnstimeofday64(&now);
 	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
-		struct timespec adjust = timespec64_to_timespec(now);
+		struct timespec64 adjust = now;
 
 		fail = -ENODEV;
 		if (persistent_clock_is_local)
 			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-		fail = update_persistent_clock(adjust);
+		fail = update_persistent_clock(timespec64_to_timespec(adjust));
 #endif
 #ifdef CONFIG_RTC_SYSTOHC
 		if (fail == -ENODEV)
-- 
cgit v1.2.3


From 425c1d4e5b6d4bd700eb94ad8318bdb05431fdc7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 24 Jan 2015 14:57:53 -0500
Subject: NFSv4: Fix lock on-wire reordering issues

This patch ensures that the server cannot reorder our LOCK/LOCKU
requests if they are sent in parallel on the wire.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 29 ++++++++++++++++++++++++-----
 fs/nfs/nfs4xdr.c        |  6 +++---
 include/linux/nfs_xdr.h |  6 +++---
 3 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f12ded041a42..41e7c2fc046e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5393,7 +5393,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	p->arg.fl = &p->fl;
 	p->arg.seqid = seqid;
 	p->res.seqid = seqid;
-	p->arg.stateid = &lsp->ls_stateid;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
 	/* Ensure we don't close file until we're done freeing locks! */
@@ -5428,6 +5427,9 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
+			if (!nfs4_stateid_match(&calldata->arg.stateid,
+						&calldata->lsp->ls_stateid))
+				rpc_restart_call_prepare(task);
 			break;
 		default:
 			if (nfs4_async_handle_error(task, calldata->server,
@@ -5443,6 +5445,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		goto out_wait;
+	nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
 	if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
 		/* Note: exit _without_ running nfs4_locku_done */
 		goto out_no_action;
@@ -5584,7 +5587,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
 	if (IS_ERR(p->arg.lock_seqid))
 		goto out_free_seqid;
-	p->arg.lock_stateid = &lsp->ls_stateid;
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
 	p->arg.lock_owner.s_dev = server->s_dev;
@@ -5615,11 +5617,15 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
 			goto out_release_lock_seqid;
 		}
-		data->arg.open_stateid = &state->open_stateid;
+		nfs4_stateid_copy(&data->arg.open_stateid,
+				&state->open_stateid);
 		data->arg.new_lock_owner = 1;
 		data->res.open_seqid = data->arg.open_seqid;
-	} else
+	} else {
 		data->arg.new_lock_owner = 0;
+		nfs4_stateid_copy(&data->arg.lock_stateid,
+				&data->lsp->ls_stateid);
+	}
 	if (!nfs4_valid_open_stateid(state)) {
 		data->rpc_status = -EBADF;
 		task->tk_action = NULL;
@@ -5651,7 +5657,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 		return;
 
 	data->rpc_status = task->tk_status;
-	if (task->tk_status == 0) {
+	switch (task->tk_status) {
+	case 0:
 		renew_lease(NFS_SERVER(data->ctx->dentry->d_inode),
 				data->timestamp);
 		if (data->arg.new_lock_owner != 0) {
@@ -5660,6 +5667,18 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 			set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
 		} else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
 			rpc_restart_call_prepare(task);
+		break;
+	case -NFS4ERR_BAD_STATEID:
+	case -NFS4ERR_OLD_STATEID:
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_EXPIRED:
+		if (data->arg.new_lock_owner != 0) {
+			if (!nfs4_stateid_match(&data->arg.open_stateid,
+						&lsp->ls_state->open_stateid))
+				rpc_restart_call_prepare(task);
+		} else if (!nfs4_stateid_match(&data->arg.lock_stateid,
+						&lsp->ls_stateid))
+				rpc_restart_call_prepare(task);
 	}
 	dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d05fada4929c..e3018e7a316c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1304,12 +1304,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	*p = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		encode_nfs4_seqid(xdr, args->open_seqid);
-		encode_nfs4_stateid(xdr, args->open_stateid);
+		encode_nfs4_stateid(xdr, &args->open_stateid);
 		encode_nfs4_seqid(xdr, args->lock_seqid);
 		encode_lockowner(xdr, &args->lock_owner);
 	}
 	else {
-		encode_nfs4_stateid(xdr, args->lock_stateid);
+		encode_nfs4_stateid(xdr, &args->lock_stateid);
 		encode_nfs4_seqid(xdr, args->lock_seqid);
 	}
 }
@@ -1333,7 +1333,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
 	encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
 	encode_nfs4_seqid(xdr, args->seqid);
-	encode_nfs4_stateid(xdr, args->stateid);
+	encode_nfs4_stateid(xdr, &args->stateid);
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
 	xdr_encode_hyper(p, nfs4_lock_length(args->fl));
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7e38d641236e..b6a6953c0f09 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -416,9 +416,9 @@ struct nfs_lock_args {
 	struct nfs_fh *		fh;
 	struct file_lock *	fl;
 	struct nfs_seqid *	lock_seqid;
-	nfs4_stateid *		lock_stateid;
+	nfs4_stateid		lock_stateid;
 	struct nfs_seqid *	open_seqid;
-	nfs4_stateid *		open_stateid;
+	nfs4_stateid		open_stateid;
 	struct nfs_lowner	lock_owner;
 	unsigned char		block : 1;
 	unsigned char		reclaim : 1;
@@ -437,7 +437,7 @@ struct nfs_locku_args {
 	struct nfs_fh *		fh;
 	struct file_lock *	fl;
 	struct nfs_seqid *	seqid;
-	nfs4_stateid *		stateid;
+	nfs4_stateid 		stateid;
 };
 
 struct nfs_locku_res {
-- 
cgit v1.2.3


From c69899a17ca4836230720e65493942d9582a0424 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 24 Jan 2015 16:03:52 -0500
Subject: NFSv4: Update of VFS byte range lock must be atomic with the stateid
 update

Ensure that we test the lock stateid remained unchanged while we were
updating the VFS tracking of the byte range lock. Have the process
replay the lock to the server if we detect that was not the case.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 37 +++++++++++++++----------------------
 include/linux/nfs_xdr.h |  1 +
 2 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 41e7c2fc046e..9f6baf98942c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5420,9 +5420,10 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 	switch (task->tk_status) {
 		case 0:
 			renew_lease(calldata->server, calldata->timestamp);
-			nfs4_update_lock_stateid(calldata->lsp,
-					&calldata->res.stateid);
-			break;
+			do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
+			if (nfs4_update_lock_stateid(calldata->lsp,
+					&calldata->res.stateid))
+				break;
 		case -NFS4ERR_BAD_STATEID:
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_STALE_STATEID:
@@ -5661,6 +5662,13 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 	case 0:
 		renew_lease(NFS_SERVER(data->ctx->dentry->d_inode),
 				data->timestamp);
+		if (data->arg.new_lock) {
+			data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
+			if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
+				rpc_restart_call_prepare(task);
+				break;
+			}
+		}
 		if (data->arg.new_lock_owner != 0) {
 			nfs_confirm_seqid(&lsp->ls_seqid, 0);
 			nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
@@ -5760,7 +5768,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 		if (recovery_type == NFS_LOCK_RECLAIM)
 			data->arg.reclaim = NFS_LOCK_RECLAIM;
 		nfs4_set_sequence_privileged(&data->arg.seq_args);
-	}
+	} else
+		data->arg.new_lock = 1;
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
@@ -5884,10 +5893,8 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
 
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-	struct nfs4_state_owner *sp = state->owner;
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	unsigned char fl_flags = request->fl_flags;
-	unsigned int seq;
 	int status = -ENOLCK;
 
 	if ((fl_flags & FL_POSIX) &&
@@ -5907,25 +5914,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 		/* ...but avoid races with delegation recall... */
 		request->fl_flags = fl_flags & ~FL_SLEEP;
 		status = do_vfs_lock(request->fl_file, request);
-		goto out_unlock;
-	}
-	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
-	up_read(&nfsi->rwsem);
-	status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
-	if (status != 0)
+		up_read(&nfsi->rwsem);
 		goto out;
-	down_read(&nfsi->rwsem);
-	if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
-		status = -NFS4ERR_DELAY;
-		goto out_unlock;
 	}
-	/* Note: we always want to sleep here! */
-	request->fl_flags = fl_flags | FL_SLEEP;
-	if (do_vfs_lock(request->fl_file, request) < 0)
-		printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
-			"manager!\n", __func__);
-out_unlock:
 	up_read(&nfsi->rwsem);
+	status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
 out:
 	request->fl_flags = fl_flags;
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b6a6953c0f09..e5c3b620a609 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -422,6 +422,7 @@ struct nfs_lock_args {
 	struct nfs_lowner	lock_owner;
 	unsigned char		block : 1;
 	unsigned char		reclaim : 1;
+	unsigned char		new_lock : 1;
 	unsigned char		new_lock_owner : 1;
 };
 
-- 
cgit v1.2.3


From 4e88f3de89fbb7b5a5a0aca20376b276d26732ac Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 21 Jan 2015 17:13:00 +0100
Subject: clk: Introduce clk_has_parent()

This new function is similar to clk_set_parent(), except that it doesn't
actually change the parent. It merely checks that the given parent clock
can be a parent for the given clock.

A situation where this is useful is to check that a particular setup is
valid before switching to it. One specific use-case for this is atomic
modesetting in the DRM framework where setting a mode is divided into a
check phase where a given configuration is validated before applying
changes to the hardware.

Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 drivers/clk/clk.c   | 30 ++++++++++++++++++++++++++++++
 include/linux/clk.h | 17 +++++++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index f4963b7d4e17..5272ad71929f 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -1651,6 +1651,36 @@ void __clk_reparent(struct clk *clk, struct clk *new_parent)
 	__clk_recalc_rates(clk, POST_RATE_CHANGE);
 }
 
+/**
+ * clk_has_parent - check if a clock is a possible parent for another
+ * @clk: clock source
+ * @parent: parent clock source
+ *
+ * This function can be used in drivers that need to check that a clock can be
+ * the parent of another without actually changing the parent.
+ *
+ * Returns true if @parent is a possible parent for @clk, false otherwise.
+ */
+bool clk_has_parent(struct clk *clk, struct clk *parent)
+{
+	unsigned int i;
+
+	/* NULL clocks should be nops, so return success if either is NULL. */
+	if (!clk || !parent)
+		return true;
+
+	/* Optimize for the case where the parent is already the parent. */
+	if (clk->parent == parent)
+		return true;
+
+	for (i = 0; i < clk->num_parents; i++)
+		if (strcmp(clk->parent_names[i], parent->name) == 0)
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(clk_has_parent);
+
 /**
  * clk_set_parent - switch the parent of a mux clk
  * @clk: the mux clk whose input we are switching
diff --git a/include/linux/clk.h b/include/linux/clk.h
index c7f258a81761..ba7e9eda4347 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -301,6 +301,18 @@ long clk_round_rate(struct clk *clk, unsigned long rate);
  */
 int clk_set_rate(struct clk *clk, unsigned long rate);
 
+/**
+ * clk_has_parent - check if a clock is a possible parent for another
+ * @clk: clock source
+ * @parent: parent clock source
+ *
+ * This function can be used in drivers that need to check that a clock can be
+ * the parent of another without actually changing the parent.
+ *
+ * Returns true if @parent is a possible parent for @clk, false otherwise.
+ */
+bool clk_has_parent(struct clk *clk, struct clk *parent);
+
 /**
  * clk_set_parent - set the parent clock source for this clock
  * @clk: clock source
@@ -374,6 +386,11 @@ static inline long clk_round_rate(struct clk *clk, unsigned long rate)
 	return 0;
 }
 
+static inline bool clk_has_parent(struct clk *clk, struct clk *parent)
+{
+	return true;
+}
+
 static inline int clk_set_parent(struct clk *clk, struct clk *parent)
 {
 	return 0;
-- 
cgit v1.2.3


From c99e76c55f68eaa0c307ba25803c4e59c2fca1ca Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Date: Mon, 12 Jan 2015 16:05:52 +0100
Subject: USB: host: Introduce flag to enable use of 64-bit dma_mask for
 ehci-platform

ehci-octeon driver used a 64-bit dma_mask. With removal of ehci-octeon
and usage of ehci-platform ehci dma_mask is now limited to 32 bits
(coerced in ehci_platform_probe).

Provide a flag in ehci platform data to allow use of 64 bits for
dma_mask.

Cc: David Daney <david.daney@cavium.com>
Cc: Alex Smith <alex.smith@imgtec.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Tested-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/cavium-octeon/octeon-platform.c | 4 +---
 drivers/usb/host/ehci-platform.c          | 3 ++-
 include/linux/usb/ehci_pdriver.h          | 1 +
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/cavium-octeon/octeon-platform.c b/arch/mips/cavium-octeon/octeon-platform.c
index eea60b6c927b..12410a2788d8 100644
--- a/arch/mips/cavium-octeon/octeon-platform.c
+++ b/arch/mips/cavium-octeon/octeon-platform.c
@@ -310,6 +310,7 @@ static struct usb_ehci_pdata octeon_ehci_pdata = {
 #ifdef __BIG_ENDIAN
 	.big_endian_mmio	= 1,
 #endif
+	.dma_mask_64	= 1,
 	.power_on	= octeon_ehci_power_on,
 	.power_off	= octeon_ehci_power_off,
 };
@@ -331,8 +332,6 @@ static void __init octeon_ehci_hw_start(struct device *dev)
 	octeon2_usb_clocks_stop();
 }
 
-static u64 octeon_ehci_dma_mask = DMA_BIT_MASK(64);
-
 static int __init octeon_ehci_device_init(void)
 {
 	struct platform_device *pd;
@@ -347,7 +346,6 @@ static int __init octeon_ehci_device_init(void)
 	if (!pd)
 		return 0;
 
-	pd->dev.dma_mask = &octeon_ehci_dma_mask;
 	pd->dev.platform_data = &octeon_ehci_pdata;
 	octeon_ehci_hw_start(&pd->dev);
 
diff --git a/drivers/usb/host/ehci-platform.c b/drivers/usb/host/ehci-platform.c
index 28aae64a8bee..63f2622926c4 100644
--- a/drivers/usb/host/ehci-platform.c
+++ b/drivers/usb/host/ehci-platform.c
@@ -155,7 +155,8 @@ static int ehci_platform_probe(struct platform_device *dev)
 	if (!pdata)
 		pdata = &ehci_platform_defaults;
 
-	err = dma_coerce_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32));
+	err = dma_coerce_mask_and_coherent(&dev->dev,
+		pdata->dma_mask_64 ? DMA_BIT_MASK(64) : DMA_BIT_MASK(32));
 	if (err)
 		return err;
 
diff --git a/include/linux/usb/ehci_pdriver.h b/include/linux/usb/ehci_pdriver.h
index 6287b398abd9..db0431b39a63 100644
--- a/include/linux/usb/ehci_pdriver.h
+++ b/include/linux/usb/ehci_pdriver.h
@@ -48,6 +48,7 @@ struct usb_ehci_pdata {
 	unsigned	big_endian_mmio:1;
 	unsigned	no_io_watchdog:1;
 	unsigned	reset_on_resume:1;
+	unsigned	dma_mask_64:1;
 
 	/* Turn on all power and clocks */
 	int (*power_on)(struct platform_device *pdev);
-- 
cgit v1.2.3


From 9636c37843c9355c1ab6adcd8491186cbdc3b950 Mon Sep 17 00:00:00 2001
From: Chris Rorvick <chris@rorvick.com>
Date: Wed, 14 Jan 2015 21:52:28 -0600
Subject: usb: Fix typo in `struct usb_host_interface' comment

The descriptor member `bNumEndpoints' is plural.

Signed-off-by: Chris Rorvick <chris@rorvick.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index f89c24a03bd9..4add5661080a 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -82,7 +82,7 @@ struct usb_host_interface {
 	int extralen;
 	unsigned char *extra;   /* Extra descriptors */
 
-	/* array of desc.bNumEndpoint endpoints associated with this
+	/* array of desc.bNumEndpoints endpoints associated with this
 	 * interface setting.  these will be in no particular order.
 	 */
 	struct usb_host_endpoint *endpoint;
-- 
cgit v1.2.3


From 524134d422316a59d5464ccbc12036bbe90c5563 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 21 Jan 2015 14:02:43 -0500
Subject: USB: don't cancel queued resets when unbinding drivers

The USB stack provides a mechanism for drivers to request an
asynchronous device reset (usb_queue_reset_device()).  The mechanism
uses a work item (reset_ws) embedded in the usb_interface structure
used by the driver, and the reset is carried out by a work queue
routine.

The asynchronous reset can race with driver unbinding.  When this
happens, we try to cancel the queued reset before unbinding the
driver, on the theory that the driver won't care about any resets once
it is unbound.

However, thanks to the fact that lockdep now tracks work queue
accesses, this can provoke a lockdep warning in situations where the
device reset causes another interface's driver to be unbound; see

	http://marc.info/?l=linux-usb&m=141893165203776&w=2

for an example.  The reason is that the work routine for reset_ws in
one interface calls cancel_queued_work() for the reset_ws in another
interface.  Lockdep thinks this might lead to a work routine trying to
cancel itself.  The simplest solution is not to cancel queued resets
when unbinding drivers.

This means we now need to acquire a reference to the usb_interface
when queuing a reset_ws work item and to drop the reference when the
work routine finishes.  We also need to make sure that the
usb_interface structure doesn't outlive its parent usb_device; this
means acquiring and dropping a reference when the interface is created
and destroyed.

In addition, cancelling a queued reset can fail (if the device is in
the middle of an earlier reset), and this can cause usb_reset_device()
to try to rebind an interface that has been deallocated (see
http://marc.info/?l=linux-usb&m=142175717016628&w=2 for details).
Acquiring the extra references prevents this failure.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Reported-by: Russell King - ARM Linux <linux@arm.linux.org.uk>
Reported-by: Olivier Sobrie <olivier@sobrie.be>
Tested-by: Olivier Sobrie <olivier@sobrie.be>
Cc: stable <stable@vger.kernel.org> # 3.19
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/driver.c  | 17 -----------------
 drivers/usb/core/hub.c     | 25 +++++++++----------------
 drivers/usb/core/message.c | 23 +++--------------------
 include/linux/usb.h        |  5 -----
 4 files changed, 12 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 874dec31a111..c76ec9758ce3 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -275,21 +275,6 @@ static int usb_unbind_device(struct device *dev)
 	return 0;
 }
 
-/*
- * Cancel any pending scheduled resets
- *
- * [see usb_queue_reset_device()]
- *
- * Called after unconfiguring / when releasing interfaces. See
- * comments in __usb_queue_reset_device() regarding
- * udev->reset_running.
- */
-static void usb_cancel_queued_reset(struct usb_interface *iface)
-{
-	if (iface->reset_running == 0)
-		cancel_work_sync(&iface->reset_ws);
-}
-
 /* called from driver core with dev locked */
 static int usb_probe_interface(struct device *dev)
 {
@@ -380,7 +365,6 @@ static int usb_probe_interface(struct device *dev)
 	usb_set_intfdata(intf, NULL);
 	intf->needs_remote_wakeup = 0;
 	intf->condition = USB_INTERFACE_UNBOUND;
-	usb_cancel_queued_reset(intf);
 
 	/* If the LPM disable succeeded, balance the ref counts. */
 	if (!lpm_disable_error)
@@ -425,7 +409,6 @@ static int usb_unbind_interface(struct device *dev)
 		usb_disable_interface(udev, intf, false);
 
 	driver->disconnect(intf);
-	usb_cancel_queued_reset(intf);
 
 	/* Free streams */
 	for (i = 0, j = 0; i < intf->cur_altsetting->desc.bNumEndpoints; i++) {
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index aeb50bb6ba9c..b4bfa3ac4b12 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -5589,26 +5589,19 @@ EXPORT_SYMBOL_GPL(usb_reset_device);
  *   possible; depending on how the driver attached to each interface
  *   handles ->pre_reset(), the second reset might happen or not.
  *
- * - If a driver is unbound and it had a pending reset, the reset will
- *   be cancelled.
+ * - If the reset is delayed so long that the interface is unbound from
+ *   its driver, the reset will be skipped.
  *
- * - This function can be called during .probe() or .disconnect()
- *   times. On return from .disconnect(), any pending resets will be
- *   cancelled.
- *
- * There is no no need to lock/unlock the @reset_ws as schedule_work()
- * does its own.
- *
- * NOTE: We don't do any reference count tracking because it is not
- *     needed. The lifecycle of the work_struct is tied to the
- *     usb_interface. Before destroying the interface we cancel the
- *     work_struct, so the fact that work_struct is queued and or
- *     running means the interface (and thus, the device) exist and
- *     are referenced.
+ * - This function can be called during .probe().  It can also be called
+ *   during .disconnect(), but doing so is pointless because the reset
+ *   will not occur.  If you really want to reset the device during
+ *   .disconnect(), call usb_reset_device() directly -- but watch out
+ *   for nested unbinding issues!
  */
 void usb_queue_reset_device(struct usb_interface *iface)
 {
-	schedule_work(&iface->reset_ws);
+	if (schedule_work(&iface->reset_ws))
+		usb_get_intf(iface);
 }
 EXPORT_SYMBOL_GPL(usb_queue_reset_device);
 
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index f7b7713cfb2a..f368d2053da5 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1551,6 +1551,7 @@ static void usb_release_interface(struct device *dev)
 			altsetting_to_usb_interface_cache(intf->altsetting);
 
 	kref_put(&intfc->ref, usb_release_interface_cache);
+	usb_put_dev(interface_to_usbdev(intf));
 	kfree(intf);
 }
 
@@ -1626,24 +1627,6 @@ static struct usb_interface_assoc_descriptor *find_iad(struct usb_device *dev,
 
 /*
  * Internal function to queue a device reset
- *
- * This is initialized into the workstruct in 'struct
- * usb_device->reset_ws' that is launched by
- * message.c:usb_set_configuration() when initializing each 'struct
- * usb_interface'.
- *
- * It is safe to get the USB device without reference counts because
- * the life cycle of @iface is bound to the life cycle of @udev. Then,
- * this function will be ran only if @iface is alive (and before
- * freeing it any scheduled instances of it will have been cancelled).
- *
- * We need to set a flag (usb_dev->reset_running) because when we call
- * the reset, the interfaces might be unbound. The current interface
- * cannot try to remove the queued work as it would cause a deadlock
- * (you cannot remove your work from within your executing
- * workqueue). This flag lets it know, so that
- * usb_cancel_queued_reset() doesn't try to do it.
- *
  * See usb_queue_reset_device() for more details
  */
 static void __usb_queue_reset_device(struct work_struct *ws)
@@ -1655,11 +1638,10 @@ static void __usb_queue_reset_device(struct work_struct *ws)
 
 	rc = usb_lock_device_for_reset(udev, iface);
 	if (rc >= 0) {
-		iface->reset_running = 1;
 		usb_reset_device(udev);
-		iface->reset_running = 0;
 		usb_unlock_device(udev);
 	}
+	usb_put_intf(iface);	/* Undo _get_ in usb_queue_reset_device() */
 }
 
 
@@ -1854,6 +1836,7 @@ free_interfaces:
 		dev_set_name(&intf->dev, "%d-%s:%d.%d",
 			dev->bus->busnum, dev->devpath,
 			configuration, alt->desc.bInterfaceNumber);
+		usb_get_dev(dev);
 	}
 	kfree(new_interfaces);
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 4add5661080a..7ee1b5c3b4cb 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -127,10 +127,6 @@ enum usb_interface_condition {
  *	to the sysfs representation for that device.
  * @pm_usage_cnt: PM usage counter for this interface
  * @reset_ws: Used for scheduling resets from atomic context.
- * @reset_running: set to 1 if the interface is currently running a
- *      queued reset so that usb_cancel_queued_reset() doesn't try to
- *      remove from the workqueue when running inside the worker
- *      thread. See __usb_queue_reset_device().
  * @resetting_device: USB core reset the device, so use alt setting 0 as
  *	current; needs bandwidth alloc after reset.
  *
@@ -181,7 +177,6 @@ struct usb_interface {
 	unsigned needs_remote_wakeup:1;	/* driver requires remote wakeup */
 	unsigned needs_altsetting0:1;	/* switch to altsetting 0 is pending */
 	unsigned needs_binding:1;	/* needs delayed unbind/rebind */
-	unsigned reset_running:1;
 	unsigned resetting_device:1;	/* true: bandwidth alloc after reset */
 
 	struct device dev;		/* interface specific device info */
-- 
cgit v1.2.3


From 79af73079d753b2d04e46f7445716d3b5f914dbd Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Wed, 21 Jan 2015 10:54:10 -0500
Subject: Add security hooks to binder and implement the hooks for SELinux.

Add security hooks to the binder and implement the hooks for SELinux.
The security hooks enable security modules such as SELinux to implement
controls over binder IPC.  The security hooks include support for
controlling what process can become the binder context manager
(binder_set_context_mgr), controlling the ability of a process
to invoke a binder transaction/IPC to another process (binder_transaction),
controlling the ability of a process to transfer a binder reference to
another process (binder_transfer_binder), and controlling the ability
of a process to transfer an open file to another process (binder_transfer_file).

These hooks have been included in the Android kernel trees since Android 4.3.

(Updated to reflect upstream relocation and changes to the binder driver,
changes to the LSM audit data structures, coding style cleanups, and
to add inline documentation for the hooks).

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Nick Kralevich <nnk@google.com>
Acked-by: Jeffrey Vander Stoep <jeffv@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 26 +++++++++++++
 include/linux/security.h            | 58 +++++++++++++++++++++++++++++
 security/capability.c               | 27 ++++++++++++++
 security/security.c                 | 23 ++++++++++++
 security/selinux/hooks.c            | 73 +++++++++++++++++++++++++++++++++++++
 security/selinux/include/classmap.h |  2 +
 6 files changed, 209 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 8c43521d3f11..33b09b6568a4 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -37,6 +37,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/pid_namespace.h>
+#include <linux/security.h>
 
 #ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
 #define BINDER_IPC_32BIT 1
@@ -1400,6 +1401,11 @@ static void binder_transaction(struct binder_proc *proc,
 			return_error = BR_DEAD_REPLY;
 			goto err_dead_binder;
 		}
+		if (security_binder_transaction(proc->tsk,
+						target_proc->tsk) < 0) {
+			return_error = BR_FAILED_REPLY;
+			goto err_invalid_target_handle;
+		}
 		if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
 			struct binder_transaction *tmp;
 
@@ -1551,6 +1557,11 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error = BR_FAILED_REPLY;
 				goto err_binder_get_ref_for_node_failed;
 			}
+			if (security_binder_transfer_binder(proc->tsk,
+							    target_proc->tsk)) {
+				return_error = BR_FAILED_REPLY;
+				goto err_binder_get_ref_for_node_failed;
+			}
 			ref = binder_get_ref_for_node(target_proc, node);
 			if (ref == NULL) {
 				return_error = BR_FAILED_REPLY;
@@ -1581,6 +1592,11 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error = BR_FAILED_REPLY;
 				goto err_binder_get_ref_failed;
 			}
+			if (security_binder_transfer_binder(proc->tsk,
+							    target_proc->tsk)) {
+				return_error = BR_FAILED_REPLY;
+				goto err_binder_get_ref_failed;
+			}
 			if (ref->node->proc == target_proc) {
 				if (fp->type == BINDER_TYPE_HANDLE)
 					fp->type = BINDER_TYPE_BINDER;
@@ -1638,6 +1654,13 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error = BR_FAILED_REPLY;
 				goto err_fget_failed;
 			}
+			if (security_binder_transfer_file(proc->tsk,
+							  target_proc->tsk,
+							  file) < 0) {
+				fput(file);
+				return_error = BR_FAILED_REPLY;
+				goto err_get_unused_fd_failed;
+			}
 			target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
 			if (target_fd < 0) {
 				fput(file);
@@ -2675,6 +2698,9 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 		ret = -EBUSY;
 		goto out;
 	}
+	ret = security_binder_set_context_mgr(proc->tsk);
+	if (ret < 0)
+		goto out;
 	if (uid_valid(binder_context_mgr_uid)) {
 		if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
 			pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
diff --git a/include/linux/security.h b/include/linux/security.h
index ba96471c11ba..a1b7dbd127ff 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1281,6 +1281,25 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@alter contains the flag indicating whether changes are to be made.
  *	Return 0 if permission is granted.
  *
+ * @binder_set_context_mgr
+ *	Check whether @mgr is allowed to be the binder context manager.
+ *	@mgr contains the task_struct for the task being registered.
+ *	Return 0 if permission is granted.
+ * @binder_transaction
+ *	Check whether @from is allowed to invoke a binder transaction call
+ *	to @to.
+ *	@from contains the task_struct for the sending task.
+ *	@to contains the task_struct for the receiving task.
+ * @binder_transfer_binder
+ *	Check whether @from is allowed to transfer a binder reference to @to.
+ *	@from contains the task_struct for the sending task.
+ *	@to contains the task_struct for the receiving task.
+ * @binder_transfer_file
+ *	Check whether @from is allowed to transfer @file to @to.
+ *	@from contains the task_struct for the sending task.
+ *	@file contains the struct file being transferred.
+ *	@to contains the task_struct for the receiving task.
+ *
  * @ptrace_access_check:
  *	Check permission before allowing the current process to trace the
  *	@child process.
@@ -1441,6 +1460,14 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
 struct security_operations {
 	char name[SECURITY_NAME_MAX + 1];
 
+	int (*binder_set_context_mgr) (struct task_struct *mgr);
+	int (*binder_transaction) (struct task_struct *from,
+				   struct task_struct *to);
+	int (*binder_transfer_binder) (struct task_struct *from,
+				       struct task_struct *to);
+	int (*binder_transfer_file) (struct task_struct *from,
+				     struct task_struct *to, struct file *file);
+
 	int (*ptrace_access_check) (struct task_struct *child, unsigned int mode);
 	int (*ptrace_traceme) (struct task_struct *parent);
 	int (*capget) (struct task_struct *target,
@@ -1739,6 +1766,13 @@ extern void __init security_fixup_ops(struct security_operations *ops);
 
 
 /* Security operations */
+int security_binder_set_context_mgr(struct task_struct *mgr);
+int security_binder_transaction(struct task_struct *from,
+				struct task_struct *to);
+int security_binder_transfer_binder(struct task_struct *from,
+				    struct task_struct *to);
+int security_binder_transfer_file(struct task_struct *from,
+				  struct task_struct *to, struct file *file);
 int security_ptrace_access_check(struct task_struct *child, unsigned int mode);
 int security_ptrace_traceme(struct task_struct *parent);
 int security_capget(struct task_struct *target,
@@ -1927,6 +1961,30 @@ static inline int security_init(void)
 	return 0;
 }
 
+static inline int security_binder_set_context_mgr(struct task_struct *mgr)
+{
+	return 0;
+}
+
+static inline int security_binder_transaction(struct task_struct *from,
+					      struct task_struct *to)
+{
+	return 0;
+}
+
+static inline int security_binder_transfer_binder(struct task_struct *from,
+						  struct task_struct *to)
+{
+	return 0;
+}
+
+static inline int security_binder_transfer_file(struct task_struct *from,
+						struct task_struct *to,
+						struct file *file)
+{
+	return 0;
+}
+
 static inline int security_ptrace_access_check(struct task_struct *child,
 					     unsigned int mode)
 {
diff --git a/security/capability.c b/security/capability.c
index d68c57a62bcf..070dd46f62f4 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -12,6 +12,29 @@
 
 #include <linux/security.h>
 
+static int cap_binder_set_context_mgr(struct task_struct *mgr)
+{
+	return 0;
+}
+
+static int cap_binder_transaction(struct task_struct *from,
+				  struct task_struct *to)
+{
+	return 0;
+}
+
+static int cap_binder_transfer_binder(struct task_struct *from,
+				      struct task_struct *to)
+{
+	return 0;
+}
+
+static int cap_binder_transfer_file(struct task_struct *from,
+				    struct task_struct *to, struct file *file)
+{
+	return 0;
+}
+
 static int cap_syslog(int type)
 {
 	return 0;
@@ -930,6 +953,10 @@ static void cap_audit_rule_free(void *lsmrule)
 
 void __init security_fixup_ops(struct security_operations *ops)
 {
+	set_to_cap_if_null(ops, binder_set_context_mgr);
+	set_to_cap_if_null(ops, binder_transaction);
+	set_to_cap_if_null(ops, binder_transfer_binder);
+	set_to_cap_if_null(ops, binder_transfer_file);
 	set_to_cap_if_null(ops, ptrace_access_check);
 	set_to_cap_if_null(ops, ptrace_traceme);
 	set_to_cap_if_null(ops, capget);
diff --git a/security/security.c b/security/security.c
index 18b35c63fc0c..b196de34b19f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -135,6 +135,29 @@ int __init register_security(struct security_operations *ops)
 
 /* Security operations */
 
+int security_binder_set_context_mgr(struct task_struct *mgr)
+{
+	return security_ops->binder_set_context_mgr(mgr);
+}
+
+int security_binder_transaction(struct task_struct *from,
+				struct task_struct *to)
+{
+	return security_ops->binder_transaction(from, to);
+}
+
+int security_binder_transfer_binder(struct task_struct *from,
+				    struct task_struct *to)
+{
+	return security_ops->binder_transfer_binder(from, to);
+}
+
+int security_binder_transfer_file(struct task_struct *from,
+				  struct task_struct *to, struct file *file)
+{
+	return security_ops->binder_transfer_file(from, to, file);
+}
+
 int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
 {
 #ifdef CONFIG_SECURITY_YAMA_STACKED
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 6da7532893a1..9d984bfb978b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1933,6 +1933,74 @@ static inline u32 open_file_to_av(struct file *file)
 
 /* Hook functions begin here. */
 
+static int selinux_binder_set_context_mgr(struct task_struct *mgr)
+{
+	u32 mysid = current_sid();
+	u32 mgrsid = task_sid(mgr);
+
+	return avc_has_perm(mysid, mgrsid, SECCLASS_BINDER,
+			    BINDER__SET_CONTEXT_MGR, NULL);
+}
+
+static int selinux_binder_transaction(struct task_struct *from,
+				      struct task_struct *to)
+{
+	u32 mysid = current_sid();
+	u32 fromsid = task_sid(from);
+	u32 tosid = task_sid(to);
+	int rc;
+
+	if (mysid != fromsid) {
+		rc = avc_has_perm(mysid, fromsid, SECCLASS_BINDER,
+				  BINDER__IMPERSONATE, NULL);
+		if (rc)
+			return rc;
+	}
+
+	return avc_has_perm(fromsid, tosid, SECCLASS_BINDER, BINDER__CALL,
+			    NULL);
+}
+
+static int selinux_binder_transfer_binder(struct task_struct *from,
+					  struct task_struct *to)
+{
+	u32 fromsid = task_sid(from);
+	u32 tosid = task_sid(to);
+
+	return avc_has_perm(fromsid, tosid, SECCLASS_BINDER, BINDER__TRANSFER,
+			    NULL);
+}
+
+static int selinux_binder_transfer_file(struct task_struct *from,
+					struct task_struct *to,
+					struct file *file)
+{
+	u32 sid = task_sid(to);
+	struct file_security_struct *fsec = file->f_security;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode_security_struct *isec = inode->i_security;
+	struct common_audit_data ad;
+	int rc;
+
+	ad.type = LSM_AUDIT_DATA_PATH;
+	ad.u.path = file->f_path;
+
+	if (sid != fsec->sid) {
+		rc = avc_has_perm(sid, fsec->sid,
+				  SECCLASS_FD,
+				  FD__USE,
+				  &ad);
+		if (rc)
+			return rc;
+	}
+
+	if (unlikely(IS_PRIVATE(inode)))
+		return 0;
+
+	return avc_has_perm(sid, isec->sid, isec->sclass, file_to_av(file),
+			    &ad);
+}
+
 static int selinux_ptrace_access_check(struct task_struct *child,
 				     unsigned int mode)
 {
@@ -5810,6 +5878,11 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer)
 static struct security_operations selinux_ops = {
 	.name =				"selinux",
 
+	.binder_set_context_mgr =	selinux_binder_set_context_mgr,
+	.binder_transaction =		selinux_binder_transaction,
+	.binder_transfer_binder =	selinux_binder_transfer_binder,
+	.binder_transfer_file =		selinux_binder_transfer_file,
+
 	.ptrace_access_check =		selinux_ptrace_access_check,
 	.ptrace_traceme =		selinux_ptrace_traceme,
 	.capget =			selinux_capget,
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index be491a74c1ed..eccd61b3de8a 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -151,5 +151,7 @@ struct security_class_mapping secclass_map[] = {
 	{ "kernel_service", { "use_as_override", "create_files_as", NULL } },
 	{ "tun_socket",
 	  { COMMON_SOCK_PERMS, "attach_queue", NULL } },
+	{ "binder", { "impersonate", "call", "set_context_mgr", "transfer",
+		      NULL } },
 	{ NULL }
   };
-- 
cgit v1.2.3


From d61031ee8df6214d58371a1cc36a0591e242fba0 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Fri, 9 Jan 2015 23:54:34 -0800
Subject: Drivers: hv: vmbus: Support a vmbus API for efficiently sending page
 arrays

Currently, the API for sending a multi-page buffer over VMBUS is limited to
a maximum pfn array of MAX_MULTIPAGE_BUFFER_COUNT. This limitation is
not imposed by the host and unnecessarily limits the maximum payload
that can be sent. Implement an API that does not have this restriction.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/hyperv.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index c76ffbe59f65..18c4f23dacf1 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -683,6 +683,50 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
 
+/*
+ * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
+ * using a GPADL Direct packet type.
+ * The buffer includes the vmbus descriptor.
+ */
+int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
+			      struct vmbus_packet_mpb_array *desc,
+			      u32 desc_size,
+			      void *buffer, u32 bufferlen, u64 requestid)
+{
+	int ret;
+	u32 packetlen;
+	u32 packetlen_aligned;
+	struct kvec bufferlist[3];
+	u64 aligned_data = 0;
+	bool signal = false;
+
+	packetlen = desc_size + bufferlen;
+	packetlen_aligned = ALIGN(packetlen, sizeof(u64));
+
+	/* Setup the descriptor */
+	desc->type = VM_PKT_DATA_USING_GPA_DIRECT;
+	desc->flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
+	desc->dataoffset8 = desc_size >> 3; /* in 8-bytes grandularity */
+	desc->length8 = (u16)(packetlen_aligned >> 3);
+	desc->transactionid = requestid;
+	desc->rangecount = 1;
+
+	bufferlist[0].iov_base = desc;
+	bufferlist[0].iov_len = desc_size;
+	bufferlist[1].iov_base = buffer;
+	bufferlist[1].iov_len = bufferlen;
+	bufferlist[2].iov_base = &aligned_data;
+	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
+
+	ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal);
+
+	if (ret == 0 && signal)
+		vmbus_setevent(channel);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc);
+
 /*
  * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
  * using a GPADL Direct packet type.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 476c685ca6f9..259023a34bec 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -57,6 +57,18 @@ struct hv_multipage_buffer {
 	u64 pfn_array[MAX_MULTIPAGE_BUFFER_COUNT];
 };
 
+/*
+ * Multiple-page buffer array; the pfn array is variable size:
+ * The number of entries in the PFN array is determined by
+ * "len" and "offset".
+ */
+struct hv_mpb_array {
+	/* Length and Offset determines the # of pfns in the array */
+	u32 len;
+	u32 offset;
+	u64 pfn_array[];
+};
+
 /* 0x18 includes the proprietary packet header */
 #define MAX_PAGE_BUFFER_PACKET		(0x18 +			\
 					(sizeof(struct hv_page_buffer) * \
@@ -814,6 +826,18 @@ struct vmbus_channel_packet_multipage_buffer {
 	struct hv_multipage_buffer range;
 } __packed;
 
+/* The format must be the same as struct vmdata_gpa_direct */
+struct vmbus_packet_mpb_array {
+	u16 type;
+	u16 dataoffset8;
+	u16 length8;
+	u16 flags;
+	u64 transactionid;
+	u32 reserved;
+	u32 rangecount;         /* Always 1 in this case */
+	struct hv_mpb_array range;
+} __packed;
+
 
 extern int vmbus_open(struct vmbus_channel *channel,
 			    u32 send_ringbuffersize,
@@ -845,6 +869,13 @@ extern int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
 					u32 bufferlen,
 					u64 requestid);
 
+extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
+				     struct vmbus_packet_mpb_array *mpb,
+				     u32 desc_size,
+				     void *buffer,
+				     u32 bufferlen,
+				     u64 requestid);
+
 extern int vmbus_establish_gpadl(struct vmbus_channel *channel,
 				      void *kbuffer,
 				      u32 size,
-- 
cgit v1.2.3


From 67fae053bfc6e84144150e4c6c62670abb215c33 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 20 Jan 2015 16:45:05 +0100
Subject: Drivers: hv: rename sc_lock to the more generic lock

sc_lock spinlock in struct vmbus_channel is being used to not only protect the
sc_list field, e.g. vmbus_open() function uses it to implement test-and-set
access to the state field. Rename it to the more generic 'lock' and add the
description.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      |  6 +++---
 drivers/hv/channel_mgmt.c | 10 +++++-----
 include/linux/hyperv.h    |  7 ++++++-
 3 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 18c4f23dacf1..2978f5ee8d2a 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -73,14 +73,14 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
 	unsigned long flags;
 	int ret, t, err = 0;
 
-	spin_lock_irqsave(&newchannel->sc_lock, flags);
+	spin_lock_irqsave(&newchannel->lock, flags);
 	if (newchannel->state == CHANNEL_OPEN_STATE) {
 		newchannel->state = CHANNEL_OPENING_STATE;
 	} else {
-		spin_unlock_irqrestore(&newchannel->sc_lock, flags);
+		spin_unlock_irqrestore(&newchannel->lock, flags);
 		return -EINVAL;
 	}
-	spin_unlock_irqrestore(&newchannel->sc_lock, flags);
+	spin_unlock_irqrestore(&newchannel->lock, flags);
 
 	newchannel->onchannel_callback = onchannelcallback;
 	newchannel->channel_callback_context = context;
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 704c0e00f8d2..1e0b996ed643 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -146,7 +146,7 @@ static struct vmbus_channel *alloc_channel(void)
 		return NULL;
 
 	spin_lock_init(&channel->inbound_lock);
-	spin_lock_init(&channel->sc_lock);
+	spin_lock_init(&channel->lock);
 
 	INIT_LIST_HEAD(&channel->sc_list);
 	INIT_LIST_HEAD(&channel->percpu_list);
@@ -246,9 +246,9 @@ static void vmbus_process_rescind_offer(struct work_struct *work)
 		spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
 	} else {
 		primary_channel = channel->primary_channel;
-		spin_lock_irqsave(&primary_channel->sc_lock, flags);
+		spin_lock_irqsave(&primary_channel->lock, flags);
 		list_del(&channel->sc_list);
-		spin_unlock_irqrestore(&primary_channel->sc_lock, flags);
+		spin_unlock_irqrestore(&primary_channel->lock, flags);
 	}
 	free_channel(channel);
 }
@@ -323,9 +323,9 @@ static void vmbus_process_offer(struct work_struct *work)
 			 * Process the sub-channel.
 			 */
 			newchannel->primary_channel = channel;
-			spin_lock_irqsave(&channel->sc_lock, flags);
+			spin_lock_irqsave(&channel->lock, flags);
 			list_add_tail(&newchannel->sc_list, &channel->sc_list);
-			spin_unlock_irqrestore(&channel->sc_lock, flags);
+			spin_unlock_irqrestore(&channel->lock, flags);
 
 			if (newchannel->target_cpu != get_cpu()) {
 				put_cpu();
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 259023a34bec..5a2ba674795e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -734,7 +734,12 @@ struct vmbus_channel {
 	 */
 	void (*sc_creation_callback)(struct vmbus_channel *new_sc);
 
-	spinlock_t sc_lock;
+	/*
+	 * The spinlock to protect the structure. It is being used to protect
+	 * test-and-set access to various attributes of the structure as well
+	 * as all sc_list operations.
+	 */
+	spinlock_t lock;
 	/*
 	 * All Sub-channels of a primary channel are linked here.
 	 */
-- 
cgit v1.2.3


From 77b3da6e3232d3b4d4b8addb4b05799fe98f3bf8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jan 2015 15:10:32 -0500
Subject: new primitive: debugfs_create_automount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/debugfs/inode.c      | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/debugfs.h |  5 +++++
 2 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 1219dff8e18f..957c40ce09f6 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -175,6 +175,18 @@ static const struct super_operations debugfs_super_operations = {
 	.show_options	= debugfs_show_options,
 };
 
+static struct vfsmount *debugfs_automount(struct path *path)
+{
+	struct vfsmount *(*f)(void *);
+	f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
+	return f(path->dentry->d_inode->i_private);
+}
+
+static const struct dentry_operations debugfs_dops = {
+	.d_delete = always_delete_dentry,
+	.d_automount = debugfs_automount,
+};
+
 static int debug_fill_super(struct super_block *sb, void *data, int silent)
 {
 	static struct tree_descr debug_files[] = {{""}};
@@ -199,6 +211,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 		goto fail;
 
 	sb->s_op = &debugfs_super_operations;
+	sb->s_d_op = &debugfs_dops;
 
 	debugfs_apply_options(sb);
 
@@ -367,6 +380,41 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
 }
 EXPORT_SYMBOL_GPL(debugfs_create_dir);
 
+/**
+ * debugfs_create_automount - create automount point in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @f: function to be called when pathname resolution steps on that one.
+ * @data: opaque argument to pass to f().
+ *
+ * @f should return what ->d_automount() would.
+ */
+struct dentry *debugfs_create_automount(const char *name,
+					struct dentry *parent,
+					struct vfsmount *(*f)(void *),
+					void *data)
+{
+	struct dentry *dentry = start_creating(name, parent);
+	struct inode *inode;
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = debugfs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	inode->i_flags |= S_AUTOMOUNT;
+	inode->i_private = data;
+	dentry->d_fsdata = (void *)f;
+	d_instantiate(dentry, inode);
+	return end_creating(dentry);
+}
+EXPORT_SYMBOL(debugfs_create_automount);
+
 /**
  * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
  * @name: a pointer to a string containing the name of the symbolic link to
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index da4c4983adbe..ea149a24a1f2 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -56,6 +56,11 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 				      const char *dest);
 
+struct dentry *debugfs_create_automount(const char *name,
+					struct dentry *parent,
+					struct vfsmount *(*f)(void *),
+					void *data);
+
 void debugfs_remove(struct dentry *dentry);
 void debugfs_remove_recursive(struct dentry *dentry);
 
-- 
cgit v1.2.3


From 872bf2fb69d90e3619befee842fc26db39d8e475 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:35 +0200
Subject: net/mlx4_core: Maintain a persistent memory for mlx4 device

Maintain a persistent memory that should survive reset flow/PCI error.
This comes as a preparation for coming series to support above flows.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/alias_GUID.c            |   2 +-
 drivers/infiniband/hw/mlx4/mad.c                   |   3 +-
 drivers/infiniband/hw/mlx4/main.c                  |  17 ++--
 drivers/infiniband/hw/mlx4/mr.c                    |   6 +-
 drivers/infiniband/hw/mlx4/sysfs.c                 |   6 +-
 drivers/net/ethernet/mellanox/mlx4/alloc.c         |  15 +--
 drivers/net/ethernet/mellanox/mlx4/catas.c         |  13 +--
 drivers/net/ethernet/mellanox/mlx4/cmd.c           |  46 +++++----
 drivers/net/ethernet/mellanox/mlx4/en_cq.c         |   4 +-
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c    |   2 +-
 drivers/net/ethernet/mellanox/mlx4/en_main.c       |   4 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |   2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   4 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         |   4 +-
 drivers/net/ethernet/mellanox/mlx4/eq.c            |  42 ++++----
 drivers/net/ethernet/mellanox/mlx4/icm.c           |  11 ++-
 drivers/net/ethernet/mellanox/mlx4/main.c          | 106 +++++++++++++--------
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |   9 +-
 drivers/net/ethernet/mellanox/mlx4/mr.c            |   8 +-
 drivers/net/ethernet/mellanox/mlx4/pd.c            |   6 +-
 drivers/net/ethernet/mellanox/mlx4/port.c          |  17 ++--
 drivers/net/ethernet/mellanox/mlx4/reset.c         |  23 +++--
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  36 ++++---
 include/linux/mlx4/device.h                        |  11 ++-
 24 files changed, 234 insertions(+), 163 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
index 0eb141c41416..a31e031afd87 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -154,7 +154,7 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
 			continue;
 
 		slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
-		if (slave_id >= dev->dev->num_vfs + 1)
+		if (slave_id >= dev->dev->persist->num_vfs + 1)
 			return;
 		tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];
 		form_cache_ag = get_cached_alias_guid(dev, port_num,
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 82a7dd87089b..c7619716c31d 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1951,7 +1951,8 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
 	ctx->ib_dev = &dev->ib_dev;
 
 	for (i = 0;
-	     i < min(dev->dev->caps.sqp_demux, (u16)(dev->dev->num_vfs + 1));
+	     i < min(dev->dev->caps.sqp_demux,
+	     (u16)(dev->dev->persist->num_vfs + 1));
 	     i++) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev->dev, i);
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 57ecc5b204f3..b4fa6f658800 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -198,7 +198,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 
 	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
 		0xffffff;
-	props->vendor_part_id	   = dev->dev->pdev->device;
+	props->vendor_part_id	   = dev->dev->persist->pdev->device;
 	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
 	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
 
@@ -1375,7 +1375,7 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
-	return sprintf(buf, "MT%d\n", dev->dev->pdev->device);
+	return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
 }
 
 static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
@@ -1937,7 +1937,8 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev)
 	int i;
 
 	if (mlx4_is_master(ibdev->dev)) {
-		for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) {
+		for (slave = 0; slave <= ibdev->dev->persist->num_vfs;
+		     ++slave) {
 			for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
 				for (i = 0;
 				     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
@@ -1994,7 +1995,7 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
 		for (j = 0; j < eq_per_port; j++) {
 			snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s",
-				 i, j, dev->pdev->bus->name);
+				 i, j, dev->persist->pdev->bus->name);
 			/* Set IRQ for specific name (per ring) */
 			if (mlx4_assign_eq(dev, name, NULL,
 					   &ibdev->eq_table[eq])) {
@@ -2058,7 +2059,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
 	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
 	if (!ibdev) {
-		dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+		dev_err(&dev->persist->pdev->dev,
+			"Device struct alloc failed\n");
 		return NULL;
 	}
 
@@ -2085,7 +2087,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->num_ports		= num_ports;
 	ibdev->ib_dev.phys_port_cnt     = ibdev->num_ports;
 	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
-	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
+	ibdev->ib_dev.dma_device	= &dev->persist->pdev->dev;
 
 	if (dev->caps.userspace_caps)
 		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
@@ -2236,7 +2238,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 				sizeof(long),
 				GFP_KERNEL);
 		if (!ibdev->ib_uc_qpns_bitmap) {
-			dev_err(&dev->pdev->dev, "bit map alloc failed\n");
+			dev_err(&dev->persist->pdev->dev,
+				"bit map alloc failed\n");
 			goto err_steer_qp_release;
 		}
 
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index c36ccbd9a644..e0d271782d0a 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -401,7 +401,8 @@ struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device
 	if (!mfrpl->ibfrpl.page_list)
 		goto err_free;
 
-	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
+	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->persist->
+						     pdev->dev,
 						     size, &mfrpl->map,
 						     GFP_KERNEL);
 	if (!mfrpl->mapped_page_list)
@@ -423,7 +424,8 @@ void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
 	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
 	int size = page_list->max_page_list_len * sizeof (u64);
 
-	dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
+	dma_free_coherent(&dev->dev->persist->pdev->dev, size,
+			  mfrpl->mapped_page_list,
 			  mfrpl->map);
 	kfree(mfrpl->ibfrpl.page_list);
 	kfree(mfrpl);
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
index cb4c66e723b5..d10c2b8a5dad 100644
--- a/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -375,7 +375,7 @@ static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
 	char base_name[9];
 
 	/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
-	strlcpy(name, pci_name(dev->dev->pdev), max);
+	strlcpy(name, pci_name(dev->dev->persist->pdev), max);
 	strncpy(base_name, name, 8); /*till xxxx:yy:*/
 	base_name[8] = '\0';
 	/* with no ARI only 3 last bits are used so when the fn is higher than 8
@@ -792,7 +792,7 @@ static int register_pkey_tree(struct mlx4_ib_dev *device)
 	if (!mlx4_is_master(device->dev))
 		return 0;
 
-	for (i = 0; i <= device->dev->num_vfs; ++i)
+	for (i = 0; i <= device->dev->persist->num_vfs; ++i)
 		register_one_pkey_tree(device, i);
 
 	return 0;
@@ -807,7 +807,7 @@ static void unregister_pkey_tree(struct mlx4_ib_dev *device)
 	if (!mlx4_is_master(device->dev))
 		return;
 
-	for (slave = device->dev->num_vfs; slave >= 0; --slave) {
+	for (slave = device->dev->persist->num_vfs; slave >= 0; --slave) {
 		list_for_each_entry_safe(p, t,
 					 &device->pkeys.pkey_port_list[slave],
 					 entry) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index 963dd7e6d547..a716c26e0d99 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -592,7 +592,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		buf->nbufs        = 1;
 		buf->npages       = 1;
 		buf->page_shift   = get_order(size) + PAGE_SHIFT;
-		buf->direct.buf   = dma_alloc_coherent(&dev->pdev->dev,
+		buf->direct.buf   = dma_alloc_coherent(&dev->persist->pdev->dev,
 						       size, &t, gfp);
 		if (!buf->direct.buf)
 			return -ENOMEM;
@@ -619,7 +619,8 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 
 		for (i = 0; i < buf->nbufs; ++i) {
 			buf->page_list[i].buf =
-				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+				dma_alloc_coherent(&dev->persist->pdev->dev,
+						   PAGE_SIZE,
 						   &t, gfp);
 			if (!buf->page_list[i].buf)
 				goto err_free;
@@ -657,7 +658,8 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 	int i;
 
 	if (buf->nbufs == 1)
-		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+		dma_free_coherent(&dev->persist->pdev->dev, size,
+				  buf->direct.buf,
 				  buf->direct.map);
 	else {
 		if (BITS_PER_LONG == 64 && buf->direct.buf)
@@ -665,7 +667,8 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 
 		for (i = 0; i < buf->nbufs; ++i)
 			if (buf->page_list[i].buf)
-				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+				dma_free_coherent(&dev->persist->pdev->dev,
+						  PAGE_SIZE,
 						  buf->page_list[i].buf,
 						  buf->page_list[i].map);
 		kfree(buf->page_list);
@@ -738,7 +741,7 @@ int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, gfp_t gfp
 		if (!mlx4_alloc_db_from_pgdir(pgdir, db, order))
 			goto out;
 
-	pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev), gfp);
+	pgdir = mlx4_alloc_db_pgdir(&dev->persist->pdev->dev, gfp);
 	if (!pgdir) {
 		ret = -ENOMEM;
 		goto out;
@@ -775,7 +778,7 @@ void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
 	set_bit(i, db->u.pgdir->bits[o]);
 
 	if (bitmap_full(db->u.pgdir->order1, MLX4_DB_PER_PAGE / 2)) {
-		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
 				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
 		list_del(&db->u.pgdir->list);
 		kfree(db->u.pgdir);
diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 9c656fe4983d..1a102c9bac99 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -70,7 +70,7 @@ static void poll_catas(unsigned long dev_ptr)
 
 	if (readl(priv->catas_err.map)) {
 		/* If the device is off-line, we cannot try to recover it */
-		if (pci_channel_offline(dev->pdev))
+		if (pci_channel_offline(dev->persist->pdev))
 			mod_timer(&priv->catas_err.timer,
 				  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
 		else {
@@ -94,6 +94,7 @@ static void catas_reset(struct work_struct *work)
 {
 	struct mlx4_priv *priv, *tmppriv;
 	struct mlx4_dev *dev;
+	struct mlx4_dev_persistent *persist;
 
 	LIST_HEAD(tlist);
 	int ret;
@@ -103,20 +104,20 @@ static void catas_reset(struct work_struct *work)
 	spin_unlock_irq(&catas_lock);
 
 	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
-		struct pci_dev *pdev = priv->dev.pdev;
+		struct pci_dev *pdev = priv->dev.persist->pdev;
 
 		/* If the device is off-line, we cannot reset it */
 		if (pci_channel_offline(pdev))
 			continue;
 
-		ret = mlx4_restart_one(priv->dev.pdev);
+		ret = mlx4_restart_one(priv->dev.persist->pdev);
 		/* 'priv' now is not valid */
 		if (ret)
 			pr_err("mlx4 %s: Reset failed (%d)\n",
 			       pci_name(pdev), ret);
 		else {
-			dev  = pci_get_drvdata(pdev);
-			mlx4_dbg(dev, "Reset succeeded\n");
+			persist  = pci_get_drvdata(pdev);
+			mlx4_dbg(persist->dev, "Reset succeeded\n");
 		}
 	}
 }
@@ -134,7 +135,7 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev)
 	init_timer(&priv->catas_err.timer);
 	priv->catas_err.map = NULL;
 
-	addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
+	addr = pci_resource_start(dev->persist->pdev, priv->fw.catas_bar) +
 		priv->fw.catas_offset;
 
 	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 5c93d1451c44..7cd90e6a4272 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -307,7 +307,7 @@ static int cmd_pending(struct mlx4_dev *dev)
 {
 	u32 status;
 
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return -EIO;
 
 	status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
@@ -328,7 +328,7 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 
 	mutex_lock(&cmd->hcr_mutex);
 
-	if (pci_channel_offline(dev->pdev)) {
+	if (pci_channel_offline(dev->persist->pdev)) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
@@ -342,7 +342,7 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
 
 	while (cmd_pending(dev)) {
-		if (pci_channel_offline(dev->pdev)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
 			/*
 			 * Device is going through error recovery
 			 * and cannot accept commands.
@@ -464,7 +464,7 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 
 	down(&priv->cmd.poll_sem);
 
-	if (pci_channel_offline(dev->pdev)) {
+	if (pci_channel_offline(dev->persist->pdev)) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
@@ -487,7 +487,7 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 
 	end = msecs_to_jiffies(timeout) + jiffies;
 	while (cmd_pending(dev) && time_before(jiffies, end)) {
-		if (pci_channel_offline(dev->pdev)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
 			/*
 			 * Device is going through error recovery
 			 * and cannot accept commands.
@@ -612,7 +612,7 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	       int out_is_imm, u32 in_modifier, u8 op_modifier,
 	       u16 op, unsigned long timeout, int native)
 {
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return -EIO;
 
 	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
@@ -1997,11 +1997,12 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 
 	if (mlx4_is_master(dev))
 		priv->mfunc.comm =
-		ioremap(pci_resource_start(dev->pdev, priv->fw.comm_bar) +
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.comm_bar) +
 			priv->fw.comm_base, MLX4_COMM_PAGESIZE);
 	else
 		priv->mfunc.comm =
-		ioremap(pci_resource_start(dev->pdev, 2) +
+		ioremap(pci_resource_start(dev->persist->pdev, 2) +
 			MLX4_SLAVE_COMM_BASE, MLX4_COMM_PAGESIZE);
 	if (!priv->mfunc.comm) {
 		mlx4_err(dev, "Couldn't map communication vector\n");
@@ -2107,9 +2108,9 @@ err_comm_admin:
 err_comm:
 	iounmap(priv->mfunc.comm);
 err_vhcr:
-	dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
-					     priv->mfunc.vhcr,
-					     priv->mfunc.vhcr_dma);
+	dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+			  priv->mfunc.vhcr,
+			  priv->mfunc.vhcr_dma);
 	priv->mfunc.vhcr = NULL;
 	return -ENOMEM;
 }
@@ -2130,8 +2131,8 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
 	}
 
 	if (!mlx4_is_slave(dev) && !priv->cmd.hcr) {
-		priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) +
-					MLX4_HCR_BASE, MLX4_HCR_SIZE);
+		priv->cmd.hcr = ioremap(pci_resource_start(dev->persist->pdev,
+					0) + MLX4_HCR_BASE, MLX4_HCR_SIZE);
 		if (!priv->cmd.hcr) {
 			mlx4_err(dev, "Couldn't map command register\n");
 			goto err;
@@ -2140,7 +2141,8 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
 	}
 
 	if (mlx4_is_mfunc(dev) && !priv->mfunc.vhcr) {
-		priv->mfunc.vhcr = dma_alloc_coherent(&(dev->pdev->dev), PAGE_SIZE,
+		priv->mfunc.vhcr = dma_alloc_coherent(&dev->persist->pdev->dev,
+						      PAGE_SIZE,
 						      &priv->mfunc.vhcr_dma,
 						      GFP_KERNEL);
 		if (!priv->mfunc.vhcr)
@@ -2150,7 +2152,8 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
 	}
 
 	if (!priv->cmd.pool) {
-		priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev,
+		priv->cmd.pool = pci_pool_create("mlx4_cmd",
+						 dev->persist->pdev,
 						 MLX4_MAILBOX_SIZE,
 						 MLX4_MAILBOX_SIZE, 0);
 		if (!priv->cmd.pool)
@@ -2202,7 +2205,7 @@ void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask)
 	}
 	if (mlx4_is_mfunc(dev) && priv->mfunc.vhcr &&
 	    (cleanup_mask & MLX4_CMD_CLEANUP_VHCR)) {
-		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
 				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
 		priv->mfunc.vhcr = NULL;
 	}
@@ -2306,8 +2309,9 @@ u32 mlx4_comm_get_version(void)
 
 static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
 {
-	if ((vf < 0) || (vf >= dev->num_vfs)) {
-		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n", vf, dev->num_vfs);
+	if ((vf < 0) || (vf >= dev->persist->num_vfs)) {
+		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n",
+			 vf, dev->persist->num_vfs);
 		return -EINVAL;
 	}
 
@@ -2316,7 +2320,7 @@ static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
 
 int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave)
 {
-	if (slave < 1 || slave > dev->num_vfs) {
+	if (slave < 1 || slave > dev->persist->num_vfs) {
 		mlx4_err(dev,
 			 "Bad slave number:%d (number of activated slaves: %lu)\n",
 			 slave, dev->num_slaves);
@@ -2388,7 +2392,7 @@ struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev,
 	if (port <= 0 || port > dev->caps.num_ports)
 		return slaves_pport;
 
-	for (i = 0; i < dev->num_vfs + 1; i++) {
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev, i);
 		if (test_bit(port - 1, actv_ports.ports))
@@ -2408,7 +2412,7 @@ struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv(
 
 	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
 
-	for (i = 0; i < dev->num_vfs + 1; i++) {
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev, i);
 		if (bitmap_equal(crit_ports->ports, actv_ports.ports,
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index 82322b1c8411..22da4d0d0f05 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -70,10 +70,10 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 	/* Allocate HW buffers on provided NUMA node.
 	 * dev->numa_node is used in mtt range allocation flow.
 	 */
-	set_dev_node(&mdev->dev->pdev->dev, node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
 	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
 				cq->buf_size, 2 * PAGE_SIZE);
-	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err)
 		goto err_cq;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 90e0f045a6bc..569eda9e83d6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -92,7 +92,7 @@ mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
 		(u16) (mdev->dev->caps.fw_ver >> 32),
 		(u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff),
 		(u16) (mdev->dev->caps.fw_ver & 0xffff));
-	strlcpy(drvinfo->bus_info, pci_name(mdev->dev->pdev),
+	strlcpy(drvinfo->bus_info, pci_name(mdev->dev->persist->pdev),
 		sizeof(drvinfo->bus_info));
 	drvinfo->n_stats = 0;
 	drvinfo->regdump_len = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
index 9f16f754137b..c643d2bbb7b9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -241,8 +241,8 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 	spin_lock_init(&mdev->uar_lock);
 
 	mdev->dev = dev;
-	mdev->dma_device = &(dev->pdev->dev);
-	mdev->pdev = dev->pdev;
+	mdev->dma_device = &dev->persist->pdev->dev;
+	mdev->pdev = dev->persist->pdev;
 	mdev->device_up = false;
 
 	mdev->LSO_support = !!(dev->caps.flags & (1 << 15));
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index d0d6dc1b8e46..43a3f9822f74 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2457,7 +2457,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	netif_set_real_num_tx_queues(dev, prof->tx_ring_num);
 	netif_set_real_num_rx_queues(dev, prof->rx_ring_num);
 
-	SET_NETDEV_DEV(dev, &mdev->dev->pdev->dev);
+	SET_NETDEV_DEV(dev, &mdev->dev->persist->pdev->dev);
 	dev->dev_port = port - 1;
 
 	/*
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index a0474eb94aa3..2ba5d368edce 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -387,10 +387,10 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 		 ring->rx_info, tmp);
 
 	/* Allocate HW buffers on provided NUMA node */
-	set_dev_node(&mdev->dev->pdev->dev, node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
 	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
 				 ring->buf_size, 2 * PAGE_SIZE);
-	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err)
 		goto err_info;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 359bb1286eb5..55f9f5c5344e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -91,10 +91,10 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
 
 	/* Allocate HW buffers on provided NUMA node */
-	set_dev_node(&mdev->dev->pdev->dev, node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
 	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
 				 2 * PAGE_SIZE);
-	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err) {
 		en_err(priv, "Failed allocating hwq resources\n");
 		goto err_bounce;
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 3d275fbaf0eb..7538c9ce98a9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -237,7 +237,7 @@ int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
 	struct mlx4_eqe eqe;
 
 	/*don't send if we don't have the that slave */
-	if (dev->num_vfs < slave)
+	if (dev->persist->num_vfs < slave)
 		return 0;
 	memset(&eqe, 0, sizeof eqe);
 
@@ -255,7 +255,7 @@ int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
 	struct mlx4_eqe eqe;
 
 	/*don't send if we don't have the that slave */
-	if (dev->num_vfs < slave)
+	if (dev->persist->num_vfs < slave)
 		return 0;
 	memset(&eqe, 0, sizeof eqe);
 
@@ -310,7 +310,7 @@ static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event)
 	struct mlx4_slaves_pport slaves_pport = mlx4_phys_to_slaves_pport(dev,
 									  port);
 
-	for (i = 0; i < dev->num_vfs + 1; i++)
+	for (i = 0; i < dev->persist->num_vfs + 1; i++)
 		if (test_bit(i, slaves_pport.slaves))
 			set_and_calc_slave_port_state(dev, i, port,
 						      event, &gen_event);
@@ -560,7 +560,8 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
 				if (!mlx4_is_master(dev))
 					break;
-				for (i = 0; i < dev->num_vfs + 1; i++) {
+				for (i = 0; i < dev->persist->num_vfs + 1;
+				     i++) {
 					if (!test_bit(i, slaves_port.slaves))
 						continue;
 					if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) {
@@ -596,7 +597,9 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 				if (!mlx4_is_master(dev))
 					break;
 				if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
-					for (i = 0; i < dev->num_vfs + 1; i++) {
+					for (i = 0;
+					     i < dev->persist->num_vfs + 1;
+					     i++) {
 						if (!test_bit(i, slaves_port.slaves))
 							continue;
 						if (i == mlx4_master_func_num(dev))
@@ -865,7 +868,7 @@ static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
 
 	if (!priv->eq_table.uar_map[index]) {
 		priv->eq_table.uar_map[index] =
-			ioremap(pci_resource_start(dev->pdev, 2) +
+			ioremap(pci_resource_start(dev->persist->pdev, 2) +
 				((eq->eqn / 4) << PAGE_SHIFT),
 				PAGE_SIZE);
 		if (!priv->eq_table.uar_map[index]) {
@@ -928,8 +931,10 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 	eq_context = mailbox->buf;
 
 	for (i = 0; i < npages; ++i) {
-		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
-							  PAGE_SIZE, &t, GFP_KERNEL);
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->persist->
+							  pdev->dev,
+							  PAGE_SIZE, &t,
+							  GFP_KERNEL);
 		if (!eq->page_list[i].buf)
 			goto err_out_free_pages;
 
@@ -995,7 +1000,7 @@ err_out_free_eq:
 err_out_free_pages:
 	for (i = 0; i < npages; ++i)
 		if (eq->page_list[i].buf)
-			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+			dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
 					  eq->page_list[i].buf,
 					  eq->page_list[i].map);
 
@@ -1044,9 +1049,9 @@ static void mlx4_free_eq(struct mlx4_dev *dev,
 
 	mlx4_mtt_cleanup(dev, &eq->mtt);
 	for (i = 0; i < npages; ++i)
-		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
-				    eq->page_list[i].buf,
-				    eq->page_list[i].map);
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+				  eq->page_list[i].buf,
+				  eq->page_list[i].map);
 
 	kfree(eq->page_list);
 	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
@@ -1060,7 +1065,7 @@ static void mlx4_free_irqs(struct mlx4_dev *dev)
 	int	i, vec;
 
 	if (eq_table->have_irq)
-		free_irq(dev->pdev->irq, dev);
+		free_irq(dev->persist->pdev->irq, dev);
 
 	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		if (eq_table->eq[i].have_irq) {
@@ -1089,7 +1094,8 @@ static int mlx4_map_clr_int(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	priv->clr_base = ioremap(pci_resource_start(dev->pdev, priv->fw.clr_int_bar) +
+	priv->clr_base = ioremap(pci_resource_start(dev->persist->pdev,
+				 priv->fw.clr_int_bar) +
 				 priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
 	if (!priv->clr_base) {
 		mlx4_err(dev, "Couldn't map interrupt clear register, aborting\n");
@@ -1212,13 +1218,13 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 					 i * MLX4_IRQNAME_SIZE,
 					 MLX4_IRQNAME_SIZE,
 					 "mlx4-comp-%d@pci:%s", i,
-					 pci_name(dev->pdev));
+					 pci_name(dev->persist->pdev));
 			} else {
 				snprintf(priv->eq_table.irq_names +
 					 i * MLX4_IRQNAME_SIZE,
 					 MLX4_IRQNAME_SIZE,
 					 "mlx4-async@pci:%s",
-					 pci_name(dev->pdev));
+					 pci_name(dev->persist->pdev));
 			}
 
 			eq_name = priv->eq_table.irq_names +
@@ -1235,8 +1241,8 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 		snprintf(priv->eq_table.irq_names,
 			 MLX4_IRQNAME_SIZE,
 			 DRV_NAME "@pci:%s",
-			 pci_name(dev->pdev));
-		err = request_irq(dev->pdev->irq, mlx4_interrupt,
+			 pci_name(dev->persist->pdev));
+		err = request_irq(dev->persist->pdev->irq, mlx4_interrupt,
 				  IRQF_SHARED, priv->eq_table.irq_names, dev);
 		if (err)
 			goto err_out_async;
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 97c9b1db1d27..2a9dd460a95f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -56,7 +56,7 @@ static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chu
 	int i;
 
 	if (chunk->nsg > 0)
-		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+		pci_unmap_sg(dev->persist->pdev, chunk->mem, chunk->npages,
 			     PCI_DMA_BIDIRECTIONAL);
 
 	for (i = 0; i < chunk->npages; ++i)
@@ -69,7 +69,8 @@ static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *
 	int i;
 
 	for (i = 0; i < chunk->npages; ++i)
-		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+		dma_free_coherent(&dev->persist->pdev->dev,
+				  chunk->mem[i].length,
 				  lowmem_page_address(sg_page(&chunk->mem[i])),
 				  sg_dma_address(&chunk->mem[i]));
 }
@@ -173,7 +174,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 			--cur_order;
 
 		if (coherent)
-			ret = mlx4_alloc_icm_coherent(&dev->pdev->dev,
+			ret = mlx4_alloc_icm_coherent(&dev->persist->pdev->dev,
 						      &chunk->mem[chunk->npages],
 						      cur_order, gfp_mask);
 		else
@@ -193,7 +194,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 		if (coherent)
 			++chunk->nsg;
 		else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
-			chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+			chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
 						chunk->npages,
 						PCI_DMA_BIDIRECTIONAL);
 
@@ -208,7 +209,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 	}
 
 	if (!coherent && chunk) {
-		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+		chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
 					chunk->npages,
 					PCI_DMA_BIDIRECTIONAL);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 03e9eb0dc761..abcee61f8a47 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -318,10 +318,11 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		return -ENODEV;
 	}
 
-	if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) {
+	if (dev_cap->uar_size > pci_resource_len(dev->persist->pdev, 2)) {
 		mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
 			 dev_cap->uar_size,
-			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
 		return -ENODEV;
 	}
 
@@ -541,8 +542,10 @@ static int mlx4_get_pcie_dev_link_caps(struct mlx4_dev *dev,
 	*speed = PCI_SPEED_UNKNOWN;
 	*width = PCIE_LNK_WIDTH_UNKNOWN;
 
-	err1 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP, &lnkcap1);
-	err2 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP2, &lnkcap2);
+	err1 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP,
+					  &lnkcap1);
+	err2 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP2,
+					  &lnkcap2);
 	if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */
 		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
 			*speed = PCIE_SPEED_8_0GT;
@@ -587,7 +590,7 @@ static void mlx4_check_pcie_caps(struct mlx4_dev *dev)
 		return;
 	}
 
-	err = pcie_get_minimum_link(dev->pdev, &speed, &width);
+	err = pcie_get_minimum_link(dev->persist->pdev, &speed, &width);
 	if (err || speed == PCI_SPEED_UNKNOWN ||
 	    width == PCIE_LNK_WIDTH_UNKNOWN) {
 		mlx4_warn(dev,
@@ -837,10 +840,12 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	if (dev->caps.uar_page_size * (dev->caps.num_uars -
 				       dev->caps.reserved_uars) >
-				       pci_resource_len(dev->pdev, 2)) {
+				       pci_resource_len(dev->persist->pdev,
+							2)) {
 		mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
 			 dev->caps.uar_page_size * dev->caps.num_uars,
-			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
 		goto err_mem;
 	}
 
@@ -1492,9 +1497,9 @@ static int map_bf_area(struct mlx4_dev *dev)
 	if (!dev->caps.bf_reg_size)
 		return -ENXIO;
 
-	bf_start = pci_resource_start(dev->pdev, 2) +
+	bf_start = pci_resource_start(dev->persist->pdev, 2) +
 			(dev->caps.num_uars << PAGE_SHIFT);
-	bf_len = pci_resource_len(dev->pdev, 2) -
+	bf_len = pci_resource_len(dev->persist->pdev, 2) -
 			(dev->caps.num_uars << PAGE_SHIFT);
 	priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
 	if (!priv->bf_mapping)
@@ -1536,7 +1541,8 @@ static int map_internal_clock(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	priv->clock_mapping =
-		ioremap(pci_resource_start(dev->pdev, priv->fw.clock_bar) +
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.clock_bar) +
 			priv->fw.clock_offset, MLX4_CLOCK_SIZE);
 
 	if (!priv->clock_mapping)
@@ -1705,7 +1711,8 @@ static void choose_steering_mode(struct mlx4_dev *dev,
 	if (mlx4_log_num_mgm_entry_size <= 0 &&
 	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
 	    (!mlx4_is_mfunc(dev) ||
-	     (dev_cap->fs_max_num_qp_per_entry >= (dev->num_vfs + 1))) &&
+	     (dev_cap->fs_max_num_qp_per_entry >=
+	     (dev->persist->num_vfs + 1))) &&
 	    choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >=
 		MLX4_MIN_MGM_LOG_ENTRY_SIZE) {
 		dev->oper_log_mgm_entry_size =
@@ -2288,7 +2295,8 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 		for (i = 0; i < nreq; ++i)
 			entries[i].entry = i;
 
-		nreq = pci_enable_msix_range(dev->pdev, entries, 2, nreq);
+		nreq = pci_enable_msix_range(dev->persist->pdev, entries, 2,
+					     nreq);
 
 		if (nreq < 0) {
 			kfree(entries);
@@ -2316,7 +2324,7 @@ no_msi:
 	dev->caps.comp_pool	   = 0;
 
 	for (i = 0; i < 2; ++i)
-		priv->eq_table.eq[i].irq = dev->pdev->irq;
+		priv->eq_table.eq[i].irq = dev->persist->pdev->irq;
 }
 
 static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
@@ -2344,7 +2352,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	info->port_attr.show      = show_port_type;
 	sysfs_attr_init(&info->port_attr.attr);
 
-	err = device_create_file(&dev->pdev->dev, &info->port_attr);
+	err = device_create_file(&dev->persist->pdev->dev, &info->port_attr);
 	if (err) {
 		mlx4_err(dev, "Failed to create file for port %d\n", port);
 		info->port = -1;
@@ -2361,10 +2369,12 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	info->port_mtu_attr.show      = show_port_ib_mtu;
 	sysfs_attr_init(&info->port_mtu_attr.attr);
 
-	err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr);
+	err = device_create_file(&dev->persist->pdev->dev,
+				 &info->port_mtu_attr);
 	if (err) {
 		mlx4_err(dev, "Failed to create mtu file for port %d\n", port);
-		device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+		device_remove_file(&info->dev->persist->pdev->dev,
+				   &info->port_attr);
 		info->port = -1;
 	}
 
@@ -2376,8 +2386,9 @@ static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
 	if (info->port < 0)
 		return;
 
-	device_remove_file(&info->dev->pdev->dev, &info->port_attr);
-	device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr);
+	device_remove_file(&info->dev->persist->pdev->dev, &info->port_attr);
+	device_remove_file(&info->dev->persist->pdev->dev,
+			   &info->port_mtu_attr);
 }
 
 static int mlx4_init_steering(struct mlx4_dev *dev)
@@ -2444,10 +2455,11 @@ static int mlx4_get_ownership(struct mlx4_dev *dev)
 	void __iomem *owner;
 	u32 ret;
 
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return -EIO;
 
-	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
 			MLX4_OWNER_SIZE);
 	if (!owner) {
 		mlx4_err(dev, "Failed to obtain ownership bit\n");
@@ -2463,10 +2475,11 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
 {
 	void __iomem *owner;
 
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return;
 
-	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
 			MLX4_OWNER_SIZE);
 	if (!owner) {
 		mlx4_err(dev, "Failed to obtain ownership bit\n");
@@ -2514,13 +2527,13 @@ static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
 		dev_flags |= MLX4_FLAG_SRIOV |
 			MLX4_FLAG_MASTER;
 		dev_flags &= ~MLX4_FLAG_SLAVE;
-		dev->num_vfs = total_vfs;
+		dev->persist->num_vfs = total_vfs;
 	}
 	return dev_flags;
 
 disable_sriov:
 	atomic_dec(&pf_loading);
-	dev->num_vfs = 0;
+	dev->persist->num_vfs = 0;
 	kfree(dev->dev_vfs);
 	return dev_flags & ~MLX4_FLAG_MASTER;
 }
@@ -2607,7 +2620,7 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 			existing_vfs = pci_num_vf(pdev);
 			if (existing_vfs)
 				dev->flags |= MLX4_FLAG_SRIOV;
-			dev->num_vfs = total_vfs;
+			dev->persist->num_vfs = total_vfs;
 		}
 	}
 
@@ -2771,12 +2784,14 @@ slave_start:
 				 dev->caps.num_ports);
 			goto err_close;
 		}
-		memcpy(dev->nvfs, nvfs, sizeof(dev->nvfs));
+		memcpy(dev->persist->nvfs, nvfs, sizeof(dev->persist->nvfs));
 
-		for (i = 0; i < sizeof(dev->nvfs)/sizeof(dev->nvfs[0]); i++) {
+		for (i = 0;
+		     i < sizeof(dev->persist->nvfs)/
+		     sizeof(dev->persist->nvfs[0]); i++) {
 			unsigned j;
 
-			for (j = 0; j < dev->nvfs[i]; ++sum, ++j) {
+			for (j = 0; j < dev->persist->nvfs[i]; ++sum, ++j) {
 				dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1;
 				dev->dev_vfs[sum].n_ports = i < 2 ? 1 :
 					dev->caps.num_ports;
@@ -2846,7 +2861,7 @@ slave_start:
 
 	priv->removed = 0;
 
-	if (mlx4_is_master(dev) && dev->num_vfs)
+	if (mlx4_is_master(dev) && dev->persist->num_vfs)
 		atomic_dec(&pf_loading);
 
 	kfree(dev_cap);
@@ -2908,7 +2923,7 @@ err_sriov:
 	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs)
 		pci_disable_sriov(pdev);
 
-	if (mlx4_is_master(dev) && dev->num_vfs)
+	if (mlx4_is_master(dev) && dev->persist->num_vfs)
 		atomic_dec(&pf_loading);
 
 	kfree(priv->dev.dev_vfs);
@@ -3076,20 +3091,28 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		return -ENOMEM;
 
 	dev       = &priv->dev;
-	dev->pdev = pdev;
-	pci_set_drvdata(pdev, dev);
+	dev->persist = kzalloc(sizeof(*dev->persist), GFP_KERNEL);
+	if (!dev->persist) {
+		kfree(priv);
+		return -ENOMEM;
+	}
+	dev->persist->pdev = pdev;
+	dev->persist->dev = dev;
+	pci_set_drvdata(pdev, dev->persist);
 	priv->pci_dev_data = id->driver_data;
 
 	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
-	if (ret)
+	if (ret) {
+		kfree(dev->persist);
 		kfree(priv);
-
+	}
 	return ret;
 }
 
 static void mlx4_unload_one(struct pci_dev *pdev)
 {
-	struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int               pci_dev_data;
 	int p;
@@ -3155,7 +3178,7 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 		mlx4_warn(dev, "Disabling SR-IOV\n");
 		pci_disable_sriov(pdev);
 		dev->flags &= ~MLX4_FLAG_SRIOV;
-		dev->num_vfs = 0;
+		dev->persist->num_vfs = 0;
 	}
 
 	if (!mlx4_is_slave(dev))
@@ -3175,26 +3198,29 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 
 static void mlx4_remove_one(struct pci_dev *pdev)
 {
-	struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	mlx4_unload_one(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
+	kfree(dev->persist);
 	kfree(priv);
 	pci_set_drvdata(pdev, NULL);
 }
 
 int mlx4_restart_one(struct pci_dev *pdev)
 {
-	struct mlx4_dev	 *dev  = pci_get_drvdata(pdev);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
 	int pci_dev_data, err, total_vfs;
 
 	pci_dev_data = priv->pci_dev_data;
-	total_vfs = dev->num_vfs;
-	memcpy(nvfs, dev->nvfs, sizeof(dev->nvfs));
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
 
 	mlx4_unload_one(pdev);
 	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index bdd4eea2247c..faa37ab75a9d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -221,16 +221,17 @@ extern int mlx4_debug_level;
 #define mlx4_dbg(mdev, format, ...)					\
 do {									\
 	if (mlx4_debug_level)						\
-		dev_printk(KERN_DEBUG, &(mdev)->pdev->dev, format,	\
+		dev_printk(KERN_DEBUG,					\
+			   &(mdev)->persist->pdev->dev, format,		\
 			   ##__VA_ARGS__);				\
 } while (0)
 
 #define mlx4_err(mdev, format, ...)					\
-	dev_err(&(mdev)->pdev->dev, format, ##__VA_ARGS__)
+	dev_err(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
 #define mlx4_info(mdev, format, ...)					\
-	dev_info(&(mdev)->pdev->dev, format, ##__VA_ARGS__)
+	dev_info(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
 #define mlx4_warn(mdev, format, ...)					\
-	dev_warn(&(mdev)->pdev->dev, format, ##__VA_ARGS__)
+	dev_warn(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
 
 extern int mlx4_log_num_mgm_entry_size;
 extern int log_mtts_per_seg;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
index 7094a9c70fd5..8dbdf1d29357 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -708,13 +708,13 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	if (!mtts)
 		return -ENOMEM;
 
-	dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle,
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, dma_handle,
 				npages * sizeof (u64), DMA_TO_DEVICE);
 
 	for (i = 0; i < npages; ++i)
 		mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
 
-	dma_sync_single_for_device(&dev->pdev->dev, dma_handle,
+	dma_sync_single_for_device(&dev->persist->pdev->dev, dma_handle,
 				   npages * sizeof (u64), DMA_TO_DEVICE);
 
 	return 0;
@@ -1020,13 +1020,13 @@ int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list
 	/* Make sure MPT status is visible before writing MTT entries */
 	wmb();
 
-	dma_sync_single_for_cpu(&dev->pdev->dev, fmr->dma_handle,
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, fmr->dma_handle,
 				npages * sizeof(u64), DMA_TO_DEVICE);
 
 	for (i = 0; i < npages; ++i)
 		fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
 
-	dma_sync_single_for_device(&dev->pdev->dev, fmr->dma_handle,
+	dma_sync_single_for_device(&dev->persist->pdev->dev, fmr->dma_handle,
 				   npages * sizeof(u64), DMA_TO_DEVICE);
 
 	fmr->mpt->key    = cpu_to_be32(key);
diff --git a/drivers/net/ethernet/mellanox/mlx4/pd.c b/drivers/net/ethernet/mellanox/mlx4/pd.c
index 74216071201f..a42b4c0a9ed9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/pd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/pd.c
@@ -151,11 +151,13 @@ int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
 		return -ENOMEM;
 
 	if (mlx4_is_slave(dev))
-		offset = uar->index % ((int) pci_resource_len(dev->pdev, 2) /
+		offset = uar->index % ((int)pci_resource_len(dev->persist->pdev,
+							     2) /
 				       dev->caps.uar_page_size);
 	else
 		offset = uar->index;
-	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + offset;
+	uar->pfn = (pci_resource_start(dev->persist->pdev, 2) >> PAGE_SHIFT)
+		    + offset;
 	uar->map = NULL;
 	return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index 30eb1ead0fe6..9f268f05290a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -553,9 +553,9 @@ int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port)
 		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
 				    dev, &exclusive_ports);
 		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
-					   dev->num_vfs + 1);
+					   dev->persist->num_vfs + 1);
 	}
-	vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1;
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
 	if (slave_gid <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % vfs))
 		return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs) + 1;
 	return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs;
@@ -590,10 +590,10 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port)
 		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
 				    dev, &exclusive_ports);
 		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
-					   dev->num_vfs + 1);
+					   dev->persist->num_vfs + 1);
 	}
 	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
-	vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1;
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
 	if (slave_gid <= gids % vfs)
 		return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave_gid - 1);
 
@@ -644,7 +644,7 @@ void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave)
 	int num_eth_ports, err;
 	int i;
 
-	if (slave < 0 || slave > dev->num_vfs)
+	if (slave < 0 || slave > dev->persist->num_vfs)
 		return;
 
 	actv_ports = mlx4_get_active_ports(dev, slave);
@@ -1214,7 +1214,8 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
 		return -EINVAL;
 
 	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
-	num_vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1;
+	num_vfs = bitmap_weight(slaves_pport.slaves,
+				dev->persist->num_vfs + 1) - 1;
 
 	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
 		if (!memcmp(priv->port[port].gid_table.roce_gids[i].raw, gid,
@@ -1258,7 +1259,7 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
 							dev, &exclusive_ports);
 				num_vfs_before += bitmap_weight(
 						slaves_pport_actv.slaves,
-						dev->num_vfs + 1);
+						dev->persist->num_vfs + 1);
 			}
 
 			/* candidate_slave_gid isn't necessarily the correct slave, but
@@ -1288,7 +1289,7 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
 						dev, &exclusive_ports);
 				slave_gid += bitmap_weight(
 						slaves_pport_actv.slaves,
-						dev->num_vfs + 1);
+						dev->persist->num_vfs + 1);
 			}
 		}
 		*slave_id = slave_gid;
diff --git a/drivers/net/ethernet/mellanox/mlx4/reset.c b/drivers/net/ethernet/mellanox/mlx4/reset.c
index ea1c6d092145..0076d88587ca 100644
--- a/drivers/net/ethernet/mellanox/mlx4/reset.c
+++ b/drivers/net/ethernet/mellanox/mlx4/reset.c
@@ -76,19 +76,21 @@ int mlx4_reset(struct mlx4_dev *dev)
 		goto out;
 	}
 
-	pcie_cap = pci_pcie_cap(dev->pdev);
+	pcie_cap = pci_pcie_cap(dev->persist->pdev);
 
 	for (i = 0; i < 64; ++i) {
 		if (i == 22 || i == 23)
 			continue;
-		if (pci_read_config_dword(dev->pdev, i * 4, hca_header + i)) {
+		if (pci_read_config_dword(dev->persist->pdev, i * 4,
+					  hca_header + i)) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't save HCA PCI header, aborting\n");
 			goto out;
 		}
 	}
 
-	reset = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_RESET_BASE,
+	reset = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_RESET_BASE,
 			MLX4_RESET_SIZE);
 	if (!reset) {
 		err = -ENOMEM;
@@ -122,8 +124,8 @@ int mlx4_reset(struct mlx4_dev *dev)
 
 	end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
 	do {
-		if (!pci_read_config_word(dev->pdev, PCI_VENDOR_ID, &vendor) &&
-		    vendor != 0xffff)
+		if (!pci_read_config_word(dev->persist->pdev, PCI_VENDOR_ID,
+					  &vendor) && vendor != 0xffff)
 			break;
 
 		msleep(1);
@@ -138,14 +140,16 @@ int mlx4_reset(struct mlx4_dev *dev)
 	/* Now restore the PCI headers */
 	if (pcie_cap) {
 		devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
-		if (pcie_capability_write_word(dev->pdev, PCI_EXP_DEVCTL,
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_DEVCTL,
 					       devctl)) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't restore HCA PCI Express Device Control register, aborting\n");
 			goto out;
 		}
 		linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
-		if (pcie_capability_write_word(dev->pdev, PCI_EXP_LNKCTL,
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_LNKCTL,
 					       linkctl)) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't restore HCA PCI Express Link control register, aborting\n");
@@ -157,7 +161,8 @@ int mlx4_reset(struct mlx4_dev *dev)
 		if (i * 4 == PCI_COMMAND)
 			continue;
 
-		if (pci_write_config_dword(dev->pdev, i * 4, hca_header[i])) {
+		if (pci_write_config_dword(dev->persist->pdev, i * 4,
+					   hca_header[i])) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't restore HCA reg %x, aborting\n",
 				 i);
@@ -165,7 +170,7 @@ int mlx4_reset(struct mlx4_dev *dev)
 		}
 	}
 
-	if (pci_write_config_dword(dev->pdev, PCI_COMMAND,
+	if (pci_write_config_dword(dev->persist->pdev, PCI_COMMAND,
 				   hca_header[PCI_COMMAND / 4])) {
 		err = -ENODEV;
 		mlx4_err(dev, "Couldn't restore HCA COMMAND, aborting\n");
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 4efbd1eca611..3e93879bccce 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -309,12 +309,13 @@ static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
 	int allocated, free, reserved, guaranteed, from_free;
 	int from_rsvd;
 
-	if (slave > dev->num_vfs)
+	if (slave > dev->persist->num_vfs)
 		return -EINVAL;
 
 	spin_lock(&res_alloc->alloc_lock);
 	allocated = (port > 0) ?
-		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] :
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
 		res_alloc->allocated[slave];
 	free = (port > 0) ? res_alloc->res_port_free[port - 1] :
 		res_alloc->res_free;
@@ -352,7 +353,8 @@ static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
 	if (!err) {
 		/* grant the request */
 		if (port > 0) {
-			res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] += count;
+			res_alloc->allocated[(port - 1) *
+			(dev->persist->num_vfs + 1) + slave] += count;
 			res_alloc->res_port_free[port - 1] -= count;
 			res_alloc->res_port_rsvd[port - 1] -= from_rsvd;
 		} else {
@@ -376,13 +378,14 @@ static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
 		&priv->mfunc.master.res_tracker.res_alloc[res_type];
 	int allocated, guaranteed, from_rsvd;
 
-	if (slave > dev->num_vfs)
+	if (slave > dev->persist->num_vfs)
 		return;
 
 	spin_lock(&res_alloc->alloc_lock);
 
 	allocated = (port > 0) ?
-		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] :
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
 		res_alloc->allocated[slave];
 	guaranteed = res_alloc->guaranteed[slave];
 
@@ -397,7 +400,8 @@ static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
 	}
 
 	if (port > 0) {
-		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] -= count;
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] -= count;
 		res_alloc->res_port_free[port - 1] += count;
 		res_alloc->res_port_rsvd[port - 1] += from_rsvd;
 	} else {
@@ -415,7 +419,8 @@ static inline void initialize_res_quotas(struct mlx4_dev *dev,
 					 enum mlx4_resource res_type,
 					 int vf, int num_instances)
 {
-	res_alloc->guaranteed[vf] = num_instances / (2 * (dev->num_vfs + 1));
+	res_alloc->guaranteed[vf] = num_instances /
+				    (2 * (dev->persist->num_vfs + 1));
 	res_alloc->quota[vf] = (num_instances / 2) + res_alloc->guaranteed[vf];
 	if (vf == mlx4_master_func_num(dev)) {
 		res_alloc->res_free = num_instances;
@@ -486,21 +491,26 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
 		struct resource_allocator *res_alloc =
 			&priv->mfunc.master.res_tracker.res_alloc[i];
-		res_alloc->quota = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
-		res_alloc->guaranteed = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+		res_alloc->quota = kmalloc((dev->persist->num_vfs + 1) *
+					   sizeof(int), GFP_KERNEL);
+		res_alloc->guaranteed = kmalloc((dev->persist->num_vfs + 1) *
+						sizeof(int), GFP_KERNEL);
 		if (i == RES_MAC || i == RES_VLAN)
 			res_alloc->allocated = kzalloc(MLX4_MAX_PORTS *
-						       (dev->num_vfs + 1) * sizeof(int),
-							GFP_KERNEL);
+						       (dev->persist->num_vfs
+						       + 1) *
+						       sizeof(int), GFP_KERNEL);
 		else
-			res_alloc->allocated = kzalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+			res_alloc->allocated = kzalloc((dev->persist->
+							num_vfs + 1) *
+						       sizeof(int), GFP_KERNEL);
 
 		if (!res_alloc->quota || !res_alloc->guaranteed ||
 		    !res_alloc->allocated)
 			goto no_mem_err;
 
 		spin_lock_init(&res_alloc->alloc_lock);
-		for (t = 0; t < dev->num_vfs + 1; t++) {
+		for (t = 0; t < dev->persist->num_vfs + 1; t++) {
 			struct mlx4_active_ports actv_ports =
 				mlx4_get_active_ports(dev, t);
 			switch (i) {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index f1e41b33462f..1069ce65e8b4 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -744,8 +744,15 @@ struct mlx4_vf_dev {
 	u8			n_ports;
 };
 
-struct mlx4_dev {
+struct mlx4_dev_persistent {
 	struct pci_dev	       *pdev;
+	struct mlx4_dev	       *dev;
+	int                     nvfs[MLX4_MAX_PORTS + 1];
+	int			num_vfs;
+};
+
+struct mlx4_dev {
+	struct mlx4_dev_persistent *persist;
 	unsigned long		flags;
 	unsigned long		num_slaves;
 	struct mlx4_caps	caps;
@@ -754,13 +761,11 @@ struct mlx4_dev {
 	struct radix_tree_root	qp_table_tree;
 	u8			rev_id;
 	char			board_id[MLX4_BOARD_ID_LEN];
-	int			num_vfs;
 	int			numa_node;
 	int			oper_log_mgm_entry_size;
 	u64			regid_promisc_array[MLX4_MAX_PORTS + 1];
 	u64			regid_allmulti_array[MLX4_MAX_PORTS + 1];
 	struct mlx4_vf_dev     *dev_vfs;
-	int                     nvfs[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_eqe {
-- 
cgit v1.2.3


From dd0eefe3abbf47442db296bf68f27eb2860c1cdf Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:36 +0200
Subject: net/mlx4_core: Set device configuration data to be persistent across
 reset

When an HCA enters an internal error state, this is detected by the driver.
The driver then should reset the HCA and restart the software stack.

Keep ports information and some SRIOV configuration in a persistent area
to have it valid across reset.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 45 +++++++++++++++++++++++++++++--
 include/linux/mlx4/device.h               |  2 ++
 2 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index abcee61f8a47..2c5a555dff89 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3109,18 +3109,34 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	return ret;
 }
 
+static void mlx4_clean_dev(struct mlx4_dev *dev)
+{
+	struct mlx4_dev_persistent *persist = dev->persist;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	memset(priv, 0, sizeof(*priv));
+	priv->dev.persist = persist;
+}
+
 static void mlx4_unload_one(struct pci_dev *pdev)
 {
 	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
 	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int               pci_dev_data;
-	int p;
+	int p, i;
 	int active_vfs = 0;
 
 	if (priv->removed)
 		return;
 
+	/* saving current ports type for further use */
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		dev->persist->curr_port_type[i] = dev->caps.port_type[i + 1];
+		dev->persist->curr_port_poss_type[i] = dev->caps.
+						       possible_type[i + 1];
+	}
+
 	pci_dev_data = priv->pci_dev_data;
 
 	/* Disabling SR-IOV is not allowed while there are active vf's */
@@ -3191,7 +3207,7 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 	kfree(dev->caps.qp1_proxy);
 	kfree(dev->dev_vfs);
 
-	memset(priv, 0, sizeof(*priv));
+	mlx4_clean_dev(dev);
 	priv->pci_dev_data = pci_dev_data;
 	priv->removed = 1;
 }
@@ -3210,6 +3226,25 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	pci_set_drvdata(pdev, NULL);
 }
 
+static int restore_current_port_types(struct mlx4_dev *dev,
+				      enum mlx4_port_type *types,
+				      enum mlx4_port_type *poss_types)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err, i;
+
+	mlx4_stop_sense(dev);
+
+	mutex_lock(&priv->port_mutex);
+	for (i = 0; i < dev->caps.num_ports; i++)
+		dev->caps.possible_type[i + 1] = poss_types[i];
+	err = mlx4_change_port_types(dev, types);
+	mlx4_start_sense(dev);
+	mutex_unlock(&priv->port_mutex);
+
+	return err;
+}
+
 int mlx4_restart_one(struct pci_dev *pdev)
 {
 	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
@@ -3230,6 +3265,12 @@ int mlx4_restart_one(struct pci_dev *pdev)
 		return err;
 	}
 
+	err = restore_current_port_types(dev, dev->persist->curr_port_type,
+					 dev->persist->curr_port_poss_type);
+	if (err)
+		mlx4_err(dev, "could not restore original port types (%d)\n",
+			 err);
+
 	return err;
 }
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 1069ce65e8b4..8c3837ac1a2d 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -749,6 +749,8 @@ struct mlx4_dev_persistent {
 	struct mlx4_dev	       *dev;
 	int                     nvfs[MLX4_MAX_PORTS + 1];
 	int			num_vfs;
+	enum mlx4_port_type curr_port_type[MLX4_MAX_PORTS + 1];
+	enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_dev {
-- 
cgit v1.2.3


From ad9a0bf08ffbf32b8f292c3bb78ca0f24bb8f6b2 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:37 +0200
Subject: net/mlx4_core: Refactor the catas flow to work per device

Using a WQ per device instead of a single global WQ, this allows
independent reset handling per device even when SRIOV is used.

This comes as a pre-patch for supporting chip reset
for both native and SRIOV.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c | 75 ++++++++++++++----------------
 drivers/net/ethernet/mellanox/mlx4/main.c  | 12 ++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |  3 +-
 include/linux/mlx4/device.h                |  2 +
 4 files changed, 48 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 1a102c9bac99..5bb9aa6e281d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -40,10 +40,7 @@ enum {
 	MLX4_CATAS_POLL_INTERVAL	= 5 * HZ,
 };
 
-static DEFINE_SPINLOCK(catas_lock);
 
-static LIST_HEAD(catas_list);
-static struct work_struct catas_work;
 
 static int internal_err_reset = 1;
 module_param(internal_err_reset, int, 0644);
@@ -77,13 +74,9 @@ static void poll_catas(unsigned long dev_ptr)
 			dump_err_buf(dev);
 			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
 
-			if (internal_err_reset) {
-				spin_lock(&catas_lock);
-				list_add(&priv->catas_err.list, &catas_list);
-				spin_unlock(&catas_lock);
-
-				queue_work(mlx4_wq, &catas_work);
-			}
+			if (internal_err_reset)
+				queue_work(dev->persist->catas_wq,
+					   &dev->persist->catas_work);
 		}
 	} else
 		mod_timer(&priv->catas_err.timer,
@@ -92,34 +85,23 @@ static void poll_catas(unsigned long dev_ptr)
 
 static void catas_reset(struct work_struct *work)
 {
-	struct mlx4_priv *priv, *tmppriv;
-	struct mlx4_dev *dev;
-	struct mlx4_dev_persistent *persist;
-
-	LIST_HEAD(tlist);
+	struct mlx4_dev_persistent *persist =
+		container_of(work, struct mlx4_dev_persistent,
+			     catas_work);
+	struct pci_dev *pdev = persist->pdev;
 	int ret;
 
-	spin_lock_irq(&catas_lock);
-	list_splice_init(&catas_list, &tlist);
-	spin_unlock_irq(&catas_lock);
-
-	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
-		struct pci_dev *pdev = priv->dev.persist->pdev;
-
-		/* If the device is off-line, we cannot reset it */
-		if (pci_channel_offline(pdev))
-			continue;
+	/* If the device is off-line, we cannot reset it */
+	if (pci_channel_offline(pdev))
+		return;
 
-		ret = mlx4_restart_one(priv->dev.persist->pdev);
-		/* 'priv' now is not valid */
-		if (ret)
-			pr_err("mlx4 %s: Reset failed (%d)\n",
-			       pci_name(pdev), ret);
-		else {
-			persist  = pci_get_drvdata(pdev);
-			mlx4_dbg(persist->dev, "Reset succeeded\n");
-		}
-	}
+	ret = mlx4_restart_one(pdev);
+	/* 'priv' now is not valid */
+	if (ret)
+		pr_err("mlx4 %s: Reset failed (%d)\n",
+		       pci_name(pdev), ret);
+	else
+		mlx4_dbg(persist->dev, "Reset succeeded\n");
 }
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev)
@@ -158,15 +140,26 @@ void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 
 	del_timer_sync(&priv->catas_err.timer);
 
-	if (priv->catas_err.map)
+	if (priv->catas_err.map) {
 		iounmap(priv->catas_err.map);
+		priv->catas_err.map = NULL;
+	}
+}
 
-	spin_lock_irq(&catas_lock);
-	list_del(&priv->catas_err.list);
-	spin_unlock_irq(&catas_lock);
+int  mlx4_catas_init(struct mlx4_dev *dev)
+{
+	INIT_WORK(&dev->persist->catas_work, catas_reset);
+	dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
+	if (!dev->persist->catas_wq)
+		return -ENOMEM;
+
+	return 0;
 }
 
-void  __init mlx4_catas_init(void)
+void mlx4_catas_end(struct mlx4_dev *dev)
 {
-	INIT_WORK(&catas_work, catas_reset);
+	if (dev->persist->catas_wq) {
+		destroy_workqueue(dev->persist->catas_wq);
+		dev->persist->catas_wq = NULL;
+	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 2c5a555dff89..a61694cc1476 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3064,11 +3064,19 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
 		}
 	}
 
-	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	err = mlx4_catas_init(&priv->dev);
 	if (err)
 		goto err_release_regions;
+
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	if (err)
+		goto err_catas;
+
 	return 0;
 
+err_catas:
+	mlx4_catas_end(&priv->dev);
+
 err_release_regions:
 	pci_release_regions(pdev);
 
@@ -3219,6 +3227,7 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	mlx4_unload_one(pdev);
+	mlx4_catas_end(dev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 	kfree(dev->persist);
@@ -3403,7 +3412,6 @@ static int __init mlx4_init(void)
 	if (mlx4_verify_params())
 		return -EINVAL;
 
-	mlx4_catas_init();
 
 	mlx4_wq = create_singlethread_workqueue("mlx4");
 	if (!mlx4_wq)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index faa37ab75a9d..d41af84f965b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -995,7 +995,8 @@ void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev);
 void mlx4_stop_catas_poll(struct mlx4_dev *dev);
-void mlx4_catas_init(void);
+int mlx4_catas_init(struct mlx4_dev *dev);
+void mlx4_catas_end(struct mlx4_dev *dev);
 int mlx4_restart_one(struct pci_dev *pdev);
 int mlx4_register_device(struct mlx4_dev *dev);
 void mlx4_unregister_device(struct mlx4_dev *dev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 8c3837ac1a2d..da425d2f3708 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -751,6 +751,8 @@ struct mlx4_dev_persistent {
 	int			num_vfs;
 	enum mlx4_port_type curr_port_type[MLX4_MAX_PORTS + 1];
 	enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
+	struct work_struct      catas_work;
+	struct workqueue_struct *catas_wq;
 };
 
 struct mlx4_dev {
-- 
cgit v1.2.3


From f6bc11e42646e661e699a5593cbd1e9dba7191d0 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:38 +0200
Subject: net/mlx4_core: Enhance the catas flow to support device reset

This includes:

- resetting the chip when a fatal error is detected (the current code
  does not do this).

- exposing the ability to enter error state from outside the catas code
  by calling its functionality. (E.g. FW Command timeout, AER error).

- managing a persistent device state. This is needed to sync between
  reset flow cases.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c | 122 ++++++++++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx4/main.c  |   6 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |   2 +-
 include/linux/mlx4/device.h                |   7 ++
 4 files changed, 108 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 5bb9aa6e281d..588d6b5e1211 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -48,6 +48,83 @@ MODULE_PARM_DESC(internal_err_reset,
 		 "Reset device on internal errors if non-zero"
 		 " (default 1, in SRIOV mode default is 0)");
 
+static int read_vendor_id(struct mlx4_dev *dev)
+{
+	u16 vendor_id = 0;
+	int ret;
+
+	ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
+	if (ret) {
+		mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
+		return ret;
+	}
+
+	if (vendor_id == 0xffff) {
+		mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_reset_master(struct mlx4_dev *dev)
+{
+	int err = 0;
+
+	if (!pci_channel_offline(dev->persist->pdev)) {
+		err = read_vendor_id(dev);
+		/* If PCI can't be accessed to read vendor ID we assume that its
+		 * link was disabled and chip was already reset.
+		 */
+		if (err)
+			return 0;
+
+		err = mlx4_reset(dev);
+		if (err)
+			mlx4_err(dev, "Fail to reset HCA\n");
+	}
+
+	return err;
+}
+
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err;
+	struct mlx4_dev *dev;
+
+	if (!internal_err_reset)
+		return;
+
+	mutex_lock(&persist->device_state_mutex);
+	if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		goto out;
+
+	dev = persist->dev;
+	mlx4_err(dev, "device is going to be reset\n");
+	err = mlx4_reset_master(dev);
+	BUG_ON(err != 0);
+
+	dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
+	mlx4_err(dev, "device was reset successfully\n");
+	mutex_unlock(&persist->device_state_mutex);
+
+	/* At that step HW was already reset, now notify clients */
+	mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+	return;
+
+out:
+	mutex_unlock(&persist->device_state_mutex);
+}
+
+static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err = 0;
+
+	mlx4_enter_error_state(persist);
+	err = mlx4_restart_one(persist->pdev);
+	mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", err);
+}
+
 static void dump_err_buf(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -66,21 +143,22 @@ static void poll_catas(unsigned long dev_ptr)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	if (readl(priv->catas_err.map)) {
-		/* If the device is off-line, we cannot try to recover it */
-		if (pci_channel_offline(dev->persist->pdev))
-			mod_timer(&priv->catas_err.timer,
-				  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
-		else {
-			dump_err_buf(dev);
-			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
-
-			if (internal_err_reset)
-				queue_work(dev->persist->catas_wq,
-					   &dev->persist->catas_work);
-		}
-	} else
-		mod_timer(&priv->catas_err.timer,
-			  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+		dump_err_buf(dev);
+		goto internal_err;
+	}
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx4_warn(dev, "Internal error mark was detected on device\n");
+		goto internal_err;
+	}
+
+	mod_timer(&priv->catas_err.timer,
+		  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+	return;
+
+internal_err:
+	if (internal_err_reset)
+		queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
 }
 
 static void catas_reset(struct work_struct *work)
@@ -88,20 +166,8 @@ static void catas_reset(struct work_struct *work)
 	struct mlx4_dev_persistent *persist =
 		container_of(work, struct mlx4_dev_persistent,
 			     catas_work);
-	struct pci_dev *pdev = persist->pdev;
-	int ret;
-
-	/* If the device is off-line, we cannot reset it */
-	if (pci_channel_offline(pdev))
-		return;
 
-	ret = mlx4_restart_one(pdev);
-	/* 'priv' now is not valid */
-	if (ret)
-		pr_err("mlx4 %s: Reset failed (%d)\n",
-		       pci_name(pdev), ret);
-	else
-		mlx4_dbg(persist->dev, "Reset succeeded\n");
+	mlx4_handle_error_state(persist);
 }
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index a61694cc1476..dc2d910fcc88 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -2624,6 +2624,11 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 		}
 	}
 
+	/* on load remove any previous indication of internal error,
+	 * device is up.
+	 */
+	dev->persist->state = MLX4_DEVICE_STATE_UP;
+
 slave_start:
 	err = mlx4_cmd_init(dev);
 	if (err) {
@@ -3108,6 +3113,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->persist->dev = dev;
 	pci_set_drvdata(pdev, dev->persist);
 	priv->pci_dev_data = id->driver_data;
+	mutex_init(&dev->persist->device_state_mutex);
 
 	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
 	if (ret) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index d41af84f965b..aa1ecbc5a606 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1178,7 +1178,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
 
 void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
 
-void mlx4_handle_catas_err(struct mlx4_dev *dev);
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist);
 
 int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
 		    enum mlx4_port_type *type);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index da425d2f3708..7d5d317cb7a6 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -411,6 +411,11 @@ enum {
 	MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK	= 1 << 4,
 };
 
+enum {
+	MLX4_DEVICE_STATE_UP			= 1 << 0,
+	MLX4_DEVICE_STATE_INTERNAL_ERROR	= 1 << 1,
+};
+
 #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
 			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
 
@@ -753,6 +758,8 @@ struct mlx4_dev_persistent {
 	enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
 	struct work_struct      catas_work;
 	struct workqueue_struct *catas_wq;
+	struct mutex	device_state_mutex; /* protect HW state */
+	u8		state;
 };
 
 struct mlx4_dev {
-- 
cgit v1.2.3


From f5aef5aa35063f2b45c3605871cd525d0cb7fb7a Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:39 +0200
Subject: net/mlx4_core: Activate reset flow upon fatal command cases

We activate reset flow upon command fatal errors, when the device enters an
erroneous state, and must be reset.

The cases below are assumed to be fatal: FW command timed-out, an error from FW
on closing commands, pci is offline when posting/pending a command.

In those cases we place the device into an error state: chip is reset, pending
commands are awakened and completed immediately. Subsequent commands will
return immediately.

The return code in the above cases will depend on the command. Commands which
free and close resources will return success (because the chip was reset, so
callers may safely free their kernel resources). Other commands will return -EIO.

Since the device's state was marked as error, the catas poller will
detect this and restart the device's software stack (as is done when a FW
internal error is directly detected). The device state is protected by a
persistent mutex lives on its mlx4_dev, as such no need any more for the
hcr_mutex which is removed.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c |  11 +-
 drivers/net/ethernet/mellanox/mlx4/cmd.c   | 163 +++++++++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/mcg.c   |   3 +
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |   2 +-
 include/linux/mlx4/cmd.h                   |   1 +
 5 files changed, 153 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 588d6b5e1211..63f14ffcc906 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -42,8 +42,8 @@ enum {
 
 
-static int internal_err_reset = 1;
-module_param(internal_err_reset, int, 0644);
+int mlx4_internal_err_reset = 1;
+module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
 MODULE_PARM_DESC(internal_err_reset,
 		 "Reset device on internal errors if non-zero"
 		 " (default 1, in SRIOV mode default is 0)");
@@ -92,7 +92,7 @@ void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 	int err;
 	struct mlx4_dev *dev;
 
-	if (!internal_err_reset)
+	if (!mlx4_internal_err_reset)
 		return;
 
 	mutex_lock(&persist->device_state_mutex);
@@ -110,6 +110,7 @@ void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 
 	/* At that step HW was already reset, now notify clients */
 	mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+	mlx4_cmd_wake_completions(dev);
 	return;
 
 out:
@@ -157,7 +158,7 @@ static void poll_catas(unsigned long dev_ptr)
 	return;
 
 internal_err:
-	if (internal_err_reset)
+	if (mlx4_internal_err_reset)
 		queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
 }
 
@@ -177,7 +178,7 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev)
 
 	/*If we are in SRIOV the default of the module param must be 0*/
 	if (mlx4_is_mfunc(dev))
-		internal_err_reset = 0;
+		mlx4_internal_err_reset = 0;
 
 	INIT_LIST_HEAD(&priv->catas_err.list);
 	init_timer(&priv->catas_err.timer);
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 7cd90e6a4272..3895b2b5fc92 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -182,6 +182,72 @@ static u8 mlx4_errno_to_status(int errno)
 	}
 }
 
+static int mlx4_internal_err_ret_value(struct mlx4_dev *dev, u16 op,
+				       u8 op_modifier)
+{
+	switch (op) {
+	case MLX4_CMD_UNMAP_ICM:
+	case MLX4_CMD_UNMAP_ICM_AUX:
+	case MLX4_CMD_UNMAP_FA:
+	case MLX4_CMD_2RST_QP:
+	case MLX4_CMD_HW2SW_EQ:
+	case MLX4_CMD_HW2SW_CQ:
+	case MLX4_CMD_HW2SW_SRQ:
+	case MLX4_CMD_HW2SW_MPT:
+	case MLX4_CMD_CLOSE_HCA:
+	case MLX4_QP_FLOW_STEERING_DETACH:
+	case MLX4_CMD_FREE_RES:
+	case MLX4_CMD_CLOSE_PORT:
+		return CMD_STAT_OK;
+
+	case MLX4_CMD_QP_ATTACH:
+		/* On Detach case return success */
+		if (op_modifier == 0)
+			return CMD_STAT_OK;
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+
+	default:
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+	}
+}
+
+static int mlx4_closing_cmd_fatal_error(u16 op, u8 fw_status)
+{
+	/* Any error during the closing commands below is considered fatal */
+	if (op == MLX4_CMD_CLOSE_HCA ||
+	    op == MLX4_CMD_HW2SW_EQ ||
+	    op == MLX4_CMD_HW2SW_CQ ||
+	    op == MLX4_CMD_2RST_QP ||
+	    op == MLX4_CMD_HW2SW_SRQ ||
+	    op == MLX4_CMD_SYNC_TPT ||
+	    op == MLX4_CMD_UNMAP_ICM ||
+	    op == MLX4_CMD_UNMAP_ICM_AUX ||
+	    op == MLX4_CMD_UNMAP_FA)
+		return 1;
+	/* Error on MLX4_CMD_HW2SW_MPT is fatal except when fw status equals
+	  * CMD_STAT_REG_BOUND.
+	  * This status indicates that memory region has memory windows bound to it
+	  * which may result from invalid user space usage and is not fatal.
+	  */
+	if (op == MLX4_CMD_HW2SW_MPT && fw_status != CMD_STAT_REG_BOUND)
+		return 1;
+	return 0;
+}
+
+static int mlx4_cmd_reset_flow(struct mlx4_dev *dev, u16 op, u8 op_modifier,
+			       int err)
+{
+	/* Only if reset flow is really active return code is based on
+	  * command, otherwise current error code is returned.
+	  */
+	if (mlx4_internal_err_reset) {
+		mlx4_enter_error_state(dev->persist);
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+	}
+
+	return err;
+}
+
 static int comm_pending(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -258,7 +324,7 @@ static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
 	cmd->free_head = context->next;
 	spin_unlock(&cmd->context_lock);
 
-	init_completion(&context->done);
+	reinit_completion(&context->done);
 
 	mlx4_comm_cmd_post(dev, op, param);
 
@@ -323,17 +389,21 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 {
 	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
 	u32 __iomem *hcr = cmd->hcr;
-	int ret = -EAGAIN;
+	int ret = -EIO;
 	unsigned long end;
 
-	mutex_lock(&cmd->hcr_mutex);
-
-	if (pci_channel_offline(dev->persist->pdev)) {
+	mutex_lock(&dev->persist->device_state_mutex);
+	/* To avoid writing to unknown addresses after the device state was
+	  * changed to internal error and the chip was reset,
+	  * check the INTERNAL_ERROR flag which is updated under
+	  * device_state_mutex lock.
+	  */
+	if (pci_channel_offline(dev->persist->pdev) ||
+	    (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
 		 */
-		ret = -EIO;
 		goto out;
 	}
 
@@ -347,7 +417,6 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 			 * Device is going through error recovery
 			 * and cannot accept commands.
 			 */
-			ret = -EIO;
 			goto out;
 		}
 
@@ -391,7 +460,11 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 	ret = 0;
 
 out:
-	mutex_unlock(&cmd->hcr_mutex);
+	if (ret)
+		mlx4_warn(dev, "Could not post command 0x%x: ret=%d, in_param=0x%llx, in_mod=0x%x, op_mod=0x%x\n",
+			  op, ret, in_param, in_modifier, op_modifier);
+	mutex_unlock(&dev->persist->device_state_mutex);
+
 	return ret;
 }
 
@@ -464,12 +537,12 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 
 	down(&priv->cmd.poll_sem);
 
-	if (pci_channel_offline(dev->persist->pdev)) {
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
 		 */
-		err = -EIO;
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
 		goto out;
 	}
 
@@ -483,7 +556,7 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
 			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
 	if (err)
-		goto out;
+		goto out_reset;
 
 	end = msecs_to_jiffies(timeout) + jiffies;
 	while (cmd_pending(dev) && time_before(jiffies, end)) {
@@ -493,6 +566,11 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 			 * and cannot accept commands.
 			 */
 			err = -EIO;
+			goto out_reset;
+		}
+
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
 			goto out;
 		}
 
@@ -502,8 +580,8 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	if (cmd_pending(dev)) {
 		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
 			  op);
-		err = -ETIMEDOUT;
-		goto out;
+		err = -EIO;
+		goto out_reset;
 	}
 
 	if (out_is_imm)
@@ -515,10 +593,17 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	stat = be32_to_cpu((__force __be32)
 			   __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
 	err = mlx4_status_to_errno(stat);
-	if (err)
+	if (err) {
 		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
 			 op, stat);
+		if (mlx4_closing_cmd_fatal_error(op, stat))
+			goto out_reset;
+		goto out;
+	}
 
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
 out:
 	up(&priv->cmd.poll_sem);
 	return err;
@@ -565,17 +650,19 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 		goto out;
 	}
 
-	init_completion(&context->done);
+	reinit_completion(&context->done);
 
-	mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
-		      in_modifier, op_modifier, op, context->token, 1);
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, context->token, 1);
+	if (err)
+		goto out_reset;
 
 	if (!wait_for_completion_timeout(&context->done,
 					 msecs_to_jiffies(timeout))) {
 		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
 			  op);
-		err = -EBUSY;
-		goto out;
+		err = -EIO;
+		goto out_reset;
 	}
 
 	err = context->result;
@@ -592,12 +679,20 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 		else
 			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
 				 op, context->fw_status);
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+		else if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
+			goto out_reset;
+
 		goto out;
 	}
 
 	if (out_is_imm)
 		*out_param = context->out_param;
 
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
 out:
 	spin_lock(&cmd->context_lock);
 	context->next = cmd->free_head;
@@ -613,9 +708,12 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	       u16 op, unsigned long timeout, int native)
 {
 	if (pci_channel_offline(dev->persist->pdev))
-		return -EIO;
+		return mlx4_cmd_reset_flow(dev, op, op_modifier, -EIO);
 
 	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			return mlx4_internal_err_ret_value(dev, op,
+							  op_modifier);
 		if (mlx4_priv(dev)->cmd.use_events)
 			return mlx4_cmd_wait(dev, in_param, out_param,
 					     out_is_imm, in_modifier,
@@ -2121,7 +2219,6 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
 	int flags = 0;
 
 	if (!priv->cmd.initialized) {
-		mutex_init(&priv->cmd.hcr_mutex);
 		mutex_init(&priv->cmd.slave_cmd_mutex);
 		sema_init(&priv->cmd.poll_sem, 1);
 		priv->cmd.use_events = 0;
@@ -2232,6 +2329,11 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev)
 	for (i = 0; i < priv->cmd.max_cmds; ++i) {
 		priv->cmd.context[i].token = i;
 		priv->cmd.context[i].next  = i + 1;
+		/* To support fatal error flow, initialize all
+		 * cmd contexts to allow simulating completions
+		 * with complete() at any time.
+		 */
+		init_completion(&priv->cmd.context[i].done);
 	}
 
 	priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
@@ -2329,6 +2431,25 @@ int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave)
 	return slave - 1;
 }
 
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context;
+	int i;
+
+	spin_lock(&priv->cmd.context_lock);
+	if (priv->cmd.context) {
+		for (i = 0; i < priv->cmd.max_cmds; ++i) {
+			context = &priv->cmd.context[i];
+			context->fw_status = CMD_STAT_INTERNAL_ERR;
+			context->result    =
+				mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+			complete(&context->done);
+		}
+	}
+	spin_unlock(&priv->cmd.context_lock);
+}
+
 struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave)
 {
 	struct mlx4_active_ports actv_ports;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index a3867e7ef885..d22d9283d2cd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -1318,6 +1318,9 @@ out:
 	mutex_unlock(&priv->mcg_table.mutex);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err && dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		/* In case device is under an error, return success as a closing command */
+		err = 0;
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index aa1ecbc5a606..5c772ea4473b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -235,6 +235,7 @@ do {									\
 
 extern int mlx4_log_num_mgm_entry_size;
 extern int log_mtts_per_seg;
+extern int mlx4_internal_err_reset;
 
 #define MLX4_MAX_NUM_SLAVES	(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF)
 #define ALL_SLAVES 0xff
@@ -607,7 +608,6 @@ struct mlx4_mgm {
 struct mlx4_cmd {
 	struct pci_pool	       *pool;
 	void __iomem	       *hcr;
-	struct mutex		hcr_mutex;
 	struct mutex		slave_cmd_mutex;
 	struct semaphore	poll_sem;
 	struct semaphore	event_sem;
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 64d25941b329..e7543844cc7a 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -279,6 +279,7 @@ int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_in
 int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state);
 int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
 			      struct mlx4_config_dev_params *params);
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev);
 /*
  * mlx4_get_slave_default_vlan -
  * return true if VST ( default vlan)
-- 
cgit v1.2.3


From c69453e294c9f16da977b68e658a8028b854c209 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:40 +0200
Subject: net/mlx4_core: Manage interface state for Reset flow cases

We need to manage interface state to sync between reset flow and some other
relative cases such as remove_one. This has to be done to prevent certain
races. For example in case software stack is down as a result of unload call,
the remove_one should skip the unload phase.

Implement the remove_one case, handling AER and other cases comes next.

The interface can be up/down, upon remove_one, the state will include an extra
bit indicating that the device is cleaned-up, forcing other tasks to finish
before the final cleanup.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c | 13 +++++++++++--
 drivers/net/ethernet/mellanox/mlx4/intf.c  |  2 ++
 drivers/net/ethernet/mellanox/mlx4/main.c  | 13 ++++++++++++-
 include/linux/mlx4/device.h                |  7 +++++++
 4 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 63f14ffcc906..3fcf3cfaedfc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -122,8 +122,14 @@ static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
 	int err = 0;
 
 	mlx4_enter_error_state(persist);
-	err = mlx4_restart_one(persist->pdev);
-	mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", err);
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
+	    !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
+		err = mlx4_restart_one(persist->pdev);
+		mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
+			  err);
+	}
+	mutex_unlock(&persist->interface_state_mutex);
 }
 
 static void dump_err_buf(struct mlx4_dev *dev)
@@ -211,6 +217,9 @@ void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 		iounmap(priv->catas_err.map);
 		priv->catas_err.map = NULL;
 	}
+
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
+		flush_workqueue(dev->persist->catas_wq);
 }
 
 int  mlx4_catas_init(struct mlx4_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 116895ac8b35..fba0b96a6f28 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -138,6 +138,7 @@ int mlx4_register_device(struct mlx4_dev *dev)
 
 	mutex_lock(&intf_mutex);
 
+	dev->persist->interface_state |= MLX4_INTERFACE_STATE_UP;
 	list_add_tail(&priv->dev_list, &dev_list);
 	list_for_each_entry(intf, &intf_list, list)
 		mlx4_add_device(intf, priv);
@@ -162,6 +163,7 @@ void mlx4_unregister_device(struct mlx4_dev *dev)
 		mlx4_remove_device(intf, priv);
 
 	list_del(&priv->dev_list);
+	dev->persist->interface_state &= ~MLX4_INTERFACE_STATE_UP;
 
 	mutex_unlock(&intf_mutex);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index dc2d910fcc88..d59cae5da3f0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3114,6 +3114,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	pci_set_drvdata(pdev, dev->persist);
 	priv->pci_dev_data = id->driver_data;
 	mutex_init(&dev->persist->device_state_mutex);
+	mutex_init(&dev->persist->interface_state_mutex);
 
 	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
 	if (ret) {
@@ -3232,7 +3233,17 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	mlx4_unload_one(pdev);
+	mutex_lock(&persist->interface_state_mutex);
+	persist->interface_state |= MLX4_INTERFACE_STATE_DELETION;
+	mutex_unlock(&persist->interface_state_mutex);
+
+	/* device marked to be under deletion running now without the lock
+	 * letting other tasks to be terminated
+	 */
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	else
+		mlx4_info(dev, "%s: interface is down\n", __func__);
 	mlx4_catas_end(dev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 7d5d317cb7a6..33f9ca71925c 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -416,6 +416,11 @@ enum {
 	MLX4_DEVICE_STATE_INTERNAL_ERROR	= 1 << 1,
 };
 
+enum {
+	MLX4_INTERFACE_STATE_UP		= 1 << 0,
+	MLX4_INTERFACE_STATE_DELETION	= 1 << 1,
+};
+
 #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
 			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
 
@@ -760,6 +765,8 @@ struct mlx4_dev_persistent {
 	struct workqueue_struct *catas_wq;
 	struct mutex	device_state_mutex; /* protect HW state */
 	u8		state;
+	struct mutex	interface_state_mutex; /* protect SW state */
+	u8	interface_state;
 };
 
 struct mlx4_dev {
-- 
cgit v1.2.3


From 55ad359225b2232b9b8f04a0dfa169bd3a7d86d2 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:42 +0200
Subject: net/mlx4_core: Enable device recovery flow with SRIOV

In SRIOV, both the PF and the VF may attempt device recovery whenever they
assume that the device is not functioning.  When the PF driver resets the
device, the VF should detect this and attempt to reinitialize itself.

The VF must be able to reset itself under all circumstances, even
if the PF is not responsive.

The VF shall reset itself in the following cases:

1. Commands are not processed within reasonable time over the communication channel.
This is done considering device state and the correct return code based on
the command as was done in the native mode, done in the next patch.

2. The VF driver receives an internal error event reported by the PF on the
communication channel. This occurs when the PF driver resets the device or
when VF is out of sync with the PF.

Add 'VF reset' capability, which allows the VF to reinitialize itself even when the
PF is not responsive.

As PF and VF may run their reset flow simulantanisly, there are several cases
that are handled:
- Prevent freeing VF resources upon FLR, when PF is in its unloading stage.
- Prevent PF getting VF commands before it has finished initializing its resources.
- Upon VF startup, check that comm-channel is online before sending
  commands to the PF and getting timed-out.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c | 120 +++++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/cmd.c   |  77 ++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/eq.c    |  10 ++-
 drivers/net/ethernet/mellanox/mlx4/intf.c  |   6 +-
 drivers/net/ethernet/mellanox/mlx4/main.c  | 135 +++++++++++++++++++++++------
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |   7 +-
 include/linux/mlx4/cmd.h                   |   2 +
 include/linux/mlx4/device.h                |   5 ++
 8 files changed, 292 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 3fcf3cfaedfc..715de8affcc9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -45,8 +45,7 @@ enum {
 int mlx4_internal_err_reset = 1;
 module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
 MODULE_PARM_DESC(internal_err_reset,
-		 "Reset device on internal errors if non-zero"
-		 " (default 1, in SRIOV mode default is 0)");
+		 "Reset device on internal errors if non-zero (default 1)");
 
 static int read_vendor_id(struct mlx4_dev *dev)
 {
@@ -71,6 +70,9 @@ static int mlx4_reset_master(struct mlx4_dev *dev)
 {
 	int err = 0;
 
+	if (mlx4_is_master(dev))
+		mlx4_report_internal_err_comm_event(dev);
+
 	if (!pci_channel_offline(dev->persist->pdev)) {
 		err = read_vendor_id(dev);
 		/* If PCI can't be accessed to read vendor ID we assume that its
@@ -87,6 +89,81 @@ static int mlx4_reset_master(struct mlx4_dev *dev)
 	return err;
 }
 
+static int mlx4_reset_slave(struct mlx4_dev *dev)
+{
+#define COM_CHAN_RST_REQ_OFFSET 0x10
+#define COM_CHAN_RST_ACK_OFFSET 0x08
+
+	u32 comm_flags;
+	u32 rst_req;
+	u32 rst_ack;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return 0;
+
+	comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+				  MLX4_COMM_CHAN_FLAGS));
+	if (comm_flags == 0xffffffff) {
+		mlx4_err(dev, "VF reset is not needed\n");
+		return 0;
+	}
+
+	if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
+		mlx4_err(dev, "VF reset is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+		COM_CHAN_RST_REQ_OFFSET;
+	rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+		COM_CHAN_RST_ACK_OFFSET;
+	if (rst_req != rst_ack) {
+		mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
+		return -EIO;
+	}
+
+	rst_req ^= 1;
+	mlx4_warn(dev, "VF is sending reset request to Firmware\n");
+	comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
+	__raw_writel((__force u32)cpu_to_be32(comm_flags),
+		     (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
+	/* Make sure that our comm channel write doesn't
+	 * get mixed in with writes from another CPU.
+	 */
+	mmiowb();
+
+	end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+			COM_CHAN_RST_ACK_OFFSET;
+
+		/* Reading rst_req again since the communication channel can
+		 * be reset at any time by the PF and all its bits will be
+		 * set to zero.
+		 */
+		rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+			COM_CHAN_RST_REQ_OFFSET;
+
+		if (rst_ack == rst_req) {
+			mlx4_warn(dev, "VF Reset succeed\n");
+			return 0;
+		}
+		cond_resched();
+	}
+	mlx4_err(dev, "Fail to send reset over the communication channel\n");
+	return -ETIMEDOUT;
+}
+
+static int mlx4_comm_internal_err(u32 slave_read)
+{
+	return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
+		(slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
+}
+
 void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 {
 	int err;
@@ -101,7 +178,10 @@ void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 
 	dev = persist->dev;
 	mlx4_err(dev, "device is going to be reset\n");
-	err = mlx4_reset_master(dev);
+	if (mlx4_is_slave(dev))
+		err = mlx4_reset_slave(dev);
+	else
+		err = mlx4_reset_master(dev);
 	BUG_ON(err != 0);
 
 	dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
@@ -148,8 +228,15 @@ static void poll_catas(unsigned long dev_ptr)
 {
 	struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr;
 	struct mlx4_priv *priv = mlx4_priv(dev);
-
-	if (readl(priv->catas_err.map)) {
+	u32 slave_read;
+
+	if (mlx4_is_slave(dev)) {
+		slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (mlx4_comm_internal_err(slave_read)) {
+			mlx4_warn(dev, "Internal error detected on the communication channel\n");
+			goto internal_err;
+		}
+	} else if (readl(priv->catas_err.map)) {
 		dump_err_buf(dev);
 		goto internal_err;
 	}
@@ -182,22 +269,21 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	phys_addr_t addr;
 
-	/*If we are in SRIOV the default of the module param must be 0*/
-	if (mlx4_is_mfunc(dev))
-		mlx4_internal_err_reset = 0;
-
 	INIT_LIST_HEAD(&priv->catas_err.list);
 	init_timer(&priv->catas_err.timer);
 	priv->catas_err.map = NULL;
 
-	addr = pci_resource_start(dev->persist->pdev, priv->fw.catas_bar) +
-		priv->fw.catas_offset;
-
-	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
-	if (!priv->catas_err.map) {
-		mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
-			  (unsigned long long) addr);
-		return;
+	if (!mlx4_is_slave(dev)) {
+		addr = pci_resource_start(dev->persist->pdev,
+					  priv->fw.catas_bar) +
+					  priv->fw.catas_offset;
+
+		priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+		if (!priv->catas_err.map) {
+			mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
+				  (unsigned long long)addr);
+			return;
+		}
 	}
 
 	priv->catas_err.timer.data     = (unsigned long) dev;
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 3895b2b5fc92..7652eed4bbc8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -42,6 +42,7 @@
 #include <linux/mlx4/device.h>
 #include <linux/semaphore.h>
 #include <rdma/ib_smi.h>
+#include <linux/delay.h>
 
 #include <asm/io.h>
 
@@ -729,7 +730,7 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 EXPORT_SYMBOL_GPL(__mlx4_cmd);
 
 
-static int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
 {
 	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL,
 			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
@@ -1945,8 +1946,11 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 		break;
 	case MLX4_COMM_CMD_VHCR_POST:
 		if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) &&
-		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST))
+		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST)) {
+			mlx4_warn(dev, "slave:%d is out of sync, cmd=0x%x, last command=0x%x, reset is needed\n",
+				  slave, cmd, slave_state[slave].last_cmd);
 			goto reset_slave;
+		}
 
 		mutex_lock(&priv->cmd.slave_cmd_mutex);
 		if (mlx4_master_process_vhcr(dev, slave, NULL)) {
@@ -1980,7 +1984,18 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 
 reset_slave:
 	/* cleanup any slave resources */
-	mlx4_delete_all_resources_for_slave(dev, slave);
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_delete_all_resources_for_slave(dev, slave);
+
+	if (cmd != MLX4_COMM_CMD_RESET) {
+		mlx4_warn(dev, "Turn on internal error to force reset, slave=%d, cmd=0x%x\n",
+			  slave, cmd);
+		/* Turn on internal error letting slave reset itself immeditaly,
+		 * otherwise it might take till timeout on command is passed
+		 */
+		reply |= ((u32)COMM_CHAN_EVENT_INTERNAL_ERR);
+	}
+
 	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
 	if (!slave_state[slave].is_slave_going_down)
 		slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET;
@@ -2056,17 +2071,28 @@ void mlx4_master_comm_channel(struct work_struct *work)
 static int sync_toggles(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int wr_toggle;
-	int rd_toggle;
+	u32 wr_toggle;
+	u32 rd_toggle;
 	unsigned long end;
 
-	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write)) >> 31;
-	end = jiffies + msecs_to_jiffies(5000);
+	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write));
+	if (wr_toggle == 0xffffffff)
+		end = jiffies + msecs_to_jiffies(30000);
+	else
+		end = jiffies + msecs_to_jiffies(5000);
 
 	while (time_before(jiffies, end)) {
-		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)) >> 31;
-		if (rd_toggle == wr_toggle) {
-			priv->cmd.comm_toggle = rd_toggle;
+		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (wr_toggle == 0xffffffff || rd_toggle == 0xffffffff) {
+			/* PCI might be offline */
+			msleep(100);
+			wr_toggle = swab32(readl(&priv->mfunc.comm->
+					   slave_write));
+			continue;
+		}
+
+		if (rd_toggle >> 31 == wr_toggle >> 31) {
+			priv->cmd.comm_toggle = rd_toggle >> 31;
 			return 0;
 		}
 
@@ -2172,13 +2198,6 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 		if (mlx4_init_resource_tracker(dev))
 			goto err_thread;
 
-		err = mlx4_ARM_COMM_CHANNEL(dev);
-		if (err) {
-			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
-				 err);
-			goto err_resource;
-		}
-
 	} else {
 		err = sync_toggles(dev);
 		if (err) {
@@ -2188,8 +2207,6 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 	}
 	return 0;
 
-err_resource:
-	mlx4_free_resource_tracker(dev, RES_TR_FREE_ALL);
 err_thread:
 	flush_workqueue(priv->mfunc.master.comm_wq);
 	destroy_workqueue(priv->mfunc.master.comm_wq);
@@ -2266,6 +2283,27 @@ err:
 	return -ENOMEM;
 }
 
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int slave;
+	u32 slave_read;
+
+	/* Report an internal error event to all
+	 * communication channels.
+	 */
+	for (slave = 0; slave < dev->num_slaves; slave++) {
+		slave_read = swab32(readl(&priv->mfunc.comm[slave].slave_read));
+		slave_read |= (u32)COMM_CHAN_EVENT_INTERNAL_ERR;
+		__raw_writel((__force u32)cpu_to_be32(slave_read),
+			     &priv->mfunc.comm[slave].slave_read);
+		/* Make sure that our comm channel write doesn't
+		 * get mixed in with writes from another CPU.
+		 */
+		mmiowb();
+	}
+}
+
 void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -2281,6 +2319,7 @@ void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
 		kfree(priv->mfunc.master.slave_state);
 		kfree(priv->mfunc.master.vf_admin);
 		kfree(priv->mfunc.master.vf_oper);
+		dev->num_slaves = 0;
 	}
 
 	iounmap(priv->mfunc.comm);
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 7538c9ce98a9..2f2e6067426d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -429,8 +429,14 @@ void mlx4_master_handle_slave_flr(struct work_struct *work)
 		if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) {
 			mlx4_dbg(dev, "mlx4_handle_slave_flr: clean slave: %d\n",
 				 i);
-
-			mlx4_delete_all_resources_for_slave(dev, i);
+			/* In case of 'Reset flow' FLR can be generated for
+			 * a slave before mlx4_load_one is done.
+			 * make sure interface is up before trying to delete
+			 * slave resources which weren't allocated yet.
+			 */
+			if (dev->persist->interface_state &
+			    MLX4_INTERFACE_STATE_UP)
+				mlx4_delete_all_resources_for_slave(dev, i);
 			/*return the slave to running mode*/
 			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
 			slave_state[i].last_cmd = MLX4_COMM_CMD_RESET;
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index fba0b96a6f28..68d2bad325d5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -144,8 +144,7 @@ int mlx4_register_device(struct mlx4_dev *dev)
 		mlx4_add_device(intf, priv);
 
 	mutex_unlock(&intf_mutex);
-	if (!mlx4_is_slave(dev))
-		mlx4_start_catas_poll(dev);
+	mlx4_start_catas_poll(dev);
 
 	return 0;
 }
@@ -155,8 +154,7 @@ void mlx4_unregister_device(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_interface *intf;
 
-	if (!mlx4_is_slave(dev))
-		mlx4_stop_catas_poll(dev);
+	mlx4_stop_catas_poll(dev);
 	mutex_lock(&intf_mutex);
 
 	list_for_each_entry(intf, &intf_list, list)
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6bb0fca137cd..1baf1f1e2866 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -108,6 +108,8 @@ MODULE_PARM_DESC(enable_64b_cqe_eqe,
 					 MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
 					 MLX4_FUNC_CAP_DMFS_A0_STATIC)
 
+#define RESET_PERSIST_MASK_FLAGS	(MLX4_FLAG_SRIOV)
+
 static char mlx4_version[] =
 	DRV_NAME ": Mellanox ConnectX core driver v"
 	DRV_VERSION " (" DRV_RELDATE ")\n";
@@ -1579,6 +1581,50 @@ static void mlx4_close_fw(struct mlx4_dev *dev)
 	}
 }
 
+static int mlx4_comm_check_offline(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_OFFLINE_OFFSET 0x09
+
+	u32 comm_flags;
+	u32 offline_bit;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	end = msecs_to_jiffies(MLX4_COMM_OFFLINE_TIME_OUT) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		offline_bit = (comm_flags &
+			       (u32)(1 << COMM_CHAN_OFFLINE_OFFSET));
+		if (!offline_bit)
+			return 0;
+		/* There are cases as part of AER/Reset flow that PF needs
+		 * around 100 msec to load. We therefore sleep for 100 msec
+		 * to allow other tasks to make use of that CPU during this
+		 * time interval.
+		 */
+		msleep(100);
+	}
+	mlx4_err(dev, "Communication channel is offline.\n");
+	return -EIO;
+}
+
+static void mlx4_reset_vf_support(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_RST_OFFSET 0x1e
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 comm_rst;
+	u32 comm_caps;
+
+	comm_caps = swab32(readl((__iomem char *)priv->mfunc.comm +
+				 MLX4_COMM_CHAN_CAPS));
+	comm_rst = (comm_caps & (u32)(1 << COMM_CHAN_RST_OFFSET));
+
+	if (comm_rst)
+		dev->caps.vf_caps |= MLX4_VF_CAP_FLAG_RESET;
+}
+
 static int mlx4_init_slave(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1594,6 +1640,12 @@ static int mlx4_init_slave(struct mlx4_dev *dev)
 
 	mutex_lock(&priv->cmd.slave_cmd_mutex);
 	priv->cmd.max_cmds = 1;
+	if (mlx4_comm_check_offline(dev)) {
+		mlx4_err(dev, "PF is not responsive, skipping initialization\n");
+		goto err_offline;
+	}
+
+	mlx4_reset_vf_support(dev);
 	mlx4_warn(dev, "Sending reset\n");
 	ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
 				       MLX4_COMM_TIME);
@@ -1637,6 +1689,7 @@ static int mlx4_init_slave(struct mlx4_dev *dev)
 
 err:
 	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0);
+err_offline:
 	mutex_unlock(&priv->cmd.slave_cmd_mutex);
 	return -EIO;
 }
@@ -2494,11 +2547,19 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
 				  !!((flags) & MLX4_FLAG_MASTER))
 
 static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
-			     u8 total_vfs, int existing_vfs)
+			     u8 total_vfs, int existing_vfs, int reset_flow)
 {
 	u64 dev_flags = dev->flags;
 	int err = 0;
 
+	if (reset_flow) {
+		dev->dev_vfs = kcalloc(total_vfs, sizeof(*dev->dev_vfs),
+				       GFP_KERNEL);
+		if (!dev->dev_vfs)
+			goto free_mem;
+		return dev_flags;
+	}
+
 	atomic_inc(&pf_loading);
 	if (dev->flags &  MLX4_FLAG_SRIOV) {
 		if (existing_vfs != total_vfs) {
@@ -2533,6 +2594,7 @@ static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
 
 disable_sriov:
 	atomic_dec(&pf_loading);
+free_mem:
 	dev->persist->num_vfs = 0;
 	kfree(dev->dev_vfs);
 	return dev_flags & ~MLX4_FLAG_MASTER;
@@ -2557,7 +2619,8 @@ static int mlx4_check_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap
 }
 
 static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
-			 int total_vfs, int *nvfs, struct mlx4_priv *priv)
+			 int total_vfs, int *nvfs, struct mlx4_priv *priv,
+			 int reset_flow)
 {
 	struct mlx4_dev *dev;
 	unsigned sum = 0;
@@ -2679,8 +2742,10 @@ slave_start:
 				goto err_fw;
 
 			if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
-				u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
-								  existing_vfs);
+				u64 dev_flags = mlx4_enable_sriov(dev, pdev,
+								  total_vfs,
+								  existing_vfs,
+								  reset_flow);
 
 				mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
 				dev->flags = dev_flags;
@@ -2722,7 +2787,7 @@ slave_start:
 			if (dev->flags & MLX4_FLAG_SRIOV) {
 				if (!existing_vfs)
 					pci_disable_sriov(pdev);
-				if (mlx4_is_master(dev))
+				if (mlx4_is_master(dev) && !reset_flow)
 					atomic_dec(&pf_loading);
 				dev->flags &= ~MLX4_FLAG_SRIOV;
 			}
@@ -2736,7 +2801,8 @@ slave_start:
 	}
 
 	if (mlx4_is_master(dev) && (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
-		u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs, existing_vfs);
+		u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
+						  existing_vfs, reset_flow);
 
 		if ((dev->flags ^ dev_flags) & (MLX4_FLAG_MASTER | MLX4_FLAG_SLAVE)) {
 			mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_VHCR);
@@ -2848,6 +2914,17 @@ slave_start:
 		goto err_steer;
 
 	mlx4_init_quotas(dev);
+	/* When PF resources are ready arm its comm channel to enable
+	 * getting commands
+	 */
+	if (mlx4_is_master(dev)) {
+		err = mlx4_ARM_COMM_CHANNEL(dev);
+		if (err) {
+			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
+				 err);
+			goto err_steer;
+		}
+	}
 
 	for (port = 1; port <= dev->caps.num_ports; port++) {
 		err = mlx4_init_port_info(dev, port);
@@ -2866,7 +2943,7 @@ slave_start:
 
 	priv->removed = 0;
 
-	if (mlx4_is_master(dev) && dev->persist->num_vfs)
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
 		atomic_dec(&pf_loading);
 
 	kfree(dev_cap);
@@ -2925,10 +3002,12 @@ err_cmd:
 	mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
 
 err_sriov:
-	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs)
+	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) {
 		pci_disable_sriov(pdev);
+		dev->flags &= ~MLX4_FLAG_SRIOV;
+	}
 
-	if (mlx4_is_master(dev) && dev->persist->num_vfs)
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
 		atomic_dec(&pf_loading);
 
 	kfree(priv->dev.dev_vfs);
@@ -3073,7 +3152,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
 	if (err)
 		goto err_release_regions;
 
-	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 0);
 	if (err)
 		goto err_catas;
 
@@ -3131,9 +3210,11 @@ static void mlx4_clean_dev(struct mlx4_dev *dev)
 {
 	struct mlx4_dev_persistent *persist = dev->persist;
 	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long	flags = (dev->flags & RESET_PERSIST_MASK_FLAGS);
 
 	memset(priv, 0, sizeof(*priv));
 	priv->dev.persist = persist;
+	priv->dev.flags = flags;
 }
 
 static void mlx4_unload_one(struct pci_dev *pdev)
@@ -3143,7 +3224,6 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int               pci_dev_data;
 	int p, i;
-	int active_vfs = 0;
 
 	if (priv->removed)
 		return;
@@ -3157,14 +3237,6 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 
 	pci_dev_data = priv->pci_dev_data;
 
-	/* Disabling SR-IOV is not allowed while there are active vf's */
-	if (mlx4_is_master(dev)) {
-		active_vfs = mlx4_how_many_lives_vf(dev);
-		if (active_vfs) {
-			pr_warn("Removing PF when there are active VF's !!\n");
-			pr_warn("Will not disable SR-IOV.\n");
-		}
-	}
 	mlx4_stop_sense(dev);
 	mlx4_unregister_device(dev);
 
@@ -3208,12 +3280,6 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 
 	if (dev->flags & MLX4_FLAG_MSI_X)
 		pci_disable_msix(pdev);
-	if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
-		mlx4_warn(dev, "Disabling SR-IOV\n");
-		pci_disable_sriov(pdev);
-		dev->flags &= ~MLX4_FLAG_SRIOV;
-		dev->persist->num_vfs = 0;
-	}
 
 	if (!mlx4_is_slave(dev))
 		mlx4_free_ownership(dev);
@@ -3235,11 +3301,21 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
 	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
+	int active_vfs = 0;
 
 	mutex_lock(&persist->interface_state_mutex);
 	persist->interface_state |= MLX4_INTERFACE_STATE_DELETION;
 	mutex_unlock(&persist->interface_state_mutex);
 
+	/* Disabling SR-IOV is not allowed while there are active vf's */
+	if (mlx4_is_master(dev) && dev->flags & MLX4_FLAG_SRIOV) {
+		active_vfs = mlx4_how_many_lives_vf(dev);
+		if (active_vfs) {
+			pr_warn("Removing PF when there are active VF's !!\n");
+			pr_warn("Will not disable SR-IOV.\n");
+		}
+	}
+
 	/* device marked to be under deletion running now without the lock
 	 * letting other tasks to be terminated
 	 */
@@ -3248,6 +3324,11 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	else
 		mlx4_info(dev, "%s: interface is down\n", __func__);
 	mlx4_catas_end(dev);
+	if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
+		mlx4_warn(dev, "Disabling SR-IOV\n");
+		pci_disable_sriov(pdev);
+	}
+
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 	kfree(dev->persist);
@@ -3287,7 +3368,7 @@ int mlx4_restart_one(struct pci_dev *pdev)
 	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
 
 	mlx4_unload_one(pdev);
-	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 1);
 	if (err) {
 		mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n",
 			 __func__, pci_name(pdev), err);
@@ -3397,7 +3478,7 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
 	mutex_lock(&persist->interface_state_mutex);
 	if (!(persist->interface_state & MLX4_INTERFACE_STATE_UP)) {
 		ret = mlx4_load_one(pdev, priv->pci_dev_data, total_vfs, nvfs,
-				    priv);
+				    priv, 1);
 		if (ret) {
 			mlx4_err(dev, "%s: mlx4_load_one failed, ret=%d\n",
 				 __func__,  ret);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 5c772ea4473b..2a15b8248e77 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -85,7 +85,9 @@ enum {
 	MLX4_CLR_INT_SIZE	= 0x00008,
 	MLX4_SLAVE_COMM_BASE	= 0x0,
 	MLX4_COMM_PAGESIZE	= 0x1000,
-	MLX4_CLOCK_SIZE		= 0x00008
+	MLX4_CLOCK_SIZE		= 0x00008,
+	MLX4_COMM_CHAN_CAPS	= 0x8,
+	MLX4_COMM_CHAN_FLAGS	= 0xc
 };
 
 enum {
@@ -120,6 +122,8 @@ enum mlx4_mpt_state {
 };
 
 #define MLX4_COMM_TIME		10000
+#define MLX4_COMM_OFFLINE_TIME_OUT 30000
+
 enum {
 	MLX4_COMM_CMD_RESET,
 	MLX4_COMM_CMD_VHCR0,
@@ -1162,6 +1166,7 @@ enum {
 int mlx4_cmd_init(struct mlx4_dev *dev);
 void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask);
 int mlx4_multi_func_init(struct mlx4_dev *dev);
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev);
 void mlx4_multi_func_cleanup(struct mlx4_dev *dev);
 void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
 int mlx4_cmd_use_events(struct mlx4_dev *dev);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index e7543844cc7a..c989442ffc6a 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -280,6 +280,7 @@ int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_stat
 int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
 			      struct mlx4_config_dev_params *params);
 void mlx4_cmd_wake_completions(struct mlx4_dev *dev);
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev);
 /*
  * mlx4_get_slave_default_vlan -
  * return true if VST ( default vlan)
@@ -289,5 +290,6 @@ bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave,
 				 u16 *vlan, u8 *qos);
 
 #define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
+#define COMM_CHAN_EVENT_INTERNAL_ERR (1 << 17)
 
 #endif /* MLX4_CMD_H */
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 33f9ca71925c..5ef54e145e4d 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -208,6 +208,10 @@ enum {
 	MLX4_QUERY_FUNC_FLAGS_A0_RES_QP		= 1LL << 1
 };
 
+enum {
+	MLX4_VF_CAP_FLAG_RESET			= 1 << 0
+};
+
 /* bit enums for an 8-bit flags field indicating special use
  * QPs which require special handling in qp_reserve_range.
  * Currently, this only includes QPs used by the ETH interface,
@@ -545,6 +549,7 @@ struct mlx4_caps {
 	u8			alloc_res_qp_mask;
 	u32			dmfs_high_rate_qpn_base;
 	u32			dmfs_high_rate_qpn_range;
+	u32			vf_caps;
 };
 
 struct mlx4_buf_list {
-- 
cgit v1.2.3


From 03af03ad7c6723e0f87568e4ffe66ceb9608bfe7 Mon Sep 17 00:00:00 2001
From: Karol Wrona <k.wrona@samsung.com>
Date: Wed, 7 Jan 2015 19:36:11 +0100
Subject: iio: Add new operating mode for non triggered sw buffers

There was a need for non triggered software buffer type.  It can be used when
triggered model does not fit and INDIO_BUFFER_HARDWARE causes confusion because
the data stream can be obtained not directly form hardware backend.

Suggested-by: Jonathan Cameron <jic23@kernel.org>
Signed-off-by: Karol Wrona <k.wrona@samsung.com>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-buffer.c | 2 ++
 include/linux/iio/iio.h           | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 403b72878b0b..71333140d42c 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -639,6 +639,8 @@ static int __iio_update_buffers(struct iio_dev *indio_dev,
 		indio_dev->currentmode = INDIO_BUFFER_TRIGGERED;
 	} else if (indio_dev->modes & INDIO_BUFFER_HARDWARE) {
 		indio_dev->currentmode = INDIO_BUFFER_HARDWARE;
+	} else if (indio_dev->modes & INDIO_BUFFER_SOFTWARE) {
+		indio_dev->currentmode = INDIO_BUFFER_SOFTWARE;
 	} else { /* Should never be reached */
 		ret = -EINVAL;
 		goto error_run_postdisable;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 878d861b0610..590202024857 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -286,10 +286,11 @@ static inline s64 iio_get_time_ns(void)
 /* Device operating modes */
 #define INDIO_DIRECT_MODE		0x01
 #define INDIO_BUFFER_TRIGGERED		0x02
+#define INDIO_BUFFER_SOFTWARE		0x04
 #define INDIO_BUFFER_HARDWARE		0x08
 
 #define INDIO_ALL_BUFFER_MODES					\
-	(INDIO_BUFFER_TRIGGERED | INDIO_BUFFER_HARDWARE)
+	(INDIO_BUFFER_TRIGGERED | INDIO_BUFFER_HARDWARE | INDIO_BUFFER_SOFTWARE)
 
 #define INDIO_MAX_RAW_ELEMENTS		4
 
@@ -593,7 +594,8 @@ void devm_iio_trigger_free(struct device *dev, struct iio_trigger *iio_trig);
 static inline bool iio_buffer_enabled(struct iio_dev *indio_dev)
 {
 	return indio_dev->currentmode
-		& (INDIO_BUFFER_TRIGGERED | INDIO_BUFFER_HARDWARE);
+		& (INDIO_BUFFER_TRIGGERED | INDIO_BUFFER_HARDWARE |
+		   INDIO_BUFFER_SOFTWARE);
 }
 
 /**
-- 
cgit v1.2.3


From c2943f14534bdc4230f4da6dcd4ea03c5d8c8162 Mon Sep 17 00:00:00 2001
From: Harout Hedeshian <harouth@codeaurora.org>
Date: Tue, 20 Jan 2015 10:06:05 -0700
Subject: net: ipv6: Add sysctl entry to disable MTU updates from RA

The kernel forcefully applies MTU values received in router
advertisements provided the new MTU is less than the current. This
behavior is undesirable when the user space is managing the MTU. Instead
a sysctl flag 'accept_ra_mtu' is introduced such that the user space
can control whether or not RA provided MTU updates should be applied. The
default behavior is unchanged; user space must explicitly set this flag
to 0 for RA MTUs to be ignored.

Signed-off-by: Harout Hedeshian <harouth@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 10 ++++++++++
 net/ipv6/ndisc.c                       |  2 +-
 5 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 85b022179104..a5e4c813f17f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1287,6 +1287,13 @@ accept_ra_rtr_pref - BOOLEAN
 	Functional default: enabled if accept_ra is enabled.
 			    disabled if accept_ra is disabled.
 
+accept_ra_mtu - BOOLEAN
+	Apply the MTU value specified in RA option 5 (RFC4861). If
+	disabled, the MTU specified in the RA will be ignored.
+
+	Functional default: enabled if accept_ra is enabled.
+			    disabled if accept_ra is disabled.
+
 accept_redirects - BOOLEAN
 	Accept Redirects.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c694e7baa621..2805062c013f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -52,6 +52,7 @@ struct ipv6_devconf {
 	__s32		force_tllao;
 	__s32           ndisc_notify;
 	__s32		suppress_frag_ndisc;
+	__s32		accept_ra_mtu;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 73cb02dc3065..437a6a4b125a 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -169,6 +169,7 @@ enum {
 	DEVCONF_SUPPRESS_FRAG_NDISC,
 	DEVCONF_ACCEPT_RA_FROM_LOCAL,
 	DEVCONF_USE_OPTIMISTIC,
+	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d6b4f5d08014..7dcc065e2160 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -201,6 +201,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
 	.suppress_frag_ndisc	= 1,
+	.accept_ra_mtu		= 1,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -238,6 +239,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
 	.suppress_frag_ndisc	= 1,
+	.accept_ra_mtu		= 1,
 };
 
 /* Check if a valid qdisc is available */
@@ -4380,6 +4382,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify;
 	array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
 	array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
+	array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5258,6 +5261,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "accept_ra_mtu",
+			.data		= &ipv6_devconf.accept_ra_mtu,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 682866777d53..8a9d7c19e247 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1348,7 +1348,7 @@ skip_routeinfo:
 		}
 	}
 
-	if (ndopts.nd_opts_mtu) {
+	if (ndopts.nd_opts_mtu && in6_dev->cnf.accept_ra_mtu) {
 		__be32 n;
 		u32 mtu;
 
-- 
cgit v1.2.3


From d6cb125b9983e1ea9444f794b2d3ed5e3ad737b7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 24 Dec 2014 22:47:00 -0500
Subject: kill d_validate()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 31 -------------------------------
 include/linux/dcache.h |  3 ---
 2 files changed, 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..40432e59d72e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2187,37 +2187,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 }
 EXPORT_SYMBOL(d_hash_and_lookup);
 
-/**
- * d_validate - verify dentry provided from insecure source (deprecated)
- * @dentry: The dentry alleged to be valid child of @dparent
- * @dparent: The parent dentry (known to be valid)
- *
- * An insecure source has sent us a dentry, here we verify it and dget() it.
- * This is used by ncpfs in its readdir implementation.
- * Zero is returned in the dentry is invalid.
- *
- * This function is slow for big directories, and deprecated, do not use it.
- */
-int d_validate(struct dentry *dentry, struct dentry *dparent)
-{
-	struct dentry *child;
-
-	spin_lock(&dparent->d_lock);
-	list_for_each_entry(child, &dparent->d_subdirs, d_child) {
-		if (dentry == child) {
-			spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-			__dget_dlock(dentry);
-			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dparent->d_lock);
-			return 1;
-		}
-	}
-	spin_unlock(&dparent->d_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(d_validate);
-
 /*
  * When a file is deleted, we have two options:
  * - turn this dentry into a negative dentry
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 5a813988e6d4..92c08cf7670e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -319,9 +319,6 @@ static inline unsigned d_count(const struct dentry *dentry)
 	return dentry->d_lockref.count;
 }
 
-/* validate "insecure" dentry pointer */
-extern int d_validate(struct dentry *, struct dentry *);
-
 /*
  * helper function for dentry_operations.d_dname() members
  */
-- 
cgit v1.2.3


From 9e251d02041432487d89cb340e72490c4bbc198a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 9 Jan 2015 20:40:02 -0500
Subject: kill pin_put()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_pin.c            | 11 -----------
 include/linux/fs_pin.h |  1 -
 kernel/acct.c          | 14 ++++++++++----
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 9368236ca100..f173313760b8 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -4,19 +4,8 @@
 #include "internal.h"
 #include "mount.h"
 
-static void pin_free_rcu(struct rcu_head *head)
-{
-	kfree(container_of(head, struct fs_pin, rcu));
-}
-
 static DEFINE_SPINLOCK(pin_lock);
 
-void pin_put(struct fs_pin *p)
-{
-	if (atomic_long_dec_and_test(&p->count))
-		call_rcu(&p->rcu, pin_free_rcu);
-}
-
 void pin_remove(struct fs_pin *pin)
 {
 	spin_lock(&pin_lock);
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index f66525e72ccf..68a54b7741aa 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -12,6 +12,5 @@ struct fs_pin {
 	void (*kill)(struct fs_pin *);
 };
 
-void pin_put(struct fs_pin *);
 void pin_remove(struct fs_pin *);
 void pin_insert(struct fs_pin *, struct vfsmount *);
diff --git a/kernel/acct.c b/kernel/acct.c
index 33738ef972f3..7bb9e659a7da 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -124,6 +124,12 @@ out:
 	return acct->active;
 }
 
+static void acct_put(struct bsd_acct_struct *p)
+{
+	if (atomic_long_dec_and_test(&p->pin.count))
+		kfree_rcu(p, pin.rcu);
+}
+
 static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 {
 	struct bsd_acct_struct *res;
@@ -144,7 +150,7 @@ again:
 	mutex_lock(&res->lock);
 	if (!res->ns) {
 		mutex_unlock(&res->lock);
-		pin_put(&res->pin);
+		acct_put(res);
 		goto again;
 	}
 	return res;
@@ -175,7 +181,7 @@ static void acct_kill(struct bsd_acct_struct *acct,
 		acct->ns = NULL;
 		atomic_long_dec(&acct->pin.count);
 		mutex_unlock(&acct->lock);
-		pin_put(&acct->pin);
+		acct_put(acct);
 	}
 }
 
@@ -186,7 +192,7 @@ static void acct_pin_kill(struct fs_pin *pin)
 	mutex_lock(&acct->lock);
 	if (!acct->ns) {
 		mutex_unlock(&acct->lock);
-		pin_put(pin);
+		acct_put(acct);
 		acct = NULL;
 	}
 	acct_kill(acct, NULL);
@@ -576,7 +582,7 @@ static void slow_acct_process(struct pid_namespace *ns)
 		if (acct) {
 			do_acct_process(acct);
 			mutex_unlock(&acct->lock);
-			pin_put(&acct->pin);
+			acct_put(acct);
 		}
 	}
 }
-- 
cgit v1.2.3


From 360f54796ed65939093ae373b92ebd5ef3341776 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 9 Jan 2015 15:19:03 -0800
Subject: dcache: let the dentry count go down to zero without taking d_lock

We can be more aggressive about this, if we are clever and careful. This is subtle.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c             | 118 ++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/lockref.h |   3 +-
 lib/lockref.c           |  36 +++++++++++----
 3 files changed, 144 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 40432e59d72e..a14d00e9839e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -508,7 +508,7 @@ static void __dentry_kill(struct dentry *dentry)
 	 * dentry_iput drops the locks, at which point nobody (except
 	 * transient RCU lookups) can reach this dentry.
 	 */
-	BUG_ON((int)dentry->d_lockref.count > 0);
+	BUG_ON(dentry->d_lockref.count > 0);
 	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
@@ -561,7 +561,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
 	struct dentry *parent = dentry->d_parent;
 	if (IS_ROOT(dentry))
 		return NULL;
-	if (unlikely((int)dentry->d_lockref.count < 0))
+	if (unlikely(dentry->d_lockref.count < 0))
 		return NULL;
 	if (likely(spin_trylock(&parent->d_lock)))
 		return parent;
@@ -590,6 +590,110 @@ again:
 	return parent;
 }
 
+/*
+ * Try to do a lockless dput(), and return whether that was successful.
+ *
+ * If unsuccessful, we return false, having already taken the dentry lock.
+ *
+ * The caller needs to hold the RCU read lock, so that the dentry is
+ * guaranteed to stay around even if the refcount goes down to zero!
+ */
+static inline bool fast_dput(struct dentry *dentry)
+{
+	int ret;
+	unsigned int d_flags;
+
+	/*
+	 * If we have a d_op->d_delete() operation, we sould not
+	 * let the dentry count go to zero, so use "put__or_lock".
+	 */
+	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
+		return lockref_put_or_lock(&dentry->d_lockref);
+
+	/*
+	 * .. otherwise, we can try to just decrement the
+	 * lockref optimistically.
+	 */
+	ret = lockref_put_return(&dentry->d_lockref);
+
+	/*
+	 * If the lockref_put_return() failed due to the lock being held
+	 * by somebody else, the fast path has failed. We will need to
+	 * get the lock, and then check the count again.
+	 */
+	if (unlikely(ret < 0)) {
+		spin_lock(&dentry->d_lock);
+		if (dentry->d_lockref.count > 1) {
+			dentry->d_lockref.count--;
+			spin_unlock(&dentry->d_lock);
+			return 1;
+		}
+		return 0;
+	}
+
+	/*
+	 * If we weren't the last ref, we're done.
+	 */
+	if (ret)
+		return 1;
+
+	/*
+	 * Careful, careful. The reference count went down
+	 * to zero, but we don't hold the dentry lock, so
+	 * somebody else could get it again, and do another
+	 * dput(), and we need to not race with that.
+	 *
+	 * However, there is a very special and common case
+	 * where we don't care, because there is nothing to
+	 * do: the dentry is still hashed, it does not have
+	 * a 'delete' op, and it's referenced and already on
+	 * the LRU list.
+	 *
+	 * NOTE! Since we aren't locked, these values are
+	 * not "stable". However, it is sufficient that at
+	 * some point after we dropped the reference the
+	 * dentry was hashed and the flags had the proper
+	 * value. Other dentry users may have re-gotten
+	 * a reference to the dentry and change that, but
+	 * our work is done - we can leave the dentry
+	 * around with a zero refcount.
+	 */
+	smp_rmb();
+	d_flags = ACCESS_ONCE(dentry->d_flags);
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST;
+
+	/* Nothing to do? Dropping the reference was all we needed? */
+	if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+		return 1;
+
+	/*
+	 * Not the fast normal case? Get the lock. We've already decremented
+	 * the refcount, but we'll need to re-check the situation after
+	 * getting the lock.
+	 */
+	spin_lock(&dentry->d_lock);
+
+	/*
+	 * Did somebody else grab a reference to it in the meantime, and
+	 * we're no longer the last user after all? Alternatively, somebody
+	 * else could have killed it and marked it dead. Either way, we
+	 * don't need to do anything else.
+	 */
+	if (dentry->d_lockref.count) {
+		spin_unlock(&dentry->d_lock);
+		return 1;
+	}
+
+	/*
+	 * Re-get the reference we optimistically dropped. We hold the
+	 * lock, and we just tested that it was zero, so we can just
+	 * set it to 1.
+	 */
+	dentry->d_lockref.count = 1;
+	return 0;
+}
+
+
 /* 
  * This is dput
  *
@@ -622,8 +726,14 @@ void dput(struct dentry *dentry)
 		return;
 
 repeat:
-	if (lockref_put_or_lock(&dentry->d_lockref))
+	rcu_read_lock();
+	if (likely(fast_dput(dentry))) {
+		rcu_read_unlock();
 		return;
+	}
+
+	/* Slow case: now with the dentry lock held */
+	rcu_read_unlock();
 
 	/* Unreachable? Get rid of it */
 	if (unlikely(d_unhashed(dentry)))
@@ -810,7 +920,7 @@ static void shrink_dentry_list(struct list_head *list)
 		 * We found an inuse dentry which was not removed from
 		 * the LRU because of laziness during lookup. Do not free it.
 		 */
-		if ((int)dentry->d_lockref.count > 0) {
+		if (dentry->d_lockref.count > 0) {
 			spin_unlock(&dentry->d_lock);
 			if (parent)
 				spin_unlock(&parent->d_lock);
diff --git a/include/linux/lockref.h b/include/linux/lockref.h
index 4bfde0e99ed5..b10b122dd099 100644
--- a/include/linux/lockref.h
+++ b/include/linux/lockref.h
@@ -28,12 +28,13 @@ struct lockref {
 #endif
 		struct {
 			spinlock_t lock;
-			unsigned int count;
+			int count;
 		};
 	};
 };
 
 extern void lockref_get(struct lockref *);
+extern int lockref_put_return(struct lockref *);
 extern int lockref_get_not_zero(struct lockref *);
 extern int lockref_get_or_lock(struct lockref *);
 extern int lockref_put_or_lock(struct lockref *);
diff --git a/lib/lockref.c b/lib/lockref.c
index d2233de9a86e..ecb9a665ec19 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -60,7 +60,7 @@ void lockref_get(struct lockref *lockref)
 EXPORT_SYMBOL(lockref_get);
 
 /**
- * lockref_get_not_zero - Increments count unless the count is 0
+ * lockref_get_not_zero - Increments count unless the count is 0 or dead
  * @lockref: pointer to lockref structure
  * Return: 1 if count updated successfully or 0 if count was zero
  */
@@ -70,7 +70,7 @@ int lockref_get_not_zero(struct lockref *lockref)
 
 	CMPXCHG_LOOP(
 		new.count++;
-		if (!old.count)
+		if (old.count <= 0)
 			return 0;
 	,
 		return 1;
@@ -78,7 +78,7 @@ int lockref_get_not_zero(struct lockref *lockref)
 
 	spin_lock(&lockref->lock);
 	retval = 0;
-	if (lockref->count) {
+	if (lockref->count > 0) {
 		lockref->count++;
 		retval = 1;
 	}
@@ -88,7 +88,7 @@ int lockref_get_not_zero(struct lockref *lockref)
 EXPORT_SYMBOL(lockref_get_not_zero);
 
 /**
- * lockref_get_or_lock - Increments count unless the count is 0
+ * lockref_get_or_lock - Increments count unless the count is 0 or dead
  * @lockref: pointer to lockref structure
  * Return: 1 if count updated successfully or 0 if count was zero
  * and we got the lock instead.
@@ -97,14 +97,14 @@ int lockref_get_or_lock(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
 		new.count++;
-		if (!old.count)
+		if (old.count <= 0)
 			break;
 	,
 		return 1;
 	);
 
 	spin_lock(&lockref->lock);
-	if (!lockref->count)
+	if (lockref->count <= 0)
 		return 0;
 	lockref->count++;
 	spin_unlock(&lockref->lock);
@@ -112,6 +112,26 @@ int lockref_get_or_lock(struct lockref *lockref)
 }
 EXPORT_SYMBOL(lockref_get_or_lock);
 
+/**
+ * lockref_put_return - Decrement reference count if possible
+ * @lockref: pointer to lockref structure
+ *
+ * Decrement the reference count and return the new value.
+ * If the lockref was dead or locked, return an error.
+ */
+int lockref_put_return(struct lockref *lockref)
+{
+	CMPXCHG_LOOP(
+		new.count--;
+		if (old.count <= 0)
+			return -1;
+	,
+		return new.count;
+	);
+	return -1;
+}
+EXPORT_SYMBOL(lockref_put_return);
+
 /**
  * lockref_put_or_lock - decrements count unless count <= 1 before decrement
  * @lockref: pointer to lockref structure
@@ -158,7 +178,7 @@ int lockref_get_not_dead(struct lockref *lockref)
 
 	CMPXCHG_LOOP(
 		new.count++;
-		if ((int)old.count < 0)
+		if (old.count < 0)
 			return 0;
 	,
 		return 1;
@@ -166,7 +186,7 @@ int lockref_get_not_dead(struct lockref *lockref)
 
 	spin_lock(&lockref->lock);
 	retval = 0;
-	if ((int) lockref->count >= 0) {
+	if (lockref->count >= 0) {
 		lockref->count++;
 		retval = 1;
 	}
-- 
cgit v1.2.3


From 34cece2e8a1d2b66f00e153a19b80b4d4cec4eb8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 Jan 2015 12:47:38 -0500
Subject: take count and rcu_head out of fs_pin

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs_pin.h | 10 ++--------
 kernel/acct.c          | 14 ++++++++------
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 68a54b7741aa..a2e71912e758 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -1,14 +1,8 @@
 #include <linux/fs.h>
 
 struct fs_pin {
-	atomic_long_t		count;
-	union {
-		struct {
-			struct hlist_node	s_list;
-			struct hlist_node	m_list;
-		};
-		struct rcu_head rcu;
-	};
+	struct hlist_node	s_list;
+	struct hlist_node	m_list;
 	void (*kill)(struct fs_pin *);
 };
 
diff --git a/kernel/acct.c b/kernel/acct.c
index a74f3d1a0c26..b8fbefb8678f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -80,6 +80,8 @@ static void do_acct_process(struct bsd_acct_struct *acct);
 
 struct bsd_acct_struct {
 	struct fs_pin		pin;
+	atomic_long_t		count;
+	struct rcu_head		rcu;
 	struct mutex		lock;
 	int			active;
 	unsigned long		needcheck;
@@ -126,8 +128,8 @@ out:
 
 static void acct_put(struct bsd_acct_struct *p)
 {
-	if (atomic_long_dec_and_test(&p->pin.count))
-		kfree_rcu(p, pin.rcu);
+	if (atomic_long_dec_and_test(&p->count))
+		kfree_rcu(p, rcu);
 }
 
 static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
@@ -141,7 +143,7 @@ again:
 		rcu_read_unlock();
 		return NULL;
 	}
-	if (!atomic_long_inc_not_zero(&res->pin.count)) {
+	if (!atomic_long_inc_not_zero(&res->count)) {
 		rcu_read_unlock();
 		cpu_relax();
 		goto again;
@@ -179,7 +181,7 @@ static void acct_kill(struct bsd_acct_struct *acct,
 		pin_remove(&acct->pin);
 		ns->bacct = new;
 		acct->ns = NULL;
-		atomic_long_dec(&acct->pin.count);
+		atomic_long_dec(&acct->count);
 		mutex_unlock(&acct->lock);
 		acct_put(acct);
 	}
@@ -189,7 +191,7 @@ static void acct_pin_kill(struct fs_pin *pin)
 {
 	struct bsd_acct_struct *acct;
 	acct = container_of(pin, struct bsd_acct_struct, pin);
-	if (!atomic_long_inc_not_zero(&pin->count)) {
+	if (!atomic_long_inc_not_zero(&acct->count)) {
 		rcu_read_unlock();
 		cpu_relax();
 		return;
@@ -250,7 +252,7 @@ static int acct_on(struct filename *pathname)
 	mnt = file->f_path.mnt;
 	file->f_path.mnt = internal;
 
-	atomic_long_set(&acct->pin.count, 1);
+	atomic_long_set(&acct->count, 1);
 	acct->pin.kill = acct_pin_kill;
 	acct->file = file;
 	acct->needcheck = jiffies;
-- 
cgit v1.2.3


From fdab684d7202774bfd8762d4a656a553b787c8ec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 11 Jan 2015 10:57:27 -0500
Subject: allow attaching fs_pin to a group not associated with some superblock

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_pin.c            | 20 +++++++++++++-------
 fs/internal.h          |  2 +-
 fs/super.c             |  4 ++--
 include/linux/fs_pin.h |  1 +
 4 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 5eb39a93a560..50ef7d2ef03c 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -14,14 +14,20 @@ void pin_remove(struct fs_pin *pin)
 	spin_unlock(&pin_lock);
 }
 
-void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
 {
 	spin_lock(&pin_lock);
-	hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+	if (p)
+		hlist_add_head(&pin->s_list, p);
 	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
 	spin_unlock(&pin_lock);
 }
 
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+	pin_insert_group(pin, m, &m->mnt_sb->s_pins);
+}
+
 void mnt_pin_kill(struct mount *m)
 {
 	while (1) {
@@ -38,18 +44,18 @@ void mnt_pin_kill(struct mount *m)
 	}
 }
 
-void sb_pin_kill(struct super_block *sb)
+void group_pin_kill(struct hlist_head *p)
 {
 	while (1) {
-		struct hlist_node *p;
+		struct hlist_node *q;
 		struct fs_pin *pin;
 		rcu_read_lock();
-		p = ACCESS_ONCE(sb->s_pins.first);
-		if (!p) {
+		q = ACCESS_ONCE(p->first);
+		if (!q) {
 			rcu_read_unlock();
 			break;
 		}
-		pin = hlist_entry(p, struct fs_pin, s_list);
+		pin = hlist_entry(q, struct fs_pin, s_list);
 		pin->kill(pin);
 	}
 }
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..a6efd2f09ba3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -145,7 +145,7 @@ extern const struct file_operations pipefifo_fops;
 /*
  * fs_pin.c
  */
-extern void sb_pin_kill(struct super_block *sb);
+extern void group_pin_kill(struct hlist_head *p);
 extern void mnt_pin_kill(struct mount *m);
 
 /*
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..2d822459bc3d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -706,9 +706,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
 
 	if (remount_ro) {
-		if (sb->s_pins.first) {
+		if (!hlist_empty(&sb->s_pins)) {
 			up_write(&sb->s_umount);
-			sb_pin_kill(sb);
+			group_pin_kill(&sb->s_pins);
 			down_write(&sb->s_umount);
 			if (!sb->s_root)
 				return 0;
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index a2e71912e758..2be38d1464ae 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -7,4 +7,5 @@ struct fs_pin {
 };
 
 void pin_remove(struct fs_pin *);
+void pin_insert_group(struct fs_pin *, struct vfsmount *, struct hlist_head *);
 void pin_insert(struct fs_pin *, struct vfsmount *);
-- 
cgit v1.2.3


From 59eda0e07f43c950d31756213b607af673e551f0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 Jan 2015 17:53:21 -0500
Subject: new fs_pin killing logics

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_pin.c                   | 54 +++++++++++++++++++++++++----
 include/linux/fs_pin.h        | 13 ++++++-
 include/linux/pid_namespace.h |  4 +--
 kernel/acct.c                 | 81 ++++++++++++++++++-------------------------
 4 files changed, 96 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 50ef7d2ef03c..0c77bdc238b2 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -1,4 +1,5 @@
 #include <linux/fs.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/fs_pin.h>
 #include "internal.h"
@@ -12,6 +13,10 @@ void pin_remove(struct fs_pin *pin)
 	hlist_del(&pin->m_list);
 	hlist_del(&pin->s_list);
 	spin_unlock(&pin_lock);
+	spin_lock_irq(&pin->wait.lock);
+	pin->done = 1;
+	wake_up_locked(&pin->wait);
+	spin_unlock_irq(&pin->wait.lock);
 }
 
 void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
@@ -28,19 +33,58 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 	pin_insert_group(pin, m, &m->mnt_sb->s_pins);
 }
 
+void pin_kill(struct fs_pin *p)
+{
+	wait_queue_t wait;
+
+	if (!p) {
+		rcu_read_unlock();
+		return;
+	}
+	init_wait(&wait);
+	spin_lock_irq(&p->wait.lock);
+	if (likely(!p->done)) {
+		p->done = -1;
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		p->kill(p);
+		return;
+	}
+	if (p->done > 0) {
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		return;
+	}
+	__add_wait_queue(&p->wait, &wait);
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		schedule();
+		rcu_read_lock();
+		if (likely(list_empty(&wait.task_list)))
+			break;
+		/* OK, we know p couldn't have been freed yet */
+		spin_lock_irq(&p->wait.lock);
+		if (p->done > 0) {
+			spin_unlock_irq(&p->wait.lock);
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
 void mnt_pin_kill(struct mount *m)
 {
 	while (1) {
 		struct hlist_node *p;
-		struct fs_pin *pin;
 		rcu_read_lock();
 		p = ACCESS_ONCE(m->mnt_pins.first);
 		if (!p) {
 			rcu_read_unlock();
 			break;
 		}
-		pin = hlist_entry(p, struct fs_pin, m_list);
-		pin->kill(pin);
+		pin_kill(hlist_entry(p, struct fs_pin, m_list));
 	}
 }
 
@@ -48,14 +92,12 @@ void group_pin_kill(struct hlist_head *p)
 {
 	while (1) {
 		struct hlist_node *q;
-		struct fs_pin *pin;
 		rcu_read_lock();
 		q = ACCESS_ONCE(p->first);
 		if (!q) {
 			rcu_read_unlock();
 			break;
 		}
-		pin = hlist_entry(q, struct fs_pin, s_list);
-		pin->kill(pin);
+		pin_kill(hlist_entry(q, struct fs_pin, s_list));
 	}
 }
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 2be38d1464ae..9dc4e0384bfb 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -1,11 +1,22 @@
-#include <linux/fs.h>
+#include <linux/wait.h>
 
 struct fs_pin {
+	wait_queue_head_t	wait;
+	int			done;
 	struct hlist_node	s_list;
 	struct hlist_node	m_list;
 	void (*kill)(struct fs_pin *);
 };
 
+struct vfsmount;
+
+static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
+{
+	init_waitqueue_head(&p->wait);
+	p->kill = kill;
+}
+
 void pin_remove(struct fs_pin *);
 void pin_insert_group(struct fs_pin *, struct vfsmount *, struct hlist_head *);
 void pin_insert(struct fs_pin *, struct vfsmount *);
+void pin_kill(struct fs_pin *);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index b9cf6c51b181..918b117a7cd3 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -19,7 +19,7 @@ struct pidmap {
 #define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
 #define PIDMAP_ENTRIES		((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
 
-struct bsd_acct_struct;
+struct fs_pin;
 
 struct pid_namespace {
 	struct kref kref;
@@ -37,7 +37,7 @@ struct pid_namespace {
 	struct dentry *proc_thread_self;
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
-	struct bsd_acct_struct *bacct;
+	struct fs_pin *bacct;
 #endif
 	struct user_namespace *user_ns;
 	struct work_struct proc_work;
diff --git a/kernel/acct.c b/kernel/acct.c
index cf6588ab517b..e6c10d1a4058 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,7 +76,6 @@ int acct_parm[3] = {4, 2, 30};
 /*
  * External references and all of the globals.
  */
-static void do_acct_process(struct bsd_acct_struct *acct);
 
 struct bsd_acct_struct {
 	struct fs_pin		pin;
@@ -91,6 +90,8 @@ struct bsd_acct_struct {
 	struct completion	done;
 };
 
+static void do_acct_process(struct bsd_acct_struct *acct);
+
 /*
  * Check the amount of free space and suspend/resume accordingly.
  */
@@ -132,13 +133,18 @@ static void acct_put(struct bsd_acct_struct *p)
 		kfree_rcu(p, rcu);
 }
 
+static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
+{
+	return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
+}
+
 static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 {
 	struct bsd_acct_struct *res;
 again:
 	smp_rmb();
 	rcu_read_lock();
-	res = ACCESS_ONCE(ns->bacct);
+	res = to_acct(ACCESS_ONCE(ns->bacct));
 	if (!res) {
 		rcu_read_unlock();
 		return NULL;
@@ -150,7 +156,7 @@ again:
 	}
 	rcu_read_unlock();
 	mutex_lock(&res->lock);
-	if (!res->ns) {
+	if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
 		mutex_unlock(&res->lock);
 		acct_put(res);
 		goto again;
@@ -158,6 +164,19 @@ again:
 	return res;
 }
 
+static void acct_pin_kill(struct fs_pin *pin)
+{
+	struct bsd_acct_struct *acct = to_acct(pin);
+	mutex_lock(&acct->lock);
+	do_acct_process(acct);
+	schedule_work(&acct->work);
+	wait_for_completion(&acct->done);
+	cmpxchg(&acct->ns->bacct, pin, NULL);
+	mutex_unlock(&acct->lock);
+	pin_remove(pin);
+	acct_put(acct);
+}
+
 static void close_work(struct work_struct *work)
 {
 	struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
@@ -168,49 +187,13 @@ static void close_work(struct work_struct *work)
 	complete(&acct->done);
 }
 
-static void acct_kill(struct bsd_acct_struct *acct)
-{
-	if (acct) {
-		struct pid_namespace *ns = acct->ns;
-		do_acct_process(acct);
-		INIT_WORK(&acct->work, close_work);
-		init_completion(&acct->done);
-		schedule_work(&acct->work);
-		wait_for_completion(&acct->done);
-		pin_remove(&acct->pin);
-		cmpxchg(&ns->bacct, acct, NULL);
-		acct->ns = NULL;
-		atomic_long_dec(&acct->count);
-		mutex_unlock(&acct->lock);
-		acct_put(acct);
-	}
-}
-
-static void acct_pin_kill(struct fs_pin *pin)
-{
-	struct bsd_acct_struct *acct;
-	acct = container_of(pin, struct bsd_acct_struct, pin);
-	if (!atomic_long_inc_not_zero(&acct->count)) {
-		rcu_read_unlock();
-		cpu_relax();
-		return;
-	}
-	rcu_read_unlock();
-	mutex_lock(&acct->lock);
-	if (!acct->ns) {
-		mutex_unlock(&acct->lock);
-		acct_put(acct);
-		acct = NULL;
-	}
-	acct_kill(acct);
-}
-
 static int acct_on(struct filename *pathname)
 {
 	struct file *file;
 	struct vfsmount *mnt, *internal;
 	struct pid_namespace *ns = task_active_pid_ns(current);
-	struct bsd_acct_struct *acct, *old;
+	struct bsd_acct_struct *acct;
+	struct fs_pin *old;
 	int err;
 
 	acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
@@ -252,18 +235,20 @@ static int acct_on(struct filename *pathname)
 	file->f_path.mnt = internal;
 
 	atomic_long_set(&acct->count, 1);
-	acct->pin.kill = acct_pin_kill;
+	init_fs_pin(&acct->pin, acct_pin_kill);
 	acct->file = file;
 	acct->needcheck = jiffies;
 	acct->ns = ns;
 	mutex_init(&acct->lock);
+	INIT_WORK(&acct->work, close_work);
+	init_completion(&acct->done);
 	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
 	pin_insert(&acct->pin, mnt);
 
-	old = acct_get(ns);
-	ns->bacct = acct;
-	acct_kill(old);
+	rcu_read_lock();
+	old = xchg(&ns->bacct, &acct->pin);
 	mutex_unlock(&acct->lock);
+	pin_kill(old);
 	mnt_drop_write(mnt);
 	mntput(mnt);
 	return 0;
@@ -299,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 		mutex_unlock(&acct_on_mutex);
 		putname(tmp);
 	} else {
-		acct_kill(acct_get(task_active_pid_ns(current)));
+		rcu_read_lock();
+		pin_kill(task_active_pid_ns(current)->bacct);
 	}
 
 	return error;
@@ -307,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 
 void acct_exit_ns(struct pid_namespace *ns)
 {
-	acct_kill(acct_get(ns));
+	rcu_read_lock();
+	pin_kill(ns->bacct);
 }
 
 /*
-- 
cgit v1.2.3


From 607954b084d4ad5e6a2e0f795de7803d9c6ae37f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 21 Jan 2015 11:12:13 +0000
Subject: rhashtable: fix rht_for_each_entry_safe() endless loop

"next" is not updated, causing an endless loop for buckets with more than
one element.

Fixes: 88d6ed15acff ("rhashtable: Convert bucket iterators to take table and index")
Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index a2562ed53ea3..e0337844358e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -260,7 +260,9 @@ void rhashtable_destroy(struct rhashtable *ht);
 	     next = !rht_is_a_nulls(pos) ?				    \
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
-	     pos = next)
+	     pos = next,						    \
+	     next = !rht_is_a_nulls(pos) ?				    \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
 
 /**
  * rht_for_each_rcu_continue - continue iterating over rcu hash chain
-- 
cgit v1.2.3


From d5fd120e7860c2b3d4c936a2ebadb6b244bec4c8 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Mon, 26 Jan 2015 20:59:31 +0100
Subject: i2c: Only include slave support if selected

Make the slave support depend on CONFIG_I2C_SLAVE. Otherwise it gets
included unconditionally, even when it is not needed.

I2C bus drivers which implement slave support must select
I2C_SLAVE.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/busses/Kconfig | 1 +
 drivers/i2c/i2c-core.c     | 2 ++
 include/linux/i2c.h        | 6 ++++++
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 31e8308ba899..ab838d9e28b6 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -881,6 +881,7 @@ config I2C_XLR
 config I2C_RCAR
 	tristate "Renesas R-Car I2C Controller"
 	depends on ARCH_SHMOBILE || COMPILE_TEST
+	select I2C_SLAVE
 	help
 	  If you say yes to this option, support will be included for the
 	  R-Car I2C controller.
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 39d25a8cb1ad..e9eae57a2b50 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -2972,6 +2972,7 @@ trace:
 }
 EXPORT_SYMBOL(i2c_smbus_xfer);
 
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb)
 {
 	int ret;
@@ -3019,6 +3020,7 @@ int i2c_slave_unregister(struct i2c_client *client)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(i2c_slave_unregister);
+#endif
 
 MODULE_AUTHOR("Simon G. Vogl <simon@tk.uni-linz.ac.at>");
 MODULE_DESCRIPTION("I2C-Bus main module");
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e3a1721c8354..7c7695940ddd 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -228,7 +228,9 @@ struct i2c_client {
 	struct device dev;		/* the device structure		*/
 	int irq;			/* irq issued by device		*/
 	struct list_head detected;
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 	i2c_slave_cb_t slave_cb;	/* callback for slave mode	*/
+#endif
 };
 #define to_i2c_client(d) container_of(d, struct i2c_client, dev)
 
@@ -253,6 +255,7 @@ static inline void i2c_set_clientdata(struct i2c_client *dev, void *data)
 
 /* I2C slave support */
 
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 enum i2c_slave_event {
 	I2C_SLAVE_REQ_READ_START,
 	I2C_SLAVE_REQ_READ_END,
@@ -269,6 +272,7 @@ static inline int i2c_slave_event(struct i2c_client *client,
 {
 	return client->slave_cb(client, event, val);
 }
+#endif
 
 /**
  * struct i2c_board_info - template for device creation
@@ -404,8 +408,10 @@ struct i2c_algorithm {
 	/* To determine what the adapter supports */
 	u32 (*functionality) (struct i2c_adapter *);
 
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 	int (*reg_slave)(struct i2c_client *client);
 	int (*unreg_slave)(struct i2c_client *client);
+#endif
 };
 
 /**
-- 
cgit v1.2.3


From 9879de7373fcfb466ec198293b6ccc1ad7a42dd8 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 26 Jan 2015 12:58:32 -0800
Subject: mm: page_alloc: embed OOM killing naturally into allocation slowpath

The OOM killing invocation does a lot of duplicative checks against the
task's allocation context.  Rework it to take advantage of the existing
checks in the allocator slowpath.

The OOM killer is invoked when the allocator is unable to reclaim any
pages but the allocation has to keep looping.  Instead of having a check
for __GFP_NORETRY hidden in oom_gfp_allowed(), just move the OOM
invocation to the true branch of should_alloc_retry().  The __GFP_FS
check from oom_gfp_allowed() can then be moved into the OOM avoidance
branch in __alloc_pages_may_oom(), along with the PF_DUMPCORE test.

__alloc_pages_may_oom() can then signal to the caller whether the OOM
killer was invoked, instead of requiring it to duplicate the order and
high_zoneidx checks to guess this when deciding whether to continue.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h |  5 ----
 mm/page_alloc.c     | 82 +++++++++++++++++++++++------------------------------
 2 files changed, 35 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 853698c721f7..76200984d1e2 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -85,11 +85,6 @@ static inline void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
-static inline bool oom_gfp_allowed(gfp_t gfp_mask)
-{
-	return (gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY);
-}
-
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
 static inline bool task_will_free_mem(struct task_struct *task)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c503a116..8e20f9c2fa5a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2332,12 +2332,21 @@ static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
-	int classzone_idx, int migratetype)
+	int classzone_idx, int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page;
 
-	/* Acquire the per-zone oom lock for each zone */
+	*did_some_progress = 0;
+
+	if (oom_killer_disabled)
+		return NULL;
+
+	/*
+	 * Acquire the per-zone oom lock for each zone.  If that
+	 * fails, somebody else is making progress for us.
+	 */
 	if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
+		*did_some_progress = 1;
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
@@ -2363,12 +2372,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		goto out;
 
 	if (!(gfp_mask & __GFP_NOFAIL)) {
+		/* Coredumps can quickly deplete all memory reserves */
+		if (current->flags & PF_DUMPCORE)
+			goto out;
 		/* The OOM killer will not help higher order allocs */
 		if (order > PAGE_ALLOC_COSTLY_ORDER)
 			goto out;
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (high_zoneidx < ZONE_NORMAL)
 			goto out;
+		/* The OOM killer does not compensate for light reclaim */
+		if (!(gfp_mask & __GFP_FS))
+			goto out;
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2381,7 +2396,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	}
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
-
+	*did_some_progress = 1;
 out:
 	oom_zonelist_unlock(zonelist, gfp_mask);
 	return page;
@@ -2658,7 +2673,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	    (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 
-restart:
+retry:
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapds(order, zonelist, high_zoneidx,
 				preferred_zone, nodemask);
@@ -2681,7 +2696,6 @@ restart:
 		classzone_idx = zonelist_zone_idx(preferred_zoneref);
 	}
 
-rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2788,54 +2802,28 @@ rebalance:
 	if (page)
 		goto got_pg;
 
-	/*
-	 * If we failed to make any progress reclaiming, then we are
-	 * running out of options and have to consider going OOM
-	 */
-	if (!did_some_progress) {
-		if (oom_gfp_allowed(gfp_mask)) {
-			if (oom_killer_disabled)
-				goto nopage;
-			/* Coredumps can quickly deplete all memory reserves */
-			if ((current->flags & PF_DUMPCORE) &&
-			    !(gfp_mask & __GFP_NOFAIL))
-				goto nopage;
-			page = __alloc_pages_may_oom(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask, preferred_zone,
-					classzone_idx, migratetype);
-			if (page)
-				goto got_pg;
-
-			if (!(gfp_mask & __GFP_NOFAIL)) {
-				/*
-				 * The oom killer is not called for high-order
-				 * allocations that may fail, so if no progress
-				 * is being made, there are no other options and
-				 * retrying is unlikely to help.
-				 */
-				if (order > PAGE_ALLOC_COSTLY_ORDER)
-					goto nopage;
-				/*
-				 * The oom killer is not called for lowmem
-				 * allocations to prevent needlessly killing
-				 * innocent tasks.
-				 */
-				if (high_zoneidx < ZONE_NORMAL)
-					goto nopage;
-			}
-
-			goto restart;
-		}
-	}
-
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
+		/*
+		 * If we fail to make progress by freeing individual
+		 * pages, but the allocation wants us to keep going,
+		 * start OOM killing tasks.
+		 */
+		if (!did_some_progress) {
+			page = __alloc_pages_may_oom(gfp_mask, order, zonelist,
+						high_zoneidx, nodemask,
+						preferred_zone, classzone_idx,
+						migratetype,&did_some_progress);
+			if (page)
+				goto got_pg;
+			if (!did_some_progress)
+				goto nopage;
+		}
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
-		goto rebalance;
+		goto retry;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
-- 
cgit v1.2.3


From 07261edb971492c6b41b44d7b1b51f76807d30ad Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Mon, 26 Jan 2015 12:58:43 -0800
Subject: printk: add dummy routine for when CONFIG_PRINTK=n

There are missing dummy routines for log_buf_addr_get() and
log_buf_len_get() for when CONFIG_PRINTK is not set causing build
failures.

This patch adds these dummy routines at the appropriate location.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index c8f170324e64..4d5bf5726578 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -10,9 +10,6 @@
 extern const char linux_banner[];
 extern const char linux_proc_banner[];
 
-extern char *log_buf_addr_get(void);
-extern u32 log_buf_len_get(void);
-
 static inline int printk_get_level(const char *buffer)
 {
 	if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
@@ -163,6 +160,8 @@ extern int kptr_restrict;
 
 extern void wake_up_klogd(void);
 
+char *log_buf_addr_get(void);
+u32 log_buf_len_get(void);
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
 void dump_stack_set_arch_desc(const char *fmt, ...);
@@ -198,6 +197,16 @@ static inline void wake_up_klogd(void)
 {
 }
 
+static inline char *log_buf_addr_get(void)
+{
+	return NULL;
+}
+
+static inline u32 log_buf_len_get(void)
+{
+	return 0;
+}
+
 static inline void log_buf_kexec_setup(void)
 {
 }
-- 
cgit v1.2.3


From 0b35fa7daefe9da6fdef91d95e07eebb714a8fcc Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Sun, 25 Jan 2015 23:33:30 +0100
Subject: NFC: st21nfcb: Remove useless include

include/linux/platform_data/st21nfcb.h is phy generic.
There is no need to include linux/i2c.h

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/platform_data/st21nfcb.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/st21nfcb.h b/include/linux/platform_data/st21nfcb.h
index c3b432f5b63e..05c54c958cc3 100644
--- a/include/linux/platform_data/st21nfcb.h
+++ b/include/linux/platform_data/st21nfcb.h
@@ -19,8 +19,6 @@
 #ifndef _ST21NFCB_NCI_H_
 #define _ST21NFCB_NCI_H_
 
-#include <linux/i2c.h>
-
 #define ST21NFCB_NCI_DRIVER_NAME "st21nfcb_nci"
 
 struct st21nfcb_nfc_platform_data {
-- 
cgit v1.2.3


From 6da7c85c75eed769423b428eb654eaaf89d273c1 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Sun, 25 Jan 2015 23:33:31 +0100
Subject: NFC: st21nfcb: Fix copy/paste error in comment

include/linux/platform_data/st21nfcb.h is based on
include/linux/platform_data/st21nfca.h.

The endif comment is inacurrate for st21nfcb.

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/platform_data/st21nfcb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/st21nfcb.h b/include/linux/platform_data/st21nfcb.h
index 05c54c958cc3..b023373d9874 100644
--- a/include/linux/platform_data/st21nfcb.h
+++ b/include/linux/platform_data/st21nfcb.h
@@ -26,4 +26,4 @@ struct st21nfcb_nfc_platform_data {
 	unsigned int irq_polarity;
 };
 
-#endif /* _ST21NFCA_HCI_H_ */
+#endif /* _ST21NFCB_NCI_H_ */
-- 
cgit v1.2.3


From 7aea8389a77abf9fde254aca2434a605c7704f58 Mon Sep 17 00:00:00 2001
From: Jacek Anaszewski <j.anaszewski@samsung.com>
Date: Fri, 9 Jan 2015 07:22:51 -0800
Subject: leds: Add LED Flash class extension to the LED subsystem

Some LED devices support two operation modes - torch and flash.
This patch provides support for flash LED devices in the LED subsystem
by introducing new sysfs attributes and kernel internal interface.
The attributes being introduced are: flash_brightness, flash_strobe,
flash_timeout, max_flash_timeout, max_flash_brightness, flash_fault,
flash_sync_strobe and available_sync_leds. All the flash related
features are placed in a separate module.

The modifications aim to be compatible with V4L2 framework requirements
related to the flash devices management. The design assumes that V4L2
sub-device can take of the LED class device control and communicate
with it through the kernel internal interface. When V4L2 Flash sub-device
file is opened, the LED class device sysfs interface is made
unavailable.

Signed-off-by: Jacek Anaszewski <j.anaszewski@samsung.com>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Richard Purdie <rpurdie@rpsys.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 drivers/leds/Kconfig            |  10 +
 drivers/leds/Makefile           |   1 +
 drivers/leds/led-class-flash.c  | 486 ++++++++++++++++++++++++++++++++++++++++
 drivers/leds/led-class.c        |   4 +
 include/linux/led-class-flash.h | 207 +++++++++++++++++
 include/linux/leds.h            |   3 +
 6 files changed, 711 insertions(+)
 create mode 100644 drivers/leds/led-class-flash.c
 create mode 100644 include/linux/led-class-flash.h

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index a6c3d2f153f3..25b320d64e26 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -22,6 +22,16 @@ config LEDS_CLASS
 	  This option enables the led sysfs class in /sys/class/leds.  You'll
 	  need this to do anything useful with LEDs.  If unsure, say N.
 
+config LEDS_CLASS_FLASH
+	tristate "LED Flash Class Support"
+	depends on LEDS_CLASS
+	help
+	  This option enables the flash led sysfs class in /sys/class/leds.
+	  It wrapps LED Class and adds flash LEDs specific sysfs attributes
+	  and kernel internal API to it. You'll need this to provide support
+	  for the flash related features of a LED device. It can be built
+	  as a module.
+
 comment "LED drivers"
 
 config LEDS_88PM860X
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 1c65a191d907..cbba921b6f1c 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -2,6 +2,7 @@
 # LED Core
 obj-$(CONFIG_NEW_LEDS)			+= led-core.o
 obj-$(CONFIG_LEDS_CLASS)		+= led-class.o
+obj-$(CONFIG_LEDS_CLASS_FLASH)		+= led-class-flash.o
 obj-$(CONFIG_LEDS_TRIGGERS)		+= led-triggers.o
 
 # LED Platform Drivers
diff --git a/drivers/leds/led-class-flash.c b/drivers/leds/led-class-flash.c
new file mode 100644
index 000000000000..4a19fd44f93f
--- /dev/null
+++ b/drivers/leds/led-class-flash.c
@@ -0,0 +1,486 @@
+/*
+ * LED Flash class interface
+ *
+ * Copyright (C) 2015 Samsung Electronics Co., Ltd.
+ * Author: Jacek Anaszewski <j.anaszewski@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/led-class-flash.h>
+#include <linux/leds.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "leds.h"
+
+#define has_flash_op(fled_cdev, op)				\
+	(fled_cdev && fled_cdev->ops->op)
+
+#define call_flash_op(fled_cdev, op, args...)		\
+	((has_flash_op(fled_cdev, op)) ?			\
+			(fled_cdev->ops->op(fled_cdev, args)) :	\
+			-EINVAL)
+
+static const char * const led_flash_fault_names[] = {
+	"led-over-voltage",
+	"flash-timeout-exceeded",
+	"controller-over-temperature",
+	"controller-short-circuit",
+	"led-power-supply-over-current",
+	"indicator-led-fault",
+	"led-under-voltage",
+	"controller-under-voltage",
+	"led-over-temperature",
+};
+
+static ssize_t flash_brightness_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	unsigned long state;
+	ssize_t ret;
+
+	mutex_lock(&led_cdev->led_access);
+
+	if (led_sysfs_is_disabled(led_cdev)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		goto unlock;
+
+	ret = led_set_flash_brightness(fled_cdev, state);
+	if (ret < 0)
+		goto unlock;
+
+	ret = size;
+unlock:
+	mutex_unlock(&led_cdev->led_access);
+	return ret;
+}
+
+static ssize_t flash_brightness_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	/* no lock needed for this */
+	led_update_flash_brightness(fled_cdev);
+
+	return sprintf(buf, "%u\n", fled_cdev->brightness.val);
+}
+static DEVICE_ATTR_RW(flash_brightness);
+
+static ssize_t max_flash_brightness_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	return sprintf(buf, "%u\n", fled_cdev->brightness.max);
+}
+static DEVICE_ATTR_RO(max_flash_brightness);
+
+static ssize_t flash_strobe_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	unsigned long state;
+	ssize_t ret = -EINVAL;
+
+	mutex_lock(&led_cdev->led_access);
+
+	if (led_sysfs_is_disabled(led_cdev)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		goto unlock;
+
+	if (state < 0 || state > 1) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	ret = led_set_flash_strobe(fled_cdev, state);
+	if (ret < 0)
+		goto unlock;
+	ret = size;
+unlock:
+	mutex_unlock(&led_cdev->led_access);
+	return ret;
+}
+
+static ssize_t flash_strobe_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	bool state;
+	int ret;
+
+	/* no lock needed for this */
+	ret = led_get_flash_strobe(fled_cdev, &state);
+	if (ret < 0)
+		return ret;
+
+	return sprintf(buf, "%u\n", state);
+}
+static DEVICE_ATTR_RW(flash_strobe);
+
+static ssize_t flash_timeout_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	unsigned long flash_timeout;
+	ssize_t ret;
+
+	mutex_lock(&led_cdev->led_access);
+
+	if (led_sysfs_is_disabled(led_cdev)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	ret = kstrtoul(buf, 10, &flash_timeout);
+	if (ret)
+		goto unlock;
+
+	ret = led_set_flash_timeout(fled_cdev, flash_timeout);
+	if (ret < 0)
+		goto unlock;
+
+	ret = size;
+unlock:
+	mutex_unlock(&led_cdev->led_access);
+	return ret;
+}
+
+static ssize_t flash_timeout_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	return sprintf(buf, "%u\n", fled_cdev->timeout.val);
+}
+static DEVICE_ATTR_RW(flash_timeout);
+
+static ssize_t max_flash_timeout_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	return sprintf(buf, "%u\n", fled_cdev->timeout.max);
+}
+static DEVICE_ATTR_RO(max_flash_timeout);
+
+static ssize_t flash_fault_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	u32 fault, mask = 0x1;
+	char *pbuf = buf;
+	int i, ret, buf_len;
+
+	ret = led_get_flash_fault(fled_cdev, &fault);
+	if (ret < 0)
+		return -EINVAL;
+
+	*buf = '\0';
+
+	for (i = 0; i < LED_NUM_FLASH_FAULTS; ++i) {
+		if (fault & mask) {
+			buf_len = sprintf(pbuf, "%s ",
+					  led_flash_fault_names[i]);
+			pbuf += buf_len;
+		}
+		mask <<= 1;
+	}
+
+	return sprintf(buf, "%s\n", buf);
+}
+static DEVICE_ATTR_RO(flash_fault);
+
+static ssize_t available_sync_leds_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	char *pbuf = buf;
+	int i, buf_len;
+
+	buf_len = sprintf(pbuf, "[0: none] ");
+	pbuf += buf_len;
+
+	for (i = 0; i < fled_cdev->num_sync_leds; ++i) {
+		buf_len = sprintf(pbuf, "[%d: %s] ", i + 1,
+				  fled_cdev->sync_leds[i]->led_cdev.name);
+		pbuf += buf_len;
+	}
+
+	return sprintf(buf, "%s\n", buf);
+}
+static DEVICE_ATTR_RO(available_sync_leds);
+
+static ssize_t flash_sync_strobe_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	unsigned long led_id;
+	ssize_t ret;
+
+	mutex_lock(&led_cdev->led_access);
+
+	if (led_sysfs_is_disabled(led_cdev)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	ret = kstrtoul(buf, 10, &led_id);
+	if (ret)
+		goto unlock;
+
+	if (led_id > fled_cdev->num_sync_leds) {
+		ret = -ERANGE;
+		goto unlock;
+	}
+
+	fled_cdev->sync_led_id = led_id;
+
+	ret = size;
+unlock:
+	mutex_unlock(&led_cdev->led_access);
+	return ret;
+}
+
+static ssize_t flash_sync_strobe_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+	int sled_id = fled_cdev->sync_led_id;
+	char *sync_led_name = "none";
+
+	if (fled_cdev->sync_led_id > 0)
+		sync_led_name = (char *)
+			fled_cdev->sync_leds[sled_id - 1]->led_cdev.name;
+
+	return sprintf(buf, "[%d: %s]\n", sled_id, sync_led_name);
+}
+static DEVICE_ATTR_RW(flash_sync_strobe);
+
+static struct attribute *led_flash_strobe_attrs[] = {
+	&dev_attr_flash_strobe.attr,
+	NULL,
+};
+
+static struct attribute *led_flash_timeout_attrs[] = {
+	&dev_attr_flash_timeout.attr,
+	&dev_attr_max_flash_timeout.attr,
+	NULL,
+};
+
+static struct attribute *led_flash_brightness_attrs[] = {
+	&dev_attr_flash_brightness.attr,
+	&dev_attr_max_flash_brightness.attr,
+	NULL,
+};
+
+static struct attribute *led_flash_fault_attrs[] = {
+	&dev_attr_flash_fault.attr,
+	NULL,
+};
+
+static struct attribute *led_flash_sync_strobe_attrs[] = {
+	&dev_attr_available_sync_leds.attr,
+	&dev_attr_flash_sync_strobe.attr,
+	NULL,
+};
+
+static const struct attribute_group led_flash_strobe_group = {
+	.attrs = led_flash_strobe_attrs,
+};
+
+static const struct attribute_group led_flash_timeout_group = {
+	.attrs = led_flash_timeout_attrs,
+};
+
+static const struct attribute_group led_flash_brightness_group = {
+	.attrs = led_flash_brightness_attrs,
+};
+
+static const struct attribute_group led_flash_fault_group = {
+	.attrs = led_flash_fault_attrs,
+};
+
+static const struct attribute_group led_flash_sync_strobe_group = {
+	.attrs = led_flash_sync_strobe_attrs,
+};
+
+static void led_flash_resume(struct led_classdev *led_cdev)
+{
+	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(led_cdev);
+
+	call_flash_op(fled_cdev, flash_brightness_set,
+					fled_cdev->brightness.val);
+	call_flash_op(fled_cdev, timeout_set, fled_cdev->timeout.val);
+}
+
+static void led_flash_init_sysfs_groups(struct led_classdev_flash *fled_cdev)
+{
+	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	const struct led_flash_ops *ops = fled_cdev->ops;
+	const struct attribute_group **flash_groups = fled_cdev->sysfs_groups;
+
+	int num_sysfs_groups = 0;
+
+	flash_groups[num_sysfs_groups++] = &led_flash_strobe_group;
+
+	if (ops->flash_brightness_set)
+		flash_groups[num_sysfs_groups++] = &led_flash_brightness_group;
+
+	if (ops->timeout_set)
+		flash_groups[num_sysfs_groups++] = &led_flash_timeout_group;
+
+	if (ops->fault_get)
+		flash_groups[num_sysfs_groups++] = &led_flash_fault_group;
+
+	if (led_cdev->flags & LED_DEV_CAP_SYNC_STROBE)
+		flash_groups[num_sysfs_groups++] = &led_flash_sync_strobe_group;
+
+	led_cdev->groups = flash_groups;
+}
+
+int led_classdev_flash_register(struct device *parent,
+				struct led_classdev_flash *fled_cdev)
+{
+	struct led_classdev *led_cdev;
+	const struct led_flash_ops *ops;
+	int ret;
+
+	if (!fled_cdev)
+		return -EINVAL;
+
+	led_cdev = &fled_cdev->led_cdev;
+
+	if (led_cdev->flags & LED_DEV_CAP_FLASH) {
+		if (!led_cdev->brightness_set_sync)
+			return -EINVAL;
+
+		ops = fled_cdev->ops;
+		if (!ops || !ops->strobe_set)
+			return -EINVAL;
+
+		led_cdev->flash_resume = led_flash_resume;
+
+		/* Select the sysfs attributes to be created for the device */
+		led_flash_init_sysfs_groups(fled_cdev);
+	}
+
+	/* Register led class device */
+	ret = led_classdev_register(parent, led_cdev);
+	if (ret < 0)
+		return ret;
+
+	/* Setting a torch brightness needs to have immediate effect */
+	led_cdev->flags &= ~SET_BRIGHTNESS_ASYNC;
+	led_cdev->flags |= SET_BRIGHTNESS_SYNC;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(led_classdev_flash_register);
+
+void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev)
+{
+	if (!fled_cdev)
+		return;
+
+	led_classdev_unregister(&fled_cdev->led_cdev);
+}
+EXPORT_SYMBOL_GPL(led_classdev_flash_unregister);
+
+static void led_clamp_align(struct led_flash_setting *s)
+{
+	u32 v, offset;
+
+	v = s->val + s->step / 2;
+	v = clamp(v, s->min, s->max);
+	offset = v - s->min;
+	offset = s->step * (offset / s->step);
+	s->val = s->min + offset;
+}
+
+int led_set_flash_timeout(struct led_classdev_flash *fled_cdev, u32 timeout)
+{
+	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	struct led_flash_setting *s = &fled_cdev->timeout;
+
+	s->val = timeout;
+	led_clamp_align(s);
+
+	if (!(led_cdev->flags & LED_SUSPENDED))
+		return call_flash_op(fled_cdev, timeout_set, s->val);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(led_set_flash_timeout);
+
+int led_get_flash_fault(struct led_classdev_flash *fled_cdev, u32 *fault)
+{
+	return call_flash_op(fled_cdev, fault_get, fault);
+}
+EXPORT_SYMBOL_GPL(led_get_flash_fault);
+
+int led_set_flash_brightness(struct led_classdev_flash *fled_cdev,
+				u32 brightness)
+{
+	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	struct led_flash_setting *s = &fled_cdev->brightness;
+
+	s->val = brightness;
+	led_clamp_align(s);
+
+	if (!(led_cdev->flags & LED_SUSPENDED))
+		return call_flash_op(fled_cdev, flash_brightness_set, s->val);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(led_set_flash_brightness);
+
+int led_update_flash_brightness(struct led_classdev_flash *fled_cdev)
+{
+	struct led_flash_setting *s = &fled_cdev->brightness;
+	u32 brightness;
+
+	if (has_flash_op(fled_cdev, flash_brightness_get)) {
+		int ret = call_flash_op(fled_cdev, flash_brightness_get,
+						&brightness);
+		if (ret < 0)
+			return ret;
+
+		s->val = brightness;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(led_update_flash_brightness);
+
+MODULE_AUTHOR("Jacek Anaszewski <j.anaszewski@samsung.com>");
+MODULE_DESCRIPTION("LED Flash class interface");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 291ca45ce8ba..795ec994c663 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -179,6 +179,10 @@ EXPORT_SYMBOL_GPL(led_classdev_suspend);
 void led_classdev_resume(struct led_classdev *led_cdev)
 {
 	led_cdev->brightness_set(led_cdev, led_cdev->brightness);
+
+	if (led_cdev->flash_resume)
+		led_cdev->flash_resume(led_cdev);
+
 	led_cdev->flags &= ~LED_SUSPENDED;
 }
 EXPORT_SYMBOL_GPL(led_classdev_resume);
diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h
new file mode 100644
index 000000000000..5ba2facd7a51
--- /dev/null
+++ b/include/linux/led-class-flash.h
@@ -0,0 +1,207 @@
+/*
+ * LED Flash class interface
+ *
+ * Copyright (C) 2015 Samsung Electronics Co., Ltd.
+ * Author: Jacek Anaszewski <j.anaszewski@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#ifndef __LINUX_FLASH_LEDS_H_INCLUDED
+#define __LINUX_FLASH_LEDS_H_INCLUDED
+
+#include <linux/leds.h>
+#include <uapi/linux/v4l2-controls.h>
+
+struct device_node;
+struct led_classdev_flash;
+
+/*
+ * Supported led fault bits - must be kept in synch
+ * with V4L2_FLASH_FAULT bits.
+ */
+#define LED_FAULT_OVER_VOLTAGE		(1 << 0)
+#define LED_FAULT_TIMEOUT		(1 << 1)
+#define LED_FAULT_OVER_TEMPERATURE	(1 << 2)
+#define LED_FAULT_SHORT_CIRCUIT		(1 << 3)
+#define LED_FAULT_OVER_CURRENT		(1 << 4)
+#define LED_FAULT_INDICATOR		(1 << 5)
+#define LED_FAULT_UNDER_VOLTAGE		(1 << 6)
+#define LED_FAULT_INPUT_VOLTAGE		(1 << 7)
+#define LED_FAULT_LED_OVER_TEMPERATURE	(1 << 8)
+#define LED_NUM_FLASH_FAULTS		9
+
+#define LED_FLASH_MAX_SYSFS_GROUPS 7
+
+struct led_flash_ops {
+	/* set flash brightness */
+	int (*flash_brightness_set)(struct led_classdev_flash *fled_cdev,
+					u32 brightness);
+	/* get flash brightness */
+	int (*flash_brightness_get)(struct led_classdev_flash *fled_cdev,
+					u32 *brightness);
+	/* set flash strobe state */
+	int (*strobe_set)(struct led_classdev_flash *fled_cdev, bool state);
+	/* get flash strobe state */
+	int (*strobe_get)(struct led_classdev_flash *fled_cdev, bool *state);
+	/* set flash timeout */
+	int (*timeout_set)(struct led_classdev_flash *fled_cdev, u32 timeout);
+	/* get the flash LED fault */
+	int (*fault_get)(struct led_classdev_flash *fled_cdev, u32 *fault);
+};
+
+/*
+ * Current value of a flash setting along
+ * with its constraints.
+ */
+struct led_flash_setting {
+	/* maximum allowed value */
+	u32 min;
+	/* maximum allowed value */
+	u32 max;
+	/* step value */
+	u32 step;
+	/* current value */
+	u32 val;
+};
+
+struct led_classdev_flash {
+	/* led class device */
+	struct led_classdev led_cdev;
+
+	/* flash led specific ops */
+	const struct led_flash_ops *ops;
+
+	/* flash brightness value in microamperes along with its constraints */
+	struct led_flash_setting brightness;
+
+	/* flash timeout value in microseconds along with its constraints */
+	struct led_flash_setting timeout;
+
+	/* LED Flash class sysfs groups */
+	const struct attribute_group *sysfs_groups[LED_FLASH_MAX_SYSFS_GROUPS];
+
+	/* LEDs available for flash strobe synchronization */
+	struct led_classdev_flash **sync_leds;
+
+	/* Number of LEDs available for flash strobe synchronization */
+	int num_sync_leds;
+
+	/*
+	 * The identifier of the sub-led to synchronize the flash strobe with.
+	 * Identifiers start from 1, which reflects the first element from the
+	 * sync_leds array. 0 means that the flash strobe should not be
+	 * synchronized.
+	 */
+	u32 sync_led_id;
+};
+
+static inline struct led_classdev_flash *lcdev_to_flcdev(
+						struct led_classdev *lcdev)
+{
+	return container_of(lcdev, struct led_classdev_flash, led_cdev);
+}
+
+/**
+ * led_classdev_flash_register - register a new object of led_classdev class
+ *				 with support for flash LEDs
+ * @parent: the flash LED to register
+ * @fled_cdev: the led_classdev_flash structure for this device
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_classdev_flash_register(struct device *parent,
+				struct led_classdev_flash *fled_cdev);
+
+/**
+ * led_classdev_flash_unregister - unregisters an object of led_classdev class
+ *				   with support for flash LEDs
+ * @fled_cdev: the flash LED to unregister
+ *
+ * Unregister a previously registered via led_classdev_flash_register object
+ */
+extern void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev);
+
+/**
+ * led_set_flash_strobe - setup flash strobe
+ * @fled_cdev: the flash LED to set strobe on
+ * @state: 1 - strobe flash, 0 - stop flash strobe
+ *
+ * Strobe the flash LED.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+static inline int led_set_flash_strobe(struct led_classdev_flash *fled_cdev,
+					bool state)
+{
+	return fled_cdev->ops->strobe_set(fled_cdev, state);
+}
+
+/**
+ * led_get_flash_strobe - get flash strobe status
+ * @fled_cdev: the flash LED to query
+ * @state: 1 - flash is strobing, 0 - flash is off
+ *
+ * Check whether the flash is strobing at the moment.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+static inline int led_get_flash_strobe(struct led_classdev_flash *fled_cdev,
+					bool *state)
+{
+	if (fled_cdev->ops->strobe_get)
+		return fled_cdev->ops->strobe_get(fled_cdev, state);
+
+	return -EINVAL;
+}
+
+/**
+ * led_set_flash_brightness - set flash LED brightness
+ * @fled_cdev: the flash LED to set
+ * @brightness: the brightness to set it to
+ *
+ * Set a flash LED's brightness.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_set_flash_brightness(struct led_classdev_flash *fled_cdev,
+					u32 brightness);
+
+/**
+ * led_update_flash_brightness - update flash LED brightness
+ * @fled_cdev: the flash LED to query
+ *
+ * Get a flash LED's current brightness and update led_flash->brightness
+ * member with the obtained value.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_update_flash_brightness(struct led_classdev_flash *fled_cdev);
+
+/**
+ * led_set_flash_timeout - set flash LED timeout
+ * @fled_cdev: the flash LED to set
+ * @timeout: the flash timeout to set it to
+ *
+ * Set the flash strobe duration.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_set_flash_timeout(struct led_classdev_flash *fled_cdev,
+					u32 timeout);
+
+/**
+ * led_get_flash_fault - get the flash LED fault
+ * @fled_cdev: the flash LED to query
+ * @fault: bitmask containing flash faults
+ *
+ * Get the flash LED fault.
+ *
+ * Returns: 0 on success or negative error value on failure
+ */
+extern int led_get_flash_fault(struct led_classdev_flash *fled_cdev,
+					u32 *fault);
+
+#endif	/* __LINUX_FLASH_LEDS_H_INCLUDED */
diff --git a/include/linux/leds.h b/include/linux/leds.h
index cfceef32c9b3..f70f84f35674 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -46,6 +46,8 @@ struct led_classdev {
 #define LED_SYSFS_DISABLE	(1 << 20)
 #define SET_BRIGHTNESS_ASYNC	(1 << 21)
 #define SET_BRIGHTNESS_SYNC	(1 << 22)
+#define LED_DEV_CAP_FLASH	(1 << 23)
+#define LED_DEV_CAP_SYNC_STROBE	(1 << 24)
 
 	/* Set LED brightness level */
 	/* Must not sleep, use a workqueue if needed */
@@ -81,6 +83,7 @@ struct led_classdev {
 	unsigned long		 blink_delay_on, blink_delay_off;
 	struct timer_list	 blink_timer;
 	int			 blink_brightness;
+	void			(*flash_resume)(struct led_classdev *led_cdev);
 
 	struct work_struct	set_brightness_work;
 	int			delayed_set_value;
-- 
cgit v1.2.3


From a2a15d54ab2d5c816deea306ae8e9101c40fe300 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 3 Jan 2015 09:51:33 -0800
Subject: device: Fix dev_dbg_once macro

There is a copy/paste typo in the dev_dbg_once macro.

It uses dev_info instead of dev_dbg, so use the correct
function instead.

Signed-off-by: Joe Perches <joe@perches.com>
Noticed-by: Marc Finet <m.dreadlock@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index fb506738f7b7..7b2532361398 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1156,7 +1156,7 @@ do {									\
 #define dev_info_once(dev, fmt, ...)					\
 	dev_level_once(dev_info, dev, fmt, ##__VA_ARGS__)
 #define dev_dbg_once(dev, fmt, ...)					\
-	dev_level_once(dev_info, dev, fmt, ##__VA_ARGS__)
+	dev_level_once(dev_dbg, dev, fmt, ##__VA_ARGS__)
 
 #define dev_level_ratelimited(dev_level, dev, fmt, ...)			\
 do {									\
-- 
cgit v1.2.3


From d1f1052c520452494db548b1a701cd9020b5903e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 25 Dec 2014 15:07:04 -0800
Subject: device: Change dev_<level> logging functions to return void

No caller or macro uses the return value so make all
the functions return void.

Compiled x86 allyesconfig and defconfig w/o CONFIG_PRINTK

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 29 +++++++++---------------
 include/linux/device.h | 60 ++++++++++++++++++++++++--------------------------
 2 files changed, 40 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 97e2baf6e5d8..07304a3b9ee2 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2080,54 +2080,47 @@ int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...)
 }
 EXPORT_SYMBOL(dev_printk_emit);
 
-static int __dev_printk(const char *level, const struct device *dev,
+static void __dev_printk(const char *level, const struct device *dev,
 			struct va_format *vaf)
 {
-	if (!dev)
-		return printk("%s(NULL device *): %pV", level, vaf);
-
-	return dev_printk_emit(level[1] - '0', dev,
-			       "%s %s: %pV",
-			       dev_driver_string(dev), dev_name(dev), vaf);
+	if (dev)
+		dev_printk_emit(level[1] - '0', dev, "%s %s: %pV",
+				dev_driver_string(dev), dev_name(dev), vaf);
+	else
+		printk("%s(NULL device *): %pV", level, vaf);
 }
 
-int dev_printk(const char *level, const struct device *dev,
-	       const char *fmt, ...)
+void dev_printk(const char *level, const struct device *dev,
+		const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
-	int r;
 
 	va_start(args, fmt);
 
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	r = __dev_printk(level, dev, &vaf);
+	__dev_printk(level, dev, &vaf);
 
 	va_end(args);
-
-	return r;
 }
 EXPORT_SYMBOL(dev_printk);
 
 #define define_dev_printk_level(func, kern_level)		\
-int func(const struct device *dev, const char *fmt, ...)	\
+void func(const struct device *dev, const char *fmt, ...)	\
 {								\
 	struct va_format vaf;					\
 	va_list args;						\
-	int r;							\
 								\
 	va_start(args, fmt);					\
 								\
 	vaf.fmt = fmt;						\
 	vaf.va = &args;						\
 								\
-	r = __dev_printk(kern_level, dev, &vaf);		\
+	__dev_printk(kern_level, dev, &vaf);			\
 								\
 	va_end(args);						\
-								\
-	return r;						\
 }								\
 EXPORT_SYMBOL(func);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 7b2532361398..0eb8ee2dc6d1 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1038,22 +1038,22 @@ extern __printf(3, 4)
 int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...);
 
 extern __printf(3, 4)
-int dev_printk(const char *level, const struct device *dev,
-	       const char *fmt, ...);
+void dev_printk(const char *level, const struct device *dev,
+		const char *fmt, ...);
 extern __printf(2, 3)
-int dev_emerg(const struct device *dev, const char *fmt, ...);
+void dev_emerg(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_alert(const struct device *dev, const char *fmt, ...);
+void dev_alert(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_crit(const struct device *dev, const char *fmt, ...);
+void dev_crit(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_err(const struct device *dev, const char *fmt, ...);
+void dev_err(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_warn(const struct device *dev, const char *fmt, ...);
+void dev_warn(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int dev_notice(const struct device *dev, const char *fmt, ...);
+void dev_notice(const struct device *dev, const char *fmt, ...);
 extern __printf(2, 3)
-int _dev_info(const struct device *dev, const char *fmt, ...);
+void _dev_info(const struct device *dev, const char *fmt, ...);
 
 #else
 
@@ -1065,35 +1065,35 @@ static inline __printf(3, 4)
 int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...)
 { return 0; }
 
-static inline int __dev_printk(const char *level, const struct device *dev,
-			       struct va_format *vaf)
-{ return 0; }
+static inline void __dev_printk(const char *level, const struct device *dev,
+				struct va_format *vaf)
+{}
 static inline __printf(3, 4)
-int dev_printk(const char *level, const struct device *dev,
-	       const char *fmt, ...)
-{ return 0; }
+void dev_printk(const char *level, const struct device *dev,
+		const char *fmt, ...)
+{}
 
 static inline __printf(2, 3)
-int dev_emerg(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_emerg(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_crit(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_crit(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_alert(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_alert(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_err(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_err(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_warn(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_warn(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int dev_notice(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void dev_notice(const struct device *dev, const char *fmt, ...)
+{}
 static inline __printf(2, 3)
-int _dev_info(const struct device *dev, const char *fmt, ...)
-{ return 0; }
+void _dev_info(const struct device *dev, const char *fmt, ...)
+{}
 
 #endif
 
@@ -1119,7 +1119,6 @@ do {						     \
 ({								\
 	if (0)							\
 		dev_printk(KERN_DEBUG, dev, format, ##arg);	\
-	0;							\
 })
 #endif
 
@@ -1215,7 +1214,6 @@ do {									\
 ({								\
 	if (0)							\
 		dev_printk(KERN_DEBUG, dev, format, ##arg);	\
-	0;							\
 })
 #endif
 
-- 
cgit v1.2.3


From 32357605ce7bcd36cb6215a21cad12cd8f51b74e Mon Sep 17 00:00:00 2001
From: Sharon Dvir <sharon.dvir1@mail.huji.ac.il>
Date: Thu, 22 Jan 2015 12:15:25 +0200
Subject: USB: Add missing word to comment in mod_devicetable.h

The documentation of match_flags in struct usb_device_id said:
'Bit mask controlling of the other fields are used to match against new devices.'
Changed to:
'Bit mask controlling which of the other fields are used to match against new devices.'
By adding the word 'which' and editing the next lines to not exceed 80 chars.

Signed-off-by: Sharon Dvir <sharon.dvir1@mail.huji.ac.il>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mod_devicetable.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 745def862580..470a240f66a1 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -53,9 +53,9 @@ struct ieee1394_device_id {
 
 /**
  * struct usb_device_id - identifies USB devices for probing and hotplugging
- * @match_flags: Bit mask controlling of the other fields are used to match
- *	against new devices.  Any field except for driver_info may be used,
- *	although some only make sense in conjunction with other fields.
+ * @match_flags: Bit mask controlling which of the other fields are used to
+ *	match against new devices. Any field except for driver_info may be
+ *	used, although some only make sense in conjunction with other fields.
  *	This is usually set by a USB_DEVICE_*() macro, which sets all
  *	other fields in this structure except for driver_info.
  * @idVendor: USB vendor ID for a device; numbers are assigned
-- 
cgit v1.2.3


From aae88261abd58fffef7ee0e00160ce4ea105b0f3 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 26 Jan 2015 22:05:38 -0800
Subject: net: phy: document has_fixups field

has_fixups was introduced to help keeping track of fixups/quirks running
on a PHY device, but we did not update the comment above struct
phy_device accordingly.

Fixes: b0ae009f3dc14 (net: phy: add "has_fixups" boolean property")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9c189a1fa3a2..1b3690b597d5 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -327,6 +327,7 @@ struct phy_c45_device_ids {
  * c45_ids: 802.3-c45 Device Identifers if is_c45.
  * is_c45:  Set to true if this phy uses clause 45 addressing.
  * is_internal: Set to true if this phy is internal to a MAC.
+ * has_fixups: Set to true if this phy has fixups/quirks.
  * state: state of the PHY for management purposes
  * dev_flags: Device-specific flags used by the PHY driver.
  * addr: Bus address of PHY
-- 
cgit v1.2.3


From 8a477a6fb6a33651adda772360b85fd813569743 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 26 Jan 2015 22:05:39 -0800
Subject: net: phy: keep track of the PHY suspend state

In order to avoid double calls to phydev->drv->suspend and resume, keep
track of whether the PHY has already been suspended as a consequence of
a successful call to phy_suspend(). We will use this in our MDIO bus
suspend/resume hooks to avoid a double suspend call.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 22 ++++++++++++++++++----
 include/linux/phy.h          |  2 ++
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 3fc91e89f5a5..bdfe51fc3a65 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -699,6 +699,7 @@ int phy_suspend(struct phy_device *phydev)
 {
 	struct phy_driver *phydrv = to_phy_driver(phydev->dev.driver);
 	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+	int ret = 0;
 
 	/* If the device has WOL enabled, we cannot suspend the PHY */
 	phy_ethtool_get_wol(phydev, &wol);
@@ -706,18 +707,31 @@ int phy_suspend(struct phy_device *phydev)
 		return -EBUSY;
 
 	if (phydrv->suspend)
-		return phydrv->suspend(phydev);
-	return 0;
+		ret = phydrv->suspend(phydev);
+
+	if (ret)
+		return ret;
+
+	phydev->suspended = true;
+
+	return ret;
 }
 EXPORT_SYMBOL(phy_suspend);
 
 int phy_resume(struct phy_device *phydev)
 {
 	struct phy_driver *phydrv = to_phy_driver(phydev->dev.driver);
+	int ret = 0;
 
 	if (phydrv->resume)
-		return phydrv->resume(phydev);
-	return 0;
+		ret = phydrv->resume(phydev);
+
+	if (ret)
+		return ret;
+
+	phydev->suspended = false;
+
+	return ret;
 }
 EXPORT_SYMBOL(phy_resume);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1b3690b597d5..685809835b5c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -328,6 +328,7 @@ struct phy_c45_device_ids {
  * is_c45:  Set to true if this phy uses clause 45 addressing.
  * is_internal: Set to true if this phy is internal to a MAC.
  * has_fixups: Set to true if this phy has fixups/quirks.
+ * suspended: Set to true if this phy has been suspended successfully.
  * state: state of the PHY for management purposes
  * dev_flags: Device-specific flags used by the PHY driver.
  * addr: Bus address of PHY
@@ -365,6 +366,7 @@ struct phy_device {
 	bool is_c45;
 	bool is_internal;
 	bool has_fixups;
+	bool suspended;
 
 	enum phy_state state;
 
-- 
cgit v1.2.3


From f4c5cf88fbd50e4779042268947b2e2f90c20484 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 18 Dec 2014 15:29:14 +0100
Subject: gpu: host1x: Provide a proper struct bus_type

Previously the struct bus_type exported by the host1x infrastructure was
only a very basic skeleton. Turn that implementation into a more full-
fledged bus to support proper probe ordering and power management.

Note that the bus infrastructure needs to be available before any of the
drivers can be registered. This is automatically ensured if all drivers
are built as loadable modules (via symbol dependencies). If all drivers
are built-in there are no such guarantees and the link order determines
the initcall ordering. Adjust drivers/gpu/Makefile to make sure that the
host1x bus infrastructure is initialized prior to any of its users (only
drm/tegra currently).

v2: Fix building host1x and tegra-drm as modules
    Reported-by: Dave Airlie <airlied@gmail.com>

Reviewed-by: Sean Paul <seanpaul@chromium.org>
Reviewed-by: Mark Zhang <markz@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/Makefile        |   5 +-
 drivers/gpu/drm/tegra/drm.c |   4 +-
 drivers/gpu/host1x/bus.c    | 111 ++++++++++++++++++++++++++++++--------------
 drivers/gpu/host1x/bus.h    |   4 +-
 drivers/gpu/host1x/dev.c    |   6 +--
 include/linux/host1x.h      |  18 +++++--
 6 files changed, 103 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/Makefile b/drivers/gpu/Makefile
index 70da9eb52a42..e9ed439a5b65 100644
--- a/drivers/gpu/Makefile
+++ b/drivers/gpu/Makefile
@@ -1,3 +1,6 @@
-obj-y			+= drm/ vga/
+# drm/tegra depends on host1x, so if both drivers are built-in care must be
+# taken to initialize them in the correct order. Link order is the only way
+# to ensure this currently.
 obj-$(CONFIG_TEGRA_HOST1X)	+= host1x/
+obj-y			+= drm/ vga/
 obj-$(CONFIG_IMX_IPUV3_CORE)	+= ipu-v3/
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index d4f827593dfa..a0c18ae79029 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -912,7 +912,9 @@ static const struct of_device_id host1x_drm_subdevs[] = {
 };
 
 static struct host1x_driver host1x_drm_driver = {
-	.name = "drm",
+	.driver = {
+		.name = "drm",
+	},
 	.probe = host1x_drm_probe,
 	.remove = host1x_drm_remove,
 	.subdevs = host1x_drm_subdevs,
diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 0b52f0ea8871..4a99c6416e6a 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -72,13 +72,14 @@ static void host1x_subdev_del(struct host1x_subdev *subdev)
 /**
  * host1x_device_parse_dt() - scan device tree and add matching subdevices
  */
-static int host1x_device_parse_dt(struct host1x_device *device)
+static int host1x_device_parse_dt(struct host1x_device *device,
+				  struct host1x_driver *driver)
 {
 	struct device_node *np;
 	int err;
 
 	for_each_child_of_node(device->dev.parent->of_node, np) {
-		if (of_match_node(device->driver->subdevs, np) &&
+		if (of_match_node(driver->subdevs, np) &&
 		    of_device_is_available(np)) {
 			err = host1x_subdev_add(device, np);
 			if (err < 0)
@@ -109,17 +110,12 @@ static void host1x_subdev_register(struct host1x_device *device,
 	mutex_unlock(&device->clients_lock);
 	mutex_unlock(&device->subdevs_lock);
 
-	/*
-	 * When all subdevices have been registered, the composite device is
-	 * ready to be probed.
-	 */
 	if (list_empty(&device->subdevs)) {
-		err = device->driver->probe(device);
+		err = device_add(&device->dev);
 		if (err < 0)
-			dev_err(&device->dev, "probe failed for %ps: %d\n",
-				device->driver, err);
+			dev_err(&device->dev, "failed to add: %d\n", err);
 		else
-			device->bound = true;
+			device->registered = true;
 	}
 }
 
@@ -127,18 +123,16 @@ static void __host1x_subdev_unregister(struct host1x_device *device,
 				       struct host1x_subdev *subdev)
 {
 	struct host1x_client *client = subdev->client;
-	int err;
 
 	/*
 	 * If all subdevices have been activated, we're about to remove the
 	 * first active subdevice, so unload the driver first.
 	 */
-	if (list_empty(&device->subdevs) && device->bound) {
-		err = device->driver->remove(device);
-		if (err < 0)
-			dev_err(&device->dev, "remove failed: %d\n", err);
-
-		device->bound = false;
+	if (list_empty(&device->subdevs)) {
+		if (device->registered) {
+			device->registered = false;
+			device_del(&device->dev);
+		}
 	}
 
 	/*
@@ -265,20 +259,60 @@ static int host1x_del_client(struct host1x *host1x,
 	return -ENODEV;
 }
 
-static struct bus_type host1x_bus_type = {
-	.name = "host1x",
-};
+static int host1x_device_match(struct device *dev, struct device_driver *drv)
+{
+	return strcmp(dev_name(dev), drv->name) == 0;
+}
 
-int host1x_bus_init(void)
+static int host1x_device_probe(struct device *dev)
 {
-	return bus_register(&host1x_bus_type);
+	struct host1x_driver *driver = to_host1x_driver(dev->driver);
+	struct host1x_device *device = to_host1x_device(dev);
+
+	if (driver->probe)
+		return driver->probe(device);
+
+	return 0;
 }
 
-void host1x_bus_exit(void)
+static int host1x_device_remove(struct device *dev)
 {
-	bus_unregister(&host1x_bus_type);
+	struct host1x_driver *driver = to_host1x_driver(dev->driver);
+	struct host1x_device *device = to_host1x_device(dev);
+
+	if (driver->remove)
+		return driver->remove(device);
+
+	return 0;
 }
 
+static void host1x_device_shutdown(struct device *dev)
+{
+	struct host1x_driver *driver = to_host1x_driver(dev->driver);
+	struct host1x_device *device = to_host1x_device(dev);
+
+	if (driver->shutdown)
+		driver->shutdown(device);
+}
+
+static const struct dev_pm_ops host1x_device_pm_ops = {
+	.suspend = pm_generic_suspend,
+	.resume = pm_generic_resume,
+	.freeze = pm_generic_freeze,
+	.thaw = pm_generic_thaw,
+	.poweroff = pm_generic_poweroff,
+	.restore = pm_generic_restore,
+};
+
+struct bus_type host1x_bus_type = {
+	.name = "host1x",
+	.match = host1x_device_match,
+	.probe = host1x_device_probe,
+	.remove = host1x_device_remove,
+	.shutdown = host1x_device_shutdown,
+	.pm = &host1x_device_pm_ops,
+};
+
 static void __host1x_device_del(struct host1x_device *device)
 {
 	struct host1x_subdev *subdev, *sd;
@@ -347,6 +381,8 @@ static int host1x_device_add(struct host1x *host1x,
 	if (!device)
 		return -ENOMEM;
 
+	device_initialize(&device->dev);
+
 	mutex_init(&device->subdevs_lock);
 	INIT_LIST_HEAD(&device->subdevs);
 	INIT_LIST_HEAD(&device->active);
@@ -357,18 +393,14 @@ static int host1x_device_add(struct host1x *host1x,
 
 	device->dev.coherent_dma_mask = host1x->dev->coherent_dma_mask;
 	device->dev.dma_mask = &device->dev.coherent_dma_mask;
+	dev_set_name(&device->dev, "%s", driver->driver.name);
 	device->dev.release = host1x_device_release;
-	dev_set_name(&device->dev, "%s", driver->name);
 	device->dev.bus = &host1x_bus_type;
 	device->dev.parent = host1x->dev;
 
-	err = device_register(&device->dev);
-	if (err < 0)
-		return err;
-
-	err = host1x_device_parse_dt(device);
+	err = host1x_device_parse_dt(device, driver);
 	if (err < 0) {
-		device_unregister(&device->dev);
+		kfree(device);
 		return err;
 	}
 
@@ -399,7 +431,12 @@ static int host1x_device_add(struct host1x *host1x,
 static void host1x_device_del(struct host1x *host1x,
 			      struct host1x_device *device)
 {
-	device_unregister(&device->dev);
+	if (device->registered) {
+		device->registered = false;
+		device_del(&device->dev);
+	}
+
+	put_device(&device->dev);
 }
 
 static void host1x_attach_driver(struct host1x *host1x,
@@ -474,7 +511,8 @@ int host1x_unregister(struct host1x *host1x)
 	return 0;
 }
 
-int host1x_driver_register(struct host1x_driver *driver)
+int host1x_driver_register_full(struct host1x_driver *driver,
+				struct module *owner)
 {
 	struct host1x *host1x;
 
@@ -491,9 +529,12 @@ int host1x_driver_register(struct host1x_driver *driver)
 
 	mutex_unlock(&devices_lock);
 
-	return 0;
+	driver->driver.bus = &host1x_bus_type;
+	driver->driver.owner = owner;
+
+	return driver_register(&driver->driver);
 }
-EXPORT_SYMBOL(host1x_driver_register);
+EXPORT_SYMBOL(host1x_driver_register_full);
 
 void host1x_driver_unregister(struct host1x_driver *driver)
 {
diff --git a/drivers/gpu/host1x/bus.h b/drivers/gpu/host1x/bus.h
index 4099e99212c8..88fb1c4aac68 100644
--- a/drivers/gpu/host1x/bus.h
+++ b/drivers/gpu/host1x/bus.h
@@ -18,10 +18,10 @@
 #ifndef HOST1X_BUS_H
 #define HOST1X_BUS_H
 
+struct bus_type;
 struct host1x;
 
-int host1x_bus_init(void);
-void host1x_bus_exit(void);
+extern struct bus_type host1x_bus_type;
 
 int host1x_register(struct host1x *host1x);
 int host1x_unregister(struct host1x *host1x);
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 2529908d304b..53d3d1d45b48 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -216,7 +216,7 @@ static int __init tegra_host1x_init(void)
 {
 	int err;
 
-	err = host1x_bus_init();
+	err = bus_register(&host1x_bus_type);
 	if (err < 0)
 		return err;
 
@@ -233,7 +233,7 @@ static int __init tegra_host1x_init(void)
 unregister_host1x:
 	platform_driver_unregister(&tegra_host1x_driver);
 unregister_bus:
-	host1x_bus_exit();
+	bus_unregister(&host1x_bus_type);
 	return err;
 }
 module_init(tegra_host1x_init);
@@ -242,7 +242,7 @@ static void __exit tegra_host1x_exit(void)
 {
 	platform_driver_unregister(&tegra_mipi_driver);
 	platform_driver_unregister(&tegra_host1x_driver);
-	host1x_bus_exit();
+	bus_unregister(&host1x_bus_type);
 }
 module_exit(tegra_host1x_exit);
 
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 7890b553d12e..464f33814a94 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -250,17 +250,29 @@ void host1x_job_unpin(struct host1x_job *job);
 struct host1x_device;
 
 struct host1x_driver {
+	struct device_driver driver;
+
 	const struct of_device_id *subdevs;
 	struct list_head list;
-	const char *name;
 
 	int (*probe)(struct host1x_device *device);
 	int (*remove)(struct host1x_device *device);
+	void (*shutdown)(struct host1x_device *device);
 };
 
-int host1x_driver_register(struct host1x_driver *driver);
+static inline struct host1x_driver *
+to_host1x_driver(struct device_driver *driver)
+{
+	return container_of(driver, struct host1x_driver, driver);
+}
+
+int host1x_driver_register_full(struct host1x_driver *driver,
+				struct module *owner);
 void host1x_driver_unregister(struct host1x_driver *driver);
 
+#define host1x_driver_register(driver) \
+	host1x_driver_register_full(driver, THIS_MODULE)
+
 struct host1x_device {
 	struct host1x_driver *driver;
 	struct list_head list;
@@ -273,7 +285,7 @@ struct host1x_device {
 	struct mutex clients_lock;
 	struct list_head clients;
 
-	bool bound;
+	bool registered;
 };
 
 static inline struct host1x_device *to_host1x_device(struct device *dev)
-- 
cgit v1.2.3


From e9fb8b7e4f6af17e3aa4835281e06fdc920341f9 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 6 Jan 2015 16:48:36 +0000
Subject: compat: Declare compat_sys_sigpending and compat_sys_sigprocmask
 prototypes

__ARCH_WANT_SYS_SIGPENDING or __ARCH_WANT_SYS_SIGPROGMASK may be defined
for compat support but the corresponding prototypes are missing from
linux/compat.h.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/compat.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 7450ca2ac1fc..ab25814690bc 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -689,6 +689,15 @@ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
 asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
 				       compat_stack_t __user *uoss_ptr);
 
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set);
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *nset,
+				       compat_old_sigset_t __user *oset);
+#endif
+
 int compat_restore_altstack(const compat_stack_t __user *uss);
 int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 #define compat_save_altstack_ex(uss, sp) do { \
-- 
cgit v1.2.3


From 54e45c169dbce43cf46d00eb1521b655b6e4f9e9 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 6 Jan 2015 16:52:38 +0000
Subject: syscalls: Declare sys_*stat64 prototypes if
 __ARCH_WANT_(COMPAT_)STAT64

Currently, the sys_stat64, sys_fstat64 and sys_lstat64 prototpyes are
only declared if BITS_PER_LONG == 32. Following commit 0753f70f07fb
(fs: Build sys_stat64() and friends if __ARCH_WANT_COMPAT_STAT64), the
implementation of these functions is allowed on 64-bit systems for
compat support. The patch changes the condition on the prototype
declaration from BITS_PER_LONG == 32 to defined(__ARCH_WANT_STAT64) ||
defined(__ARCH_WANT_COMPAT_STAT64).

In addition, it moves the sys_fstatat64 prototype under the same #if
block

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/syscalls.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 85893d744901..76d1e38aabe1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -410,12 +410,16 @@ asmlinkage long sys_newlstat(const char __user *filename,
 				struct stat __user *statbuf);
 asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
 asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf);
-#if BITS_PER_LONG == 32
+#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
 asmlinkage long sys_stat64(const char __user *filename,
 				struct stat64 __user *statbuf);
 asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
 asmlinkage long sys_lstat64(const char __user *filename,
 				struct stat64 __user *statbuf);
+asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
+			       struct stat64 __user *statbuf, int flag);
+#endif
+#if BITS_PER_LONG == 32
 asmlinkage long sys_truncate64(const char __user *path, loff_t length);
 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length);
 #endif
@@ -771,8 +775,6 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   umode_t mode);
 asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
 			       struct stat __user *statbuf, int flag);
-asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
-			       struct stat64 __user *statbuf, int flag);
 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
 			       int bufsiz);
 asmlinkage long sys_utimensat(int dfd, const char __user *filename,
-- 
cgit v1.2.3


From cfcf1682c4ca8f601a4702255958e0b1c9aa12cc Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Sat, 24 Jan 2015 19:52:05 +0200
Subject: cfg80211: Add new GCMP, CCMP-256, BIP-GMAC, BIP-CMAC-256 ciphers

This makes cfg80211 aware of the GCMP, GCMP-256, CCMP-256, BIP-GMAC-128,
BIP-GMAC-256, and BIP-CMAC-256 cipher suites. These new cipher suites
were defined in IEEE Std 802.11ac-2013.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 18 ++++++++++++++++++
 net/wireless/util.c       | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 4f4eea8a6288..dbf417bf25bf 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1994,9 +1994,15 @@ enum ieee80211_key_len {
 	WLAN_KEY_LEN_WEP40 = 5,
 	WLAN_KEY_LEN_WEP104 = 13,
 	WLAN_KEY_LEN_CCMP = 16,
+	WLAN_KEY_LEN_CCMP_256 = 32,
 	WLAN_KEY_LEN_TKIP = 32,
 	WLAN_KEY_LEN_AES_CMAC = 16,
 	WLAN_KEY_LEN_SMS4 = 32,
+	WLAN_KEY_LEN_GCMP = 16,
+	WLAN_KEY_LEN_GCMP_256 = 32,
+	WLAN_KEY_LEN_BIP_CMAC_256 = 32,
+	WLAN_KEY_LEN_BIP_GMAC_128 = 16,
+	WLAN_KEY_LEN_BIP_GMAC_256 = 32,
 };
 
 #define IEEE80211_WEP_IV_LEN		4
@@ -2004,9 +2010,16 @@ enum ieee80211_key_len {
 #define IEEE80211_CCMP_HDR_LEN		8
 #define IEEE80211_CCMP_MIC_LEN		8
 #define IEEE80211_CCMP_PN_LEN		6
+#define IEEE80211_CCMP_256_HDR_LEN	8
+#define IEEE80211_CCMP_256_MIC_LEN	16
+#define IEEE80211_CCMP_256_PN_LEN	6
 #define IEEE80211_TKIP_IV_LEN		8
 #define IEEE80211_TKIP_ICV_LEN		4
 #define IEEE80211_CMAC_PN_LEN		6
+#define IEEE80211_GMAC_PN_LEN		6
+#define IEEE80211_GCMP_HDR_LEN		8
+#define IEEE80211_GCMP_MIC_LEN		16
+#define IEEE80211_GCMP_PN_LEN		6
 
 /* Public action codes */
 enum ieee80211_pub_actioncode {
@@ -2230,6 +2243,11 @@ enum ieee80211_sa_query_action {
 #define WLAN_CIPHER_SUITE_WEP104	0x000FAC05
 #define WLAN_CIPHER_SUITE_AES_CMAC	0x000FAC06
 #define WLAN_CIPHER_SUITE_GCMP		0x000FAC08
+#define WLAN_CIPHER_SUITE_GCMP_256	0x000FAC09
+#define WLAN_CIPHER_SUITE_CCMP_256	0x000FAC0A
+#define WLAN_CIPHER_SUITE_BIP_GMAC_128	0x000FAC0B
+#define WLAN_CIPHER_SUITE_BIP_GMAC_256	0x000FAC0C
+#define WLAN_CIPHER_SUITE_BIP_CMAC_256	0x000FAC0D
 
 #define WLAN_CIPHER_SUITE_SMS4		0x00147201
 
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 08f136ad2ea5..919fee807dd9 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -230,6 +230,9 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 	switch (params->cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
 	case WLAN_CIPHER_SUITE_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP_256:
+	case WLAN_CIPHER_SUITE_GCMP:
+	case WLAN_CIPHER_SUITE_GCMP_256:
 		/* Disallow pairwise keys with non-zero index unless it's WEP
 		 * or a vendor specific cipher (because current deployments use
 		 * pairwise WEP keys with non-zero indices and for vendor
@@ -240,6 +243,9 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 			return -EINVAL;
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
 		/* Disallow BIP (group-only) cipher as pairwise cipher */
 		if (pairwise)
 			return -EINVAL;
@@ -261,6 +267,18 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 		if (params->key_len != WLAN_KEY_LEN_CCMP)
 			return -EINVAL;
 		break;
+	case WLAN_CIPHER_SUITE_CCMP_256:
+		if (params->key_len != WLAN_KEY_LEN_CCMP_256)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_GCMP:
+		if (params->key_len != WLAN_KEY_LEN_GCMP)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_GCMP_256:
+		if (params->key_len != WLAN_KEY_LEN_GCMP_256)
+			return -EINVAL;
+		break;
 	case WLAN_CIPHER_SUITE_WEP104:
 		if (params->key_len != WLAN_KEY_LEN_WEP104)
 			return -EINVAL;
@@ -269,6 +287,18 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 		if (params->key_len != WLAN_KEY_LEN_AES_CMAC)
 			return -EINVAL;
 		break;
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+		if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+		if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+		if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256)
+			return -EINVAL;
+		break;
 	default:
 		/*
 		 * We don't know anything about this algorithm,
@@ -288,7 +318,13 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 			return -EINVAL;
 		case WLAN_CIPHER_SUITE_TKIP:
 		case WLAN_CIPHER_SUITE_CCMP:
+		case WLAN_CIPHER_SUITE_CCMP_256:
+		case WLAN_CIPHER_SUITE_GCMP:
+		case WLAN_CIPHER_SUITE_GCMP_256:
 		case WLAN_CIPHER_SUITE_AES_CMAC:
+		case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+		case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+		case WLAN_CIPHER_SUITE_BIP_GMAC_256:
 			if (params->seq_len != 6)
 				return -EINVAL;
 			break;
-- 
cgit v1.2.3


From 56c52da2d554f081e8fce58ecbcf6a40c605b95b Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Sat, 24 Jan 2015 19:52:08 +0200
Subject: mac80111: Add BIP-CMAC-256 cipher

This allows mac80211 to configure BIP-CMAC-256 to the driver and also
use software-implementation within mac80211 when the driver does not
support this with hardware accelaration.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  9 +++++
 net/mac80211/aes_cmac.c    | 34 +++++++++++++----
 net/mac80211/aes_cmac.h    |  5 ++-
 net/mac80211/cfg.c         |  2 +
 net/mac80211/debugfs_key.c |  4 ++
 net/mac80211/key.c         | 14 ++++++-
 net/mac80211/main.c        | 13 ++++---
 net/mac80211/rx.c          | 21 ++++++++---
 net/mac80211/tx.c          |  3 ++
 net/mac80211/wpa.c         | 92 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/wpa.h         |  4 ++
 11 files changed, 180 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index dbf417bf25bf..b9c7897dc566 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1017,6 +1017,15 @@ struct ieee80211_mmie {
 	u8 mic[8];
 } __packed;
 
+/* Management MIC information element (IEEE 802.11w) for GMAC and CMAC-256 */
+struct ieee80211_mmie_16 {
+	u8 element_id;
+	u8 length;
+	__le16 key_id;
+	u8 sequence_number[6];
+	u8 mic[16];
+} __packed;
+
 struct ieee80211_vendor_ie {
 	u8 element_id;
 	u8 len;
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index 9b9009f99551..4192806be3d3 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -18,8 +18,8 @@
 #include "key.h"
 #include "aes_cmac.h"
 
-#define AES_CMAC_KEY_LEN 16
 #define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */
+#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */
 #define AAD_LEN 20
 
 
@@ -35,9 +35,9 @@ static void gf_mulx(u8 *pad)
 		pad[AES_BLOCK_SIZE - 1] ^= 0x87;
 }
 
-
-static void aes_128_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
-				const u8 *addr[], const size_t *len, u8 *mac)
+static void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
+			    const u8 *addr[], const size_t *len, u8 *mac,
+			    size_t mac_len)
 {
 	u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE];
 	const u8 *pos, *end;
@@ -88,7 +88,7 @@ static void aes_128_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
 	for (i = 0; i < AES_BLOCK_SIZE; i++)
 		pad[i] ^= cbc[i];
 	crypto_cipher_encrypt_one(tfm, pad, pad);
-	memcpy(mac, pad, CMAC_TLEN);
+	memcpy(mac, pad, mac_len);
 }
 
 
@@ -107,17 +107,35 @@ void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
 	addr[2] = zero;
 	len[2] = CMAC_TLEN;
 
-	aes_128_cmac_vector(tfm, 3, addr, len, mic);
+	aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN);
 }
 
+void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+			    const u8 *data, size_t data_len, u8 *mic)
+{
+	const u8 *addr[3];
+	size_t len[3];
+	u8 zero[CMAC_TLEN_256];
+
+	memset(zero, 0, CMAC_TLEN_256);
+	addr[0] = aad;
+	len[0] = AAD_LEN;
+	addr[1] = data;
+	len[1] = data_len - CMAC_TLEN_256;
+	addr[2] = zero;
+	len[2] = CMAC_TLEN_256;
+
+	aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN_256);
+}
 
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[])
+struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
+						   size_t key_len)
 {
 	struct crypto_cipher *tfm;
 
 	tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
 	if (!IS_ERR(tfm))
-		crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
+		crypto_cipher_setkey(tfm, key, key_len);
 
 	return tfm;
 }
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index 0ce6487af795..3702041f44fd 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -11,9 +11,12 @@
 
 #include <linux/crypto.h>
 
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[]);
+struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
+						   size_t key_len);
 void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
 			const u8 *data, size_t data_len, u8 *mic);
+void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+			    const u8 *data, size_t data_len, u8 *mic);
 void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm);
 
 #endif /* AES_CMAC_H */
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index ef84441c119c..b7e528bbecce 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -164,6 +164,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		break;
@@ -362,6 +363,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 		params.seq_len = 6;
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		pn64 = atomic64_read(&key->u.aes_cmac.tx_pn);
 		seq[0] = pn64;
 		seq[1] = pn64 >> 8;
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 64de07b16092..d1b60eb014a8 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -101,6 +101,7 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
 				(u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		pn = atomic64_read(&key->u.aes_cmac.tx_pn);
 		len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
 				(u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
@@ -153,6 +154,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
 		len = p - buf;
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		rpn = key->u.aes_cmac.rx_pn;
 		p += scnprintf(p, sizeof(buf)+buf-p,
 			       "%02x%02x%02x%02x%02x%02x\n",
@@ -191,6 +193,7 @@ static ssize_t key_replays_read(struct file *file, char __user *userbuf,
 		len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		len = scnprintf(buf, sizeof(buf), "%u\n",
 				key->u.aes_cmac.replays);
 		break;
@@ -214,6 +217,7 @@ static ssize_t key_icverrors_read(struct file *file, char __user *userbuf,
 
 	switch (key->conf.cipher) {
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		len = scnprintf(buf, sizeof(buf), "%u\n",
 				key->u.aes_cmac.icverrors);
 		break;
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 83c61085c3f0..7ceea9d9fcd2 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -165,6 +165,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		/* all of these we can do in software - if driver can */
@@ -417,8 +418,12 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
 		}
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		key->conf.iv_len = 0;
-		key->conf.icv_len = sizeof(struct ieee80211_mmie);
+		if (cipher == WLAN_CIPHER_SUITE_AES_CMAC)
+			key->conf.icv_len = sizeof(struct ieee80211_mmie);
+		else
+			key->conf.icv_len = sizeof(struct ieee80211_mmie_16);
 		if (seq)
 			for (j = 0; j < IEEE80211_CMAC_PN_LEN; j++)
 				key->u.aes_cmac.rx_pn[j] =
@@ -428,7 +433,7 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
 		 * it does not need to be initialized for every packet.
 		 */
 		key->u.aes_cmac.tfm =
-			ieee80211_aes_cmac_key_setup(key_data);
+			ieee80211_aes_cmac_key_setup(key_data, key_len);
 		if (IS_ERR(key->u.aes_cmac.tfm)) {
 			err = PTR_ERR(key->u.aes_cmac.tfm);
 			kfree(key);
@@ -481,6 +486,7 @@ static void ieee80211_key_free_common(struct ieee80211_key *key)
 		ieee80211_aes_key_free(key->u.ccmp.tfm);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm);
 		break;
 	case WLAN_CIPHER_SUITE_GCMP:
@@ -804,6 +810,7 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
 		seq->ccmp.pn[0] = pn64 >> 40;
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		pn64 = atomic64_read(&key->u.aes_cmac.tx_pn);
 		seq->ccmp.pn[5] = pn64;
 		seq->ccmp.pn[4] = pn64 >> 8;
@@ -854,6 +861,7 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 		memcpy(seq->ccmp.pn, pn, IEEE80211_CCMP_PN_LEN);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		if (WARN_ON(tid != 0))
 			return;
 		pn = key->u.aes_cmac.rx_pn;
@@ -897,6 +905,7 @@ void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
 		atomic64_set(&key->u.ccmp.tx_pn, pn64);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		pn64 = (u64)seq->aes_cmac.pn[5] |
 		       ((u64)seq->aes_cmac.pn[4] << 8) |
 		       ((u64)seq->aes_cmac.pn[3] << 16) |
@@ -948,6 +957,7 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
 		memcpy(pn, seq->ccmp.pn, IEEE80211_CCMP_PN_LEN);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		if (WARN_ON(tid != 0))
 			return;
 		pn = key->u.aes_cmac.rx_pn;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index a5ad2d5bb29b..053a17c5023a 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -671,7 +671,8 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 		WLAN_CIPHER_SUITE_GCMP_256,
 
 		/* keep last -- depends on hw flags! */
-		WLAN_CIPHER_SUITE_AES_CMAC
+		WLAN_CIPHER_SUITE_AES_CMAC,
+		WLAN_CIPHER_SUITE_BIP_CMAC_256,
 	};
 
 	if (local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL ||
@@ -710,7 +711,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 		local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
 
 		if (!have_mfp)
-			local->hw.wiphy->n_cipher_suites--;
+			local->hw.wiphy->n_cipher_suites -= 2;
 
 		if (!have_wep) {
 			local->hw.wiphy->cipher_suites += 2;
@@ -736,9 +737,9 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 		if (have_wep)
 			n_suites += 2;
 
-		/* check if we have AES_CMAC */
+		/* check if we have AES_CMAC, BIP-CMAC-256 */
 		if (have_mfp)
-			n_suites++;
+			n_suites += 2;
 
 		suites = kmalloc(sizeof(u32) * n_suites, GFP_KERNEL);
 		if (!suites)
@@ -755,8 +756,10 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 			suites[w++] = WLAN_CIPHER_SUITE_WEP104;
 		}
 
-		if (have_mfp)
+		if (have_mfp) {
 			suites[w++] = WLAN_CIPHER_SUITE_AES_CMAC;
+			suites[w++] = WLAN_CIPHER_SUITE_BIP_CMAC_256;
+		}
 
 		for (r = 0; r < local->hw.n_cipher_schemes; r++)
 			suites[w++] = cs[r].cipher;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e8c6ba5ce70b..93ebc9525478 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -647,6 +647,7 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
 {
 	struct ieee80211_mgmt *hdr = (struct ieee80211_mgmt *) skb->data;
 	struct ieee80211_mmie *mmie;
+	struct ieee80211_mmie_16 *mmie16;
 
 	if (skb->len < 24 + sizeof(*mmie) || !is_multicast_ether_addr(hdr->da))
 		return -1;
@@ -656,11 +657,18 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
 
 	mmie = (struct ieee80211_mmie *)
 		(skb->data + skb->len - sizeof(*mmie));
-	if (mmie->element_id != WLAN_EID_MMIE ||
-	    mmie->length != sizeof(*mmie) - 2)
-		return -1;
-
-	return le16_to_cpu(mmie->key_id);
+	if (mmie->element_id == WLAN_EID_MMIE &&
+	    mmie->length == sizeof(*mmie) - 2)
+		return le16_to_cpu(mmie->key_id);
+
+	mmie16 = (struct ieee80211_mmie_16 *)
+		(skb->data + skb->len - sizeof(*mmie16));
+	if (skb->len >= 24 + sizeof(*mmie16) &&
+	    mmie16->element_id == WLAN_EID_MMIE &&
+	    mmie16->length == sizeof(*mmie16) - 2)
+		return le16_to_cpu(mmie16->key_id);
+
+	return -1;
 }
 
 static int iwl80211_get_cs_keyid(const struct ieee80211_cipher_scheme *cs,
@@ -1660,6 +1668,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	case WLAN_CIPHER_SUITE_AES_CMAC:
 		result = ieee80211_crypto_aes_cmac_decrypt(rx);
 		break;
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+		result = ieee80211_crypto_aes_cmac_256_decrypt(rx);
+		break;
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		result = ieee80211_crypto_gcmp_decrypt(rx);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index be57e0afd019..909c27be1fdc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -639,6 +639,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 					ieee80211_is_mgmt(hdr->frame_control);
 			break;
 		case WLAN_CIPHER_SUITE_AES_CMAC:
+		case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 			if (!ieee80211_is_mgmt(hdr->frame_control))
 				tx->key = NULL;
 			break;
@@ -1021,6 +1022,8 @@ ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx)
 			tx, IEEE80211_CCMP_256_MIC_LEN);
 	case WLAN_CIPHER_SUITE_AES_CMAC:
 		return ieee80211_crypto_aes_cmac_encrypt(tx);
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+		return ieee80211_crypto_aes_cmac_256_encrypt(tx);
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		return ieee80211_crypto_gcmp_encrypt(tx);
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index ae654de9782a..549af118de9f 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -955,6 +955,48 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_key *key = tx->key;
+	struct ieee80211_mmie_16 *mmie;
+	u8 aad[20];
+	u64 pn64;
+
+	if (WARN_ON(skb_queue_len(&tx->skbs) != 1))
+		return TX_DROP;
+
+	skb = skb_peek(&tx->skbs);
+
+	info = IEEE80211_SKB_CB(skb);
+
+	if (info->control.hw_key)
+		return TX_CONTINUE;
+
+	if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
+		return TX_DROP;
+
+	mmie = (struct ieee80211_mmie_16 *)skb_put(skb, sizeof(*mmie));
+	mmie->element_id = WLAN_EID_MMIE;
+	mmie->length = sizeof(*mmie) - 2;
+	mmie->key_id = cpu_to_le16(key->conf.keyidx);
+
+	/* PN = PN + 1 */
+	pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn);
+
+	bip_ipn_set64(mmie->sequence_number, pn64);
+
+	bip_aad(skb, aad);
+
+	/* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128)
+	 */
+	ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
+			       skb->data + 24, skb->len - 24, mmie->mic);
+
+	return TX_CONTINUE;
+}
 
 ieee80211_rx_result
 ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
@@ -1006,6 +1048,56 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_key *key = rx->key;
+	struct ieee80211_mmie_16 *mmie;
+	u8 aad[20], mic[16], ipn[6];
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+	if (!ieee80211_is_mgmt(hdr->frame_control))
+		return RX_CONTINUE;
+
+	/* management frames are already linear */
+
+	if (skb->len < 24 + sizeof(*mmie))
+		return RX_DROP_UNUSABLE;
+
+	mmie = (struct ieee80211_mmie_16 *)
+		(skb->data + skb->len - sizeof(*mmie));
+	if (mmie->element_id != WLAN_EID_MMIE ||
+	    mmie->length != sizeof(*mmie) - 2)
+		return RX_DROP_UNUSABLE; /* Invalid MMIE */
+
+	bip_ipn_swap(ipn, mmie->sequence_number);
+
+	if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
+		key->u.aes_cmac.replays++;
+		return RX_DROP_UNUSABLE;
+	}
+
+	if (!(status->flag & RX_FLAG_DECRYPTED)) {
+		/* hardware didn't decrypt/verify MIC */
+		bip_aad(skb, aad);
+		ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
+				       skb->data + 24, skb->len - 24, mic);
+		if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+			key->u.aes_cmac.icverrors++;
+			return RX_DROP_UNUSABLE;
+		}
+	}
+
+	memcpy(key->u.aes_cmac.rx_pn, ipn, 6);
+
+	/* Remove MMIE */
+	skb_trim(skb, skb->len - sizeof(*mmie));
+
+	return RX_CONTINUE;
+}
+
 ieee80211_tx_result
 ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx)
 {
diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h
index 43e109f27a89..06b7f167a176 100644
--- a/net/mac80211/wpa.h
+++ b/net/mac80211/wpa.h
@@ -32,8 +32,12 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
 
 ieee80211_tx_result
 ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx);
 ieee80211_rx_result
 ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx);
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx);
 ieee80211_tx_result
 ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx);
 ieee80211_rx_result
-- 
cgit v1.2.3


From 05c80d75f10ad7d3f95444b65788d6a0bbb4380d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 19 Dec 2014 09:14:20 -0300
Subject: [media] hdmi: add new HDMI 2.0 defines

Add new Video InfoFrame colorspace information introduced in HDMI 2.0
and new Audio Coding Extension Types, also from HDMI 2.0.

HDMI_CONTENT_TYPE_NONE was renamed to _GRAPHICS since that's what
it is called in CEA-861-F.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/linux/hdmi.h | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index cbb5790a35cd..5afc0bff2bbe 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -52,12 +52,18 @@ enum hdmi_colorspace {
 	HDMI_COLORSPACE_RGB,
 	HDMI_COLORSPACE_YUV422,
 	HDMI_COLORSPACE_YUV444,
+	HDMI_COLORSPACE_YUV420,
+	HDMI_COLORSPACE_RESERVED4,
+	HDMI_COLORSPACE_RESERVED5,
+	HDMI_COLORSPACE_RESERVED6,
+	HDMI_COLORSPACE_IDO_DEFINED,
 };
 
 enum hdmi_scan_mode {
 	HDMI_SCAN_MODE_NONE,
 	HDMI_SCAN_MODE_OVERSCAN,
 	HDMI_SCAN_MODE_UNDERSCAN,
+	HDMI_SCAN_MODE_RESERVED,
 };
 
 enum hdmi_colorimetry {
@@ -71,6 +77,7 @@ enum hdmi_picture_aspect {
 	HDMI_PICTURE_ASPECT_NONE,
 	HDMI_PICTURE_ASPECT_4_3,
 	HDMI_PICTURE_ASPECT_16_9,
+	HDMI_PICTURE_ASPECT_RESERVED,
 };
 
 enum hdmi_active_aspect {
@@ -92,12 +99,18 @@ enum hdmi_extended_colorimetry {
 	HDMI_EXTENDED_COLORIMETRY_S_YCC_601,
 	HDMI_EXTENDED_COLORIMETRY_ADOBE_YCC_601,
 	HDMI_EXTENDED_COLORIMETRY_ADOBE_RGB,
+
+	/* The following EC values are only defined in CEA-861-F. */
+	HDMI_EXTENDED_COLORIMETRY_BT2020_CONST_LUM,
+	HDMI_EXTENDED_COLORIMETRY_BT2020,
+	HDMI_EXTENDED_COLORIMETRY_RESERVED,
 };
 
 enum hdmi_quantization_range {
 	HDMI_QUANTIZATION_RANGE_DEFAULT,
 	HDMI_QUANTIZATION_RANGE_LIMITED,
 	HDMI_QUANTIZATION_RANGE_FULL,
+	HDMI_QUANTIZATION_RANGE_RESERVED,
 };
 
 /* non-uniform picture scaling */
@@ -114,7 +127,7 @@ enum hdmi_ycc_quantization_range {
 };
 
 enum hdmi_content_type {
-	HDMI_CONTENT_TYPE_NONE,
+	HDMI_CONTENT_TYPE_GRAPHICS,
 	HDMI_CONTENT_TYPE_PHOTO,
 	HDMI_CONTENT_TYPE_CINEMA,
 	HDMI_CONTENT_TYPE_GAME,
@@ -194,6 +207,7 @@ enum hdmi_audio_coding_type {
 	HDMI_AUDIO_CODING_TYPE_MLP,
 	HDMI_AUDIO_CODING_TYPE_DST,
 	HDMI_AUDIO_CODING_TYPE_WMA_PRO,
+	HDMI_AUDIO_CODING_TYPE_CXT,
 };
 
 enum hdmi_audio_sample_size {
@@ -216,9 +230,23 @@ enum hdmi_audio_sample_frequency {
 
 enum hdmi_audio_coding_type_ext {
 	HDMI_AUDIO_CODING_TYPE_EXT_STREAM,
+
+	/*
+	 * The next three CXT values are defined in CEA-861-E only.
+	 * They do not exist in older versions, and in CEA-861-F they are
+	 * defined as 'Not in use'.
+	 */
 	HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC,
 	HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC_V2,
 	HDMI_AUDIO_CODING_TYPE_EXT_MPEG_SURROUND,
+
+	/* The following CXT values are only defined in CEA-861-F. */
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC,
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC_V2,
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_AAC_LC,
+	HDMI_AUDIO_CODING_TYPE_EXT_DRA,
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC_SURROUND,
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_AAC_LC_SURROUND = 10,
 };
 
 struct hdmi_audio_infoframe {
-- 
cgit v1.2.3


From 2c676f378edb16cb68f7815581c8119fc43a4b85 Mon Sep 17 00:00:00 2001
From: Martin Bugge <marbugge@cisco.com>
Date: Fri, 19 Dec 2014 09:14:21 -0300
Subject: [media] hdmi: added unpack and logging functions for InfoFrames

When receiving video it is very useful to be able to unpack the InfoFrames.
Logging is useful as well, both for transmitters and receivers.

Especially when implementing the VIDIOC_LOG_STATUS ioctl (supported by many
V4L2 drivers) for a receiver it is important to be able to easily log what
the InfoFrame contains. This greatly simplifies debugging.

Signed-off-by: Martin Bugge <marbugge@cisco.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/video/hdmi.c | 822 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/hdmi.h |   4 +
 2 files changed, 819 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c
index 9e758a8f890d..a7c6ae4e10e5 100644
--- a/drivers/video/hdmi.c
+++ b/drivers/video/hdmi.c
@@ -27,10 +27,12 @@
 #include <linux/export.h>
 #include <linux/hdmi.h>
 #include <linux/string.h>
+#include <linux/device.h>
 
-static void hdmi_infoframe_checksum(void *buffer, size_t size)
+#define hdmi_log(fmt, ...) dev_printk(level, dev, fmt, ##__VA_ARGS__)
+
+static u8 hdmi_infoframe_checksum(u8 *ptr, size_t size)
 {
-	u8 *ptr = buffer;
 	u8 csum = 0;
 	size_t i;
 
@@ -38,7 +40,14 @@ static void hdmi_infoframe_checksum(void *buffer, size_t size)
 	for (i = 0; i < size; i++)
 		csum += ptr[i];
 
-	ptr[3] = 256 - csum;
+	return 256 - csum;
+}
+
+static void hdmi_infoframe_set_checksum(void *buffer, size_t size)
+{
+	u8 *ptr = buffer;
+
+	ptr[3] = hdmi_infoframe_checksum(buffer, size);
 }
 
 /**
@@ -136,7 +145,7 @@ ssize_t hdmi_avi_infoframe_pack(struct hdmi_avi_infoframe *frame, void *buffer,
 	ptr[11] = frame->right_bar & 0xff;
 	ptr[12] = (frame->right_bar >> 8) & 0xff;
 
-	hdmi_infoframe_checksum(buffer, length);
+	hdmi_infoframe_set_checksum(buffer, length);
 
 	return length;
 }
@@ -206,7 +215,7 @@ ssize_t hdmi_spd_infoframe_pack(struct hdmi_spd_infoframe *frame, void *buffer,
 
 	ptr[24] = frame->sdi;
 
-	hdmi_infoframe_checksum(buffer, length);
+	hdmi_infoframe_set_checksum(buffer, length);
 
 	return length;
 }
@@ -281,7 +290,7 @@ ssize_t hdmi_audio_infoframe_pack(struct hdmi_audio_infoframe *frame,
 	if (frame->downmix_inhibit)
 		ptr[4] |= BIT(7);
 
-	hdmi_infoframe_checksum(buffer, length);
+	hdmi_infoframe_set_checksum(buffer, length);
 
 	return length;
 }
@@ -373,7 +382,7 @@ ssize_t hdmi_vendor_infoframe_pack(struct hdmi_vendor_infoframe *frame,
 			ptr[9] = (frame->s3d_ext_data & 0xf) << 4;
 	}
 
-	hdmi_infoframe_checksum(buffer, length);
+	hdmi_infoframe_set_checksum(buffer, length);
 
 	return length;
 }
@@ -434,3 +443,802 @@ hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer, size_t size)
 	return length;
 }
 EXPORT_SYMBOL(hdmi_infoframe_pack);
+
+static const char *hdmi_infoframe_type_get_name(enum hdmi_infoframe_type type)
+{
+	if (type < 0x80 || type > 0x9f)
+		return "Invalid";
+	switch (type) {
+	case HDMI_INFOFRAME_TYPE_VENDOR:
+		return "Vendor";
+	case HDMI_INFOFRAME_TYPE_AVI:
+		return "Auxiliary Video Information (AVI)";
+	case HDMI_INFOFRAME_TYPE_SPD:
+		return "Source Product Description (SPD)";
+	case HDMI_INFOFRAME_TYPE_AUDIO:
+		return "Audio";
+	}
+	return "Reserved";
+}
+
+static void hdmi_infoframe_log_header(const char *level,
+				      struct device *dev,
+				      struct hdmi_any_infoframe *frame)
+{
+	hdmi_log("HDMI infoframe: %s, version %u, length %u\n",
+		hdmi_infoframe_type_get_name(frame->type),
+		frame->version, frame->length);
+}
+
+static const char *hdmi_colorspace_get_name(enum hdmi_colorspace colorspace)
+{
+	switch (colorspace) {
+	case HDMI_COLORSPACE_RGB:
+		return "RGB";
+	case HDMI_COLORSPACE_YUV422:
+		return "YCbCr 4:2:2";
+	case HDMI_COLORSPACE_YUV444:
+		return "YCbCr 4:4:4";
+	case HDMI_COLORSPACE_YUV420:
+		return "YCbCr 4:2:0";
+	case HDMI_COLORSPACE_RESERVED4:
+		return "Reserved (4)";
+	case HDMI_COLORSPACE_RESERVED5:
+		return "Reserved (5)";
+	case HDMI_COLORSPACE_RESERVED6:
+		return "Reserved (6)";
+	case HDMI_COLORSPACE_IDO_DEFINED:
+		return "IDO Defined";
+	}
+	return "Invalid";
+}
+
+static const char *hdmi_scan_mode_get_name(enum hdmi_scan_mode scan_mode)
+{
+	switch (scan_mode) {
+	case HDMI_SCAN_MODE_NONE:
+		return "No Data";
+	case HDMI_SCAN_MODE_OVERSCAN:
+		return "Overscan";
+	case HDMI_SCAN_MODE_UNDERSCAN:
+		return "Underscan";
+	case HDMI_SCAN_MODE_RESERVED:
+		return "Reserved";
+	}
+	return "Invalid";
+}
+
+static const char *hdmi_colorimetry_get_name(enum hdmi_colorimetry colorimetry)
+{
+	switch (colorimetry) {
+	case HDMI_COLORIMETRY_NONE:
+		return "No Data";
+	case HDMI_COLORIMETRY_ITU_601:
+		return "ITU601";
+	case HDMI_COLORIMETRY_ITU_709:
+		return "ITU709";
+	case HDMI_COLORIMETRY_EXTENDED:
+		return "Extended";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_picture_aspect_get_name(enum hdmi_picture_aspect picture_aspect)
+{
+	switch (picture_aspect) {
+	case HDMI_PICTURE_ASPECT_NONE:
+		return "No Data";
+	case HDMI_PICTURE_ASPECT_4_3:
+		return "4:3";
+	case HDMI_PICTURE_ASPECT_16_9:
+		return "16:9";
+	case HDMI_PICTURE_ASPECT_RESERVED:
+		return "Reserved";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_active_aspect_get_name(enum hdmi_active_aspect active_aspect)
+{
+	if (active_aspect < 0 || active_aspect > 0xf)
+		return "Invalid";
+
+	switch (active_aspect) {
+	case HDMI_ACTIVE_ASPECT_16_9_TOP:
+		return "16:9 Top";
+	case HDMI_ACTIVE_ASPECT_14_9_TOP:
+		return "14:9 Top";
+	case HDMI_ACTIVE_ASPECT_16_9_CENTER:
+		return "16:9 Center";
+	case HDMI_ACTIVE_ASPECT_PICTURE:
+		return "Same as Picture";
+	case HDMI_ACTIVE_ASPECT_4_3:
+		return "4:3";
+	case HDMI_ACTIVE_ASPECT_16_9:
+		return "16:9";
+	case HDMI_ACTIVE_ASPECT_14_9:
+		return "14:9";
+	case HDMI_ACTIVE_ASPECT_4_3_SP_14_9:
+		return "4:3 SP 14:9";
+	case HDMI_ACTIVE_ASPECT_16_9_SP_14_9:
+		return "16:9 SP 14:9";
+	case HDMI_ACTIVE_ASPECT_16_9_SP_4_3:
+		return "16:9 SP 4:3";
+	}
+	return "Reserved";
+}
+
+static const char *
+hdmi_extended_colorimetry_get_name(enum hdmi_extended_colorimetry ext_col)
+{
+	switch (ext_col) {
+	case HDMI_EXTENDED_COLORIMETRY_XV_YCC_601:
+		return "xvYCC 601";
+	case HDMI_EXTENDED_COLORIMETRY_XV_YCC_709:
+		return "xvYCC 709";
+	case HDMI_EXTENDED_COLORIMETRY_S_YCC_601:
+		return "sYCC 601";
+	case HDMI_EXTENDED_COLORIMETRY_ADOBE_YCC_601:
+		return "Adobe YCC 601";
+	case HDMI_EXTENDED_COLORIMETRY_ADOBE_RGB:
+		return "Adobe RGB";
+	case HDMI_EXTENDED_COLORIMETRY_BT2020_CONST_LUM:
+		return "BT.2020 Constant Luminance";
+	case HDMI_EXTENDED_COLORIMETRY_BT2020:
+		return "BT.2020";
+	case HDMI_EXTENDED_COLORIMETRY_RESERVED:
+		return "Reserved";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_quantization_range_get_name(enum hdmi_quantization_range qrange)
+{
+	switch (qrange) {
+	case HDMI_QUANTIZATION_RANGE_DEFAULT:
+		return "Default";
+	case HDMI_QUANTIZATION_RANGE_LIMITED:
+		return "Limited";
+	case HDMI_QUANTIZATION_RANGE_FULL:
+		return "Full";
+	case HDMI_QUANTIZATION_RANGE_RESERVED:
+		return "Reserved";
+	}
+	return "Invalid";
+}
+
+static const char *hdmi_nups_get_name(enum hdmi_nups nups)
+{
+	switch (nups) {
+	case HDMI_NUPS_UNKNOWN:
+		return "Unknown Non-uniform Scaling";
+	case HDMI_NUPS_HORIZONTAL:
+		return "Horizontally Scaled";
+	case HDMI_NUPS_VERTICAL:
+		return "Vertically Scaled";
+	case HDMI_NUPS_BOTH:
+		return "Horizontally and Vertically Scaled";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_ycc_quantization_range_get_name(enum hdmi_ycc_quantization_range qrange)
+{
+	switch (qrange) {
+	case HDMI_YCC_QUANTIZATION_RANGE_LIMITED:
+		return "Limited";
+	case HDMI_YCC_QUANTIZATION_RANGE_FULL:
+		return "Full";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_content_type_get_name(enum hdmi_content_type content_type)
+{
+	switch (content_type) {
+	case HDMI_CONTENT_TYPE_GRAPHICS:
+		return "Graphics";
+	case HDMI_CONTENT_TYPE_PHOTO:
+		return "Photo";
+	case HDMI_CONTENT_TYPE_CINEMA:
+		return "Cinema";
+	case HDMI_CONTENT_TYPE_GAME:
+		return "Game";
+	}
+	return "Invalid";
+}
+
+/**
+ * hdmi_avi_infoframe_log() - log info of HDMI AVI infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI AVI infoframe
+ */
+static void hdmi_avi_infoframe_log(const char *level,
+				   struct device *dev,
+				   struct hdmi_avi_infoframe *frame)
+{
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+
+	hdmi_log("    colorspace: %s\n",
+			hdmi_colorspace_get_name(frame->colorspace));
+	hdmi_log("    scan mode: %s\n",
+			hdmi_scan_mode_get_name(frame->scan_mode));
+	hdmi_log("    colorimetry: %s\n",
+			hdmi_colorimetry_get_name(frame->colorimetry));
+	hdmi_log("    picture aspect: %s\n",
+			hdmi_picture_aspect_get_name(frame->picture_aspect));
+	hdmi_log("    active aspect: %s\n",
+			hdmi_active_aspect_get_name(frame->active_aspect));
+	hdmi_log("    itc: %s\n", frame->itc ? "IT Content" : "No Data");
+	hdmi_log("    extended colorimetry: %s\n",
+			hdmi_extended_colorimetry_get_name(frame->extended_colorimetry));
+	hdmi_log("    quantization range: %s\n",
+			hdmi_quantization_range_get_name(frame->quantization_range));
+	hdmi_log("    nups: %s\n", hdmi_nups_get_name(frame->nups));
+	hdmi_log("    video code: %u\n", frame->video_code);
+	hdmi_log("    ycc quantization range: %s\n",
+			hdmi_ycc_quantization_range_get_name(frame->ycc_quantization_range));
+	hdmi_log("    hdmi content type: %s\n",
+			hdmi_content_type_get_name(frame->content_type));
+	hdmi_log("    pixel repeat: %u\n", frame->pixel_repeat);
+	hdmi_log("    bar top %u, bottom %u, left %u, right %u\n",
+			frame->top_bar, frame->bottom_bar,
+			frame->left_bar, frame->right_bar);
+}
+
+static const char *hdmi_spd_sdi_get_name(enum hdmi_spd_sdi sdi)
+{
+	if (sdi < 0 || sdi > 0xff)
+		return "Invalid";
+	switch (sdi) {
+	case HDMI_SPD_SDI_UNKNOWN:
+		return "Unknown";
+	case HDMI_SPD_SDI_DSTB:
+		return "Digital STB";
+	case HDMI_SPD_SDI_DVDP:
+		return "DVD Player";
+	case HDMI_SPD_SDI_DVHS:
+		return "D-VHS";
+	case HDMI_SPD_SDI_HDDVR:
+		return "HDD Videorecorder";
+	case HDMI_SPD_SDI_DVC:
+		return "DVC";
+	case HDMI_SPD_SDI_DSC:
+		return "DSC";
+	case HDMI_SPD_SDI_VCD:
+		return "Video CD";
+	case HDMI_SPD_SDI_GAME:
+		return "Game";
+	case HDMI_SPD_SDI_PC:
+		return "PC General";
+	case HDMI_SPD_SDI_BD:
+		return "Blu-Ray Disc (BD)";
+	case HDMI_SPD_SDI_SACD:
+		return "Super Audio CD";
+	case HDMI_SPD_SDI_HDDVD:
+		return "HD DVD";
+	case HDMI_SPD_SDI_PMP:
+		return "PMP";
+	}
+	return "Reserved";
+}
+
+/**
+ * hdmi_spd_infoframe_log() - log info of HDMI SPD infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI SPD infoframe
+ */
+static void hdmi_spd_infoframe_log(const char *level,
+				   struct device *dev,
+				   struct hdmi_spd_infoframe *frame)
+{
+	u8 buf[17];
+
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+
+	memset(buf, 0, sizeof(buf));
+
+	strncpy(buf, frame->vendor, 8);
+	hdmi_log("    vendor: %s\n", buf);
+	strncpy(buf, frame->product, 16);
+	hdmi_log("    product: %s\n", buf);
+	hdmi_log("    source device information: %s (0x%x)\n",
+		hdmi_spd_sdi_get_name(frame->sdi), frame->sdi);
+}
+
+static const char *
+hdmi_audio_coding_type_get_name(enum hdmi_audio_coding_type coding_type)
+{
+	switch (coding_type) {
+	case HDMI_AUDIO_CODING_TYPE_STREAM:
+		return "Refer to Stream Header";
+	case HDMI_AUDIO_CODING_TYPE_PCM:
+		return "PCM";
+	case HDMI_AUDIO_CODING_TYPE_AC3:
+		return "AC-3";
+	case HDMI_AUDIO_CODING_TYPE_MPEG1:
+		return "MPEG1";
+	case HDMI_AUDIO_CODING_TYPE_MP3:
+		return "MP3";
+	case HDMI_AUDIO_CODING_TYPE_MPEG2:
+		return "MPEG2";
+	case HDMI_AUDIO_CODING_TYPE_AAC_LC:
+		return "AAC";
+	case HDMI_AUDIO_CODING_TYPE_DTS:
+		return "DTS";
+	case HDMI_AUDIO_CODING_TYPE_ATRAC:
+		return "ATRAC";
+	case HDMI_AUDIO_CODING_TYPE_DSD:
+		return "One Bit Audio";
+	case HDMI_AUDIO_CODING_TYPE_EAC3:
+		return "Dolby Digital +";
+	case HDMI_AUDIO_CODING_TYPE_DTS_HD:
+		return "DTS-HD";
+	case HDMI_AUDIO_CODING_TYPE_MLP:
+		return "MAT (MLP)";
+	case HDMI_AUDIO_CODING_TYPE_DST:
+		return "DST";
+	case HDMI_AUDIO_CODING_TYPE_WMA_PRO:
+		return "WMA PRO";
+	case HDMI_AUDIO_CODING_TYPE_CXT:
+		return "Refer to CXT";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_audio_sample_size_get_name(enum hdmi_audio_sample_size sample_size)
+{
+	switch (sample_size) {
+	case HDMI_AUDIO_SAMPLE_SIZE_STREAM:
+		return "Refer to Stream Header";
+	case HDMI_AUDIO_SAMPLE_SIZE_16:
+		return "16 bit";
+	case HDMI_AUDIO_SAMPLE_SIZE_20:
+		return "20 bit";
+	case HDMI_AUDIO_SAMPLE_SIZE_24:
+		return "24 bit";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_audio_sample_frequency_get_name(enum hdmi_audio_sample_frequency freq)
+{
+	switch (freq) {
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_STREAM:
+		return "Refer to Stream Header";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_32000:
+		return "32 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_44100:
+		return "44.1 kHz (CD)";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_48000:
+		return "48 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_88200:
+		return "88.2 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_96000:
+		return "96 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_176400:
+		return "176.4 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_192000:
+		return "192 kHz";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_audio_coding_type_ext_get_name(enum hdmi_audio_coding_type_ext ctx)
+{
+	if (ctx < 0 || ctx > 0x1f)
+		return "Invalid";
+
+	switch (ctx) {
+	case HDMI_AUDIO_CODING_TYPE_EXT_STREAM:
+		return "Refer to CT";
+	case HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC:
+		return "HE AAC";
+	case HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC_V2:
+		return "HE AAC v2";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG_SURROUND:
+		return "MPEG SURROUND";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC:
+		return "MPEG-4 HE AAC";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC_V2:
+		return "MPEG-4 HE AAC v2";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_AAC_LC:
+		return "MPEG-4 AAC LC";
+	case HDMI_AUDIO_CODING_TYPE_EXT_DRA:
+		return "DRA";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC_SURROUND:
+		return "MPEG-4 HE AAC + MPEG Surround";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_AAC_LC_SURROUND:
+		return "MPEG-4 AAC LC + MPEG Surround";
+	}
+	return "Reserved";
+}
+
+/**
+ * hdmi_audio_infoframe_log() - log info of HDMI AUDIO infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI AUDIO infoframe
+ */
+static void hdmi_audio_infoframe_log(const char *level,
+				     struct device *dev,
+				     struct hdmi_audio_infoframe *frame)
+{
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+
+	if (frame->channels)
+		hdmi_log("    channels: %u\n", frame->channels - 1);
+	else
+		hdmi_log("    channels: Refer to stream header\n");
+	hdmi_log("    coding type: %s\n",
+			hdmi_audio_coding_type_get_name(frame->coding_type));
+	hdmi_log("    sample size: %s\n",
+			hdmi_audio_sample_size_get_name(frame->sample_size));
+	hdmi_log("    sample frequency: %s\n",
+			hdmi_audio_sample_frequency_get_name(frame->sample_frequency));
+	hdmi_log("    coding type ext: %s\n",
+			hdmi_audio_coding_type_ext_get_name(frame->coding_type_ext));
+	hdmi_log("    channel allocation: 0x%x\n",
+			frame->channel_allocation);
+	hdmi_log("    level shift value: %u dB\n",
+			frame->level_shift_value);
+	hdmi_log("    downmix inhibit: %s\n",
+			frame->downmix_inhibit ? "Yes" : "No");
+}
+
+static const char *
+hdmi_3d_structure_get_name(enum hdmi_3d_structure s3d_struct)
+{
+	if (s3d_struct < 0 || s3d_struct > 0xf)
+		return "Invalid";
+
+	switch (s3d_struct) {
+	case HDMI_3D_STRUCTURE_FRAME_PACKING:
+		return "Frame Packing";
+	case HDMI_3D_STRUCTURE_FIELD_ALTERNATIVE:
+		return "Field Alternative";
+	case HDMI_3D_STRUCTURE_LINE_ALTERNATIVE:
+		return "Line Alternative";
+	case HDMI_3D_STRUCTURE_SIDE_BY_SIDE_FULL:
+		return "Side-by-side (Full)";
+	case HDMI_3D_STRUCTURE_L_DEPTH:
+		return "L + Depth";
+	case HDMI_3D_STRUCTURE_L_DEPTH_GFX_GFX_DEPTH:
+		return "L + Depth + Graphics + Graphics-depth";
+	case HDMI_3D_STRUCTURE_TOP_AND_BOTTOM:
+		return "Top-and-Bottom";
+	case HDMI_3D_STRUCTURE_SIDE_BY_SIDE_HALF:
+		return "Side-by-side (Half)";
+	default:
+		break;
+	}
+	return "Reserved";
+}
+
+/**
+ * hdmi_vendor_infoframe_log() - log info of HDMI VENDOR infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI VENDOR infoframe
+ */
+static void
+hdmi_vendor_any_infoframe_log(const char *level,
+			      struct device *dev,
+			      union hdmi_vendor_any_infoframe *frame)
+{
+	struct hdmi_vendor_infoframe *hvf = &frame->hdmi;
+
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+
+	if (frame->any.oui != HDMI_IEEE_OUI) {
+		hdmi_log("    not a HDMI vendor infoframe\n");
+		return;
+	}
+	if (hvf->vic == 0 && hvf->s3d_struct == HDMI_3D_STRUCTURE_INVALID) {
+		hdmi_log("    empty frame\n");
+		return;
+	}
+
+	if (hvf->vic)
+		hdmi_log("    HDMI VIC: %u\n", hvf->vic);
+	if (hvf->s3d_struct != HDMI_3D_STRUCTURE_INVALID) {
+		hdmi_log("    3D structure: %s\n",
+				hdmi_3d_structure_get_name(hvf->s3d_struct));
+		if (hvf->s3d_struct >= HDMI_3D_STRUCTURE_SIDE_BY_SIDE_HALF)
+			hdmi_log("    3D extension data: %d\n",
+					hvf->s3d_ext_data);
+	}
+}
+
+/**
+ * hdmi_infoframe_log() - log info of HDMI infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI infoframe
+ */
+void hdmi_infoframe_log(const char *level,
+			struct device *dev,
+			union hdmi_infoframe *frame)
+{
+	switch (frame->any.type) {
+	case HDMI_INFOFRAME_TYPE_AVI:
+		hdmi_avi_infoframe_log(level, dev, &frame->avi);
+		break;
+	case HDMI_INFOFRAME_TYPE_SPD:
+		hdmi_spd_infoframe_log(level, dev, &frame->spd);
+		break;
+	case HDMI_INFOFRAME_TYPE_AUDIO:
+		hdmi_audio_infoframe_log(level, dev, &frame->audio);
+		break;
+	case HDMI_INFOFRAME_TYPE_VENDOR:
+		hdmi_vendor_any_infoframe_log(level, dev, &frame->vendor);
+		break;
+	}
+}
+EXPORT_SYMBOL(hdmi_infoframe_log);
+
+/**
+ * hdmi_avi_infoframe_unpack() - unpack binary buffer to a HDMI AVI infoframe
+ * @buffer: source buffer
+ * @frame: HDMI AVI infoframe
+ *
+ * Unpacks the information contained in binary @buffer into a structured
+ * @frame of the HDMI Auxiliary Video (AVI) information frame.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+static int hdmi_avi_infoframe_unpack(struct hdmi_avi_infoframe *frame,
+				     void *buffer)
+{
+	u8 *ptr = buffer;
+	int ret;
+
+	if (ptr[0] != HDMI_INFOFRAME_TYPE_AVI ||
+	    ptr[1] != 2 ||
+	    ptr[2] != HDMI_AVI_INFOFRAME_SIZE)
+		return -EINVAL;
+
+	if (hdmi_infoframe_checksum(buffer, HDMI_INFOFRAME_SIZE(AVI)) != 0)
+		return -EINVAL;
+
+	ret = hdmi_avi_infoframe_init(frame);
+	if (ret)
+		return ret;
+
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	frame->colorspace = (ptr[0] >> 5) & 0x3;
+	if (ptr[0] & 0x10)
+		frame->active_aspect = ptr[1] & 0xf;
+	if (ptr[0] & 0x8) {
+		frame->top_bar = (ptr[5] << 8) + ptr[6];
+		frame->bottom_bar = (ptr[7] << 8) + ptr[8];
+	}
+	if (ptr[0] & 0x4) {
+		frame->left_bar = (ptr[9] << 8) + ptr[10];
+		frame->right_bar = (ptr[11] << 8) + ptr[12];
+	}
+	frame->scan_mode = ptr[0] & 0x3;
+
+	frame->colorimetry = (ptr[1] >> 6) & 0x3;
+	frame->picture_aspect = (ptr[1] >> 4) & 0x3;
+	frame->active_aspect = ptr[1] & 0xf;
+
+	frame->itc = ptr[2] & 0x80 ? true : false;
+	frame->extended_colorimetry = (ptr[2] >> 4) & 0x7;
+	frame->quantization_range = (ptr[2] >> 2) & 0x3;
+	frame->nups = ptr[2] & 0x3;
+
+	frame->video_code = ptr[3] & 0x7f;
+	frame->ycc_quantization_range = (ptr[4] >> 6) & 0x3;
+	frame->content_type = (ptr[4] >> 4) & 0x3;
+
+	frame->pixel_repeat = ptr[4] & 0xf;
+
+	return 0;
+}
+
+/**
+ * hdmi_spd_infoframe_unpack() - unpack binary buffer to a HDMI SPD infoframe
+ * @buffer: source buffer
+ * @frame: HDMI SPD infoframe
+ *
+ * Unpacks the information contained in binary @buffer into a structured
+ * @frame of the HDMI Source Product Description (SPD) information frame.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+static int hdmi_spd_infoframe_unpack(struct hdmi_spd_infoframe *frame,
+				     void *buffer)
+{
+	u8 *ptr = buffer;
+	int ret;
+
+	if (ptr[0] != HDMI_INFOFRAME_TYPE_SPD ||
+	    ptr[1] != 1 ||
+	    ptr[2] != HDMI_SPD_INFOFRAME_SIZE) {
+		return -EINVAL;
+	}
+
+	if (hdmi_infoframe_checksum(buffer, HDMI_INFOFRAME_SIZE(SPD)) != 0)
+		return -EINVAL;
+
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	ret = hdmi_spd_infoframe_init(frame, ptr, ptr + 8);
+	if (ret)
+		return ret;
+
+	frame->sdi = ptr[24];
+
+	return 0;
+}
+
+/**
+ * hdmi_audio_infoframe_unpack() - unpack binary buffer to a HDMI AUDIO infoframe
+ * @buffer: source buffer
+ * @frame: HDMI Audio infoframe
+ *
+ * Unpacks the information contained in binary @buffer into a structured
+ * @frame of the HDMI Audio information frame.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+static int hdmi_audio_infoframe_unpack(struct hdmi_audio_infoframe *frame,
+				       void *buffer)
+{
+	u8 *ptr = buffer;
+	int ret;
+
+	if (ptr[0] != HDMI_INFOFRAME_TYPE_AUDIO ||
+	    ptr[1] != 1 ||
+	    ptr[2] != HDMI_AUDIO_INFOFRAME_SIZE) {
+		return -EINVAL;
+	}
+
+	if (hdmi_infoframe_checksum(buffer, HDMI_INFOFRAME_SIZE(AUDIO)) != 0)
+		return -EINVAL;
+
+	ret = hdmi_audio_infoframe_init(frame);
+	if (ret)
+		return ret;
+
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	frame->channels = ptr[0] & 0x7;
+	frame->coding_type = (ptr[0] >> 4) & 0xf;
+	frame->sample_size = ptr[1] & 0x3;
+	frame->sample_frequency = (ptr[1] >> 2) & 0x7;
+	frame->coding_type_ext = ptr[2] & 0x1f;
+	frame->channel_allocation = ptr[3];
+	frame->level_shift_value = (ptr[4] >> 3) & 0xf;
+	frame->downmix_inhibit = ptr[4] & 0x80 ? true : false;
+
+	return 0;
+}
+
+/**
+ * hdmi_vendor_infoframe_unpack() - unpack binary buffer to a HDMI vendor infoframe
+ * @buffer: source buffer
+ * @frame: HDMI Vendor infoframe
+ *
+ * Unpacks the information contained in binary @buffer into a structured
+ * @frame of the HDMI Vendor information frame.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+static int
+hdmi_vendor_any_infoframe_unpack(union hdmi_vendor_any_infoframe *frame,
+				 void *buffer)
+{
+	u8 *ptr = buffer;
+	size_t length;
+	int ret;
+	u8 hdmi_video_format;
+	struct hdmi_vendor_infoframe *hvf = &frame->hdmi;
+
+	if (ptr[0] != HDMI_INFOFRAME_TYPE_VENDOR ||
+	    ptr[1] != 1 ||
+	    (ptr[2] != 5 && ptr[2] != 6))
+		return -EINVAL;
+
+	length = ptr[2];
+
+	if (hdmi_infoframe_checksum(buffer,
+				    HDMI_INFOFRAME_HEADER_SIZE + length) != 0)
+		return -EINVAL;
+
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	/* HDMI OUI */
+	if ((ptr[0] != 0x03) ||
+	    (ptr[1] != 0x0c) ||
+	    (ptr[2] != 0x00))
+		return -EINVAL;
+
+	hdmi_video_format = ptr[3] >> 5;
+
+	if (hdmi_video_format > 0x2)
+		return -EINVAL;
+
+	ret = hdmi_vendor_infoframe_init(hvf);
+	if (ret)
+		return ret;
+
+	hvf->length = length;
+
+	if (hdmi_video_format == 0x1) {
+		hvf->vic = ptr[4];
+	} else if (hdmi_video_format == 0x2) {
+		hvf->s3d_struct = ptr[4] >> 4;
+		if (hvf->s3d_struct >= HDMI_3D_STRUCTURE_SIDE_BY_SIDE_HALF) {
+			if (length == 6)
+				hvf->s3d_ext_data = ptr[5] >> 4;
+			else
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * hdmi_infoframe_unpack() - unpack binary buffer to a HDMI infoframe
+ * @buffer: source buffer
+ * @frame: HDMI infoframe
+ *
+ * Unpacks the information contained in binary buffer @buffer into a structured
+ * @frame of a HDMI infoframe.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int hdmi_infoframe_unpack(union hdmi_infoframe *frame, void *buffer)
+{
+	int ret;
+	u8 *ptr = buffer;
+
+	switch (ptr[0]) {
+	case HDMI_INFOFRAME_TYPE_AVI:
+		ret = hdmi_avi_infoframe_unpack(&frame->avi, buffer);
+		break;
+	case HDMI_INFOFRAME_TYPE_SPD:
+		ret = hdmi_spd_infoframe_unpack(&frame->spd, buffer);
+		break;
+	case HDMI_INFOFRAME_TYPE_AUDIO:
+		ret = hdmi_audio_infoframe_unpack(&frame->audio, buffer);
+		break;
+	case HDMI_INFOFRAME_TYPE_VENDOR:
+		ret = hdmi_vendor_any_infoframe_unpack(&frame->vendor, buffer);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(hdmi_infoframe_unpack);
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 5afc0bff2bbe..2ff34315a1bb 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -25,6 +25,7 @@
 #define __LINUX_HDMI_H_
 
 #include <linux/types.h>
+#include <linux/device.h>
 
 enum hdmi_infoframe_type {
 	HDMI_INFOFRAME_TYPE_VENDOR = 0x81,
@@ -327,5 +328,8 @@ union hdmi_infoframe {
 
 ssize_t
 hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer, size_t size);
+int hdmi_infoframe_unpack(union hdmi_infoframe *frame, void *buffer);
+void hdmi_infoframe_log(const char *level, struct device *dev,
+			union hdmi_infoframe *frame);
 
 #endif /* _DRM_HDMI_H */
-- 
cgit v1.2.3


From dc189053e1a5ae606c56e432dae1afc28261a819 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 19 Dec 2014 09:14:22 -0300
Subject: [media] hdmi: rename HDMI_AUDIO_CODING_TYPE_EXT_STREAM to _EXT_CT

As per the suggestion of Thierry Reding rename
HDMI_AUDIO_CODING_TYPE_EXT_STREAM to HDMI_AUDIO_CODING_TYPE_EXT_CT to
be consistent with the CEA-861 spec.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/video/hdmi.c | 2 +-
 include/linux/hdmi.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c
index a7c6ae4e10e5..162689227a23 100644
--- a/drivers/video/hdmi.c
+++ b/drivers/video/hdmi.c
@@ -842,7 +842,7 @@ hdmi_audio_coding_type_ext_get_name(enum hdmi_audio_coding_type_ext ctx)
 		return "Invalid";
 
 	switch (ctx) {
-	case HDMI_AUDIO_CODING_TYPE_EXT_STREAM:
+	case HDMI_AUDIO_CODING_TYPE_EXT_CT:
 		return "Refer to CT";
 	case HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC:
 		return "HE AAC";
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 2ff34315a1bb..e9744202fa29 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -230,7 +230,8 @@ enum hdmi_audio_sample_frequency {
 };
 
 enum hdmi_audio_coding_type_ext {
-	HDMI_AUDIO_CODING_TYPE_EXT_STREAM,
+	/* Refer to Audio Coding Type (CT) field in Data Byte 1 */
+	HDMI_AUDIO_CODING_TYPE_EXT_CT,
 
 	/*
 	 * The next three CXT values are defined in CEA-861-E only.
-- 
cgit v1.2.3


From f27b37f5993a080700ccecdce9960d1563eccd36 Mon Sep 17 00:00:00 2001
From: Bintian Wang <bintian.wang@huawei.com>
Date: Tue, 27 Jan 2015 20:50:29 +0800
Subject: regmap: correct the description of structure element in reg_field

Fix incorrect description of structure element "msb", which is
described as "reg".

Signed-off-by: Bintian Wang <bintian.wang@huawei.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4419b99d8d6e..116655d92269 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -468,7 +468,7 @@ bool regmap_reg_in_ranges(unsigned int reg,
  *
  * @reg: Offset of the register within the regmap bank
  * @lsb: lsb of the register field.
- * @reg: msb of the register field.
+ * @msb: msb of the register field.
  * @id_size: port size if it has some ports
  * @id_offset: address offset for each ports
  */
-- 
cgit v1.2.3


From 72c66644673a61ad85d293de7a61e54b9bdc9682 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:07 +0200
Subject: iio: core: Introduce ENERGY channel type

Human activity sensors report the energy burnt by the user.
One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that computes the number of calories based on weight and step rate.

Introduce a new channel type ENERGY to export these values.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 10 ++++++++++
 drivers/iio/industrialio-core.c         |  1 +
 include/linux/iio/types.h               |  1 +
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 831db8623e4b..33118862fb8b 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -282,6 +282,7 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_current_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_peak_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_scale
+What:		/sys/bus/iio/devices/iio:deviceX/in_energy_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_x_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_y_scale
@@ -1049,6 +1050,15 @@ Description:
 		For a list of available output power modes read
 		in_accel_power_mode_available.
 
+What:		/sys/.../iio:deviceX/in_energy_input
+What:		/sys/.../iio:deviceX/in_energy_raw
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		This attribute is used to read the energy value reported by the
+		device (e.g.: human activity sensors report energy burnt by the
+		user). Units after application of scale are Joules.
+
 What:		/sys/bus/iio/devices/iio:deviceX/store_eeprom
 KernelVersion:	3.4.0
 Contact:	linux-iio@vger.kernel.org
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 69feb912515a..8d2c9ba85fd7 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -72,6 +72,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_HUMIDITYRELATIVE] = "humidityrelative",
 	[IIO_ACTIVITY] = "activity",
 	[IIO_STEPS] = "steps",
+	[IIO_ENERGY] = "energy",
 };
 
 static const char * const iio_modifier_names[] = {
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 904dcbbf0e6f..26b8a1c5e2af 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -32,6 +32,7 @@ enum iio_chan_type {
 	IIO_HUMIDITYRELATIVE,
 	IIO_ACTIVITY,
 	IIO_STEPS,
+	IIO_ENERGY,
 };
 
 enum iio_modifier {
-- 
cgit v1.2.3


From cc3c9eecaed65b26ee0661e9e9491fd8d48e3907 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:08 +0200
Subject: iio: core: Introduce DISTANCE channel type

Some devices export an estimation of the distance the user has covered
since the last reset.

One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that computes the distance based on the stride length and step rate.

Introduce a new channel type DISTANCE to export these values.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 10 ++++++++++
 drivers/iio/industrialio-core.c         |  1 +
 include/linux/iio/types.h               |  1 +
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 33118862fb8b..c627a9a1cd56 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -283,6 +283,7 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_accel_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_peak_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_energy_scale
+What:		/sys/bus/iio/devices/iio:deviceX/in_distance_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_x_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_y_scale
@@ -1059,6 +1060,15 @@ Description:
 		device (e.g.: human activity sensors report energy burnt by the
 		user). Units after application of scale are Joules.
 
+What:		/sys/.../iio:deviceX/in_distance_input
+What:		/sys/.../iio:deviceX/in_distance_raw
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		This attribute is used to read the distance covered by the user
+		since the last reboot while activated. Units after application
+		of scale are meters.
+
 What:		/sys/bus/iio/devices/iio:deviceX/store_eeprom
 KernelVersion:	3.4.0
 Contact:	linux-iio@vger.kernel.org
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 8d2c9ba85fd7..655755b49ccd 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -73,6 +73,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_ACTIVITY] = "activity",
 	[IIO_STEPS] = "steps",
 	[IIO_ENERGY] = "energy",
+	[IIO_DISTANCE] = "distance",
 };
 
 static const char * const iio_modifier_names[] = {
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 26b8a1c5e2af..a7de445222f4 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -33,6 +33,7 @@ enum iio_chan_type {
 	IIO_ACTIVITY,
 	IIO_STEPS,
 	IIO_ENERGY,
+	IIO_DISTANCE,
 };
 
 enum iio_modifier {
-- 
cgit v1.2.3


From 5a1a932981415661827f7edd9e99943a2a3b7b67 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:09 +0200
Subject: iio: core: Introduce IIO_VELOCITY and IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z

Some devices export the current speed value of the user.

One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that computes the speed of the user based on the number of steps and
stride length.

Introduce a new channel type VELOCITY and a modifier for the magniture or
norm of the velocity vector, IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 10 ++++++++++
 drivers/iio/industrialio-core.c         |  2 ++
 include/linux/iio/types.h               |  2 ++
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index c627a9a1cd56..80b5efb1cdbf 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -295,6 +295,7 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_rot_from_north_true_tilt_comp_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_pressureY_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_pressure_scale
 What:		/sys/bus/iio/devices/iio:deviceX/in_humidityrelative_scale
+What:		/sys/bus/iio/devices/iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_scale
 KernelVersion:	2.6.35
 Contact:	linux-iio@vger.kernel.org
 Description:
@@ -1164,3 +1165,12 @@ Contact:	linux-iio@vger.kernel.org
 Description:
 		This attribute is used to read the number of steps taken by the user
 		since the last reboot while activated.
+
+What:		/sys/.../iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_input
+What:		/sys/.../iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_raw
+KernelVersion:	3.19
+Contact:	linux-iio@vger.kernel.org
+Description:
+		This attribute is used to read the current speed value of the
+		user (which is the norm or magnitude of the velocity vector).
+		Units after application of scale are m/s.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 655755b49ccd..18a8ab911aab 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -74,6 +74,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_STEPS] = "steps",
 	[IIO_ENERGY] = "energy",
 	[IIO_DISTANCE] = "distance",
+	[IIO_VELOCITY] = "velocity",
 };
 
 static const char * const iio_modifier_names[] = {
@@ -99,6 +100,7 @@ static const char * const iio_modifier_names[] = {
 	[IIO_MOD_JOGGING] = "jogging",
 	[IIO_MOD_WALKING] = "walking",
 	[IIO_MOD_STILL] = "still",
+	[IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z] = "sqrt(x^2+y^2+z^2)",
 };
 
 /* relies on pairs of these shared then separate */
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index a7de445222f4..c3601c2c0a9d 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -34,6 +34,7 @@ enum iio_chan_type {
 	IIO_STEPS,
 	IIO_ENERGY,
 	IIO_DISTANCE,
+	IIO_VELOCITY,
 };
 
 enum iio_modifier {
@@ -68,6 +69,7 @@ enum iio_modifier {
 	IIO_MOD_JOGGING,
 	IIO_MOD_WALKING,
 	IIO_MOD_STILL,
+	IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z,
 };
 
 enum iio_event_type {
-- 
cgit v1.2.3


From d37f6836fa285882450e28d1cbc5a9b624911e7e Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:10 +0200
Subject: iio: core: Introduce IIO_CHAN_INFO_CALIBWEIGHT

Some devices need the weight of the user to compute other
parameters. One of this devices is Freescale's MMA9553L
(http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf)
that needs the weight of the user to compute the number of calories burnt.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 7 +++++++
 drivers/iio/industrialio-core.c         | 1 +
 include/linux/iio/iio.h                 | 1 +
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 80b5efb1cdbf..71dc8db4388b 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -351,6 +351,13 @@ Description:
 		to compute the stride length, distance, speed and activity
 		type.
 
+What:		/sys/bus/iio/devices/iio:deviceX/in_energy_calibweight
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Weight of the user (in kg). It is needed by some pedometers
+		to compute the calories burnt by the user.
+
 What:		/sys/bus/iio/devices/iio:deviceX/in_accel_scale_available
 What:		/sys/.../iio:deviceX/in_voltageX_scale_available
 What:		/sys/.../iio:deviceX/in_voltage-voltage_scale_available
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 18a8ab911aab..4ee6fdfa92fe 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -125,6 +125,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_INT_TIME] = "integration_time",
 	[IIO_CHAN_INFO_ENABLE] = "en",
 	[IIO_CHAN_INFO_CALIBHEIGHT] = "calibheight",
+	[IIO_CHAN_INFO_CALIBWEIGHT] = "calibweight",
 };
 
 /**
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 590202024857..51f16437dacc 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -40,6 +40,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_INT_TIME,
 	IIO_CHAN_INFO_ENABLE,
 	IIO_CHAN_INFO_CALIBHEIGHT,
+	IIO_CHAN_INFO_CALIBWEIGHT,
 };
 
 enum iio_shared_by {
-- 
cgit v1.2.3


From 27be84236d75c13a83c45d850390f40b58401d97 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:11 +0200
Subject: iio: core: Introduce CHANGE event type

A step detector will generate an interrupt each time N step are detected.
A device that has such pedometer functionality is Freescale's MMA9553L:
http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf.

Introduce IIO_EV_TYPE_CHANGE event type for events that are generated
when the channel passes a threshold on the absolute change in value.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio              | 20 ++++++++++++++++----
 drivers/iio/industrialio-event.c                     |  1 +
 .../staging/iio/Documentation/iio_event_monitor.c    |  2 ++
 include/linux/iio/types.h                            |  1 +
 4 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 71dc8db4388b..c03a1401b9ca 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -891,12 +891,24 @@ Description:
 		number or direction is not specified, applies to all channels of
 		this type.
 
-What:		/sys/.../events/in_steps_instance_en
-KernelVersion:	3.19
+What:		/sys/.../events/in_steps_change_en
+KernelVersion:	3.20
 Contact:	linux-iio@vger.kernel.org
 Description:
-		Enables or disables step detection. Each time the user takes a step an
-		event of this type will be generated.
+		Event generated when channel passes a threshold on the absolute
+		change in value. E.g. for steps: a step change event is
+		generated each time the user takes N steps, where N is set using
+		in_steps_change_value.
+
+What:		/sys/.../events/in_steps_change_value
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies the value of change threshold that the
+		device is comparing against for the events enabled by
+		<type>[Y][_name]_roc[_rising|falling|]_en. E.g. for steps:
+		if set to 3, a step change event will be generated every 3
+		steps.
 
 What:		/sys/bus/iio/devices/iio:deviceX/trigger/current_trigger
 KernelVersion:	2.6.35
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 78cf115dc0d4..b33ce55eb695 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -198,6 +198,7 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
 	[IIO_EV_TYPE_INSTANCE] = "instance",
+	[IIO_EV_TYPE_CHANGE] = "change",
 };
 
 static const char * const iio_ev_dir_text[] = {
diff --git a/drivers/staging/iio/Documentation/iio_event_monitor.c b/drivers/staging/iio/Documentation/iio_event_monitor.c
index def236abcb3e..2e78d58b9b77 100644
--- a/drivers/staging/iio/Documentation/iio_event_monitor.c
+++ b/drivers/staging/iio/Documentation/iio_event_monitor.c
@@ -60,6 +60,7 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
 	[IIO_EV_TYPE_INSTANCE] = "instance",
+	[IIO_EV_TYPE_CHANGE] = "change",
 };
 
 static const char * const iio_ev_dir_text[] = {
@@ -179,6 +180,7 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_EV_TYPE_THRESH_ADAPTIVE:
 	case IIO_EV_TYPE_MAG_ADAPTIVE:
 	case IIO_EV_TYPE_INSTANCE:
+	case IIO_EV_TYPE_CHANGE:
 		break;
 	default:
 		return false;
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index c3601c2c0a9d..3ba3d6678412 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -79,6 +79,7 @@ enum iio_event_type {
 	IIO_EV_TYPE_THRESH_ADAPTIVE,
 	IIO_EV_TYPE_MAG_ADAPTIVE,
 	IIO_EV_TYPE_INSTANCE,
+	IIO_EV_TYPE_CHANGE,
 };
 
 enum iio_event_info {
-- 
cgit v1.2.3


From 17a2cbc27981b85a09a48425c2614ae0cb7be8cd Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Sun, 11 Jan 2015 21:10:12 +0200
Subject: iio: core: Remove IIO_EV_TYPE_INSTANCE

By introducing IIO_EV_TYPE_CHANGE, IIO_EV_TYPE_INSTANCE becomes redundant.
The effect of IIO_EV_TYPE_INSTANCE can be obtained by using IIO_EV_TYPE_CHANGE
with IIO_EV_INFO_VALUE set to 1.

Remove all instances of IIO_EV_TYPE_INSTANCE and replace them with
IIO_EV_TYPE_CHANGE where needed.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-event.c                      | 1 -
 drivers/staging/iio/Documentation/iio_event_monitor.c | 2 --
 drivers/staging/iio/iio_simple_dummy.c                | 2 +-
 drivers/staging/iio/iio_simple_dummy_events.c         | 4 ++--
 include/linux/iio/types.h                             | 1 -
 5 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index b33ce55eb695..a4b397048f71 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -197,7 +197,6 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_ROC] = "roc",
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
-	[IIO_EV_TYPE_INSTANCE] = "instance",
 	[IIO_EV_TYPE_CHANGE] = "change",
 };
 
diff --git a/drivers/staging/iio/Documentation/iio_event_monitor.c b/drivers/staging/iio/Documentation/iio_event_monitor.c
index 2e78d58b9b77..72c96aa6e992 100644
--- a/drivers/staging/iio/Documentation/iio_event_monitor.c
+++ b/drivers/staging/iio/Documentation/iio_event_monitor.c
@@ -59,7 +59,6 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_ROC] = "roc",
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
-	[IIO_EV_TYPE_INSTANCE] = "instance",
 	[IIO_EV_TYPE_CHANGE] = "change",
 };
 
@@ -179,7 +178,6 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_EV_TYPE_ROC:
 	case IIO_EV_TYPE_THRESH_ADAPTIVE:
 	case IIO_EV_TYPE_MAG_ADAPTIVE:
-	case IIO_EV_TYPE_INSTANCE:
 	case IIO_EV_TYPE_CHANGE:
 		break;
 	default:
diff --git a/drivers/staging/iio/iio_simple_dummy.c b/drivers/staging/iio/iio_simple_dummy.c
index 0b8611ac1003..e4520213f627 100644
--- a/drivers/staging/iio/iio_simple_dummy.c
+++ b/drivers/staging/iio/iio_simple_dummy.c
@@ -73,7 +73,7 @@ static const struct iio_event_spec iio_dummy_event = {
  * simple step detect event - triggered when a step is detected
  */
 static const struct iio_event_spec step_detect_event = {
-	.type = IIO_EV_TYPE_INSTANCE,
+	.type = IIO_EV_TYPE_CHANGE,
 	.dir = IIO_EV_DIR_NONE,
 	.mask_separate = BIT(IIO_EV_INFO_ENABLE),
 };
diff --git a/drivers/staging/iio/iio_simple_dummy_events.c b/drivers/staging/iio/iio_simple_dummy_events.c
index ac15a44ba271..a5cd3bb219fe 100644
--- a/drivers/staging/iio/iio_simple_dummy_events.c
+++ b/drivers/staging/iio/iio_simple_dummy_events.c
@@ -86,7 +86,7 @@ int iio_simple_dummy_write_event_config(struct iio_dev *indio_dev,
 		}
 	case IIO_STEPS:
 		switch (type) {
-		case IIO_EV_TYPE_INSTANCE:
+		case IIO_EV_TYPE_CHANGE:
 			st->event_en = state;
 			break;
 		default:
@@ -201,7 +201,7 @@ static irqreturn_t iio_simple_dummy_event_handler(int irq, void *private)
 		iio_push_event(indio_dev,
 			       IIO_EVENT_CODE(IIO_STEPS, 0, IIO_NO_MOD,
 					      IIO_EV_DIR_NONE,
-					      IIO_EV_TYPE_INSTANCE, 0, 0, 0),
+					      IIO_EV_TYPE_CHANGE, 0, 0, 0),
 			       iio_get_time_ns());
 		break;
 	default:
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 3ba3d6678412..580ed5bdb3fa 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -78,7 +78,6 @@ enum iio_event_type {
 	IIO_EV_TYPE_ROC,
 	IIO_EV_TYPE_THRESH_ADAPTIVE,
 	IIO_EV_TYPE_MAG_ADAPTIVE,
-	IIO_EV_TYPE_INSTANCE,
 	IIO_EV_TYPE_CHANGE,
 };
 
-- 
cgit v1.2.3


From 15a02c1f6dd7c2bb150c61d00ffb33f584ff2288 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 19 Jan 2015 18:05:28 -0800
Subject: clk: Add __clk_mux_determine_rate_closest

Some clock drivers want to find the closest rate on the input of
a mux instead of a rate that's less than or equal to the desired
rate. Add a generic mux function to support this.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Tested-by: Kenneth Westfield <kwestfie@codeaurora.org>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 drivers/clk/clk.c            | 47 +++++++++++++++++++++++++++++++++++---------
 include/linux/clk-provider.h |  8 +++++++-
 2 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 30fff56e1f15..9fc209abcb48 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -690,14 +690,20 @@ struct clk *__clk_lookup(const char *name)
 	return NULL;
 }
 
-/*
- * Helper for finding best parent to provide a given frequency. This can be used
- * directly as a determine_rate callback (e.g. for a mux), or from a more
- * complex clock that may combine a mux with other operations.
- */
-long __clk_mux_determine_rate(struct clk_hw *hw, unsigned long rate,
-			      unsigned long *best_parent_rate,
-			      struct clk_hw **best_parent_p)
+static bool mux_is_better_rate(unsigned long rate, unsigned long now,
+			   unsigned long best, unsigned long flags)
+{
+	if (flags & CLK_MUX_ROUND_CLOSEST)
+		return abs(now - rate) < abs(best - rate);
+
+	return now <= rate && now > best;
+}
+
+static long
+clk_mux_determine_rate_flags(struct clk_hw *hw, unsigned long rate,
+			     unsigned long *best_parent_rate,
+			     struct clk_hw **best_parent_p,
+			     unsigned long flags)
 {
 	struct clk *clk = hw->clk, *parent, *best_parent = NULL;
 	int i, num_parents;
@@ -725,7 +731,7 @@ long __clk_mux_determine_rate(struct clk_hw *hw, unsigned long rate,
 			parent_rate = __clk_round_rate(parent, rate);
 		else
 			parent_rate = __clk_get_rate(parent);
-		if (parent_rate <= rate && parent_rate > best) {
+		if (mux_is_better_rate(rate, parent_rate, best, flags)) {
 			best_parent = parent;
 			best = parent_rate;
 		}
@@ -738,8 +744,31 @@ out:
 
 	return best;
 }
+
+/*
+ * Helper for finding best parent to provide a given frequency. This can be used
+ * directly as a determine_rate callback (e.g. for a mux), or from a more
+ * complex clock that may combine a mux with other operations.
+ */
+long __clk_mux_determine_rate(struct clk_hw *hw, unsigned long rate,
+			      unsigned long *best_parent_rate,
+			      struct clk_hw **best_parent_p)
+{
+	return clk_mux_determine_rate_flags(hw, rate, best_parent_rate,
+					    best_parent_p, 0);
+}
 EXPORT_SYMBOL_GPL(__clk_mux_determine_rate);
 
+long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate,
+			      unsigned long *best_parent_rate,
+			      struct clk_hw **best_parent_p)
+{
+	return clk_mux_determine_rate_flags(hw, rate, best_parent_rate,
+					    best_parent_p,
+					    CLK_MUX_ROUND_CLOSEST);
+}
+EXPORT_SYMBOL_GPL(__clk_mux_determine_rate_closest);
+
 /***        clk api        ***/
 
 void __clk_unprepare(struct clk *clk)
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index ebb7055a6d84..ba858e90d5de 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -384,6 +384,8 @@ void clk_unregister_divider(struct clk *clk);
  *	register, and mask of mux bits are in higher 16-bit of this register.
  *	While setting the mux bits, higher 16-bit should also be updated to
  *	indicate changing mux bits.
+ * CLK_MUX_ROUND_CLOSEST - Use the parent rate that is closest to the desired
+ *	frequency.
  */
 struct clk_mux {
 	struct clk_hw	hw;
@@ -398,7 +400,8 @@ struct clk_mux {
 #define CLK_MUX_INDEX_ONE		BIT(0)
 #define CLK_MUX_INDEX_BIT		BIT(1)
 #define CLK_MUX_HIWORD_MASK		BIT(2)
-#define CLK_MUX_READ_ONLY	BIT(3) /* mux setting cannot be changed */
+#define CLK_MUX_READ_ONLY		BIT(3) /* mux can't be changed */
+#define CLK_MUX_ROUND_CLOSEST		BIT(4)
 
 extern const struct clk_ops clk_mux_ops;
 extern const struct clk_ops clk_mux_ro_ops;
@@ -556,6 +559,9 @@ struct clk *__clk_lookup(const char *name);
 long __clk_mux_determine_rate(struct clk_hw *hw, unsigned long rate,
 			      unsigned long *best_parent_rate,
 			      struct clk_hw **best_parent_p);
+long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate,
+			      unsigned long *best_parent_rate,
+			      struct clk_hw **best_parent_p);
 
 /*
  * FIXME clock api without lock protection
-- 
cgit v1.2.3


From bca9690b942654f668ffb5124b2bbd0ba0f007bb Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 19 Jan 2015 18:05:29 -0800
Subject: clk: divider: Make generic for usage elsewhere

Some devices don't use mmio to interact with dividers. Split out the
logic from the register read/write parts so that we can reuse the
division logic elsewhere.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Tested-by: Kenneth Westfield <kwestfie@codeaurora.org>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 drivers/clk/clk-divider.c    | 212 ++++++++++++++++++++++++++-----------------
 include/linux/clk-provider.h |  11 +++
 2 files changed, 139 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index c2bb9f679ec6..db7f8bce7467 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -30,7 +30,7 @@
 
 #define to_clk_divider(_hw) container_of(_hw, struct clk_divider, hw)
 
-#define div_mask(d)	((1 << ((d)->width)) - 1)
+#define div_mask(width)	((1 << (width)) - 1)
 
 static unsigned int _get_table_maxdiv(const struct clk_div_table *table)
 {
@@ -54,15 +54,16 @@ static unsigned int _get_table_mindiv(const struct clk_div_table *table)
 	return mindiv;
 }
 
-static unsigned int _get_maxdiv(struct clk_divider *divider)
+static unsigned int _get_maxdiv(const struct clk_div_table *table, u8 width,
+				unsigned long flags)
 {
-	if (divider->flags & CLK_DIVIDER_ONE_BASED)
-		return div_mask(divider);
-	if (divider->flags & CLK_DIVIDER_POWER_OF_TWO)
-		return 1 << div_mask(divider);
-	if (divider->table)
-		return _get_table_maxdiv(divider->table);
-	return div_mask(divider) + 1;
+	if (flags & CLK_DIVIDER_ONE_BASED)
+		return div_mask(width);
+	if (flags & CLK_DIVIDER_POWER_OF_TWO)
+		return 1 << div_mask(width);
+	if (table)
+		return _get_table_maxdiv(table);
+	return div_mask(width) + 1;
 }
 
 static unsigned int _get_table_div(const struct clk_div_table *table,
@@ -76,14 +77,15 @@ static unsigned int _get_table_div(const struct clk_div_table *table,
 	return 0;
 }
 
-static unsigned int _get_div(struct clk_divider *divider, unsigned int val)
+static unsigned int _get_div(const struct clk_div_table *table,
+			     unsigned int val, unsigned long flags)
 {
-	if (divider->flags & CLK_DIVIDER_ONE_BASED)
+	if (flags & CLK_DIVIDER_ONE_BASED)
 		return val;
-	if (divider->flags & CLK_DIVIDER_POWER_OF_TWO)
+	if (flags & CLK_DIVIDER_POWER_OF_TWO)
 		return 1 << val;
-	if (divider->table)
-		return _get_table_div(divider->table, val);
+	if (table)
+		return _get_table_div(table, val);
 	return val + 1;
 }
 
@@ -98,29 +100,28 @@ static unsigned int _get_table_val(const struct clk_div_table *table,
 	return 0;
 }
 
-static unsigned int _get_val(struct clk_divider *divider, unsigned int div)
+static unsigned int _get_val(const struct clk_div_table *table,
+			     unsigned int div, unsigned long flags)
 {
-	if (divider->flags & CLK_DIVIDER_ONE_BASED)
+	if (flags & CLK_DIVIDER_ONE_BASED)
 		return div;
-	if (divider->flags & CLK_DIVIDER_POWER_OF_TWO)
+	if (flags & CLK_DIVIDER_POWER_OF_TWO)
 		return __ffs(div);
-	if (divider->table)
-		return  _get_table_val(divider->table, div);
+	if (table)
+		return  _get_table_val(table, div);
 	return div - 1;
 }
 
-static unsigned long clk_divider_recalc_rate(struct clk_hw *hw,
-		unsigned long parent_rate)
+unsigned long divider_recalc_rate(struct clk_hw *hw, unsigned long parent_rate,
+				  unsigned int val,
+				  const struct clk_div_table *table,
+				  unsigned long flags)
 {
-	struct clk_divider *divider = to_clk_divider(hw);
-	unsigned int div, val;
+	unsigned int div;
 
-	val = clk_readl(divider->reg) >> divider->shift;
-	val &= div_mask(divider);
-
-	div = _get_div(divider, val);
+	div = _get_div(table, val, flags);
 	if (!div) {
-		WARN(!(divider->flags & CLK_DIVIDER_ALLOW_ZERO),
+		WARN(!(flags & CLK_DIVIDER_ALLOW_ZERO),
 			"%s: Zero divisor and CLK_DIVIDER_ALLOW_ZERO not set\n",
 			__clk_get_name(hw->clk));
 		return parent_rate;
@@ -128,6 +129,20 @@ static unsigned long clk_divider_recalc_rate(struct clk_hw *hw,
 
 	return DIV_ROUND_UP(parent_rate, div);
 }
+EXPORT_SYMBOL_GPL(divider_recalc_rate);
+
+static unsigned long clk_divider_recalc_rate(struct clk_hw *hw,
+		unsigned long parent_rate)
+{
+	struct clk_divider *divider = to_clk_divider(hw);
+	unsigned int val;
+
+	val = clk_readl(divider->reg) >> divider->shift;
+	val &= div_mask(divider->width);
+
+	return divider_recalc_rate(hw, parent_rate, val, divider->table,
+				   divider->flags);
+}
 
 /*
  * The reverse of DIV_ROUND_UP: The maximum number which
@@ -146,12 +161,13 @@ static bool _is_valid_table_div(const struct clk_div_table *table,
 	return false;
 }
 
-static bool _is_valid_div(struct clk_divider *divider, unsigned int div)
+static bool _is_valid_div(const struct clk_div_table *table, unsigned int div,
+			  unsigned long flags)
 {
-	if (divider->flags & CLK_DIVIDER_POWER_OF_TWO)
+	if (flags & CLK_DIVIDER_POWER_OF_TWO)
 		return is_power_of_2(div);
-	if (divider->table)
-		return _is_valid_table_div(divider->table, div);
+	if (table)
+		return _is_valid_table_div(table, div);
 	return true;
 }
 
@@ -191,71 +207,76 @@ static int _round_down_table(const struct clk_div_table *table, int div)
 	return down;
 }
 
-static int _div_round_up(struct clk_divider *divider,
-		unsigned long parent_rate, unsigned long rate)
+static int _div_round_up(const struct clk_div_table *table,
+			 unsigned long parent_rate, unsigned long rate,
+			 unsigned long flags)
 {
 	int div = DIV_ROUND_UP(parent_rate, rate);
 
-	if (divider->flags & CLK_DIVIDER_POWER_OF_TWO)
+	if (flags & CLK_DIVIDER_POWER_OF_TWO)
 		div = __roundup_pow_of_two(div);
-	if (divider->table)
-		div = _round_up_table(divider->table, div);
+	if (table)
+		div = _round_up_table(table, div);
 
 	return div;
 }
 
-static int _div_round_closest(struct clk_divider *divider,
-		unsigned long parent_rate, unsigned long rate)
+static int _div_round_closest(const struct clk_div_table *table,
+			      unsigned long parent_rate, unsigned long rate,
+			      unsigned long flags)
 {
 	int up, down, div;
 
 	up = down = div = DIV_ROUND_CLOSEST(parent_rate, rate);
 
-	if (divider->flags & CLK_DIVIDER_POWER_OF_TWO) {
+	if (flags & CLK_DIVIDER_POWER_OF_TWO) {
 		up = __roundup_pow_of_two(div);
 		down = __rounddown_pow_of_two(div);
-	} else if (divider->table) {
-		up = _round_up_table(divider->table, div);
-		down = _round_down_table(divider->table, div);
+	} else if (table) {
+		up = _round_up_table(table, div);
+		down = _round_down_table(table, div);
 	}
 
 	return (up - div) <= (div - down) ? up : down;
 }
 
-static int _div_round(struct clk_divider *divider, unsigned long parent_rate,
-		unsigned long rate)
+static int _div_round(const struct clk_div_table *table,
+		      unsigned long parent_rate, unsigned long rate,
+		      unsigned long flags)
 {
-	if (divider->flags & CLK_DIVIDER_ROUND_CLOSEST)
-		return _div_round_closest(divider, parent_rate, rate);
+	if (flags & CLK_DIVIDER_ROUND_CLOSEST)
+		return _div_round_closest(table, parent_rate, rate, flags);
 
-	return _div_round_up(divider, parent_rate, rate);
+	return _div_round_up(table, parent_rate, rate, flags);
 }
 
-static bool _is_best_div(struct clk_divider *divider,
-		unsigned long rate, unsigned long now, unsigned long best)
+static bool _is_best_div(unsigned long rate, unsigned long now,
+			 unsigned long best, unsigned long flags)
 {
-	if (divider->flags & CLK_DIVIDER_ROUND_CLOSEST)
+	if (flags & CLK_DIVIDER_ROUND_CLOSEST)
 		return abs(rate - now) < abs(rate - best);
 
 	return now <= rate && now > best;
 }
 
-static int _next_div(struct clk_divider *divider, int div)
+static int _next_div(const struct clk_div_table *table, int div,
+		     unsigned long flags)
 {
 	div++;
 
-	if (divider->flags & CLK_DIVIDER_POWER_OF_TWO)
+	if (flags & CLK_DIVIDER_POWER_OF_TWO)
 		return __roundup_pow_of_two(div);
-	if (divider->table)
-		return _round_up_table(divider->table, div);
+	if (table)
+		return _round_up_table(table, div);
 
 	return div;
 }
 
 static int clk_divider_bestdiv(struct clk_hw *hw, unsigned long rate,
-		unsigned long *best_parent_rate)
+			       unsigned long *best_parent_rate,
+			       const struct clk_div_table *table, u8 width,
+			       unsigned long flags)
 {
-	struct clk_divider *divider = to_clk_divider(hw);
 	int i, bestdiv = 0;
 	unsigned long parent_rate, best = 0, now, maxdiv;
 	unsigned long parent_rate_saved = *best_parent_rate;
@@ -263,19 +284,11 @@ static int clk_divider_bestdiv(struct clk_hw *hw, unsigned long rate,
 	if (!rate)
 		rate = 1;
 
-	/* if read only, just return current value */
-	if (divider->flags & CLK_DIVIDER_READ_ONLY) {
-		bestdiv = readl(divider->reg) >> divider->shift;
-		bestdiv &= div_mask(divider);
-		bestdiv = _get_div(divider, bestdiv);
-		return bestdiv;
-	}
-
-	maxdiv = _get_maxdiv(divider);
+	maxdiv = _get_maxdiv(table, width, flags);
 
 	if (!(__clk_get_flags(hw->clk) & CLK_SET_RATE_PARENT)) {
 		parent_rate = *best_parent_rate;
-		bestdiv = _div_round(divider, parent_rate, rate);
+		bestdiv = _div_round(table, parent_rate, rate, flags);
 		bestdiv = bestdiv == 0 ? 1 : bestdiv;
 		bestdiv = bestdiv > maxdiv ? maxdiv : bestdiv;
 		return bestdiv;
@@ -287,8 +300,8 @@ static int clk_divider_bestdiv(struct clk_hw *hw, unsigned long rate,
 	 */
 	maxdiv = min(ULONG_MAX / rate, maxdiv);
 
-	for (i = 1; i <= maxdiv; i = _next_div(divider, i)) {
-		if (!_is_valid_div(divider, i))
+	for (i = 1; i <= maxdiv; i = _next_div(table, i, flags)) {
+		if (!_is_valid_div(table, i, flags))
 			continue;
 		if (rate * i == parent_rate_saved) {
 			/*
@@ -302,7 +315,7 @@ static int clk_divider_bestdiv(struct clk_hw *hw, unsigned long rate,
 		parent_rate = __clk_round_rate(__clk_get_parent(hw->clk),
 				MULT_ROUND_UP(rate, i));
 		now = DIV_ROUND_UP(parent_rate, i);
-		if (_is_best_div(divider, rate, now, best)) {
+		if (_is_best_div(rate, now, best, flags)) {
 			bestdiv = i;
 			best = now;
 			*best_parent_rate = parent_rate;
@@ -310,48 +323,79 @@ static int clk_divider_bestdiv(struct clk_hw *hw, unsigned long rate,
 	}
 
 	if (!bestdiv) {
-		bestdiv = _get_maxdiv(divider);
+		bestdiv = _get_maxdiv(table, width, flags);
 		*best_parent_rate = __clk_round_rate(__clk_get_parent(hw->clk), 1);
 	}
 
 	return bestdiv;
 }
 
-static long clk_divider_round_rate(struct clk_hw *hw, unsigned long rate,
-				unsigned long *prate)
+long divider_round_rate(struct clk_hw *hw, unsigned long rate,
+			unsigned long *prate, const struct clk_div_table *table,
+			u8 width, unsigned long flags)
 {
 	int div;
-	div = clk_divider_bestdiv(hw, rate, prate);
+
+	div = clk_divider_bestdiv(hw, rate, prate, table, width, flags);
 
 	return DIV_ROUND_UP(*prate, div);
 }
+EXPORT_SYMBOL_GPL(divider_round_rate);
 
-static int clk_divider_set_rate(struct clk_hw *hw, unsigned long rate,
-				unsigned long parent_rate)
+static long clk_divider_round_rate(struct clk_hw *hw, unsigned long rate,
+				unsigned long *prate)
 {
 	struct clk_divider *divider = to_clk_divider(hw);
+	int bestdiv;
+
+	/* if read only, just return current value */
+	if (divider->flags & CLK_DIVIDER_READ_ONLY) {
+		bestdiv = readl(divider->reg) >> divider->shift;
+		bestdiv &= div_mask(divider->width);
+		bestdiv = _get_div(divider->table, bestdiv, divider->flags);
+		return bestdiv;
+	}
+
+	return divider_round_rate(hw, rate, prate, divider->table,
+				  divider->width, divider->flags);
+}
+
+int divider_get_val(unsigned long rate, unsigned long parent_rate,
+		    const struct clk_div_table *table, u8 width,
+		    unsigned long flags)
+{
 	unsigned int div, value;
-	unsigned long flags = 0;
-	u32 val;
 
 	div = DIV_ROUND_UP(parent_rate, rate);
 
-	if (!_is_valid_div(divider, div))
+	if (!_is_valid_div(table, div, flags))
 		return -EINVAL;
 
-	value = _get_val(divider, div);
+	value = _get_val(table, div, flags);
+
+	return min_t(unsigned int, value, div_mask(width));
+}
+EXPORT_SYMBOL_GPL(divider_get_val);
+
+static int clk_divider_set_rate(struct clk_hw *hw, unsigned long rate,
+				unsigned long parent_rate)
+{
+	struct clk_divider *divider = to_clk_divider(hw);
+	unsigned int value;
+	unsigned long flags = 0;
+	u32 val;
 
-	if (value > div_mask(divider))
-		value = div_mask(divider);
+	value = divider_get_val(rate, parent_rate, divider->table,
+				divider->width, divider->flags);
 
 	if (divider->lock)
 		spin_lock_irqsave(divider->lock, flags);
 
 	if (divider->flags & CLK_DIVIDER_HIWORD_MASK) {
-		val = div_mask(divider) << (divider->shift + 16);
+		val = div_mask(divider->width) << (divider->shift + 16);
 	} else {
 		val = clk_readl(divider->reg);
-		val &= ~(div_mask(divider) << divider->shift);
+		val &= ~(div_mask(divider->width) << divider->shift);
 	}
 	val |= value << divider->shift;
 	clk_writel(val, divider->reg);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index ba858e90d5de..0ed5bf2209ad 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -353,6 +353,17 @@ struct clk_divider {
 #define CLK_DIVIDER_READ_ONLY		BIT(5)
 
 extern const struct clk_ops clk_divider_ops;
+
+unsigned long divider_recalc_rate(struct clk_hw *hw, unsigned long parent_rate,
+		unsigned int val, const struct clk_div_table *table,
+		unsigned long flags);
+long divider_round_rate(struct clk_hw *hw, unsigned long rate,
+		unsigned long *prate, const struct clk_div_table *table,
+		u8 width, unsigned long flags);
+int divider_get_val(unsigned long rate, unsigned long parent_rate,
+		const struct clk_div_table *table, u8 width,
+		unsigned long flags);
+
 struct clk *clk_register_divider(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
-- 
cgit v1.2.3


From e387088a03a07583f248a237cb00c5c955a394c9 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 22 Jan 2015 15:40:20 -0800
Subject: clk: ti: Drop use of clk-private.h

These modules don't need to include clk-private.h. Replace the
include with clk.h because these modules are clock consumers and
also include clk-provider.h in clk/ti.h because struct
clk_hw_omap has a struct clk_hw embedded in it.

Cc: Tero Kristo <t-kristo@ti.com>
Cc: Tony Lindgren <tony@atomide.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Paul Walmsley <paul@pwsan.com>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 drivers/clk/ti/clk-44xx.c | 2 +-
 drivers/clk/ti/clk-54xx.c | 2 +-
 drivers/clk/ti/clk-7xx.c  | 2 +-
 include/linux/clk/ti.h    | 1 +
 4 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/ti/clk-44xx.c b/drivers/clk/ti/clk-44xx.c
index 02517a8206bd..4f4c87751db5 100644
--- a/drivers/clk/ti/clk-44xx.c
+++ b/drivers/clk/ti/clk-44xx.c
@@ -12,7 +12,7 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/clk-private.h>
+#include <linux/clk.h>
 #include <linux/clkdev.h>
 #include <linux/clk/ti.h>
 
diff --git a/drivers/clk/ti/clk-54xx.c b/drivers/clk/ti/clk-54xx.c
index 5e183993e3ec..14160b223548 100644
--- a/drivers/clk/ti/clk-54xx.c
+++ b/drivers/clk/ti/clk-54xx.c
@@ -12,7 +12,7 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/clk-private.h>
+#include <linux/clk.h>
 #include <linux/clkdev.h>
 #include <linux/io.h>
 #include <linux/clk/ti.h>
diff --git a/drivers/clk/ti/clk-7xx.c b/drivers/clk/ti/clk-7xx.c
index 62ac8f6e480c..ee32f4deebf4 100644
--- a/drivers/clk/ti/clk-7xx.c
+++ b/drivers/clk/ti/clk-7xx.c
@@ -12,7 +12,7 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/clk-private.h>
+#include <linux/clk.h>
 #include <linux/clkdev.h>
 #include <linux/clk/ti.h>
 
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 55ef529a0dbf..172d13fd8bea 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -15,6 +15,7 @@
 #ifndef __LINUX_CLK_TI_H__
 #define __LINUX_CLK_TI_H__
 
+#include <linux/clk-provider.h>
 #include <linux/clkdev.h>
 
 /**
-- 
cgit v1.2.3


From 2130fb97fecf9a51bb4a21da220cff3f72496a94 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Tue, 27 Jan 2015 01:18:19 +0100
Subject: NFC: st21nfca: Adding support for secure element

st21nfca has 1 physical SWP line and can support up to 2 secure elements
(UICC & eSE) thanks to an external switch managed with a gpio.

The platform integrator needs to specify thanks to 2 initialization
properties, uicc-present and ese-present, if it is suppose to have uicc
and/or ese. Of course if the platform does not have an external switch,
only one kind of secure element can be supported. Those parameters are
under platform integrator responsibilities.

During initialization, the white_list will be set according to those
parameters.

The discovery_se function will assume a secure element is physically
present according to uicc-present and ese-present values and will add it
to the secure element list. On ese activation, the atr is retrieved to
calculate a command exchange timeout based on the first atr(TB) value.

The se_io will allow to transfer data over SWP. 2 kind of events may appear
after a data is sent over:
- ST21NFCA_EVT_TRANSMIT_DATA when receiving an apdu answer
- ST21NFCA_EVT_WTX_REQUEST when the secure element needs more time than
expected to compute a command. If this timeout expired, a first recovery
tentative consist to send a simple software reset proprietary command.
If this tentative still fail, a second recovery tentative consist to send
a hardware reset proprietary command.
This function is only relevant for eSE like secure element.

This patch also change the way a pipe is referenced. There can be
different pipe connected to the same gate with different host destination
(ex: CONNECTIVITY). In order to keep host information every pipe are
reference with a tuple (gate, host). In order to reduce changes, we are
keeping unchanged the way a gate is addressed on the Terminal Host.
However, this is working because we consider the apdu reader gate is only
present on the eSE slot also the connectivity gate cannot give a reliable
value; it will give the latest stored pipe value.

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/st21nfca/Makefile          |   2 +-
 drivers/nfc/st21nfca/i2c.c             |  17 +-
 drivers/nfc/st21nfca/st21nfca.c        | 134 +++++++++--
 drivers/nfc/st21nfca/st21nfca.h        |  21 +-
 drivers/nfc/st21nfca/st21nfca_se.c     | 390 +++++++++++++++++++++++++++++++++
 drivers/nfc/st21nfca/st21nfca_se.h     |  63 ++++++
 include/linux/platform_data/st21nfca.h |   2 +
 7 files changed, 608 insertions(+), 21 deletions(-)
 create mode 100644 drivers/nfc/st21nfca/st21nfca_se.c
 create mode 100644 drivers/nfc/st21nfca/st21nfca_se.h

(limited to 'include/linux')

diff --git a/drivers/nfc/st21nfca/Makefile b/drivers/nfc/st21nfca/Makefile
index 7d688f97aa27..97edab4bbdf8 100644
--- a/drivers/nfc/st21nfca/Makefile
+++ b/drivers/nfc/st21nfca/Makefile
@@ -2,7 +2,7 @@
 # Makefile for ST21NFCA HCI based NFC driver
 #
 
-st21nfca_hci-objs = st21nfca.o st21nfca_dep.o
+st21nfca_hci-objs = st21nfca.o st21nfca_dep.o st21nfca_se.o
 obj-$(CONFIG_NFC_ST21NFCA)     += st21nfca_hci.o
 
 st21nfca_i2c-objs  = i2c.o
diff --git a/drivers/nfc/st21nfca/i2c.c b/drivers/nfc/st21nfca/i2c.c
index 82b82dbd2997..a32143951616 100644
--- a/drivers/nfc/st21nfca/i2c.c
+++ b/drivers/nfc/st21nfca/i2c.c
@@ -74,6 +74,8 @@ struct st21nfca_i2c_phy {
 	unsigned int gpio_ena;
 	unsigned int irq_polarity;
 
+	struct st21nfca_se_status se_status;
+
 	struct sk_buff *pending_skb;
 	int current_read_len;
 	/*
@@ -537,6 +539,11 @@ static int st21nfca_hci_i2c_of_request_resources(struct i2c_client *client)
 
 	phy->irq_polarity = irq_get_trigger_type(client->irq);
 
+	phy->se_status.is_ese_present =
+				of_property_read_bool(pp, "ese-present");
+	phy->se_status.is_uicc_present =
+				of_property_read_bool(pp, "uicc-present");
+
 	return 0;
 }
 #else
@@ -571,6 +578,9 @@ static int st21nfca_hci_i2c_request_resources(struct i2c_client *client)
 		}
 	}
 
+	phy->se_status.is_ese_present = pdata->is_ese_present;
+	phy->se_status.is_uicc_present = pdata->is_uicc_present;
+
 	return 0;
 }
 
@@ -638,8 +648,11 @@ static int st21nfca_hci_i2c_probe(struct i2c_client *client,
 	}
 
 	return st21nfca_hci_probe(phy, &i2c_phy_ops, LLC_SHDLC_NAME,
-			       ST21NFCA_FRAME_HEADROOM, ST21NFCA_FRAME_TAILROOM,
-			       ST21NFCA_HCI_LLC_MAX_PAYLOAD, &phy->hdev);
+					ST21NFCA_FRAME_HEADROOM,
+					ST21NFCA_FRAME_TAILROOM,
+					ST21NFCA_HCI_LLC_MAX_PAYLOAD,
+					&phy->hdev,
+					&phy->se_status);
 }
 
 static int st21nfca_hci_i2c_remove(struct i2c_client *client)
diff --git a/drivers/nfc/st21nfca/st21nfca.c b/drivers/nfc/st21nfca/st21nfca.c
index 7cb7ce47d2af..24d3d240d5f4 100644
--- a/drivers/nfc/st21nfca/st21nfca.c
+++ b/drivers/nfc/st21nfca/st21nfca.c
@@ -23,6 +23,7 @@
 
 #include "st21nfca.h"
 #include "st21nfca_dep.h"
+#include "st21nfca_se.h"
 
 #define DRIVER_DESC "HCI NFC driver for ST21NFCA"
 
@@ -62,7 +63,6 @@
 #define ST21NFCA_RF_CARD_F_DATARATE		0x08
 #define ST21NFCA_RF_CARD_F_DATARATE_212_424	0x01
 
-#define ST21NFCA_DEVICE_MGNT_GATE		0x01
 #define ST21NFCA_DEVICE_MGNT_PIPE		0x02
 
 #define ST21NFCA_DM_GETINFO			0x13
@@ -78,6 +78,11 @@
 
 #define ST21NFCA_NFC_MODE			0x03	/* NFC_MODE parameter*/
 
+#define ST21NFCA_EVT_HOT_PLUG			0x03
+#define ST21NFCA_EVT_HOT_PLUG_IS_INHIBITED(x) (x->data[0] & 0x80)
+
+#define ST21NFCA_SE_TO_PIPES			2000
+
 static DECLARE_BITMAP(dev_mask, ST21NFCA_NUM_DEVICES);
 
 static struct nfc_hci_gate st21nfca_gates[] = {
@@ -92,6 +97,10 @@ static struct nfc_hci_gate st21nfca_gates[] = {
 	{ST21NFCA_RF_READER_14443_3_A_GATE, NFC_HCI_INVALID_PIPE},
 	{ST21NFCA_RF_READER_ISO15693_GATE, NFC_HCI_INVALID_PIPE},
 	{ST21NFCA_RF_CARD_F_GATE, NFC_HCI_INVALID_PIPE},
+
+	/* Secure element pipes are created by secure element host */
+	{ST21NFCA_CONNECTIVITY_GATE, NFC_HCI_DO_NOT_CREATE_PIPE},
+	{ST21NFCA_APDU_READER_GATE, NFC_HCI_DO_NOT_CREATE_PIPE},
 };
 
 struct st21nfca_pipe_info {
@@ -136,7 +145,8 @@ static int st21nfca_hci_load_session(struct nfc_hci_dev *hdev)
 	 * Pipe can be closed and need to be open.
 	 */
 	r = nfc_hci_connect_gate(hdev, NFC_HCI_HOST_CONTROLLER_ID,
-		ST21NFCA_DEVICE_MGNT_GATE, ST21NFCA_DEVICE_MGNT_PIPE);
+				ST21NFCA_DEVICE_MGNT_GATE,
+				ST21NFCA_DEVICE_MGNT_PIPE);
 	if (r < 0)
 		goto free_info;
 
@@ -167,17 +177,28 @@ static int st21nfca_hci_load_session(struct nfc_hci_dev *hdev)
 		 * - destination gid (1byte)
 		 */
 		info = (struct st21nfca_pipe_info *) skb_pipe_info->data;
+		if (info->dst_gate_id == ST21NFCA_APDU_READER_GATE &&
+			info->src_host_id != ST21NFCA_ESE_HOST_ID) {
+			pr_err("Unexpected apdu_reader pipe on host %x\n",
+				info->src_host_id);
+			continue;
+		}
+
 		for (j = 0; (j < ARRAY_SIZE(st21nfca_gates)) &&
-			(st21nfca_gates[j].gate != info->dst_gate_id);
-			j++)
+			(st21nfca_gates[j].gate != info->dst_gate_id) ; j++)
 			;
 
 		if (j < ARRAY_SIZE(st21nfca_gates) &&
 			st21nfca_gates[j].gate == info->dst_gate_id &&
 			ST21NFCA_DM_IS_PIPE_OPEN(info->pipe_state)) {
 			st21nfca_gates[j].pipe = pipe_info[2];
+
 			hdev->gate2pipe[st21nfca_gates[j].gate] =
-				st21nfca_gates[j].pipe;
+							st21nfca_gates[j].pipe;
+			hdev->pipes[st21nfca_gates[j].pipe].gate =
+							st21nfca_gates[j].gate;
+			hdev->pipes[st21nfca_gates[j].pipe].dest_host =
+							info->src_host_id;
 		}
 	}
 
@@ -187,7 +208,7 @@ static int st21nfca_hci_load_session(struct nfc_hci_dev *hdev)
 	 */
 	if (skb_pipe_list->len + 3 < ARRAY_SIZE(st21nfca_gates)) {
 		for (i = skb_pipe_list->len + 3;
-				i < ARRAY_SIZE(st21nfca_gates); i++) {
+				i < ARRAY_SIZE(st21nfca_gates) - 2; i++) {
 			r = nfc_hci_connect_gate(hdev,
 					NFC_HCI_HOST_CONTROLLER_ID,
 					st21nfca_gates[i].gate,
@@ -244,16 +265,33 @@ out:
 
 static int st21nfca_hci_ready(struct nfc_hci_dev *hdev)
 {
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
 	struct sk_buff *skb;
 
 	u8 param;
+	u8 white_list[2];
+	int wl_size = 0;
 	int r;
 
-	param = NFC_HCI_UICC_HOST_ID;
-	r = nfc_hci_set_param(hdev, NFC_HCI_ADMIN_GATE,
-			      NFC_HCI_ADMIN_WHITELIST, &param, 1);
-	if (r < 0)
-		return r;
+	if (info->se_status->is_ese_present &&
+		info->se_status->is_uicc_present) {
+		white_list[wl_size++] = NFC_HCI_UICC_HOST_ID;
+		white_list[wl_size++] = ST21NFCA_ESE_HOST_ID;
+	} else if (!info->se_status->is_ese_present &&
+			 info->se_status->is_uicc_present) {
+		white_list[wl_size++] = NFC_HCI_UICC_HOST_ID;
+	} else if (info->se_status->is_ese_present &&
+			!info->se_status->is_uicc_present) {
+		white_list[wl_size++] = ST21NFCA_ESE_HOST_ID;
+	}
+
+	if (wl_size) {
+		r = nfc_hci_set_param(hdev, NFC_HCI_ADMIN_GATE,
+					NFC_HCI_ADMIN_WHITELIST,
+					(u8 *) &white_list, wl_size);
+		if (r < 0)
+			return r;
+	}
 
 	/* Set NFC_MODE in device management gate to enable */
 	r = nfc_hci_get_param(hdev, ST21NFCA_DEVICE_MGNT_GATE,
@@ -821,19 +859,79 @@ static int st21nfca_hci_check_presence(struct nfc_hci_dev *hdev,
 	}
 }
 
+static void st21nfca_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
+				struct sk_buff *skb)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+	u8 gate = hdev->pipes[pipe].gate;
+
+	pr_debug("cmd: %x\n", cmd);
+
+	switch (cmd) {
+	case NFC_HCI_ANY_OPEN_PIPE:
+		if (gate != ST21NFCA_APDU_READER_GATE &&
+			hdev->pipes[pipe].dest_host != NFC_HCI_UICC_HOST_ID)
+			info->se_info.count_pipes++;
+
+		if (info->se_info.count_pipes == info->se_info.expected_pipes) {
+			del_timer_sync(&info->se_info.se_active_timer);
+			info->se_info.se_active = false;
+			info->se_info.count_pipes = 0;
+			complete(&info->se_info.req_completion);
+		}
+	break;
+	}
+}
+
+static int st21nfca_admin_event_received(struct nfc_hci_dev *hdev, u8 event,
+					struct sk_buff *skb)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	pr_debug("admin event: %x\n", event);
+
+	switch (event) {
+	case ST21NFCA_EVT_HOT_PLUG:
+		if (info->se_info.se_active) {
+			if (!ST21NFCA_EVT_HOT_PLUG_IS_INHIBITED(skb)) {
+				del_timer_sync(&info->se_info.se_active_timer);
+				info->se_info.se_active = false;
+				complete(&info->se_info.req_completion);
+			} else {
+				mod_timer(&info->se_info.se_active_timer,
+					jiffies +
+					msecs_to_jiffies(ST21NFCA_SE_TO_PIPES));
+			}
+		}
+	break;
+	}
+	kfree_skb(skb);
+	return 0;
+}
+
 /*
  * Returns:
  * <= 0: driver handled the event, skb consumed
  *    1: driver does not handle the event, please do standard processing
  */
-static int st21nfca_hci_event_received(struct nfc_hci_dev *hdev, u8 gate,
+static int st21nfca_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe,
 				       u8 event, struct sk_buff *skb)
 {
+	u8 gate = hdev->pipes[pipe].gate;
+	u8 host = hdev->pipes[pipe].dest_host;
+
 	pr_debug("hci event: %d gate: %x\n", event, gate);
 
 	switch (gate) {
+	case NFC_HCI_ADMIN_GATE:
+		return st21nfca_admin_event_received(hdev, event, skb);
 	case ST21NFCA_RF_CARD_F_GATE:
 		return st21nfca_dep_event_received(hdev, event, skb);
+	case ST21NFCA_CONNECTIVITY_GATE:
+		return st21nfca_connectivity_event_received(hdev, host,
+							event, skb);
+	case ST21NFCA_APDU_READER_GATE:
+		return st21nfca_apdu_reader_event_received(hdev, event, skb);
 	default:
 		return 1;
 	}
@@ -855,11 +953,17 @@ static struct nfc_hci_ops st21nfca_hci_ops = {
 	.tm_send = st21nfca_hci_tm_send,
 	.check_presence = st21nfca_hci_check_presence,
 	.event_received = st21nfca_hci_event_received,
+	.cmd_received = st21nfca_hci_cmd_received,
+	.discover_se = st21nfca_hci_discover_se,
+	.enable_se = st21nfca_hci_enable_se,
+	.disable_se = st21nfca_hci_disable_se,
+	.se_io = st21nfca_hci_se_io,
 };
 
 int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
 		       char *llc_name, int phy_headroom, int phy_tailroom,
-		       int phy_payload, struct nfc_hci_dev **hdev)
+		       int phy_payload, struct nfc_hci_dev **hdev,
+			   struct st21nfca_se_status *se_status)
 {
 	struct st21nfca_hci_info *info;
 	int r = 0;
@@ -919,6 +1023,8 @@ int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
 		goto err_alloc_hdev;
 	}
 
+	info->se_status = se_status;
+
 	nfc_hci_set_clientdata(info->hdev, info);
 
 	r = nfc_hci_register_device(info->hdev);
@@ -927,6 +1033,7 @@ int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
 
 	*hdev = info->hdev;
 	st21nfca_dep_init(info->hdev);
+	st21nfca_se_init(info->hdev);
 
 	return 0;
 
@@ -945,6 +1052,7 @@ void st21nfca_hci_remove(struct nfc_hci_dev *hdev)
 	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
 
 	st21nfca_dep_deinit(hdev);
+	st21nfca_se_deinit(hdev);
 	nfc_hci_unregister_device(hdev);
 	nfc_hci_free_device(hdev);
 	kfree(info);
diff --git a/drivers/nfc/st21nfca/st21nfca.h b/drivers/nfc/st21nfca/st21nfca.h
index 7c2a85292230..15a78d330a9f 100644
--- a/drivers/nfc/st21nfca/st21nfca.h
+++ b/drivers/nfc/st21nfca/st21nfca.h
@@ -20,6 +20,7 @@
 #include <net/nfc/hci.h>
 
 #include "st21nfca_dep.h"
+#include "st21nfca_se.h"
 
 #define HCI_MODE 0
 
@@ -51,9 +52,15 @@
 
 #define ST21NFCA_NUM_DEVICES 256
 
+struct st21nfca_se_status {
+	bool is_ese_present;
+	bool is_uicc_present;
+};
+
 int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
 		       char *llc_name, int phy_headroom, int phy_tailroom,
-		       int phy_payload, struct nfc_hci_dev **hdev);
+		       int phy_payload, struct nfc_hci_dev **hdev,
+			   struct st21nfca_se_status *se_status);
 void st21nfca_hci_remove(struct nfc_hci_dev *hdev);
 
 enum st21nfca_state {
@@ -66,6 +73,7 @@ struct st21nfca_hci_info {
 	void *phy_id;
 
 	struct nfc_hci_dev *hdev;
+	struct st21nfca_se_status *se_status;
 
 	enum st21nfca_state state;
 
@@ -76,13 +84,16 @@ struct st21nfca_hci_info {
 	void *async_cb_context;
 
 	struct st21nfca_dep_info dep_info;
+	struct st21nfca_se_info se_info;
 };
 
 /* Reader RF commands */
-#define ST21NFCA_WR_XCHG_DATA            0x10
-
-#define ST21NFCA_RF_READER_F_GATE               0x14
+#define ST21NFCA_WR_XCHG_DATA           0x10
 
-#define ST21NFCA_RF_CARD_F_GATE 0x24
+#define ST21NFCA_DEVICE_MGNT_GATE       0x01
+#define ST21NFCA_RF_READER_F_GATE       0x14
+#define ST21NFCA_RF_CARD_F_GATE			0x24
+#define ST21NFCA_APDU_READER_GATE		0xf0
+#define ST21NFCA_CONNECTIVITY_GATE		0x41
 
 #endif /* __LOCAL_ST21NFCA_H_ */
diff --git a/drivers/nfc/st21nfca/st21nfca_se.c b/drivers/nfc/st21nfca/st21nfca_se.c
new file mode 100644
index 000000000000..9b93d3904ab5
--- /dev/null
+++ b/drivers/nfc/st21nfca/st21nfca_se.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <net/nfc/hci.h>
+
+#include "st21nfca.h"
+#include "st21nfca_se.h"
+
+#define ST21NFCA_EVT_UICC_ACTIVATE		0x10
+#define ST21NFCA_EVT_UICC_DEACTIVATE	0x13
+#define ST21NFCA_EVT_SE_HARD_RESET		0x20
+#define ST21NFCA_EVT_SE_SOFT_RESET		0x11
+#define ST21NFCA_EVT_SE_END_OF_APDU_TRANSFER	0x21
+#define ST21NFCA_EVT_SE_ACTIVATE		0x22
+#define ST21NFCA_EVT_SE_DEACTIVATE		0x23
+
+#define ST21NFCA_EVT_TRANSMIT_DATA		0x10
+#define ST21NFCA_EVT_WTX_REQUEST		0x11
+
+#define ST21NFCA_EVT_CONNECTIVITY		0x10
+#define ST21NFCA_EVT_TRANSACTION		0x12
+
+#define ST21NFCA_ESE_HOST_ID			0xc0
+
+#define ST21NFCA_SE_TO_HOT_PLUG			1000
+/* Connectivity pipe only */
+#define ST21NFCA_SE_COUNT_PIPE_UICC		0x01
+/* Connectivity + APDU Reader pipe */
+#define ST21NFCA_SE_COUNT_PIPE_EMBEDDED	0x02
+
+#define ST21NFCA_SE_MODE_OFF			0x00
+#define ST21NFCA_SE_MODE_ON				0x01
+
+#define ST21NFCA_PARAM_ATR				0x01
+#define ST21NFCA_ATR_DEFAULT_BWI		0x04
+
+/*
+ * WT = 2^BWI/10[s], convert into msecs and add a secure
+ * room by increasing by 2 this timeout
+ */
+#define ST21NFCA_BWI_TO_TIMEOUT(x)		((1 << x) * 200)
+#define ST21NFCA_ATR_GET_Y_FROM_TD(x)	(x >> 4)
+
+/* If TA is present bit 0 is set */
+#define ST21NFCA_ATR_TA_PRESENT(x) (x & 0x01)
+/* If TB is present bit 1 is set */
+#define ST21NFCA_ATR_TB_PRESENT(x) (x & 0x02)
+
+static u8 st21nfca_se_get_bwi(struct nfc_hci_dev *hdev)
+{
+	int i;
+	u8 td;
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	/* Bits 8 to 5 of the first TB for T=1 encode BWI from zero to nine */
+	for (i = 1; i < ST21NFCA_ESE_MAX_LENGTH; i++) {
+		td = ST21NFCA_ATR_GET_Y_FROM_TD(info->se_info.atr[i]);
+		if (ST21NFCA_ATR_TA_PRESENT(td))
+			i++;
+		if (ST21NFCA_ATR_TB_PRESENT(td)) {
+			i++;
+			return info->se_info.atr[i] >> 4;
+		}
+	}
+	return ST21NFCA_ATR_DEFAULT_BWI;
+}
+
+static void st21nfca_se_get_atr(struct nfc_hci_dev *hdev)
+{
+	int r;
+	struct sk_buff *skb;
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	r = nfc_hci_get_param(hdev, ST21NFCA_APDU_READER_GATE,
+			ST21NFCA_PARAM_ATR, &skb);
+	if (r < 0)
+		return;
+
+	if (skb->len <= ST21NFCA_ESE_MAX_LENGTH) {
+		memcpy(info->se_info.atr, skb->data, skb->len);
+		info->se_info.wt_timeout =
+			ST21NFCA_BWI_TO_TIMEOUT(st21nfca_se_get_bwi(hdev));
+	}
+	kfree_skb(skb);
+}
+
+static int st21nfca_hci_control_se(struct nfc_hci_dev *hdev, u32 se_idx,
+				u8 state)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+	int r;
+	struct sk_buff *sk_host_list;
+	u8 se_event, host_id;
+
+	switch (se_idx) {
+	case NFC_HCI_UICC_HOST_ID:
+		se_event = (state == ST21NFCA_SE_MODE_ON ?
+					ST21NFCA_EVT_UICC_ACTIVATE :
+					ST21NFCA_EVT_UICC_DEACTIVATE);
+
+		info->se_info.count_pipes = 0;
+		info->se_info.expected_pipes = ST21NFCA_SE_COUNT_PIPE_UICC;
+		break;
+	case ST21NFCA_ESE_HOST_ID:
+		se_event = (state == ST21NFCA_SE_MODE_ON ?
+					ST21NFCA_EVT_SE_ACTIVATE :
+					ST21NFCA_EVT_SE_DEACTIVATE);
+
+		info->se_info.count_pipes = 0;
+		info->se_info.expected_pipes = ST21NFCA_SE_COUNT_PIPE_EMBEDDED;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/*
+	 * Wait for an EVT_HOT_PLUG in order to
+	 * retrieve a relevant host list.
+	 */
+	reinit_completion(&info->se_info.req_completion);
+	r = nfc_hci_send_event(hdev, ST21NFCA_DEVICE_MGNT_GATE, se_event,
+			       NULL, 0);
+	if (r < 0)
+		return r;
+
+	mod_timer(&info->se_info.se_active_timer, jiffies +
+		msecs_to_jiffies(ST21NFCA_SE_TO_HOT_PLUG));
+	info->se_info.se_active = true;
+
+	/* Ignore return value and check in any case the host_list */
+	wait_for_completion_interruptible(&info->se_info.req_completion);
+
+	r = nfc_hci_get_param(hdev, NFC_HCI_ADMIN_GATE,
+			NFC_HCI_ADMIN_HOST_LIST,
+			&sk_host_list);
+	if (r < 0)
+		return r;
+
+	host_id = sk_host_list->data[sk_host_list->len - 1];
+	kfree_skb(sk_host_list);
+
+	if (state == ST21NFCA_SE_MODE_ON && host_id == se_idx)
+		return se_idx;
+	else if (state == ST21NFCA_SE_MODE_OFF && host_id != se_idx)
+		return se_idx;
+
+	return -1;
+}
+
+int st21nfca_hci_discover_se(struct nfc_hci_dev *hdev)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+	int se_count = 0;
+
+	if (info->se_status->is_uicc_present) {
+		nfc_add_se(hdev->ndev, NFC_HCI_UICC_HOST_ID, NFC_SE_UICC);
+		se_count++;
+	}
+
+	if (info->se_status->is_ese_present) {
+		nfc_add_se(hdev->ndev, ST21NFCA_ESE_HOST_ID, NFC_SE_EMBEDDED);
+		se_count++;
+	}
+
+	return !se_count;
+}
+EXPORT_SYMBOL(st21nfca_hci_discover_se);
+
+int st21nfca_hci_enable_se(struct nfc_hci_dev *hdev, u32 se_idx)
+{
+	int r;
+
+	/*
+	 * According to upper layer, se_idx == NFC_SE_UICC when
+	 * info->se_status->is_uicc_enable is true should never happen.
+	 * Same for eSE.
+	 */
+	r = st21nfca_hci_control_se(hdev, se_idx, ST21NFCA_SE_MODE_ON);
+
+	if (r == ST21NFCA_ESE_HOST_ID) {
+		st21nfca_se_get_atr(hdev);
+		r = nfc_hci_send_event(hdev, ST21NFCA_APDU_READER_GATE,
+				ST21NFCA_EVT_SE_SOFT_RESET, NULL, 0);
+		if (r < 0)
+			return r;
+	} else if (r < 0) {
+		/*
+		 * The activation tentative failed, the secure element
+		 * is not connected. Remove from the list.
+		 */
+		nfc_remove_se(hdev->ndev, se_idx);
+		return r;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(st21nfca_hci_enable_se);
+
+int st21nfca_hci_disable_se(struct nfc_hci_dev *hdev, u32 se_idx)
+{
+	int r;
+
+	/*
+	 * According to upper layer, se_idx == NFC_SE_UICC when
+	 * info->se_status->is_uicc_enable is true should never happen
+	 * Same for eSE.
+	 */
+	r = st21nfca_hci_control_se(hdev, se_idx, ST21NFCA_SE_MODE_OFF);
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+EXPORT_SYMBOL(st21nfca_hci_disable_se);
+
+int st21nfca_hci_se_io(struct nfc_hci_dev *hdev, u32 se_idx,
+			u8 *apdu, size_t apdu_length,
+			se_io_cb_t cb, void *cb_context)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	pr_debug("se_io %x\n", se_idx);
+
+	switch (se_idx) {
+	case ST21NFCA_ESE_HOST_ID:
+		info->se_info.cb = cb;
+		info->se_info.cb_context = cb_context;
+		mod_timer(&info->se_info.bwi_timer, jiffies +
+			  msecs_to_jiffies(info->se_info.wt_timeout));
+		info->se_info.bwi_active = true;
+		return nfc_hci_send_event(hdev, ST21NFCA_APDU_READER_GATE,
+					ST21NFCA_EVT_TRANSMIT_DATA,
+					apdu, apdu_length);
+	default:
+		return -ENODEV;
+	}
+}
+EXPORT_SYMBOL(st21nfca_hci_se_io);
+
+static void st21nfca_se_wt_timeout(unsigned long data)
+{
+	/*
+	 * No answer from the secure element
+	 * within the defined timeout.
+	 * Let's send a reset request as recovery procedure.
+	 * According to the situation, we first try to send a software reset
+	 * to the secure element. If the next command is still not
+	 * answering in time, we send to the CLF a secure element hardware
+	 * reset request.
+	 */
+	/* hardware reset managed through VCC_UICC_OUT power supply */
+	u8 param = 0x01;
+	struct st21nfca_hci_info *info = (struct st21nfca_hci_info *) data;
+
+	pr_debug("\n");
+
+	info->se_info.bwi_active = false;
+
+	if (!info->se_info.xch_error) {
+		info->se_info.xch_error = true;
+		nfc_hci_send_event(info->hdev, ST21NFCA_APDU_READER_GATE,
+				ST21NFCA_EVT_SE_SOFT_RESET, NULL, 0);
+	} else {
+		info->se_info.xch_error = false;
+		nfc_hci_send_event(info->hdev, ST21NFCA_DEVICE_MGNT_GATE,
+				ST21NFCA_EVT_SE_HARD_RESET, &param, 1);
+	}
+	info->se_info.cb(info->se_info.cb_context, NULL, 0, -ETIME);
+}
+
+static void st21nfca_se_activation_timeout(unsigned long data)
+{
+	struct st21nfca_hci_info *info = (struct st21nfca_hci_info *) data;
+
+	pr_debug("\n");
+
+	info->se_info.se_active = false;
+
+	complete(&info->se_info.req_completion);
+}
+
+/*
+ * Returns:
+ * <= 0: driver handled the event, skb consumed
+ *    1: driver does not handle the event, please do standard processing
+ */
+int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host,
+				u8 event, struct sk_buff *skb)
+{
+	int r = 0;
+
+	pr_debug("connectivity gate event: %x\n", event);
+
+	switch (event) {
+	case ST21NFCA_EVT_CONNECTIVITY:
+		break;
+	case ST21NFCA_EVT_TRANSACTION:
+		break;
+	default:
+		return 1;
+	}
+	kfree_skb(skb);
+	return r;
+}
+EXPORT_SYMBOL(st21nfca_connectivity_event_received);
+
+int st21nfca_apdu_reader_event_received(struct nfc_hci_dev *hdev,
+					u8 event, struct sk_buff *skb)
+{
+	int r = 0;
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	pr_debug("apdu reader gate event: %x\n", event);
+
+	switch (event) {
+	case ST21NFCA_EVT_TRANSMIT_DATA:
+		del_timer_sync(&info->se_info.bwi_timer);
+		info->se_info.bwi_active = false;
+		r = nfc_hci_send_event(hdev, ST21NFCA_DEVICE_MGNT_GATE,
+				ST21NFCA_EVT_SE_END_OF_APDU_TRANSFER, NULL, 0);
+		if (r < 0)
+			goto exit;
+
+		info->se_info.cb(info->se_info.cb_context,
+			skb->data, skb->len, 0);
+		break;
+	case ST21NFCA_EVT_WTX_REQUEST:
+		mod_timer(&info->se_info.bwi_timer, jiffies +
+				msecs_to_jiffies(info->se_info.wt_timeout));
+		break;
+	}
+
+exit:
+	kfree_skb(skb);
+	return r;
+}
+EXPORT_SYMBOL(st21nfca_apdu_reader_event_received);
+
+void st21nfca_se_init(struct nfc_hci_dev *hdev)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	init_completion(&info->se_info.req_completion);
+	/* initialize timers */
+	init_timer(&info->se_info.bwi_timer);
+	info->se_info.bwi_timer.data = (unsigned long)info;
+	info->se_info.bwi_timer.function = st21nfca_se_wt_timeout;
+	info->se_info.bwi_active = false;
+
+	init_timer(&info->se_info.se_active_timer);
+	info->se_info.se_active_timer.data = (unsigned long)info;
+	info->se_info.se_active_timer.function = st21nfca_se_activation_timeout;
+	info->se_info.se_active = false;
+
+	info->se_info.count_pipes = 0;
+	info->se_info.expected_pipes = 0;
+
+	info->se_info.xch_error = false;
+
+	info->se_info.wt_timeout =
+			ST21NFCA_BWI_TO_TIMEOUT(ST21NFCA_ATR_DEFAULT_BWI);
+}
+EXPORT_SYMBOL(st21nfca_se_init);
+
+void st21nfca_se_deinit(struct nfc_hci_dev *hdev)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	if (info->se_info.bwi_active)
+		del_timer_sync(&info->se_info.bwi_timer);
+	if (info->se_info.se_active)
+		del_timer_sync(&info->se_info.se_active_timer);
+
+	info->se_info.bwi_active = false;
+	info->se_info.se_active = false;
+}
+EXPORT_SYMBOL(st21nfca_se_deinit);
diff --git a/drivers/nfc/st21nfca/st21nfca_se.h b/drivers/nfc/st21nfca/st21nfca_se.h
new file mode 100644
index 000000000000..b172cfcaeb90
--- /dev/null
+++ b/drivers/nfc/st21nfca/st21nfca_se.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ST21NFCA_SE_H
+#define __ST21NFCA_SE_H
+
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+
+/*
+ * ref ISO7816-3 chap 8.1. the initial character TS is followed by a
+ * sequence of at most 32 characters.
+ */
+#define ST21NFCA_ESE_MAX_LENGTH			33
+#define ST21NFCA_ESE_HOST_ID			0xc0
+
+struct st21nfca_se_info {
+	u8 atr[ST21NFCA_ESE_MAX_LENGTH];
+	struct completion req_completion;
+
+	struct timer_list bwi_timer;
+	int wt_timeout; /* in msecs */
+	bool bwi_active;
+
+	struct timer_list se_active_timer;
+	bool se_active;
+	int expected_pipes;
+	int count_pipes;
+
+	bool xch_error;
+
+	se_io_cb_t cb;
+	void *cb_context;
+};
+
+int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host,
+					u8 event, struct sk_buff *skb);
+int st21nfca_apdu_reader_event_received(struct nfc_hci_dev *hdev,
+					u8 event, struct sk_buff *skb);
+
+int st21nfca_hci_discover_se(struct nfc_hci_dev *hdev);
+int st21nfca_hci_enable_se(struct nfc_hci_dev *hdev, u32 se_idx);
+int st21nfca_hci_disable_se(struct nfc_hci_dev *hdev, u32 se_idx);
+int st21nfca_hci_se_io(struct nfc_hci_dev *hdev, u32 se_idx,
+		u8 *apdu, size_t apdu_length,
+		se_io_cb_t cb, void *cb_context);
+
+void st21nfca_se_init(struct nfc_hci_dev *hdev);
+void st21nfca_se_deinit(struct nfc_hci_dev *hdev);
+#endif /* __ST21NFCA_SE_H */
diff --git a/include/linux/platform_data/st21nfca.h b/include/linux/platform_data/st21nfca.h
index 5087fff96d86..cc2bdafb0c69 100644
--- a/include/linux/platform_data/st21nfca.h
+++ b/include/linux/platform_data/st21nfca.h
@@ -26,6 +26,8 @@
 struct st21nfca_nfc_platform_data {
 	unsigned int gpio_ena;
 	unsigned int irq_polarity;
+	bool is_ese_present;
+	bool is_uicc_present;
 };
 
 #endif /* _ST21NFCA_HCI_H_ */
-- 
cgit v1.2.3


From be6a6b43b597a37d96dbf74985f72045ccef0940 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Tue, 27 Jan 2015 15:57:59 +0200
Subject: net/mlx4_core: Add bad-cable event support

If the firmware can detect a bad cable, allow it to generate an
event, and print the problem in the log.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/eq.c | 22 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/fw.c |  9 ++++++++-
 include/linux/mlx4/device.h             | 14 +++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 2f2e6067426d..4df006d8afa4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -88,6 +88,8 @@ static u64 get_async_ev_mask(struct mlx4_dev *dev)
 	u64 async_ev_mask = MLX4_ASYNC_EVENT_MASK;
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
 		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT);
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT);
 
 	return async_ev_mask;
 }
@@ -736,6 +738,26 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 					    (unsigned long) eqe);
 			break;
 
+		case MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT:
+			switch (eqe->subtype) {
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE:
+				mlx4_warn(dev, "Bad cable detected on port %u\n",
+					  eqe->event.bad_cable.port);
+				break;
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE:
+				mlx4_warn(dev, "Unsupported cable detected\n");
+				break;
+			default:
+				mlx4_dbg(dev,
+					 "Unhandled recoverable error event detected: %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, ownership=%s\n",
+					 eqe->type, eqe->subtype, eq->eqn,
+					 eq->cons_index, eqe->owner, eq->nent,
+					 !!(eqe->owner & 0x80) ^
+					 !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+				break;
+			}
+			break;
+
 		case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
 		case MLX4_EVENT_TYPE_ECC_DETECT:
 		default:
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 982861d1df44..2eadc2882e4f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -145,7 +145,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[16] = "CONFIG DEV support",
 		[17] = "Asymmetric EQs support",
 		[18] = "More than 80 VFs support",
-		[19] = "Performance optimized for limited rule configuration flow steering support"
+		[19] = "Performance optimized for limited rule configuration flow steering support",
+		[20] = "Recoverable error events support"
 	};
 	int i;
 
@@ -859,6 +860,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
 	if (field32 & (1 << 0))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
+	if (field32 & (1 << 7))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC);
 	if (field & 1<<6)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN;
@@ -1562,6 +1565,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define INIT_HCA_VXLAN_OFFSET		 0x0c
 #define INIT_HCA_CACHELINE_SZ_OFFSET	 0x0e
 #define INIT_HCA_FLAGS_OFFSET		 0x014
+#define INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET 0x018
 #define INIT_HCA_QPC_OFFSET		 0x020
 #define	 INIT_HCA_QPC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x10)
 #define	 INIT_HCA_LOG_QP_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x17)
@@ -1668,6 +1672,9 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
 	}
 
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		*(inbox + INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET / 4) |= cpu_to_be32(1 << 31);
+
 	/* QPC/EEC/CQC/EQC/RDMARC attributes */
 
 	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 5ef54e145e4d..c95d659a39f2 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -200,7 +200,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16,
 	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
 	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
-	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19
+	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19,
+	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20
 };
 
 enum {
@@ -280,6 +281,7 @@ enum mlx4_event {
 	MLX4_EVENT_TYPE_FATAL_WARNING	   = 0x1b,
 	MLX4_EVENT_TYPE_FLR_EVENT	   = 0x1c,
 	MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT = 0x1d,
+	MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT  = 0x3e,
 	MLX4_EVENT_TYPE_NONE		   = 0xff,
 };
 
@@ -288,6 +290,11 @@ enum {
 	MLX4_PORT_CHANGE_SUBTYPE_ACTIVE	= 4
 };
 
+enum {
+	MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE		= 1,
+	MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE	= 2,
+};
+
 enum {
 	MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0,
 };
@@ -860,6 +867,11 @@ struct mlx4_eqe {
 				} __packed tbl_change_info;
 			} params;
 		} __packed port_mgmt_change;
+		struct {
+			u8 reserved[3];
+			u8 port;
+			u32 reserved1[5];
+		} __packed bad_cable;
 	}			event;
 	u8			slave_id;
 	u8			reserved3[2];
-- 
cgit v1.2.3


From 5a03108689c6f3e448a920b42af04e6d28401f80 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Tue, 27 Jan 2015 15:58:02 +0200
Subject: net/mlx4_core: Adjust command timeouts to conform to the firmware
 spec

The firmware spec states that the timeout for all commands should be 60 seconds.

In the past, the spec indicated that there were several classes of timeout
(short, medium, and long).  The driver has these different timeout classes.
We leave the class differentiation in the driver as-is (to protect against any
future spec changes), but set the timeout for all classes to be 60 seconds.

In addition, we fix a few commands which had hard-coded numeric timeouts specified.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c | 19 ++++++++++---------
 drivers/net/ethernet/mellanox/mlx4/mr.c |  4 ++--
 include/linux/mlx4/cmd.h                |  6 +++---
 3 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 2aa7c232d0b6..0c90d1072e47 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -1774,8 +1774,8 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		MLX4_PUT(inbox, parser_params,	INIT_HCA_VXLAN_OFFSET);
 	}
 
-	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000,
-		       MLX4_CMD_NATIVE);
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA,
+		       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 
 	if (err)
 		mlx4_err(dev, "INIT_HCA returns %d\n", err);
@@ -2029,7 +2029,7 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
 		if (priv->mfunc.master.init_port_ref[port] == 1) {
 			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
-				       1000, MLX4_CMD_NATIVE);
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 			if (err)
 				return err;
 		}
@@ -2040,7 +2040,7 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 			if (!priv->mfunc.master.qp0_state[port].qp0_active &&
 			    priv->mfunc.master.qp0_state[port].port_active) {
 				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
-					       1000, MLX4_CMD_NATIVE);
+					       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 				if (err)
 					return err;
 				priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
@@ -2055,15 +2055,15 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 
 int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
 {
-	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000,
-			MLX4_CMD_WRAPPED);
+	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 }
 EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
 
 int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
 {
-	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000,
-			MLX4_CMD_NATIVE);
+	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA,
+			MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 }
 
 struct mlx4_config_dev {
@@ -2202,7 +2202,8 @@ int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
 int mlx4_NOP(struct mlx4_dev *dev)
 {
 	/* Input modifier of 0x1f means "finish as soon as possible." */
-	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100, MLX4_CMD_NATIVE);
+	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
 }
 
 int mlx4_get_phys_port_id(struct mlx4_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
index 8dbdf1d29357..d21e884a0838 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -1155,7 +1155,7 @@ EXPORT_SYMBOL_GPL(mlx4_fmr_free);
 
 int mlx4_SYNC_TPT(struct mlx4_dev *dev)
 {
-	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000,
-			MLX4_CMD_NATIVE);
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 }
 EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index c989442ffc6a..ae95adc78509 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -165,9 +165,9 @@ enum {
 };
 
 enum {
-	MLX4_CMD_TIME_CLASS_A	= 10000,
-	MLX4_CMD_TIME_CLASS_B	= 10000,
-	MLX4_CMD_TIME_CLASS_C	= 10000,
+	MLX4_CMD_TIME_CLASS_A	= 60000,
+	MLX4_CMD_TIME_CLASS_B	= 60000,
+	MLX4_CMD_TIME_CLASS_C	= 60000,
 };
 
 enum {
-- 
cgit v1.2.3


From 14bf61ffe6ac54afcd1e888a4407fe16054483db Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 9 Oct 2014 16:03:13 +0200
Subject: quota: Switch ->get_dqblk() and ->set_dqblk() to use bytes as space
 units

Currently ->get_dqblk() and ->set_dqblk() use struct fs_disk_quota which
tracks space limits and usage in 512-byte blocks. However VFS quotas
track usage in bytes (as some filesystems require that) and we need to
somehow pass this information. Upto now it wasn't a problem because we
didn't do any unit conversion (thus VFS quota routines happily stuck
number of bytes into d_bcount field of struct fd_disk_quota). Only if
you tried to use Q_XGETQUOTA or Q_XSETQLIM for VFS quotas (or Q_GETQUOTA
/ Q_SETQUOTA for XFS quotas), you got bogus results. Hardly anyone
tried this but reportedly some Samba users hit the problem in practice.
So when we want interfaces compatible we need to fix this.

We bite the bullet and define another quota structure used for passing
information from/to ->get_dqblk()/->set_dqblk. It's somewhat sad we have
to have more conversion routines in fs/quota/quota.c and another copying
of quota structure slows down getting of quota information by about 2%
but it seems cleaner than overloading e.g. units of d_bcount to bytes.

CC: stable@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/gfs2/quota.c          |  49 +++++++-------
 fs/quota/dquot.c         |  83 ++++++++++++------------
 fs/quota/quota.c         | 162 +++++++++++++++++++++++++++++++++++++++--------
 fs/xfs/xfs_qm.h          |   4 +-
 fs/xfs/xfs_qm_syscalls.c | 156 +++++++++++++++++++--------------------------
 fs/xfs/xfs_quotaops.c    |   8 +--
 include/linux/quota.h    |  47 +++++++++++++-
 include/linux/quotaops.h |   4 +-
 8 files changed, 318 insertions(+), 195 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c8b148bbdc8b..3e193cb36996 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -667,7 +667,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 
 static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 			     s64 change, struct gfs2_quota_data *qd,
-			     struct fs_disk_quota *fdq)
+			     struct qc_dqblk *fdq)
 {
 	struct inode *inode = &ip->i_inode;
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -697,16 +697,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	be64_add_cpu(&q.qu_value, change);
 	qd->qd_qb.qb_value = q.qu_value;
 	if (fdq) {
-		if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-			q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
+		if (fdq->d_fieldmask & QC_SPC_SOFT) {
+			q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift);
 			qd->qd_qb.qb_warn = q.qu_warn;
 		}
-		if (fdq->d_fieldmask & FS_DQ_BHARD) {
-			q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
+		if (fdq->d_fieldmask & QC_SPC_HARD) {
+			q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift);
 			qd->qd_qb.qb_limit = q.qu_limit;
 		}
-		if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
-			q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+		if (fdq->d_fieldmask & QC_SPACE) {
+			q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift);
 			qd->qd_qb.qb_value = q.qu_value;
 		}
 	}
@@ -1497,7 +1497,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 }
 
 static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
-			  struct fs_disk_quota *fdq)
+			  struct qc_dqblk *fdq)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_quota_lvb *qlvb;
@@ -1505,7 +1505,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
 	struct gfs2_holder q_gh;
 	int error;
 
-	memset(fdq, 0, sizeof(struct fs_disk_quota));
+	memset(fdq, 0, sizeof(*fdq));
 
 	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
 		return -ESRCH; /* Crazy XFS error code */
@@ -1522,12 +1522,9 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
 		goto out;
 
 	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
-	fdq->d_version = FS_DQUOT_VERSION;
-	fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
-	fdq->d_id = from_kqid_munged(current_user_ns(), qid);
-	fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
-	fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
-	fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
+	fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift;
+	fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift;
+	fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift;
 
 	gfs2_glock_dq_uninit(&q_gh);
 out:
@@ -1536,10 +1533,10 @@ out:
 }
 
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
+#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE)
 
 static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
-			  struct fs_disk_quota *fdq)
+			  struct qc_dqblk *fdq)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1583,17 +1580,17 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
 		goto out_i;
 
 	/* If nothing has changed, this is a no-op */
-	if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
-	    ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
-		fdq->d_fieldmask ^= FS_DQ_BSOFT;
+	if ((fdq->d_fieldmask & QC_SPC_SOFT) &&
+	    ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
+		fdq->d_fieldmask ^= QC_SPC_SOFT;
 
-	if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
-	    ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
-		fdq->d_fieldmask ^= FS_DQ_BHARD;
+	if ((fdq->d_fieldmask & QC_SPC_HARD) &&
+	    ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
+		fdq->d_fieldmask ^= QC_SPC_HARD;
 
-	if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
-	    ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
-		fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+	if ((fdq->d_fieldmask & QC_SPACE) &&
+	    ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+		fdq->d_fieldmask ^= QC_SPACE;
 
 	if (fdq->d_fieldmask == 0)
 		goto out_i;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8f0acef3d184..69df5b239844 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2396,30 +2396,25 @@ static inline qsize_t stoqb(qsize_t space)
 }
 
 /* Generic routine for getting common part of quota structure */
-static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 
 	memset(di, 0, sizeof(*di));
-	di->d_version = FS_DQUOT_VERSION;
-	di->d_flags = dquot->dq_id.type == USRQUOTA ?
-			FS_USER_QUOTA : FS_GROUP_QUOTA;
-	di->d_id = from_kqid_munged(current_user_ns(), dquot->dq_id);
-
 	spin_lock(&dq_data_lock);
-	di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
-	di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
+	di->d_spc_hardlimit = dm->dqb_bhardlimit;
+	di->d_spc_softlimit = dm->dqb_bsoftlimit;
 	di->d_ino_hardlimit = dm->dqb_ihardlimit;
 	di->d_ino_softlimit = dm->dqb_isoftlimit;
-	di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
-	di->d_icount = dm->dqb_curinodes;
-	di->d_btimer = dm->dqb_btime;
-	di->d_itimer = dm->dqb_itime;
+	di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
+	di->d_ino_count = dm->dqb_curinodes;
+	di->d_spc_timer = dm->dqb_btime;
+	di->d_ino_timer = dm->dqb_itime;
 	spin_unlock(&dq_data_lock);
 }
 
 int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
-		    struct fs_disk_quota *di)
+		    struct qc_dqblk *di)
 {
 	struct dquot *dquot;
 
@@ -2433,70 +2428,70 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
 }
 EXPORT_SYMBOL(dquot_get_dqblk);
 
-#define VFS_FS_DQ_MASK \
-	(FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
-	 FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
-	 FS_DQ_BTIMER | FS_DQ_ITIMER)
+#define VFS_QC_MASK \
+	(QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
+	 QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
+	 QC_SPC_TIMER | QC_INO_TIMER)
 
 /* Generic routine for setting common part of quota structure */
-static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 	int check_blim = 0, check_ilim = 0;
 	struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
 
-	if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
+	if (di->d_fieldmask & ~VFS_QC_MASK)
 		return -EINVAL;
 
-	if (((di->d_fieldmask & FS_DQ_BSOFT) &&
-	     (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
-	    ((di->d_fieldmask & FS_DQ_BHARD) &&
-	     (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
-	    ((di->d_fieldmask & FS_DQ_ISOFT) &&
+	if (((di->d_fieldmask & QC_SPC_SOFT) &&
+	     stoqb(di->d_spc_softlimit) > dqi->dqi_maxblimit) ||
+	    ((di->d_fieldmask & QC_SPC_HARD) &&
+	     stoqb(di->d_spc_hardlimit) > dqi->dqi_maxblimit) ||
+	    ((di->d_fieldmask & QC_INO_SOFT) &&
 	     (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
-	    ((di->d_fieldmask & FS_DQ_IHARD) &&
+	    ((di->d_fieldmask & QC_INO_HARD) &&
 	     (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
 		return -ERANGE;
 
 	spin_lock(&dq_data_lock);
-	if (di->d_fieldmask & FS_DQ_BCOUNT) {
-		dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
+	if (di->d_fieldmask & QC_SPACE) {
+		dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
 		check_blim = 1;
 		set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_BSOFT)
-		dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
-	if (di->d_fieldmask & FS_DQ_BHARD)
-		dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
-	if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
+	if (di->d_fieldmask & QC_SPC_SOFT)
+		dm->dqb_bsoftlimit = di->d_spc_softlimit;
+	if (di->d_fieldmask & QC_SPC_HARD)
+		dm->dqb_bhardlimit = di->d_spc_hardlimit;
+	if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
 		check_blim = 1;
 		set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_ICOUNT) {
-		dm->dqb_curinodes = di->d_icount;
+	if (di->d_fieldmask & QC_INO_COUNT) {
+		dm->dqb_curinodes = di->d_ino_count;
 		check_ilim = 1;
 		set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_ISOFT)
+	if (di->d_fieldmask & QC_INO_SOFT)
 		dm->dqb_isoftlimit = di->d_ino_softlimit;
-	if (di->d_fieldmask & FS_DQ_IHARD)
+	if (di->d_fieldmask & QC_INO_HARD)
 		dm->dqb_ihardlimit = di->d_ino_hardlimit;
-	if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
+	if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
 		check_ilim = 1;
 		set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_BTIMER) {
-		dm->dqb_btime = di->d_btimer;
+	if (di->d_fieldmask & QC_SPC_TIMER) {
+		dm->dqb_btime = di->d_spc_timer;
 		check_blim = 1;
 		set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_ITIMER) {
-		dm->dqb_itime = di->d_itimer;
+	if (di->d_fieldmask & QC_INO_TIMER) {
+		dm->dqb_itime = di->d_ino_timer;
 		check_ilim = 1;
 		set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
 	}
@@ -2506,7 +2501,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 		    dm->dqb_curspace < dm->dqb_bsoftlimit) {
 			dm->dqb_btime = 0;
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-		} else if (!(di->d_fieldmask & FS_DQ_BTIMER))
+		} else if (!(di->d_fieldmask & QC_SPC_TIMER))
 			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
 	}
@@ -2515,7 +2510,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 		    dm->dqb_curinodes < dm->dqb_isoftlimit) {
 			dm->dqb_itime = 0;
 			clear_bit(DQ_INODES_B, &dquot->dq_flags);
-		} else if (!(di->d_fieldmask & FS_DQ_ITIMER))
+		} else if (!(di->d_fieldmask & QC_INO_TIMER))
 			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
 	}
@@ -2531,7 +2526,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 }
 
 int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
-		  struct fs_disk_quota *di)
+		  struct qc_dqblk *di)
 {
 	struct dquot *dquot;
 	int rc;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2aa4151f99d2..6f3856328eea 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -118,17 +118,27 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
 	return sb->s_qcop->set_info(sb, type, &info);
 }
 
-static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+static inline qsize_t qbtos(qsize_t blocks)
+{
+	return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src)
 {
 	memset(dst, 0, sizeof(*dst));
-	dst->dqb_bhardlimit = src->d_blk_hardlimit;
-	dst->dqb_bsoftlimit = src->d_blk_softlimit;
-	dst->dqb_curspace = src->d_bcount;
+	dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit);
+	dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit);
+	dst->dqb_curspace = src->d_space;
 	dst->dqb_ihardlimit = src->d_ino_hardlimit;
 	dst->dqb_isoftlimit = src->d_ino_softlimit;
-	dst->dqb_curinodes = src->d_icount;
-	dst->dqb_btime = src->d_btimer;
-	dst->dqb_itime = src->d_itimer;
+	dst->dqb_curinodes = src->d_ino_count;
+	dst->dqb_btime = src->d_spc_timer;
+	dst->dqb_itime = src->d_ino_timer;
 	dst->dqb_valid = QIF_ALL;
 }
 
@@ -136,7 +146,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
 			  void __user *addr)
 {
 	struct kqid qid;
-	struct fs_disk_quota fdq;
+	struct qc_dqblk fdq;
 	struct if_dqblk idq;
 	int ret;
 
@@ -154,36 +164,36 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
 	return 0;
 }
 
-static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
+static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
 {
-	dst->d_blk_hardlimit = src->dqb_bhardlimit;
-	dst->d_blk_softlimit  = src->dqb_bsoftlimit;
-	dst->d_bcount = src->dqb_curspace;
+	dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
+	dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit);
+	dst->d_space = src->dqb_curspace;
 	dst->d_ino_hardlimit = src->dqb_ihardlimit;
 	dst->d_ino_softlimit = src->dqb_isoftlimit;
-	dst->d_icount = src->dqb_curinodes;
-	dst->d_btimer = src->dqb_btime;
-	dst->d_itimer = src->dqb_itime;
+	dst->d_ino_count = src->dqb_curinodes;
+	dst->d_spc_timer = src->dqb_btime;
+	dst->d_ino_timer = src->dqb_itime;
 
 	dst->d_fieldmask = 0;
 	if (src->dqb_valid & QIF_BLIMITS)
-		dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+		dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD;
 	if (src->dqb_valid & QIF_SPACE)
-		dst->d_fieldmask |= FS_DQ_BCOUNT;
+		dst->d_fieldmask |= QC_SPACE;
 	if (src->dqb_valid & QIF_ILIMITS)
-		dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+		dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD;
 	if (src->dqb_valid & QIF_INODES)
-		dst->d_fieldmask |= FS_DQ_ICOUNT;
+		dst->d_fieldmask |= QC_INO_COUNT;
 	if (src->dqb_valid & QIF_BTIME)
-		dst->d_fieldmask |= FS_DQ_BTIMER;
+		dst->d_fieldmask |= QC_SPC_TIMER;
 	if (src->dqb_valid & QIF_ITIME)
-		dst->d_fieldmask |= FS_DQ_ITIMER;
+		dst->d_fieldmask |= QC_INO_TIMER;
 }
 
 static int quota_setquota(struct super_block *sb, int type, qid_t id,
 			  void __user *addr)
 {
-	struct fs_disk_quota fdq;
+	struct qc_dqblk fdq;
 	struct if_dqblk idq;
 	struct kqid qid;
 
@@ -247,10 +257,78 @@ static int quota_getxstatev(struct super_block *sb, void __user *addr)
 	return ret;
 }
 
+/*
+ * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them
+ * out of there as xfsprogs rely on definitions being in that header file. So
+ * just define same functions here for quota purposes.
+ */
+#define XFS_BB_SHIFT 9
+
+static inline u64 quota_bbtob(u64 blocks)
+{
+	return blocks << XFS_BB_SHIFT;
+}
+
+static inline u64 quota_btobb(u64 bytes)
+{
+	return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT;
+}
+
+static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
+{
+	dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit);
+	dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit);
+	dst->d_ino_hardlimit = src->d_ino_hardlimit;
+	dst->d_ino_softlimit = src->d_ino_softlimit;
+	dst->d_space = quota_bbtob(src->d_bcount);
+	dst->d_ino_count = src->d_icount;
+	dst->d_ino_timer = src->d_itimer;
+	dst->d_spc_timer = src->d_btimer;
+	dst->d_ino_warns = src->d_iwarns;
+	dst->d_spc_warns = src->d_bwarns;
+	dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit);
+	dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit);
+	dst->d_rt_space = quota_bbtob(src->d_rtbcount);
+	dst->d_rt_spc_timer = src->d_rtbtimer;
+	dst->d_rt_spc_warns = src->d_rtbwarns;
+	dst->d_fieldmask = 0;
+	if (src->d_fieldmask & FS_DQ_ISOFT)
+		dst->d_fieldmask |= QC_INO_SOFT;
+	if (src->d_fieldmask & FS_DQ_IHARD)
+		dst->d_fieldmask |= QC_INO_HARD;
+	if (src->d_fieldmask & FS_DQ_BSOFT)
+		dst->d_fieldmask |= QC_SPC_SOFT;
+	if (src->d_fieldmask & FS_DQ_BHARD)
+		dst->d_fieldmask |= QC_SPC_HARD;
+	if (src->d_fieldmask & FS_DQ_RTBSOFT)
+		dst->d_fieldmask |= QC_RT_SPC_SOFT;
+	if (src->d_fieldmask & FS_DQ_RTBHARD)
+		dst->d_fieldmask |= QC_RT_SPC_HARD;
+	if (src->d_fieldmask & FS_DQ_BTIMER)
+		dst->d_fieldmask |= QC_SPC_TIMER;
+	if (src->d_fieldmask & FS_DQ_ITIMER)
+		dst->d_fieldmask |= QC_INO_TIMER;
+	if (src->d_fieldmask & FS_DQ_RTBTIMER)
+		dst->d_fieldmask |= QC_RT_SPC_TIMER;
+	if (src->d_fieldmask & FS_DQ_BWARNS)
+		dst->d_fieldmask |= QC_SPC_WARNS;
+	if (src->d_fieldmask & FS_DQ_IWARNS)
+		dst->d_fieldmask |= QC_INO_WARNS;
+	if (src->d_fieldmask & FS_DQ_RTBWARNS)
+		dst->d_fieldmask |= QC_RT_SPC_WARNS;
+	if (src->d_fieldmask & FS_DQ_BCOUNT)
+		dst->d_fieldmask |= QC_SPACE;
+	if (src->d_fieldmask & FS_DQ_ICOUNT)
+		dst->d_fieldmask |= QC_INO_COUNT;
+	if (src->d_fieldmask & FS_DQ_RTBCOUNT)
+		dst->d_fieldmask |= QC_RT_SPACE;
+}
+
 static int quota_setxquota(struct super_block *sb, int type, qid_t id,
 			   void __user *addr)
 {
 	struct fs_disk_quota fdq;
+	struct qc_dqblk qdq;
 	struct kqid qid;
 
 	if (copy_from_user(&fdq, addr, sizeof(fdq)))
@@ -260,13 +338,44 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
 	qid = make_kqid(current_user_ns(), type, id);
 	if (!qid_valid(qid))
 		return -EINVAL;
-	return sb->s_qcop->set_dqblk(sb, qid, &fdq);
+	copy_from_xfs_dqblk(&qdq, &fdq);
+	return sb->s_qcop->set_dqblk(sb, qid, &qdq);
+}
+
+static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
+			      int type, qid_t id)
+{
+	memset(dst, 0, sizeof(*dst));
+	dst->d_version = FS_DQUOT_VERSION;
+	dst->d_id = id;
+	if (type == USRQUOTA)
+		dst->d_flags = FS_USER_QUOTA;
+	else if (type == PRJQUOTA)
+		dst->d_flags = FS_PROJ_QUOTA;
+	else
+		dst->d_flags = FS_GROUP_QUOTA;
+	dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit);
+	dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit);
+	dst->d_ino_hardlimit = src->d_ino_hardlimit;
+	dst->d_ino_softlimit = src->d_ino_softlimit;
+	dst->d_bcount = quota_btobb(src->d_space);
+	dst->d_icount = src->d_ino_count;
+	dst->d_itimer = src->d_ino_timer;
+	dst->d_btimer = src->d_spc_timer;
+	dst->d_iwarns = src->d_ino_warns;
+	dst->d_bwarns = src->d_spc_warns;
+	dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit);
+	dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit);
+	dst->d_rtbcount = quota_btobb(src->d_rt_space);
+	dst->d_rtbtimer = src->d_rt_spc_timer;
+	dst->d_rtbwarns = src->d_rt_spc_warns;
 }
 
 static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 			   void __user *addr)
 {
 	struct fs_disk_quota fdq;
+	struct qc_dqblk qdq;
 	struct kqid qid;
 	int ret;
 
@@ -275,8 +384,11 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 	qid = make_kqid(current_user_ns(), type, id);
 	if (!qid_valid(qid))
 		return -EINVAL;
-	ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
-	if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
+	ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
+	if (ret)
+		return ret;
+	copy_to_xfs_dqblk(&fdq, &qdq, type, id);
+	if (copy_to_user(addr, &fdq, sizeof(fdq)))
 		return -EFAULT;
 	return ret;
 }
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 3a07a937e232..41f6c0b9d51c 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -166,9 +166,9 @@ extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 /* quota ops */
 extern int		xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
 extern int		xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
-					uint, struct fs_disk_quota *);
+					uint, struct qc_dqblk *);
 extern int		xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
-					struct fs_disk_quota *);
+					struct qc_dqblk *);
 extern int		xfs_qm_scall_getqstat(struct xfs_mount *,
 					struct fs_quota_stat *);
 extern int		xfs_qm_scall_getqstatv(struct xfs_mount *,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 74fca68e43b6..cb6168ec92c9 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -39,7 +39,6 @@ STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
 					uint);
 STATIC uint	xfs_qm_export_flags(uint);
-STATIC uint	xfs_qm_export_qtype_flags(uint);
 
 /*
  * Turn off quota accounting and/or enforcement for all udquots and/or
@@ -573,8 +572,8 @@ xfs_qm_scall_getqstatv(
 	return 0;
 }
 
-#define XFS_DQ_MASK \
-	(FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
+#define XFS_QC_MASK \
+	(QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
 
 /*
  * Adjust quota limits, and start/stop timers accordingly.
@@ -584,7 +583,7 @@ xfs_qm_scall_setqlim(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
 	uint			type,
-	fs_disk_quota_t		*newlim)
+	struct qc_dqblk		*newlim)
 {
 	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	struct xfs_disk_dquot	*ddq;
@@ -593,9 +592,9 @@ xfs_qm_scall_setqlim(
 	int			error;
 	xfs_qcnt_t		hard, soft;
 
-	if (newlim->d_fieldmask & ~XFS_DQ_MASK)
+	if (newlim->d_fieldmask & ~XFS_QC_MASK)
 		return -EINVAL;
-	if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+	if ((newlim->d_fieldmask & XFS_QC_MASK) == 0)
 		return 0;
 
 	/*
@@ -633,11 +632,11 @@ xfs_qm_scall_setqlim(
 	/*
 	 * Make sure that hardlimits are >= soft limits before changing.
 	 */
-	hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+	hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
 			be64_to_cpu(ddq->d_blk_hardlimit);
-	soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+	soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
 			be64_to_cpu(ddq->d_blk_softlimit);
 	if (hard == 0 || hard >= soft) {
 		ddq->d_blk_hardlimit = cpu_to_be64(hard);
@@ -650,11 +649,11 @@ xfs_qm_scall_setqlim(
 	} else {
 		xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
 	}
-	hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+	hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
 			be64_to_cpu(ddq->d_rtb_hardlimit);
-	soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+	soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
 			be64_to_cpu(ddq->d_rtb_softlimit);
 	if (hard == 0 || hard >= soft) {
 		ddq->d_rtb_hardlimit = cpu_to_be64(hard);
@@ -667,10 +666,10 @@ xfs_qm_scall_setqlim(
 		xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
 	}
 
-	hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+	hard = (newlim->d_fieldmask & QC_INO_HARD) ?
 		(xfs_qcnt_t) newlim->d_ino_hardlimit :
 			be64_to_cpu(ddq->d_ino_hardlimit);
-	soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+	soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
 		(xfs_qcnt_t) newlim->d_ino_softlimit :
 			be64_to_cpu(ddq->d_ino_softlimit);
 	if (hard == 0 || hard >= soft) {
@@ -687,12 +686,12 @@ xfs_qm_scall_setqlim(
 	/*
 	 * Update warnings counter(s) if requested
 	 */
-	if (newlim->d_fieldmask & FS_DQ_BWARNS)
-		ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
-	if (newlim->d_fieldmask & FS_DQ_IWARNS)
-		ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
-	if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-		ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
+	if (newlim->d_fieldmask & QC_SPC_WARNS)
+		ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
+	if (newlim->d_fieldmask & QC_INO_WARNS)
+		ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
+	if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+		ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
 
 	if (id == 0) {
 		/*
@@ -702,24 +701,24 @@ xfs_qm_scall_setqlim(
 		 * soft and hard limit values (already done, above), and
 		 * for warnings.
 		 */
-		if (newlim->d_fieldmask & FS_DQ_BTIMER) {
-			q->qi_btimelimit = newlim->d_btimer;
-			ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
+		if (newlim->d_fieldmask & QC_SPC_TIMER) {
+			q->qi_btimelimit = newlim->d_spc_timer;
+			ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
 		}
-		if (newlim->d_fieldmask & FS_DQ_ITIMER) {
-			q->qi_itimelimit = newlim->d_itimer;
-			ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
+		if (newlim->d_fieldmask & QC_INO_TIMER) {
+			q->qi_itimelimit = newlim->d_ino_timer;
+			ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
 		}
-		if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
-			q->qi_rtbtimelimit = newlim->d_rtbtimer;
-			ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
+		if (newlim->d_fieldmask & QC_RT_SPC_TIMER) {
+			q->qi_rtbtimelimit = newlim->d_rt_spc_timer;
+			ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
 		}
-		if (newlim->d_fieldmask & FS_DQ_BWARNS)
-			q->qi_bwarnlimit = newlim->d_bwarns;
-		if (newlim->d_fieldmask & FS_DQ_IWARNS)
-			q->qi_iwarnlimit = newlim->d_iwarns;
-		if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-			q->qi_rtbwarnlimit = newlim->d_rtbwarns;
+		if (newlim->d_fieldmask & QC_SPC_WARNS)
+			q->qi_bwarnlimit = newlim->d_spc_warns;
+		if (newlim->d_fieldmask & QC_INO_WARNS)
+			q->qi_iwarnlimit = newlim->d_ino_warns;
+		if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+			q->qi_rtbwarnlimit = newlim->d_rt_spc_warns;
 	} else {
 		/*
 		 * If the user is now over quota, start the timelimit.
@@ -824,7 +823,7 @@ xfs_qm_scall_getquota(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
 	uint			type,
-	struct fs_disk_quota	*dst)
+	struct qc_dqblk		*dst)
 {
 	struct xfs_dquot	*dqp;
 	int			error;
@@ -848,28 +847,25 @@ xfs_qm_scall_getquota(
 	}
 
 	memset(dst, 0, sizeof(*dst));
-	dst->d_version = FS_DQUOT_VERSION;
-	dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
-	dst->d_id = be32_to_cpu(dqp->q_core.d_id);
-	dst->d_blk_hardlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-	dst->d_blk_softlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
+	dst->d_spc_hardlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
+	dst->d_spc_softlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
 	dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
 	dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
-	dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
-	dst->d_icount = dqp->q_res_icount;
-	dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
-	dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
-	dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
-	dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
-	dst->d_rtb_hardlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
-	dst->d_rtb_softlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
-	dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
-	dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
-	dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+	dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
+	dst->d_ino_count = dqp->q_res_icount;
+	dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
+	dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
+	dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
+	dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
+	dst->d_rt_spc_hardlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
+	dst->d_rt_spc_softlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
+	dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
+	dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+	dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
 
 	/*
 	 * Internally, we don't reset all the timers when quota enforcement
@@ -882,23 +878,23 @@ xfs_qm_scall_getquota(
 	     dqp->q_core.d_flags == XFS_DQ_GROUP) ||
 	    (!XFS_IS_PQUOTA_ENFORCED(mp) &&
 	     dqp->q_core.d_flags == XFS_DQ_PROJ)) {
-		dst->d_btimer = 0;
-		dst->d_itimer = 0;
-		dst->d_rtbtimer = 0;
+		dst->d_spc_timer = 0;
+		dst->d_ino_timer = 0;
+		dst->d_rt_spc_timer = 0;
 	}
 
 #ifdef DEBUG
-	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
-	     (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
-	     (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
-	    dst->d_id != 0) {
-		if ((dst->d_bcount > dst->d_blk_softlimit) &&
-		    (dst->d_blk_softlimit > 0)) {
-			ASSERT(dst->d_btimer != 0);
+	if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
+	     (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
+	     (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
+	    id != 0) {
+		if ((dst->d_space > dst->d_spc_softlimit) &&
+		    (dst->d_spc_softlimit > 0)) {
+			ASSERT(dst->d_spc_timer != 0);
 		}
-		if ((dst->d_icount > dst->d_ino_softlimit) &&
+		if ((dst->d_ino_count > dst->d_ino_softlimit) &&
 		    (dst->d_ino_softlimit > 0)) {
-			ASSERT(dst->d_itimer != 0);
+			ASSERT(dst->d_ino_timer != 0);
 		}
 	}
 #endif
@@ -907,26 +903,6 @@ out_put:
 	return error;
 }
 
-STATIC uint
-xfs_qm_export_qtype_flags(
-	uint flags)
-{
-	/*
-	 * Can't be more than one, or none.
-	 */
-	ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
-		(FS_PROJ_QUOTA | FS_USER_QUOTA));
-	ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
-		(FS_PROJ_QUOTA | FS_GROUP_QUOTA));
-	ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
-		(FS_USER_QUOTA | FS_GROUP_QUOTA));
-	ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
-
-	return (flags & XFS_DQ_USER) ?
-		FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-			FS_PROJ_QUOTA : FS_GROUP_QUOTA;
-}
-
 STATIC uint
 xfs_qm_export_flags(
 	uint flags)
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7542bbeca6a1..801a84c1cdc3 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -131,7 +131,7 @@ STATIC int
 xfs_fs_get_dqblk(
 	struct super_block	*sb,
 	struct kqid		qid,
-	struct fs_disk_quota	*fdq)
+	struct qc_dqblk		*qdq)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
@@ -141,14 +141,14 @@ xfs_fs_get_dqblk(
 		return -ESRCH;
 
 	return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
-				      xfs_quota_type(qid.type), fdq);
+				      xfs_quota_type(qid.type), qdq);
 }
 
 STATIC int
 xfs_fs_set_dqblk(
 	struct super_block	*sb,
 	struct kqid		qid,
-	struct fs_disk_quota	*fdq)
+	struct qc_dqblk		*qdq)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
@@ -160,7 +160,7 @@ xfs_fs_set_dqblk(
 		return -ESRCH;
 
 	return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
-				     xfs_quota_type(qid.type), fdq);
+				     xfs_quota_type(qid.type), qdq);
 }
 
 const struct quotactl_ops xfs_quotactl_operations = {
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 50978b781a19..097d7eb2441e 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -321,6 +321,49 @@ struct dquot_operations {
 
 struct path;
 
+/* Structure for communicating via ->get_dqblk() & ->set_dqblk() */
+struct qc_dqblk {
+	int d_fieldmask;	/* mask of fields to change in ->set_dqblk() */
+	u64 d_spc_hardlimit;	/* absolute limit on used space */
+	u64 d_spc_softlimit;	/* preferred limit on used space */
+	u64 d_ino_hardlimit;	/* maximum # allocated inodes */
+	u64 d_ino_softlimit;	/* preferred inode limit */
+	u64 d_space;		/* Space owned by the user */
+	u64 d_ino_count;	/* # inodes owned by the user */
+	s64 d_ino_timer;	/* zero if within inode limits */
+				/* if not, we refuse service */
+	s64 d_spc_timer;	/* similar to above; for space */
+	int d_ino_warns;	/* # warnings issued wrt num inodes */
+	int d_spc_warns;	/* # warnings issued wrt used space */
+	u64 d_rt_spc_hardlimit;	/* absolute limit on realtime space */
+	u64 d_rt_spc_softlimit;	/* preferred limit on RT space */
+	u64 d_rt_space;		/* realtime space owned */
+	s64 d_rt_spc_timer;	/* similar to above; for RT space */
+	int d_rt_spc_warns;	/* # warnings issued wrt RT space */
+};
+
+/* Field specifiers for ->set_dqblk() in struct qc_dqblk */
+#define	QC_INO_SOFT	(1<<0)
+#define	QC_INO_HARD	(1<<1)
+#define	QC_SPC_SOFT	(1<<2)
+#define	QC_SPC_HARD	(1<<3)
+#define	QC_RT_SPC_SOFT	(1<<4)
+#define	QC_RT_SPC_HARD	(1<<5)
+#define QC_LIMIT_MASK (QC_INO_SOFT | QC_INO_HARD | QC_SPC_SOFT | QC_SPC_HARD | \
+		       QC_RT_SPC_SOFT | QC_RT_SPC_HARD)
+#define	QC_SPC_TIMER	(1<<6)
+#define	QC_INO_TIMER	(1<<7)
+#define	QC_RT_SPC_TIMER	(1<<8)
+#define QC_TIMER_MASK (QC_SPC_TIMER | QC_INO_TIMER | QC_RT_SPC_TIMER)
+#define	QC_SPC_WARNS	(1<<9)
+#define	QC_INO_WARNS	(1<<10)
+#define	QC_RT_SPC_WARNS	(1<<11)
+#define QC_WARNS_MASK (QC_SPC_WARNS | QC_INO_WARNS | QC_RT_SPC_WARNS)
+#define	QC_SPACE	(1<<12)
+#define	QC_INO_COUNT	(1<<13)
+#define	QC_RT_SPACE	(1<<14)
+#define QC_ACCT_MASK (QC_SPACE | QC_INO_COUNT | QC_RT_SPACE)
+
 /* Operations handling requests from userspace */
 struct quotactl_ops {
 	int (*quota_on)(struct super_block *, int, int, struct path *);
@@ -329,8 +372,8 @@ struct quotactl_ops {
 	int (*quota_sync)(struct super_block *, int);
 	int (*get_info)(struct super_block *, int, struct if_dqinfo *);
 	int (*set_info)(struct super_block *, int, struct if_dqinfo *);
-	int (*get_dqblk)(struct super_block *, struct kqid, struct fs_disk_quota *);
-	int (*set_dqblk)(struct super_block *, struct kqid, struct fs_disk_quota *);
+	int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
+	int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
 	int (*get_xstate)(struct super_block *, struct fs_quota_stat *);
 	int (*set_xstate)(struct super_block *, unsigned int, int);
 	int (*get_xstatev)(struct super_block *, struct fs_quota_statv *);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index f23538a6e411..29e3455f7d41 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -98,9 +98,9 @@ int dquot_quota_sync(struct super_block *sb, int type);
 int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int dquot_get_dqblk(struct super_block *sb, struct kqid id,
-		struct fs_disk_quota *di);
+		struct qc_dqblk *di);
 int dquot_set_dqblk(struct super_block *sb, struct kqid id,
-		struct fs_disk_quota *di);
+		struct qc_dqblk *di);
 
 int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
 int dquot_transfer(struct inode *inode, struct iattr *iattr);
-- 
cgit v1.2.3


From 3aa8793f751d4cfcaca886e75ab30dfb00cf1d88 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 28 Nov 2014 14:38:36 +0100
Subject: mmc: core: Initial support for MMC power sequences

System on chip designs may specify a specific MMC power sequence. To
successfully detect an (e)MMC/SD/SDIO card, that power sequence must
be followed while initializing the card.

To be able to handle these SOC specific power sequences, let's add a
MMC power sequence interface. It provides the following functions to
help the mmc core to deal with these power sequences.

mmc_pwrseq_alloc() - Invoked from mmc_of_parse(), to initialize data.
mmc_pwrseq_pre_power_on()- Invoked in the beginning of mmc_power_up().
mmc_pwrseq_post_power_on()- Invoked at the end in mmc_power_up().
mmc_pwrseq_power_off()- Invoked from mmc_power_off().
mmc_pwrseq_free() - Invoked from mmc_free_host(), to free data.

Each MMC power sequence provider will be responsible to implement a set
of callbacks. These callbacks mirrors the functions above.

This patch adds the skeleton, following patches will extend the core of
the MMC power sequence and add support for a specific simple MMC power
sequence.

Do note, since the mmc_pwrseq_alloc() is invoked from mmc_of_parse(),
host drivers needs to make use of this API to enable the support for
MMC power sequences. Moreover the MMC power sequence support depends on
CONFIG_OF.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Reviewed-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
---
 drivers/mmc/core/Makefile |  2 +-
 drivers/mmc/core/core.c   |  7 +++++++
 drivers/mmc/core/host.c   |  4 +++-
 drivers/mmc/core/pwrseq.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/pwrseq.h | 40 +++++++++++++++++++++++++++++++++++++
 include/linux/mmc/host.h  |  2 ++
 6 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 drivers/mmc/core/pwrseq.c
 create mode 100644 drivers/mmc/core/pwrseq.h

(limited to 'include/linux')

diff --git a/drivers/mmc/core/Makefile b/drivers/mmc/core/Makefile
index 38ed210ce2f3..ccdd35f78256 100644
--- a/drivers/mmc/core/Makefile
+++ b/drivers/mmc/core/Makefile
@@ -8,5 +8,5 @@ mmc_core-y			:= core.o bus.o host.o \
 				   sdio.o sdio_ops.o sdio_bus.o \
 				   sdio_cis.o sdio_io.o sdio_irq.o \
 				   quirks.o slot-gpio.o
-
+mmc_core-$(CONFIG_OF)		+= pwrseq.o
 mmc_core-$(CONFIG_DEBUG_FS)	+= debugfs.o
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index d5c176e87951..1be7055548cb 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -40,6 +40,7 @@
 #include "bus.h"
 #include "host.h"
 #include "sdio_bus.h"
+#include "pwrseq.h"
 
 #include "mmc_ops.h"
 #include "sd_ops.h"
@@ -1615,6 +1616,8 @@ void mmc_power_up(struct mmc_host *host, u32 ocr)
 
 	mmc_host_clk_hold(host);
 
+	mmc_pwrseq_pre_power_on(host);
+
 	host->ios.vdd = fls(ocr) - 1;
 	host->ios.power_mode = MMC_POWER_UP;
 	/* Set initial state and call mmc_set_ios */
@@ -1645,6 +1648,8 @@ void mmc_power_up(struct mmc_host *host, u32 ocr)
 	 */
 	mmc_delay(10);
 
+	mmc_pwrseq_post_power_on(host);
+
 	mmc_host_clk_release(host);
 }
 
@@ -1655,6 +1660,8 @@ void mmc_power_off(struct mmc_host *host)
 
 	mmc_host_clk_hold(host);
 
+	mmc_pwrseq_power_off(host);
+
 	host->ios.clock = 0;
 	host->ios.vdd = 0;
 
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 07636449b4de..8be0df758e68 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -30,6 +30,7 @@
 #include "core.h"
 #include "host.h"
 #include "slot-gpio.h"
+#include "pwrseq.h"
 
 #define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
 
@@ -448,7 +449,7 @@ int mmc_of_parse(struct mmc_host *host)
 		host->dsr_req = 0;
 	}
 
-	return 0;
+	return mmc_pwrseq_alloc(host);
 }
 
 EXPORT_SYMBOL(mmc_of_parse);
@@ -588,6 +589,7 @@ EXPORT_SYMBOL(mmc_remove_host);
  */
 void mmc_free_host(struct mmc_host *host)
 {
+	mmc_pwrseq_free(host);
 	put_device(&host->class_dev);
 }
 
diff --git a/drivers/mmc/core/pwrseq.c b/drivers/mmc/core/pwrseq.c
new file mode 100644
index 000000000000..bd087723929f
--- /dev/null
+++ b/drivers/mmc/core/pwrseq.c
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (C) 2014 Linaro Ltd
+ *
+ * Author: Ulf Hansson <ulf.hansson@linaro.org>
+ *
+ * License terms: GNU General Public License (GPL) version 2
+ *
+ *  MMC power sequence management
+ */
+#include <linux/mmc/host.h>
+
+#include "pwrseq.h"
+
+
+int mmc_pwrseq_alloc(struct mmc_host *host)
+{
+	return 0;
+}
+
+void mmc_pwrseq_pre_power_on(struct mmc_host *host)
+{
+	struct mmc_pwrseq *pwrseq = host->pwrseq;
+
+	if (pwrseq && pwrseq->ops && pwrseq->ops->pre_power_on)
+		pwrseq->ops->pre_power_on(host);
+}
+
+void mmc_pwrseq_post_power_on(struct mmc_host *host)
+{
+	struct mmc_pwrseq *pwrseq = host->pwrseq;
+
+	if (pwrseq && pwrseq->ops && pwrseq->ops->post_power_on)
+		pwrseq->ops->post_power_on(host);
+}
+
+void mmc_pwrseq_power_off(struct mmc_host *host)
+{
+	struct mmc_pwrseq *pwrseq = host->pwrseq;
+
+	if (pwrseq && pwrseq->ops && pwrseq->ops->power_off)
+		pwrseq->ops->power_off(host);
+}
+
+void mmc_pwrseq_free(struct mmc_host *host)
+{
+	struct mmc_pwrseq *pwrseq = host->pwrseq;
+
+	if (pwrseq && pwrseq->ops && pwrseq->ops->free)
+		pwrseq->ops->free(host);
+}
diff --git a/drivers/mmc/core/pwrseq.h b/drivers/mmc/core/pwrseq.h
new file mode 100644
index 000000000000..12aaf2b4745b
--- /dev/null
+++ b/drivers/mmc/core/pwrseq.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2014 Linaro Ltd
+ *
+ * Author: Ulf Hansson <ulf.hansson@linaro.org>
+ *
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#ifndef _MMC_CORE_PWRSEQ_H
+#define _MMC_CORE_PWRSEQ_H
+
+struct mmc_pwrseq_ops {
+	void (*pre_power_on)(struct mmc_host *host);
+	void (*post_power_on)(struct mmc_host *host);
+	void (*power_off)(struct mmc_host *host);
+	void (*free)(struct mmc_host *host);
+};
+
+struct mmc_pwrseq {
+	struct mmc_pwrseq_ops *ops;
+};
+
+#ifdef CONFIG_OF
+
+int mmc_pwrseq_alloc(struct mmc_host *host);
+void mmc_pwrseq_pre_power_on(struct mmc_host *host);
+void mmc_pwrseq_post_power_on(struct mmc_host *host);
+void mmc_pwrseq_power_off(struct mmc_host *host);
+void mmc_pwrseq_free(struct mmc_host *host);
+
+#else
+
+static inline int mmc_pwrseq_alloc(struct mmc_host *host) { return 0; }
+static inline void mmc_pwrseq_pre_power_on(struct mmc_host *host) {}
+static inline void mmc_pwrseq_post_power_on(struct mmc_host *host) {}
+static inline void mmc_pwrseq_power_off(struct mmc_host *host) {}
+static inline void mmc_pwrseq_free(struct mmc_host *host) {}
+
+#endif
+
+#endif
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index b6bf718c3498..0c8cbe5d1550 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -195,6 +195,7 @@ struct mmc_context_info {
 };
 
 struct regulator;
+struct mmc_pwrseq;
 
 struct mmc_supply {
 	struct regulator *vmmc;		/* Card power supply */
@@ -206,6 +207,7 @@ struct mmc_host {
 	struct device		class_dev;
 	int			index;
 	const struct mmc_host_ops *ops;
+	struct mmc_pwrseq	*pwrseq;
 	unsigned int		f_min;
 	unsigned int		f_max;
 	unsigned int		f_init;
-- 
cgit v1.2.3


From c3c87e770458aa004bd7ed3f29945ff436fd6511 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 Jan 2015 11:19:48 +0100
Subject: perf: Tighten (and fix) the grouping condition

The fix from 9fc81d87420d ("perf: Fix events installation during
moving group") was incomplete in that it failed to recognise that
creating a group with events for different CPUs is semantically
broken -- they cannot be co-scheduled.

Furthermore, it leads to real breakage where, when we create an event
for CPU Y and then migrate it to form a group on CPU X, the code gets
confused where the counter is programmed -- triggered in practice
as well by me via the perf fuzzer.

Fix this by tightening the rules for creating groups. Only allow
grouping of counters that can be co-scheduled in the same context.
This means for the same task and/or the same cpu.

Fixes: 9fc81d87420d ("perf: Fix events installation during moving group")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20150123125834.090683288@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  6 ------
 kernel/events/core.c       | 15 +++++++++++++--
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4f7a61ca4b39..664de5a4ec46 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -450,11 +450,6 @@ struct perf_event {
 #endif /* CONFIG_PERF_EVENTS */
 };
 
-enum perf_event_context_type {
-	task_context,
-	cpu_context,
-};
-
 /**
  * struct perf_event_context - event context structure
  *
@@ -462,7 +457,6 @@ enum perf_event_context_type {
  */
 struct perf_event_context {
 	struct pmu			*pmu;
-	enum perf_event_context_type	type;
 	/*
 	 * Protect the states of the events in the list,
 	 * nr_active, and the list:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..19efcf13375a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6776,7 +6776,6 @@ skip_type:
 		__perf_event_init_context(&cpuctx->ctx);
 		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
-		cpuctx->ctx.type = cpu_context;
 		cpuctx->ctx.pmu = pmu;
 
 		__perf_cpu_hrtimer_init(cpuctx, cpu);
@@ -7420,7 +7419,19 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * task or CPU context:
 		 */
 		if (move_group) {
-			if (group_leader->ctx->type != ctx->type)
+			/*
+			 * Make sure we're both on the same task, or both
+			 * per-cpu events.
+			 */
+			if (group_leader->ctx->task != ctx->task)
+				goto err_context;
+
+			/*
+			 * Make sure we're both events for the same CPU;
+			 * grouping events for different CPUs is broken; since
+			 * you can never concurrently schedule them anyhow.
+			 */
+			if (group_leader->cpu != event->cpu)
 				goto err_context;
 		} else {
 			if (group_leader->ctx != ctx)
-- 
cgit v1.2.3


From 667a0a06c99d5291433b869ed35dabdd95ba1453 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 18 Dec 2014 14:48:15 +0000
Subject: mm: provide a find_special_page vma operation

The optional find_special_page VMA operation is used to lookup the
pages backing a VMA.  This is useful in cases where the normal
mechanisms for finding the page don't work.  This is only called if
the PTE is special.

One use case is a Xen PV guest mapping foreign pages into userspace.

In a Xen PV guest, the PTEs contain MFNs so get_user_pages() (for
example) must do an MFN to PFN (M2P) lookup before it can get the
page.  For foreign pages (those owned by another guest) the M2P lookup
returns the PFN as seen by the foreign guest (which would be
completely the wrong page for the local guest).

This cannot be fixed up improving the M2P lookup since one MFN may be
mapped onto two or more pages so getting the right page is impossible
given just the MFN.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++++++
 mm/memory.c        | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80fc92a49649..9269af7349fe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -290,6 +290,14 @@ struct vm_operations_struct {
 	/* called by sys_remap_file_pages() to populate non-linear mapping */
 	int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
 			   unsigned long size, pgoff_t pgoff);
+
+	/*
+	 * Called by vm_normal_page() for special PTEs to find the
+	 * page for @addr.  This is useful if the default behavior
+	 * (using pte_page()) would not find the correct page.
+	 */
+	struct page *(*find_special_page)(struct vm_area_struct *vma,
+					  unsigned long addr);
 };
 
 struct mmu_gather;
diff --git a/mm/memory.c b/mm/memory.c
index 54f3a9b00956..dc2e01a315e2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -754,6 +754,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 	if (HAVE_PTE_SPECIAL) {
 		if (likely(!pte_special(pte)))
 			goto check_pfn;
+		if (vma->vm_ops && vma->vm_ops->find_special_page)
+			return vma->vm_ops->find_special_page(vma, addr);
 		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
 			return NULL;
 		if (!is_zero_pfn(pfn))
-- 
cgit v1.2.3


From d8ac3dd41aea245f65465449efc35dd3ac71e91d Mon Sep 17 00:00:00 2001
From: Jennifer Herbert <jennifer.herbert@citrix.com>
Date: Mon, 5 Jan 2015 13:24:09 +0000
Subject: mm: add 'foreign' alias for the 'pinned' page flag

The foreign page flag will be used by Xen guests to mark pages that
have grant mappings of frames from other (foreign) guests.

The foreign flag is an alias for the existing (Xen-specific) pinned
flag.  This is safe because pinned is only used on pages used for page
tables and these cannot also be foreign.

Signed-off-by: Jennifer Herbert <jennifer.herbert@citrix.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 include/linux/page-flags.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e1f5fcd79792..5ed7bdaf22d5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -121,8 +121,12 @@ enum pageflags {
 	PG_fscache = PG_private_2,	/* page backed by cache */
 
 	/* XEN */
+	/* Pinned in Xen as a read-only pagetable page. */
 	PG_pinned = PG_owner_priv_1,
+	/* Pinned as part of domain save (see xen_mm_pin_all()). */
 	PG_savepinned = PG_dirty,
+	/* Has a grant mapping of another (foreign) domain's page. */
+	PG_foreign = PG_owner_priv_1,
 
 	/* SLOB */
 	PG_slob_free = PG_private,
@@ -215,6 +219,7 @@ __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
+PAGEFLAG(Foreign, foreign);				/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 	__SETPAGEFLAG(SwapBacked, swapbacked)
-- 
cgit v1.2.3


From fafdd2bf2638157670f28462b641150d16dbaeca Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Mon, 24 Nov 2014 22:30:09 +0100
Subject: UBI: Implement UBI_METAONLY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

UBI_METAONLY is a new open mode for UBI volumes, it indicates
that only meta data is being changed.
Meta data in terms of UBI volumes means data which is stored in the
UBI volume table but not on the volume itself.
While it does not interfere with UBI_READONLY and UBI_READWRITE
it is not allowed to use UBI_METAONLY together with UBI_EXCLUSIVE.

Cc: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Cc: Andrew Murray <amurray@embedded-bits.co.uk>
Signed-off-by: Richard Weinberger <richard@nod.at>
Tested-by: Guido Martínez <guido@vanguardiasur.com.ar>
Reviewed-by: Guido Martínez <guido@vanguardiasur.com.ar>
Tested-by: Christoph Fritz <chf.fritz@googlemail.com>
Tested-by: Andrew Murray <amurray@embedded-bits.co.uk>
---
 drivers/mtd/ubi/cdev.c  |  8 +++++---
 drivers/mtd/ubi/kapi.c  | 15 +++++++++++++--
 drivers/mtd/ubi/ubi.h   |  8 ++++++--
 include/linux/mtd/ubi.h |  5 ++++-
 4 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 3410ea8109f8..f5c715c32dcd 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -61,13 +61,13 @@ static int get_exclusive(struct ubi_device *ubi, struct ubi_volume_desc *desc)
 	struct ubi_volume *vol = desc->vol;
 
 	spin_lock(&vol->ubi->volumes_lock);
-	users = vol->readers + vol->writers + vol->exclusive;
+	users = vol->readers + vol->writers + vol->exclusive + vol->metaonly;
 	ubi_assert(users > 0);
 	if (users > 1) {
 		ubi_err(ubi, "%d users for volume %d", users, vol->vol_id);
 		err = -EBUSY;
 	} else {
-		vol->readers = vol->writers = 0;
+		vol->readers = vol->writers = vol->metaonly = 0;
 		vol->exclusive = 1;
 		err = desc->mode;
 		desc->mode = UBI_EXCLUSIVE;
@@ -87,13 +87,15 @@ static void revoke_exclusive(struct ubi_volume_desc *desc, int mode)
 	struct ubi_volume *vol = desc->vol;
 
 	spin_lock(&vol->ubi->volumes_lock);
-	ubi_assert(vol->readers == 0 && vol->writers == 0);
+	ubi_assert(vol->readers == 0 && vol->writers == 0 && vol->metaonly == 0);
 	ubi_assert(vol->exclusive == 1 && desc->mode == UBI_EXCLUSIVE);
 	vol->exclusive = 0;
 	if (mode == UBI_READONLY)
 		vol->readers = 1;
 	else if (mode == UBI_READWRITE)
 		vol->writers = 1;
+	else if (mode == UBI_METAONLY)
+		vol->metaonly = 1;
 	else
 		vol->exclusive = 1;
 	spin_unlock(&vol->ubi->volumes_lock);
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index f3bab669f6bb..589c423fac2d 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -137,7 +137,7 @@ struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode)
 		return ERR_PTR(-EINVAL);
 
 	if (mode != UBI_READONLY && mode != UBI_READWRITE &&
-	    mode != UBI_EXCLUSIVE)
+	    mode != UBI_EXCLUSIVE && mode != UBI_METAONLY)
 		return ERR_PTR(-EINVAL);
 
 	/*
@@ -182,10 +182,17 @@ struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode)
 		break;
 
 	case UBI_EXCLUSIVE:
-		if (vol->exclusive || vol->writers || vol->readers)
+		if (vol->exclusive || vol->writers || vol->readers ||
+		    vol->metaonly)
 			goto out_unlock;
 		vol->exclusive = 1;
 		break;
+
+	case UBI_METAONLY:
+		if (vol->metaonly || vol->exclusive)
+			goto out_unlock;
+		vol->metaonly = 1;
+		break;
 	}
 	get_device(&vol->dev);
 	vol->ref_count += 1;
@@ -343,6 +350,10 @@ void ubi_close_volume(struct ubi_volume_desc *desc)
 		break;
 	case UBI_EXCLUSIVE:
 		vol->exclusive = 0;
+		break;
+	case UBI_METAONLY:
+		vol->metaonly = 0;
+		break;
 	}
 	vol->ref_count -= 1;
 	spin_unlock(&ubi->volumes_lock);
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
index f80ffaba9058..20cabc060b61 100644
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -261,6 +261,7 @@ struct ubi_fm_pool {
  * @readers: number of users holding this volume in read-only mode
  * @writers: number of users holding this volume in read-write mode
  * @exclusive: whether somebody holds this volume in exclusive mode
+ * @metaonly: whether somebody is altering only meta data of this volume
  *
  * @reserved_pebs: how many physical eraseblocks are reserved for this volume
  * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
@@ -309,6 +310,7 @@ struct ubi_volume {
 	int readers;
 	int writers;
 	int exclusive;
+	int metaonly;
 
 	int reserved_pebs;
 	int vol_type;
@@ -339,7 +341,8 @@ struct ubi_volume {
 /**
  * struct ubi_volume_desc - UBI volume descriptor returned when it is opened.
  * @vol: reference to the corresponding volume description object
- * @mode: open mode (%UBI_READONLY, %UBI_READWRITE, or %UBI_EXCLUSIVE)
+ * @mode: open mode (%UBI_READONLY, %UBI_READWRITE, %UBI_EXCLUSIVE
+ * or %UBI_METAONLY)
  */
 struct ubi_volume_desc {
 	struct ubi_volume *vol;
@@ -390,7 +393,8 @@ struct ubi_debug_info {
  * @volumes_lock: protects @volumes, @rsvd_pebs, @avail_pebs, beb_rsvd_pebs,
  *                @beb_rsvd_level, @bad_peb_count, @good_peb_count, @vol_count,
  *                @vol->readers, @vol->writers, @vol->exclusive,
- *                @vol->ref_count, @vol->mapping and @vol->eba_tbl.
+ *                @vol->metaonly, @vol->ref_count, @vol->mapping and
+ *                @vol->eba_tbl.
  * @ref_count: count of references on the UBI device
  * @image_seq: image sequence number recorded on EC headers
  *
diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
index c3918a0684fe..8fa2753316e8 100644
--- a/include/linux/mtd/ubi.h
+++ b/include/linux/mtd/ubi.h
@@ -34,11 +34,14 @@
  * UBI_READONLY: read-only mode
  * UBI_READWRITE: read-write mode
  * UBI_EXCLUSIVE: exclusive mode
+ * UBI_METAONLY: modify only the volume meta-data,
+ *  i.e. the data stored in the volume table, but not in any of volume LEBs.
  */
 enum {
 	UBI_READONLY = 1,
 	UBI_READWRITE,
-	UBI_EXCLUSIVE
+	UBI_EXCLUSIVE,
+	UBI_METAONLY
 };
 
 /**
-- 
cgit v1.2.3


From 9ff08979e17423f0f691c1d76f35dfec72a5e459 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Sat, 10 Jan 2015 22:52:13 +0100
Subject: UBI: Add initial support for scatter gather

Adds a new set of functions to deal with scatter gather.
ubi_eba_read_leb_sg() will read from a LEB into a scatter gather list.
The new data structure struct ubi_sgl will be used within UBI to
hold the scatter gather list itself and metadata to have a cursor
within the list.

Signed-off-by: Richard Weinberger <richard@nod.at>
Tested-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
---
 drivers/mtd/ubi/eba.c   | 55 ++++++++++++++++++++++++++++
 drivers/mtd/ubi/kapi.c  | 95 +++++++++++++++++++++++++++++++++++++++++--------
 drivers/mtd/ubi/ubi.h   |  3 ++
 include/linux/mtd/ubi.h | 48 +++++++++++++++++++++++++
 4 files changed, 186 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
index a40020cf0923..f1db73da7975 100644
--- a/drivers/mtd/ubi/eba.c
+++ b/drivers/mtd/ubi/eba.c
@@ -479,6 +479,61 @@ out_unlock:
 	return err;
 }
 
+/**
+ * ubi_eba_read_leb_sg - read data into a scatter gather list.
+ * @ubi: UBI device description object
+ * @vol: volume description object
+ * @lnum: logical eraseblock number
+ * @sgl: UBI scatter gather list to store the read data
+ * @offset: offset from where to read
+ * @len: how many bytes to read
+ * @check: data CRC check flag
+ *
+ * This function works exactly like ubi_eba_read_leb(). But instead of
+ * storing the read data into a buffer it writes to an UBI scatter gather
+ * list.
+ */
+int ubi_eba_read_leb_sg(struct ubi_device *ubi, struct ubi_volume *vol,
+			struct ubi_sgl *sgl, int lnum, int offset, int len,
+			int check)
+{
+	int to_read;
+	int ret;
+	struct scatterlist *sg;
+
+	for (;;) {
+		ubi_assert(sgl->list_pos < UBI_MAX_SG_COUNT);
+		sg = &sgl->sg[sgl->list_pos];
+		if (len < sg->length - sgl->page_pos)
+			to_read = len;
+		else
+			to_read = sg->length - sgl->page_pos;
+
+		ret = ubi_eba_read_leb(ubi, vol, lnum,
+				       sg_virt(sg) + sgl->page_pos, offset,
+				       to_read, check);
+		if (ret < 0)
+			return ret;
+
+		offset += to_read;
+		len -= to_read;
+		if (!len) {
+			sgl->page_pos += to_read;
+			if (sgl->page_pos == sg->length) {
+				sgl->list_pos++;
+				sgl->page_pos = 0;
+			}
+
+			break;
+		}
+
+		sgl->list_pos++;
+		sgl->page_pos = 0;
+	}
+
+	return ret;
+}
+
 /**
  * recover_peb - recover from write failure.
  * @ubi: UBI device description object
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index 589c423fac2d..478e00cf2d9e 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -365,6 +365,43 @@ void ubi_close_volume(struct ubi_volume_desc *desc)
 }
 EXPORT_SYMBOL_GPL(ubi_close_volume);
 
+/**
+ * leb_read_sanity_check - does sanity checks on read requests.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number to read from
+ * @offset: offset within the logical eraseblock to read from
+ * @len: how many bytes to read
+ *
+ * This function is used by ubi_leb_read() and ubi_leb_read_sg()
+ * to perform sanity checks.
+ */
+static int leb_read_sanity_check(struct ubi_volume_desc *desc, int lnum,
+				 int offset, int len)
+{
+	struct ubi_volume *vol = desc->vol;
+	struct ubi_device *ubi = vol->ubi;
+	int vol_id = vol->vol_id;
+
+	if (vol_id < 0 || vol_id >= ubi->vtbl_slots || lnum < 0 ||
+	    lnum >= vol->used_ebs || offset < 0 || len < 0 ||
+	    offset + len > vol->usable_leb_size)
+		return -EINVAL;
+
+	if (vol->vol_type == UBI_STATIC_VOLUME) {
+		if (vol->used_ebs == 0)
+			/* Empty static UBI volume */
+			return 0;
+		if (lnum == vol->used_ebs - 1 &&
+		    offset + len > vol->last_eb_bytes)
+			return -EINVAL;
+	}
+
+	if (vol->upd_marker)
+		return -EBADF;
+
+	return 0;
+}
+
 /**
  * ubi_leb_read - read data.
  * @desc: volume descriptor
@@ -401,22 +438,10 @@ int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 
 	dbg_gen("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset);
 
-	if (vol_id < 0 || vol_id >= ubi->vtbl_slots || lnum < 0 ||
-	    lnum >= vol->used_ebs || offset < 0 || len < 0 ||
-	    offset + len > vol->usable_leb_size)
-		return -EINVAL;
-
-	if (vol->vol_type == UBI_STATIC_VOLUME) {
-		if (vol->used_ebs == 0)
-			/* Empty static UBI volume */
-			return 0;
-		if (lnum == vol->used_ebs - 1 &&
-		    offset + len > vol->last_eb_bytes)
-			return -EINVAL;
-	}
+	err = leb_read_sanity_check(desc, lnum, offset, len);
+	if (err < 0)
+		return err;
 
-	if (vol->upd_marker)
-		return -EBADF;
 	if (len == 0)
 		return 0;
 
@@ -430,6 +455,46 @@ int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 }
 EXPORT_SYMBOL_GPL(ubi_leb_read);
 
+
+/**
+ * ubi_leb_read_sg - read data into a scatter gather list.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number to read from
+ * @buf: buffer where to store the read data
+ * @offset: offset within the logical eraseblock to read from
+ * @len: how many bytes to read
+ * @check: whether UBI has to check the read data's CRC or not.
+ *
+ * This function works exactly like ubi_leb_read_sg(). But instead of
+ * storing the read data into a buffer it writes to an UBI scatter gather
+ * list.
+ */
+int ubi_leb_read_sg(struct ubi_volume_desc *desc, int lnum, struct ubi_sgl *sgl,
+		    int offset, int len, int check)
+{
+	struct ubi_volume *vol = desc->vol;
+	struct ubi_device *ubi = vol->ubi;
+	int err, vol_id = vol->vol_id;
+
+	dbg_gen("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset);
+
+	err = leb_read_sanity_check(desc, lnum, offset, len);
+	if (err < 0)
+		return err;
+
+	if (len == 0)
+		return 0;
+
+	err = ubi_eba_read_leb_sg(ubi, vol, sgl, lnum, offset, len, check);
+	if (err && mtd_is_eccerr(err) && vol->vol_type == UBI_STATIC_VOLUME) {
+		ubi_warn(ubi, "mark volume %d as corrupted", vol_id);
+		vol->corrupted = 1;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(ubi_leb_read_sg);
+
 /**
  * ubi_leb_write - write data.
  * @desc: volume descriptor
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
index 20cabc060b61..d94b81fa2306 100644
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -795,6 +795,9 @@ int ubi_eba_unmap_leb(struct ubi_device *ubi, struct ubi_volume *vol,
 		      int lnum);
 int ubi_eba_read_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,
 		     void *buf, int offset, int len, int check);
+int ubi_eba_read_leb_sg(struct ubi_device *ubi, struct ubi_volume *vol,
+			struct ubi_sgl *sgl, int lnum, int offset, int len,
+			int check);
 int ubi_eba_write_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,
 		      const void *buf, int offset, int len);
 int ubi_eba_write_leb_st(struct ubi_device *ubi, struct ubi_volume *vol,
diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
index 8fa2753316e8..1e271cb559cd 100644
--- a/include/linux/mtd/ubi.h
+++ b/include/linux/mtd/ubi.h
@@ -23,11 +23,18 @@
 
 #include <linux/ioctl.h>
 #include <linux/types.h>
+#include <linux/scatterlist.h>
 #include <mtd/ubi-user.h>
 
 /* All voumes/LEBs */
 #define UBI_ALL -1
 
+/*
+ * Maximum number of scatter gather list entries,
+ * we use only 64 to have a lower memory foot print.
+ */
+#define UBI_MAX_SG_COUNT 64
+
 /*
  * enum ubi_open_mode - UBI volume open mode constants.
  *
@@ -118,6 +125,35 @@ struct ubi_volume_info {
 	dev_t cdev;
 };
 
+/**
+ * struct ubi_sgl - UBI scatter gather list data structure.
+ * @list_pos: current position in @sg[]
+ * @page_pos: current position in @sg[@list_pos]
+ * @sg: the scatter gather list itself
+ *
+ * ubi_sgl is a wrapper around a scatter list which keeps track of the
+ * current position in the list and the current list item such that
+ * it can be used across multiple ubi_leb_read_sg() calls.
+ */
+struct ubi_sgl {
+	int list_pos;
+	int page_pos;
+	struct scatterlist sg[UBI_MAX_SG_COUNT];
+};
+
+/**
+ * ubi_sgl_init - initialize an UBI scatter gather list data structure.
+ * @usgl: the UBI scatter gather struct itself
+ *
+ * Please note that you still have to use sg_init_table() or any adequate
+ * function to initialize the unterlaying struct scatterlist.
+ */
+static inline void ubi_sgl_init(struct ubi_sgl *usgl)
+{
+	usgl->list_pos = 0;
+	usgl->page_pos = 0;
+}
+
 /**
  * struct ubi_device_info - UBI device description data structure.
  * @ubi_num: ubi device number
@@ -213,6 +249,8 @@ int ubi_unregister_volume_notifier(struct notifier_block *nb);
 void ubi_close_volume(struct ubi_volume_desc *desc);
 int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 		 int len, int check);
+int ubi_leb_read_sg(struct ubi_volume_desc *desc, int lnum, struct ubi_sgl *sgl,
+		   int offset, int len, int check);
 int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
 		  int offset, int len);
 int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
@@ -233,4 +271,14 @@ static inline int ubi_read(struct ubi_volume_desc *desc, int lnum, char *buf,
 {
 	return ubi_leb_read(desc, lnum, buf, offset, len, 0);
 }
+
+/*
+ * This function is the same as the 'ubi_leb_read_sg()' function, but it does
+ * not provide the checking capability.
+ */
+static inline int ubi_read_sg(struct ubi_volume_desc *desc, int lnum,
+			      struct ubi_sgl *sgl, int offset, int len)
+{
+	return ubi_leb_read_sg(desc, lnum, sgl, offset, len, 0);
+}
 #endif /* !__LINUX_UBI_H__ */
-- 
cgit v1.2.3


From 6ea22486ba46bcb665de36514094d74575cd1330 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Wed, 28 Jan 2015 12:48:53 +0000
Subject: tracing: Add array printing helper

If a trace event contains an array, there is currently no standard
way to format this for text output.  Drivers are currently hacking
around this by a) local hacks that use the trace_seq functionailty
directly, or b) just not printing that information.  For fixed size
arrays, formatting of the elements can be open-coded, but this gets
cumbersome for arrays of non-trivial size.

These approaches result in non-standard content of the event format
description delivered to userspace, so userland tools needs to be
taught to understand and parse each array printing method
individually.

This patch implements a __print_array() helper that tracepoint
implementations can use instead of reinventing it.  A simple C-style
syntax is used to delimit the array and its elements {like,this}.

So that the helpers can be used with large static arrays as well as
dynamic arrays, they take a pointer and element count: they can be
used with __get_dynamic_array() for use with dynamic arrays.
Link: http://lkml.kernel.org/r/1422449335-8289-2-git-send-email-javi.merino@arm.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  4 ++++
 include/trace/ftrace.h       |  9 +++++++++
 kernel/trace/trace_output.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..5aa4a9269547 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -44,6 +44,10 @@ const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 const char *ftrace_print_hex_seq(struct trace_seq *p,
 				 const unsigned char *buf, int len);
 
+const char *ftrace_print_array_seq(struct trace_seq *p,
+				   const void *buf, int buf_len,
+				   size_t el_size);
+
 struct trace_iterator;
 struct trace_event;
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..304901fc5f34 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -263,6 +263,14 @@
 #undef __print_hex
 #define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len)
 
+#undef __print_array
+#define __print_array(array, count, el_size)				\
+	({								\
+		BUILD_BUG_ON(el_size != 1 && el_size != 2 &&		\
+			     el_size != 4 && el_size != 8);		\
+		ftrace_print_array_seq(p, array, count, el_size);	\
+	})
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace enum print_line_t					\
@@ -674,6 +682,7 @@ static inline void ftrace_test_probe_##call(void)			\
 #undef __get_dynamic_array_len
 #undef __get_str
 #undef __get_bitmask
+#undef __print_array
 
 #undef TP_printk
 #define TP_printk(fmt, args...) "\"" fmt "\", "  __stringify(args)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b77b9a697619..692bf7184c8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 }
 EXPORT_SYMBOL(ftrace_print_hex_seq);
 
+const char *
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
+		       size_t el_size)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	const char *prefix = "";
+	void *ptr = (void *)buf;
+
+	trace_seq_putc(p, '{');
+
+	while (ptr < buf + buf_len) {
+		switch (el_size) {
+		case 1:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u8 *)ptr);
+			break;
+		case 2:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u16 *)ptr);
+			break;
+		case 4:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u32 *)ptr);
+			break;
+		case 8:
+			trace_seq_printf(p, "%s0x%llx", prefix,
+					 *(u64 *)ptr);
+			break;
+		default:
+			trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
+					 *(u8 *)ptr);
+			el_size = 1;
+		}
+		prefix = ",";
+		ptr += el_size;
+	}
+
+	trace_seq_putc(p, '}');
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(ftrace_print_array_seq);
+
 int ftrace_raw_output_prep(struct trace_iterator *iter,
 			   struct trace_event *trace_event)
 {
-- 
cgit v1.2.3


From ad9cf3bbd18a94806314741ac8092c3422f5aebe Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 16 Dec 2014 12:54:25 -0500
Subject: block: mark blk-mq devices as stackable

Commit 4ee5eaf4 ("block: add a queue flag for request stacking support")
introduced the concept of "STACKABLE" and blk-mq devices fit the
definition in that they establish q->request_fn.  So establish
QUEUE_FLAG_STACKABLE in QUEUE_FLAG_MQ_DEFAULT.

While not strictly needed (DM _could_ just check for q->mq_ops to assume
the device is request-based), request-based DM support for blk-mq devices
benefits from the ability to consistently check for QUEUE_FLAG_STACKABLE
before allowing a device to be stacked into a request-based DM table.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6f388fd1c11c..13e16401a7ce 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -520,6 +520,7 @@ struct request_queue {
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
+				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP))
 
 static inline void queue_lockdep_assert_held(struct request_queue *q)
-- 
cgit v1.2.3


From 8c7dd8bce05345ca5fe249b64782e8feeb3b9259 Mon Sep 17 00:00:00 2001
From: James Ban <james.ban.opensource@diasemi.com>
Date: Wed, 28 Jan 2015 09:28:08 +0900
Subject: regulator: da9211: Add gpio control for enable/disable of buck

This is a patch for adding gpio control about enable/disable of buck.

Signed-off-by: James Ban <james.ban.opensource@diasemi.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/regulator/da9211.txt |  7 ++++++-
 drivers/regulator/da9211-regulator.c                   | 12 ++++++++++++
 include/linux/regulator/da9211.h                       |  1 +
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/da9211.txt b/Documentation/devicetree/bindings/regulator/da9211.txt
index 240019a82f9a..eb618907c7de 100644
--- a/Documentation/devicetree/bindings/regulator/da9211.txt
+++ b/Documentation/devicetree/bindings/regulator/da9211.txt
@@ -11,6 +11,7 @@ Required properties:
   BUCKA and BUCKB.
 
 Optional properties:
+- enable-gpios: platform gpio for control of BUCKA/BUCKB.
 - Any optional property defined in regulator.txt
 
 Example 1) DA9211
@@ -27,6 +28,7 @@ Example 1) DA9211
 				regulator-max-microvolt = <1570000>;
 				regulator-min-microamp 	= <2000000>;
 				regulator-max-microamp 	= <5000000>;
+				enable-gpios = <&gpio 27 0>;
 			};
 			BUCKB {
 				regulator-name = "VBUCKB";
@@ -34,11 +36,12 @@ Example 1) DA9211
 				regulator-max-microvolt = <1570000>;
 				regulator-min-microamp 	= <2000000>;
 				regulator-max-microamp 	= <5000000>;
+				enable-gpios = <&gpio 17 0>;
 			};
 		};
 	};
 
-Example 2) DA92113
+Example 2) DA9213
 	pmic: da9213@68 {
 		compatible = "dlg,da9213";
 		reg = <0x68>;
@@ -51,6 +54,7 @@ Example 2) DA92113
 				regulator-max-microvolt = <1570000>;
 				regulator-min-microamp 	= <3000000>;
 				regulator-max-microamp 	= <6000000>;
+				enable-gpios = <&gpio 27 0>;
 			};
 			BUCKB {
 				regulator-name = "VBUCKB";
@@ -58,6 +62,7 @@ Example 2) DA92113
 				regulator-max-microvolt = <1570000>;
 				regulator-min-microamp 	= <3000000>;
 				regulator-max-microamp 	= <6000000>;
+				enable-gpios = <&gpio 17 0>;
 			};
 		};
 	};
diff --git a/drivers/regulator/da9211-regulator.c b/drivers/regulator/da9211-regulator.c
index 8e6957c63a69..01343419555e 100644
--- a/drivers/regulator/da9211-regulator.c
+++ b/drivers/regulator/da9211-regulator.c
@@ -24,6 +24,7 @@
 #include <linux/regmap.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
+#include <linux/of_gpio.h>
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/da9211.h>
 #include "da9211-regulator.h"
@@ -277,6 +278,9 @@ static struct da9211_pdata *da9211_parse_regulators_dt(
 
 		pdata->init_data[n] = da9211_matches[i].init_data;
 		pdata->reg_node[n] = da9211_matches[i].of_node;
+		pdata->gpio_ren[n] =
+			of_get_named_gpio(da9211_matches[i].of_node,
+				"enable-gpios", 0);
 		n++;
 	}
 
@@ -366,6 +370,14 @@ static int da9211_regulator_init(struct da9211 *chip)
 		config.regmap = chip->regmap;
 		config.of_node = chip->pdata->reg_node[i];
 
+		if (gpio_is_valid(chip->pdata->gpio_ren[i])) {
+			config.ena_gpio = chip->pdata->gpio_ren[i];
+			config.ena_gpio_initialized = true;
+		} else {
+			config.ena_gpio = -EINVAL;
+			config.ena_gpio_initialized = false;
+		}
+
 		chip->rdev[i] = devm_regulator_register(chip->dev,
 			&da9211_regulators[i], &config);
 		if (IS_ERR(chip->rdev[i])) {
diff --git a/include/linux/regulator/da9211.h b/include/linux/regulator/da9211.h
index d1d9d3849bdb..5dd65acc2a69 100644
--- a/include/linux/regulator/da9211.h
+++ b/include/linux/regulator/da9211.h
@@ -32,6 +32,7 @@ struct da9211_pdata {
 	 * 2 : 2 phase 2 buck
 	 */
 	int num_buck;
+	int gpio_ren[DA9211_MAX_REGULATORS];
 	struct device_node *reg_node[DA9211_MAX_REGULATORS];
 	struct regulator_init_data *init_data[DA9211_MAX_REGULATORS];
 };
-- 
cgit v1.2.3


From af0f349b2996f9f3d83e5aac1edf58fff727a0e0 Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Fri, 23 Jan 2015 12:03:29 +0100
Subject: clk: Remove __clk_register

As it has never been used.

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 drivers/clk/clk.c           | 42 ------------------------------------------
 include/linux/clk-private.h |  2 --
 2 files changed, 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 05986e389bf8..b701e7c195e4 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2031,48 +2031,6 @@ out:
 	return ret;
 }
 
-/**
- * __clk_register - register a clock and return a cookie.
- *
- * Same as clk_register, except that the .clk field inside hw shall point to a
- * preallocated (generally statically allocated) struct clk. None of the fields
- * of the struct clk need to be initialized.
- *
- * The data pointed to by .init and .clk field shall NOT be marked as init
- * data.
- *
- * __clk_register is only exposed via clk-private.h and is intended for use with
- * very large numbers of clocks that need to be statically initialized.  It is
- * a layering violation to include clk-private.h from any code which implements
- * a clock's .ops; as such any statically initialized clock data MUST be in a
- * separate C file from the logic that implements its operations.  Returns 0
- * on success, otherwise an error code.
- */
-struct clk *__clk_register(struct device *dev, struct clk_hw *hw)
-{
-	int ret;
-	struct clk *clk;
-
-	clk = hw->clk;
-	clk->name = hw->init->name;
-	clk->ops = hw->init->ops;
-	clk->hw = hw;
-	clk->flags = hw->init->flags;
-	clk->parent_names = hw->init->parent_names;
-	clk->num_parents = hw->init->num_parents;
-	if (dev && dev->driver)
-		clk->owner = dev->driver->owner;
-	else
-		clk->owner = NULL;
-
-	ret = __clk_init(dev, clk);
-	if (ret)
-		return ERR_PTR(ret);
-
-	return clk;
-}
-EXPORT_SYMBOL_GPL(__clk_register);
-
 /**
  * clk_register - allocate a new clock, register it and return an opaque cookie
  * @dev: device that is registering this clock
diff --git a/include/linux/clk-private.h b/include/linux/clk-private.h
index 0ca5f6046920..c5f40d07686c 100644
--- a/include/linux/clk-private.h
+++ b/include/linux/clk-private.h
@@ -214,7 +214,5 @@ struct clk {
  */
 int __clk_init(struct device *dev, struct clk *clk);
 
-struct clk *__clk_register(struct device *dev, struct clk_hw *hw);
-
 #endif /* CONFIG_COMMON_CLK */
 #endif /* CLK_PRIVATE_H */
-- 
cgit v1.2.3


From 018d5ef2048fcab339467bcbebccf588c9bd2531 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Thu, 29 Jan 2015 08:30:29 +0900
Subject: ata: ahci_platform: fix owner module reference mismatch for scsi host

The owner module reference of the ahci platform's scsi_host is
initialized to libahci_platform's one, because these drivers use a
scsi_host_template defined in libahci_platform.  So these drivers can
be unloaded even if the scsi device is being accessed.

This fixes it by pushing the scsi_host_template from libahci_platform
to all leaf drivers.  The scsi_host_template is passed through a new
argument of ahci_platform_init_host().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: linux-ide@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
---
 drivers/ata/ahci.h             |  4 ++++
 drivers/ata/ahci_da850.c       | 11 +++++++++--
 drivers/ata/ahci_imx.c         | 11 +++++++++--
 drivers/ata/ahci_mvebu.c       | 11 +++++++++--
 drivers/ata/ahci_platform.c    | 11 +++++++++--
 drivers/ata/ahci_st.c          | 11 +++++++++--
 drivers/ata/ahci_sunxi.c       | 11 +++++++++--
 drivers/ata/ahci_tegra.c       | 11 +++++++++--
 drivers/ata/ahci_xgene.c       | 11 +++++++++--
 drivers/ata/libahci_platform.c | 10 ++++------
 include/linux/ahci_platform.h  |  4 +++-
 include/linux/libata.h         |  6 ++++++
 12 files changed, 89 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index 275358ae0b3f..71262e08648e 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -354,6 +354,10 @@ extern int ahci_ignore_sss;
 extern struct device_attribute *ahci_shost_attrs[];
 extern struct device_attribute *ahci_sdev_attrs[];
 
+/*
+ * This must be instantiated by the edge drivers.  Read the comments
+ * for ATA_BASE_SHT
+ */
 #define AHCI_SHT(drv_name)						\
 	ATA_NCQ_SHT(drv_name),						\
 	.can_queue		= AHCI_MAX_CMDS - 1,			\
diff --git a/drivers/ata/ahci_da850.c b/drivers/ata/ahci_da850.c
index ce8a7a6d6c7f..267a3d3e79f4 100644
--- a/drivers/ata/ahci_da850.c
+++ b/drivers/ata/ahci_da850.c
@@ -16,6 +16,8 @@
 #include <linux/ahci_platform.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci_da850"
+
 /* SATA PHY Control Register offset from AHCI base */
 #define SATA_P0PHYCR_REG	0x178
 
@@ -59,6 +61,10 @@ static const struct ata_port_info ahci_da850_port_info = {
 	.port_ops	= &ahci_platform_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int ahci_da850_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -85,7 +91,8 @@ static int ahci_da850_probe(struct platform_device *pdev)
 
 	da850_sata_init(dev, pwrdn_reg, hpriv->mmio);
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_da850_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_da850_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -102,7 +109,7 @@ static struct platform_driver ahci_da850_driver = {
 	.probe = ahci_da850_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci_da850",
+		.name = DRV_NAME,
 		.pm = &ahci_da850_pm_ops,
 	},
 };
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index 41632e57d46f..3f3a7db208ae 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -28,6 +28,8 @@
 #include <linux/libata.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci-imx"
+
 enum {
 	/* Timer 1-ms Register */
 	IMX_TIMER1MS				= 0x00e0,
@@ -520,6 +522,10 @@ static u32 imx_ahci_parse_props(struct device *dev,
 	return reg_value;
 }
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int imx_ahci_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -616,7 +622,8 @@ static int imx_ahci_probe(struct platform_device *pdev)
 	reg_val = clk_get_rate(imxpriv->ahb_clk) / 1000;
 	writel(reg_val, hpriv->mmio + IMX_TIMER1MS);
 
-	ret = ahci_platform_init_host(pdev, hpriv, &ahci_imx_port_info);
+	ret = ahci_platform_init_host(pdev, hpriv, &ahci_imx_port_info,
+				      &ahci_platform_sht);
 	if (ret)
 		goto disable_sata;
 
@@ -674,7 +681,7 @@ static struct platform_driver imx_ahci_driver = {
 	.probe = imx_ahci_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci-imx",
+		.name = DRV_NAME,
 		.of_match_table = imx_ahci_of_match,
 		.pm = &ahci_imx_pm_ops,
 	},
diff --git a/drivers/ata/ahci_mvebu.c b/drivers/ata/ahci_mvebu.c
index 64bb08432b69..23716dd8a7ec 100644
--- a/drivers/ata/ahci_mvebu.c
+++ b/drivers/ata/ahci_mvebu.c
@@ -19,6 +19,8 @@
 #include <linux/platform_device.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci-mvebu"
+
 #define AHCI_VENDOR_SPECIFIC_0_ADDR  0xa0
 #define AHCI_VENDOR_SPECIFIC_0_DATA  0xa4
 
@@ -67,6 +69,10 @@ static const struct ata_port_info ahci_mvebu_port_info = {
 	.port_ops  = &ahci_platform_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int ahci_mvebu_probe(struct platform_device *pdev)
 {
 	struct ahci_host_priv *hpriv;
@@ -88,7 +94,8 @@ static int ahci_mvebu_probe(struct platform_device *pdev)
 	ahci_mvebu_mbus_config(hpriv, dram);
 	ahci_mvebu_regret_option(hpriv);
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_mvebu_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_mvebu_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -114,7 +121,7 @@ static struct platform_driver ahci_mvebu_driver = {
 	.probe = ahci_mvebu_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci-mvebu",
+		.name = DRV_NAME,
 		.of_match_table = ahci_mvebu_of_match,
 	},
 };
diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index 18d539837045..78d6ae0b90c4 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -22,6 +22,8 @@
 #include <linux/ahci_platform.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci"
+
 static const struct ata_port_info ahci_port_info = {
 	.flags		= AHCI_FLAG_COMMON,
 	.pio_mask	= ATA_PIO4,
@@ -29,6 +31,10 @@ static const struct ata_port_info ahci_port_info = {
 	.port_ops	= &ahci_platform_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int ahci_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -46,7 +52,8 @@ static int ahci_probe(struct platform_device *pdev)
 	if (of_device_is_compatible(dev->of_node, "hisilicon,hisi-ahci"))
 		hpriv->flags |= AHCI_HFLAG_NO_FBS | AHCI_HFLAG_NO_NCQ;
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -75,7 +82,7 @@ static struct platform_driver ahci_driver = {
 	.probe = ahci_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci",
+		.name = DRV_NAME,
 		.of_match_table = ahci_of_match,
 		.pm = &ahci_pm_ops,
 	},
diff --git a/drivers/ata/ahci_st.c b/drivers/ata/ahci_st.c
index 2f9e8317cc16..bc971af262e7 100644
--- a/drivers/ata/ahci_st.c
+++ b/drivers/ata/ahci_st.c
@@ -23,6 +23,8 @@
 
 #include "ahci.h"
 
+#define DRV_NAME  "st_ahci"
+
 #define ST_AHCI_OOBR			0xbc
 #define ST_AHCI_OOBR_WE			BIT(31)
 #define ST_AHCI_OOBR_CWMIN_SHIFT	24
@@ -140,6 +142,10 @@ static const struct ata_port_info st_ahci_port_info = {
 	.port_ops       = &st_ahci_port_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int st_ahci_probe(struct platform_device *pdev)
 {
 	struct st_ahci_drv_data *drv_data;
@@ -166,7 +172,8 @@ static int st_ahci_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	err = ahci_platform_init_host(pdev, hpriv, &st_ahci_port_info);
+	err = ahci_platform_init_host(pdev, hpriv, &st_ahci_port_info,
+				      &ahci_platform_sht);
 	if (err) {
 		ahci_platform_disable_resources(hpriv);
 		return err;
@@ -229,7 +236,7 @@ MODULE_DEVICE_TABLE(of, st_ahci_match);
 
 static struct platform_driver st_ahci_driver = {
 	.driver = {
-		.name = "st_ahci",
+		.name = DRV_NAME,
 		.pm = &st_ahci_pm_ops,
 		.of_match_table = of_match_ptr(st_ahci_match),
 	},
diff --git a/drivers/ata/ahci_sunxi.c b/drivers/ata/ahci_sunxi.c
index e2e0da539a2f..b26437430163 100644
--- a/drivers/ata/ahci_sunxi.c
+++ b/drivers/ata/ahci_sunxi.c
@@ -27,6 +27,8 @@
 #include <linux/regulator/consumer.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci-sunxi"
+
 /* Insmod parameters */
 static bool enable_pmp;
 module_param(enable_pmp, bool, 0);
@@ -169,6 +171,10 @@ static const struct ata_port_info ahci_sunxi_port_info = {
 	.port_ops	= &ahci_platform_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int ahci_sunxi_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -200,7 +206,8 @@ static int ahci_sunxi_probe(struct platform_device *pdev)
 	if (!enable_pmp)
 		hpriv->flags |= AHCI_HFLAG_NO_PMP;
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_sunxi_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_sunxi_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -251,7 +258,7 @@ static struct platform_driver ahci_sunxi_driver = {
 	.probe = ahci_sunxi_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci-sunxi",
+		.name = DRV_NAME,
 		.of_match_table = ahci_sunxi_of_match,
 		.pm = &ahci_sunxi_pm_ops,
 	},
diff --git a/drivers/ata/ahci_tegra.c b/drivers/ata/ahci_tegra.c
index 032904402c95..3a62eb246d80 100644
--- a/drivers/ata/ahci_tegra.c
+++ b/drivers/ata/ahci_tegra.c
@@ -31,6 +31,8 @@
 
 #include "ahci.h"
 
+#define DRV_NAME "tegra-ahci"
+
 #define SATA_CONFIGURATION_0				0x180
 #define SATA_CONFIGURATION_EN_FPCI			BIT(0)
 
@@ -289,6 +291,10 @@ static const struct of_device_id tegra_ahci_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, tegra_ahci_of_match);
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int tegra_ahci_probe(struct platform_device *pdev)
 {
 	struct ahci_host_priv *hpriv;
@@ -354,7 +360,8 @@ static int tegra_ahci_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = ahci_platform_init_host(pdev, hpriv, &ahci_tegra_port_info);
+	ret = ahci_platform_init_host(pdev, hpriv, &ahci_tegra_port_info,
+				      &ahci_platform_sht);
 	if (ret)
 		goto deinit_controller;
 
@@ -370,7 +377,7 @@ static struct platform_driver tegra_ahci_driver = {
 	.probe = tegra_ahci_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "tegra-ahci",
+		.name = DRV_NAME,
 		.of_match_table = tegra_ahci_of_match,
 	},
 	/* LP0 suspend support not implemented */
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index 7f6887535c1e..e3b8750e8e9d 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -30,6 +30,8 @@
 #include <linux/phy/phy.h>
 #include "ahci.h"
 
+#define DRV_NAME "xgene-ahci"
+
 /* Max # of disk per a controller */
 #define MAX_AHCI_CHN_PERCTR		2
 
@@ -621,6 +623,10 @@ static int xgene_ahci_mux_select(struct xgene_ahci_context *ctx)
 	return val & CFG_SATA_ENET_SELECT_MASK ? -1 : 0;
 }
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int xgene_ahci_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -698,7 +704,8 @@ static int xgene_ahci_probe(struct platform_device *pdev)
 skip_clk_phy:
 	hpriv->flags = AHCI_HFLAG_NO_PMP | AHCI_HFLAG_NO_NCQ;
 
-	rc = ahci_platform_init_host(pdev, hpriv, &xgene_ahci_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &xgene_ahci_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -720,7 +727,7 @@ static struct platform_driver xgene_ahci_driver = {
 	.probe = xgene_ahci_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "xgene-ahci",
+		.name = DRV_NAME,
 		.of_match_table = xgene_ahci_of_match,
 	},
 };
diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c
index 504d534ccbfe..077c7a261354 100644
--- a/drivers/ata/libahci_platform.c
+++ b/drivers/ata/libahci_platform.c
@@ -35,10 +35,6 @@ struct ata_port_operations ahci_platform_ops = {
 };
 EXPORT_SYMBOL_GPL(ahci_platform_ops);
 
-static struct scsi_host_template ahci_platform_sht = {
-	AHCI_SHT("ahci_platform"),
-};
-
 /**
  * ahci_platform_enable_phys - Enable PHYs
  * @hpriv: host private area to store config values
@@ -494,6 +490,7 @@ EXPORT_SYMBOL_GPL(ahci_platform_get_resources);
  * @pdev: platform device pointer for the host
  * @hpriv: ahci-host private data for the host
  * @pi_template: template for the ata_port_info to use
+ * @sht: scsi_host_template to use when registering
  *
  * This function does all the usual steps needed to bring up an
  * ahci-platform host, note any necessary resources (ie clks, phys, etc.)
@@ -504,7 +501,8 @@ EXPORT_SYMBOL_GPL(ahci_platform_get_resources);
  */
 int ahci_platform_init_host(struct platform_device *pdev,
 			    struct ahci_host_priv *hpriv,
-			    const struct ata_port_info *pi_template)
+			    const struct ata_port_info *pi_template,
+			    struct scsi_host_template *sht)
 {
 	struct device *dev = &pdev->dev;
 	struct ata_port_info pi = *pi_template;
@@ -588,7 +586,7 @@ int ahci_platform_init_host(struct platform_device *pdev,
 	ahci_init_controller(host);
 	ahci_print_info(host, "platform");
 
-	return ahci_host_activate(host, irq, &ahci_platform_sht);
+	return ahci_host_activate(host, irq, sht);
 }
 EXPORT_SYMBOL_GPL(ahci_platform_init_host);
 
diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index f65b33809170..a270f25ee7c7 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -21,6 +21,7 @@ struct device;
 struct ata_port_info;
 struct ahci_host_priv;
 struct platform_device;
+struct scsi_host_template;
 
 int ahci_platform_enable_clks(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_clks(struct ahci_host_priv *hpriv);
@@ -32,7 +33,8 @@ struct ahci_host_priv *ahci_platform_get_resources(
 	struct platform_device *pdev);
 int ahci_platform_init_host(struct platform_device *pdev,
 			    struct ahci_host_priv *hpriv,
-			    const struct ata_port_info *pi_template);
+			    const struct ata_port_info *pi_template,
+			    struct scsi_host_template *sht);
 
 int ahci_platform_suspend_host(struct device *dev);
 int ahci_platform_resume_host(struct device *dev);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2d182413b1db..11beb4196c32 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1338,6 +1338,12 @@ extern const struct ata_port_operations ata_base_port_ops;
 extern const struct ata_port_operations sata_port_ops;
 extern struct device_attribute *ata_common_sdev_attrs[];
 
+/*
+ * All sht initializers (BASE, PIO, BMDMA, NCQ) must be instantiated
+ * by the edge drivers.  Because the 'module' field of sht must be the
+ * edge driver's module reference, otherwise the driver can be unloaded
+ * even if the scsi_device is being accessed.
+ */
 #define ATA_BASE_SHT(drv_name)					\
 	.module			= THIS_MODULE,			\
 	.name			= drv_name,			\
-- 
cgit v1.2.3


From 17263905399471016cda6c1975044d14291c5ba5 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Thu, 29 Jan 2015 08:30:30 +0900
Subject: ata: pata_platform: fix owner module reference mismatch for scsi host

The owner module reference of the pata_of_platform's scsi_host is
initialized to pata_platform's one, because pata_of_platform driver
use a scsi_host_template defined in pata_platform.  So this drivers
can be unloaded even if the scsi device is being accessed.

This fixes it by propagating the scsi_host_template to pata_of_platform
driver.  The scsi_host_template is passed through a new
argument of __pata_platform_probe().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: linux-ide@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
---
 drivers/ata/pata_of_platform.c | 10 ++++++++--
 drivers/ata/pata_platform.c    |  8 +++++---
 include/linux/ata_platform.h   |  5 ++++-
 3 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/pata_of_platform.c b/drivers/ata/pata_of_platform.c
index dcc408abe171..b6b7af894d9d 100644
--- a/drivers/ata/pata_of_platform.c
+++ b/drivers/ata/pata_of_platform.c
@@ -16,6 +16,12 @@
 #include <linux/ata_platform.h>
 #include <linux/libata.h>
 
+#define DRV_NAME "pata_of_platform"
+
+static struct scsi_host_template pata_platform_sht = {
+	ATA_PIO_SHT(DRV_NAME),
+};
+
 static int pata_of_platform_probe(struct platform_device *ofdev)
 {
 	int ret;
@@ -63,7 +69,7 @@ static int pata_of_platform_probe(struct platform_device *ofdev)
 	pio_mask |= (1 << pio_mode) - 1;
 
 	return __pata_platform_probe(&ofdev->dev, &io_res, &ctl_res, irq_res,
-				     reg_shift, pio_mask);
+				     reg_shift, pio_mask, &pata_platform_sht);
 }
 
 static struct of_device_id pata_of_platform_match[] = {
@@ -74,7 +80,7 @@ MODULE_DEVICE_TABLE(of, pata_of_platform_match);
 
 static struct platform_driver pata_of_platform_driver = {
 	.driver = {
-		.name = "pata_of_platform",
+		.name = DRV_NAME,
 		.of_match_table = pata_of_platform_match,
 	},
 	.probe		= pata_of_platform_probe,
diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c
index 1eedfe46d7c8..c503ded87bb8 100644
--- a/drivers/ata/pata_platform.c
+++ b/drivers/ata/pata_platform.c
@@ -78,6 +78,7 @@ static void pata_platform_setup_port(struct ata_ioports *ioaddr,
  *	@irq_res: Resource representing IRQ and its flags
  *	@ioport_shift: I/O port shift
  *	@__pio_mask: PIO mask
+ *	@sht: scsi_host_template to use when registering
  *
  *	Register a platform bus IDE interface. Such interfaces are PIO and we
  *	assume do not support IRQ sharing.
@@ -99,7 +100,8 @@ static void pata_platform_setup_port(struct ata_ioports *ioaddr,
  */
 int __pata_platform_probe(struct device *dev, struct resource *io_res,
 			  struct resource *ctl_res, struct resource *irq_res,
-			  unsigned int ioport_shift, int __pio_mask)
+			  unsigned int ioport_shift, int __pio_mask,
+			  struct scsi_host_template *sht)
 {
 	struct ata_host *host;
 	struct ata_port *ap;
@@ -170,7 +172,7 @@ int __pata_platform_probe(struct device *dev, struct resource *io_res,
 
 	/* activate */
 	return ata_host_activate(host, irq, irq ? ata_sff_interrupt : NULL,
-				 irq_flags, &pata_platform_sht);
+				 irq_flags, sht);
 }
 EXPORT_SYMBOL_GPL(__pata_platform_probe);
 
@@ -216,7 +218,7 @@ static int pata_platform_probe(struct platform_device *pdev)
 
 	return __pata_platform_probe(&pdev->dev, io_res, ctl_res, irq_res,
 				     pp_info ? pp_info->ioport_shift : 0,
-				     pio_mask);
+				     pio_mask, &pata_platform_sht);
 }
 
 static struct platform_driver pata_platform_driver = {
diff --git a/include/linux/ata_platform.h b/include/linux/ata_platform.h
index 5c618a084225..619d9e78e644 100644
--- a/include/linux/ata_platform.h
+++ b/include/linux/ata_platform.h
@@ -10,12 +10,15 @@ struct pata_platform_info {
 	unsigned int ioport_shift;
 };
 
+struct scsi_host_template;
+
 extern int __pata_platform_probe(struct device *dev,
 				 struct resource *io_res,
 				 struct resource *ctl_res,
 				 struct resource *irq_res,
 				 unsigned int ioport_shift,
-				 int __pio_mask);
+				 int __pio_mask,
+				 struct scsi_host_template *sht);
 
 /*
  * Marvell SATA private data
-- 
cgit v1.2.3


From 05afcb77eb4713f46e7ebaa3cb54bc465c5d516e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 23 Jan 2015 01:08:07 -0500
Subject: new helper: iov_iter_bvec()

similar to iov_iter_kvec(), for ITER_BVEC ones

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/splice.c         |  7 ++-----
 include/linux/uio.h |  4 +++-
 mm/iov_iter.c       | 17 +++++++++++++++--
 mm/page_io.c        |  9 ++-------
 4 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index 75c6058eabf2..7c7176f2d1e7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1006,11 +1006,8 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		}
 
 		/* ... iov_iter */
-		from.type = ITER_BVEC | WRITE;
-		from.bvec = array;
-		from.nr_segs = n;
-		from.count = sd.total_len - left;
-		from.iov_offset = 0;
+		iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
+			      sd.total_len - left);
 
 		/* ... and iocb */
 		init_sync_kiocb(&kiocb, out);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 1c5e453f7ea9..b447402d9737 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -88,7 +88,9 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
-void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *iov,
+void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec,
+			unsigned long nr_segs, size_t count);
+void iov_iter_bvec(struct iov_iter *i, int direction, const struct bio_vec *bvec,
 			unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 			size_t maxsize, unsigned maxpages, size_t *start);
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index a1599ca4ab0e..827732047da1 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -501,18 +501,31 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
 EXPORT_SYMBOL(iov_iter_single_seg_count);
 
 void iov_iter_kvec(struct iov_iter *i, int direction,
-			const struct kvec *iov, unsigned long nr_segs,
+			const struct kvec *kvec, unsigned long nr_segs,
 			size_t count)
 {
 	BUG_ON(!(direction & ITER_KVEC));
 	i->type = direction;
-	i->kvec = (struct kvec *)iov;
+	i->kvec = kvec;
 	i->nr_segs = nr_segs;
 	i->iov_offset = 0;
 	i->count = count;
 }
 EXPORT_SYMBOL(iov_iter_kvec);
 
+void iov_iter_bvec(struct iov_iter *i, int direction,
+			const struct bio_vec *bvec, unsigned long nr_segs,
+			size_t count)
+{
+	BUG_ON(!(direction & ITER_BVEC));
+	i->type = direction;
+	i->bvec = bvec;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_bvec);
+
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
 	unsigned long res = 0;
diff --git a/mm/page_io.c b/mm/page_io.c
index 955db8b0d497..e6045804c8d8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -269,14 +269,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 			.bv_len  = PAGE_SIZE,
 			.bv_offset = 0
 		};
-		struct iov_iter from = {
-			.type = ITER_BVEC | WRITE,
-			.count = PAGE_SIZE,
-			.iov_offset = 0,
-			.nr_segs = 1,
-		};
-		from.bvec = &bv;	/* older gcc versions are broken */
+		struct iov_iter from;
 
+		iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
 		init_sync_kiocb(&kiocb, swap_file);
 		kiocb.ki_pos = page_file_offset(page);
 		kiocb.ki_nbytes = PAGE_SIZE;
-- 
cgit v1.2.3


From dbe4e192a234cd6133d86fffb965d0f032c12ccc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 25 Jan 2015 21:11:59 +0100
Subject: fs: add vfs_iter_{read,write} helpers

Simple helpers that pass an arbitrary iov_iter to filesystems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/splice.c        | 16 ++--------------
 include/linux/fs.h |  3 +++
 3 files changed, 51 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..ab4f26a7d5cb 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -333,6 +333,52 @@ out_putf:
 }
 #endif
 
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	if (!file->f_op->read_iter)
+		return -EINVAL;
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_nbytes = iov_iter_count(iter);
+
+	iter->type |= READ;
+	ret = file->f_op->read_iter(&kiocb, iter);
+	if (ret == -EIOCBQUEUED)
+		ret = wait_on_sync_kiocb(&kiocb);
+
+	if (ret > 0)
+		*ppos = kiocb.ki_pos;
+	return ret;
+}
+EXPORT_SYMBOL(vfs_iter_read);
+
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	if (!file->f_op->write_iter)
+		return -EINVAL;
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_nbytes = iov_iter_count(iter);
+
+	iter->type |= WRITE;
+	ret = file->f_op->write_iter(&kiocb, iter);
+	if (ret == -EIOCBQUEUED)
+		ret = wait_on_sync_kiocb(&kiocb);
+
+	if (ret > 0)
+		*ppos = kiocb.ki_pos;
+	return ret;
+}
+EXPORT_SYMBOL(vfs_iter_write);
+
 /*
  * rw_verify_area doesn't like huge counts. We limit
  * them to something that fits in "int" so that others
diff --git a/fs/splice.c b/fs/splice.c
index 7c7176f2d1e7..7968da96bebb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -961,7 +961,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	splice_from_pipe_begin(&sd);
 	while (sd.total_len) {
 		struct iov_iter from;
-		struct kiocb kiocb;
 		size_t left;
 		int n, idx;
 
@@ -1005,26 +1004,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 			left -= this_len;
 		}
 
-		/* ... iov_iter */
 		iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
 			      sd.total_len - left);
-
-		/* ... and iocb */
-		init_sync_kiocb(&kiocb, out);
-		kiocb.ki_pos = sd.pos;
-		kiocb.ki_nbytes = sd.total_len - left;
-
-		/* now, send it */
-		ret = out->f_op->write_iter(&kiocb, &from);
-		if (-EIOCBQUEUED == ret)
-			ret = wait_on_sync_kiocb(&kiocb);
-
+		ret = vfs_iter_write(out, &from, &sd.pos);
 		if (ret <= 0)
 			break;
 
 		sd.num_spliced += ret;
 		sd.total_len -= ret;
-		*ppos = sd.pos = kiocb.ki_pos;
+		*ppos = sd.pos;
 
 		/* dismiss the fully eaten buffers, adjust the partial one */
 		while (ret) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..1f3c43952540 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2494,6 +2494,9 @@ extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t l
 extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos);
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos);
+
 /* fs/block_dev.c */
 extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from);
-- 
cgit v1.2.3


From 61f552141c9c0e88b3fdc7046265781ffd8fa68a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 18 Jan 2015 16:45:42 +0100
Subject: ftrace: let notrace function attribute disable hotpatching if
 necessary

gcc supports an s390 specific function attribute called "hotpatch".
It can be used to specify the number of halfwords that shall be added before
and after a function and which shall be filled with nops for runtime patching.

s390 will use the hotpatch attribute for function tracing, therefore make
sure that the notrace function attribute either disables the mcount call
or in case of hotpatch nop generation.

Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 include/linux/compiler.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d5ad7b1118fc..1ef679f4b88e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -54,7 +54,11 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 #include <linux/compiler-gcc.h>
 #endif
 
+#ifdef CC_USING_HOTPATCH
+#define notrace __attribute__((hotpatch(0,0)))
+#else
 #define notrace __attribute__((no_instrument_function))
+#endif
 
 /* Intel compiler defines __GNUC__. So we will overwrite implementations
  * coming from above header files here
-- 
cgit v1.2.3


From 3c313161353ad527de1a6ecde0f0d41700858fd6 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sat, 24 Jan 2015 18:47:20 +0100
Subject: bcma: detect SPROM revision 11
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extracting values from it is still unsupported, but at least we'll
display some meaningful error now.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/sprom.c         | 3 ++-
 include/linux/ssb/ssb_regs.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/sprom.c b/drivers/bcma/sprom.c
index efb037f9c98a..206edd3ba668 100644
--- a/drivers/bcma/sprom.c
+++ b/drivers/bcma/sprom.c
@@ -579,7 +579,8 @@ int bcma_sprom_get(struct bcma_bus *bus)
 	u16 offset = BCMA_CC_SPROM;
 	u16 *sprom;
 	size_t sprom_sizes[] = { SSB_SPROMSIZE_WORDS_R4,
-				 SSB_SPROMSIZE_WORDS_R10, };
+				 SSB_SPROMSIZE_WORDS_R10,
+				 SSB_SPROMSIZE_WORDS_R11, };
 	int i, err = 0;
 
 	if (!bus->drv_cc.core)
diff --git a/include/linux/ssb/ssb_regs.h b/include/linux/ssb/ssb_regs.h
index f7b9100686c3..c0f707ac192b 100644
--- a/include/linux/ssb/ssb_regs.h
+++ b/include/linux/ssb/ssb_regs.h
@@ -173,6 +173,7 @@
 #define SSB_SPROMSIZE_BYTES_R123	(SSB_SPROMSIZE_WORDS_R123 * sizeof(u16))
 #define SSB_SPROMSIZE_BYTES_R4		(SSB_SPROMSIZE_WORDS_R4 * sizeof(u16))
 #define SSB_SPROMSIZE_WORDS_R10		230
+#define SSB_SPROMSIZE_WORDS_R11		234
 #define SSB_SPROM_BASE1			0x1000
 #define SSB_SPROM_BASE31		0x0800
 #define SSB_SPROM_REVISION		0x007E
-- 
cgit v1.2.3


From b504075f5903b969a54ef3a6ae994c0872edb259 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 25 Jan 2015 11:11:14 +0100
Subject: bcma: add early_init function for PCIe core and move some fix into it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are some PCIe core fixes that need to be applied before accessing
SPROM, otherwise reading it may fail.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/driver_pci.c            | 66 ++++++++++++++++++++++++------------
 drivers/bcma/main.c                  |  7 ++++
 include/linux/bcma/bcma_driver_pci.h |  2 ++
 3 files changed, 53 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c
index b85a505603e6..786666488a2d 100644
--- a/drivers/bcma/driver_pci.c
+++ b/drivers/bcma/driver_pci.c
@@ -144,6 +144,47 @@ static u16 bcma_pcie_mdio_writeread(struct bcma_drv_pci *pc, u16 device,
 	return bcma_pcie_mdio_read(pc, device, address);
 }
 
+/**************************************************
+ * Early init.
+ **************************************************/
+
+static void bcma_core_pci_fixcfg(struct bcma_drv_pci *pc)
+{
+	struct bcma_device *core = pc->core;
+	u16 val16, core_index;
+	uint regoff;
+
+	regoff = BCMA_CORE_PCI_SPROM(BCMA_CORE_PCI_SPROM_PI_OFFSET);
+	core_index = (u16)core->core_index;
+
+	val16 = pcicore_read16(pc, regoff);
+	if (((val16 & BCMA_CORE_PCI_SPROM_PI_MASK) >> BCMA_CORE_PCI_SPROM_PI_SHIFT)
+	     != core_index) {
+		val16 = (core_index << BCMA_CORE_PCI_SPROM_PI_SHIFT) |
+			(val16 & ~BCMA_CORE_PCI_SPROM_PI_MASK);
+		pcicore_write16(pc, regoff, val16);
+	}
+}
+
+/*
+ * Apply some early fixes required before accessing SPROM.
+ * See also si_pci_fixcfg.
+ */
+void bcma_core_pci_early_init(struct bcma_drv_pci *pc)
+{
+	if (pc->early_setup_done)
+		return;
+
+	pc->hostmode = bcma_core_pci_is_in_hostmode(pc);
+	if (pc->hostmode)
+		goto out;
+
+	bcma_core_pci_fixcfg(pc);
+
+out:
+	pc->early_setup_done = true;
+}
+
 /**************************************************
  * Workarounds.
  **************************************************/
@@ -175,24 +216,6 @@ static void bcma_pcicore_serdes_workaround(struct bcma_drv_pci *pc)
 		                     tmp & ~BCMA_CORE_PCI_PLL_CTRL_FREQDET_EN);
 }
 
-static void bcma_core_pci_fixcfg(struct bcma_drv_pci *pc)
-{
-	struct bcma_device *core = pc->core;
-	u16 val16, core_index;
-	uint regoff;
-
-	regoff = BCMA_CORE_PCI_SPROM(BCMA_CORE_PCI_SPROM_PI_OFFSET);
-	core_index = (u16)core->core_index;
-
-	val16 = pcicore_read16(pc, regoff);
-	if (((val16 & BCMA_CORE_PCI_SPROM_PI_MASK) >> BCMA_CORE_PCI_SPROM_PI_SHIFT)
-	     != core_index) {
-		val16 = (core_index << BCMA_CORE_PCI_SPROM_PI_SHIFT) |
-			(val16 & ~BCMA_CORE_PCI_SPROM_PI_MASK);
-		pcicore_write16(pc, regoff, val16);
-	}
-}
-
 /* Fix MISC config to allow coming out of L2/L3-Ready state w/o PRST */
 /* Needs to happen when coming out of 'standby'/'hibernate' */
 static void bcma_core_pci_config_fixup(struct bcma_drv_pci *pc)
@@ -216,7 +239,6 @@ static void bcma_core_pci_config_fixup(struct bcma_drv_pci *pc)
 
 static void bcma_core_pci_clientmode_init(struct bcma_drv_pci *pc)
 {
-	bcma_core_pci_fixcfg(pc);
 	bcma_pcicore_serdes_workaround(pc);
 	bcma_core_pci_config_fixup(pc);
 }
@@ -226,11 +248,11 @@ void bcma_core_pci_init(struct bcma_drv_pci *pc)
 	if (pc->setup_done)
 		return;
 
-	pc->hostmode = bcma_core_pci_is_in_hostmode(pc);
+	bcma_core_pci_early_init(pc);
+
 	if (pc->hostmode)
 		bcma_core_pci_hostmode_init(pc);
-
-	if (!pc->hostmode)
+	else
 		bcma_core_pci_clientmode_init(pc);
 }
 
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index 73b2ee3de972..38bde6eab8a4 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -402,6 +402,13 @@ int bcma_bus_register(struct bcma_bus *bus)
 		bcma_core_chipcommon_early_init(&bus->drv_cc);
 	}
 
+	/* Early init PCIE core */
+	core = bcma_find_core(bus, BCMA_CORE_PCIE);
+	if (core) {
+		bus->drv_pci[0].core = core;
+		bcma_core_pci_early_init(&bus->drv_pci[0]);
+	}
+
 	/* Cores providing flash access go before SPROM init */
 	list_for_each_entry(core, &bus->cores, list) {
 		if (bcma_is_core_needed_early(core->id.id))
diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h
index 0333e605ea0d..3f809ae372c4 100644
--- a/include/linux/bcma/bcma_driver_pci.h
+++ b/include/linux/bcma/bcma_driver_pci.h
@@ -223,6 +223,7 @@ struct bcma_drv_pci_host {
 
 struct bcma_drv_pci {
 	struct bcma_device *core;
+	u8 early_setup_done:1;
 	u8 setup_done:1;
 	u8 hostmode:1;
 
@@ -237,6 +238,7 @@ struct bcma_drv_pci {
 #define pcicore_write16(pc, offset, val)	bcma_write16((pc)->core, offset, val)
 #define pcicore_write32(pc, offset, val)	bcma_write32((pc)->core, offset, val)
 
+extern void bcma_core_pci_early_init(struct bcma_drv_pci *pc);
 extern void bcma_core_pci_init(struct bcma_drv_pci *pc);
 extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc,
 				 struct bcma_device *core, bool enable);
-- 
cgit v1.2.3


From 8be08a39d498d5d93ff5149276e34ccb4ec3757f Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 25 Jan 2015 13:41:19 +0100
Subject: bcma: implement host code support for PCIe Gen 2 devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is stil incomplete, so we don't add PCI IDs of new devices yet.
Purpose of this patch is to allow testing & adjusting rest of the code.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/host_pci.c        | 6 ++++--
 include/linux/bcma/bcma.h      | 1 +
 include/linux/bcma/bcma_regs.h | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c
index cd9161a8b3a1..53c6a8a58859 100644
--- a/drivers/bcma/host_pci.c
+++ b/drivers/bcma/host_pci.c
@@ -13,10 +13,12 @@
 
 static void bcma_host_pci_switch_core(struct bcma_device *core)
 {
+	int win2 = core->bus->host_is_pcie2 ?
+		BCMA_PCIE2_BAR0_WIN2 : BCMA_PCI_BAR0_WIN2;
+
 	pci_write_config_dword(core->bus->host_pci, BCMA_PCI_BAR0_WIN,
 			       core->addr);
-	pci_write_config_dword(core->bus->host_pci, BCMA_PCI_BAR0_WIN2,
-			       core->wrap);
+	pci_write_config_dword(core->bus->host_pci, win2, core->wrap);
 	core->bus->mapped_core = core;
 	bcma_debug(core->bus, "Switched to core: 0x%X\n", core->id.id);
 }
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index eb1c6a47b67f..994739da827f 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -318,6 +318,7 @@ struct bcma_bus {
 	const struct bcma_host_ops *ops;
 
 	enum bcma_hosttype hosttype;
+	bool host_is_pcie2; /* Used for BCMA_HOSTTYPE_PCI only */
 	union {
 		/* Pointer to the PCI bus (only for BCMA_HOSTTYPE_PCI) */
 		struct pci_dev *host_pci;
diff --git a/include/linux/bcma/bcma_regs.h b/include/linux/bcma/bcma_regs.h
index e64ae7bf80a1..ebd5c1fcdea4 100644
--- a/include/linux/bcma/bcma_regs.h
+++ b/include/linux/bcma/bcma_regs.h
@@ -64,6 +64,8 @@
 #define  BCMA_PCI_GPIO_XTAL		0x40	/* PCI config space GPIO 14 for Xtal powerup */
 #define  BCMA_PCI_GPIO_PLL		0x80	/* PCI config space GPIO 15 for PLL powerdown */
 
+#define BCMA_PCIE2_BAR0_WIN2		0x70
+
 /* SiliconBackplane Address Map.
  * All regions may not exist on all chips.
  */
-- 
cgit v1.2.3


From 0501be6429e4eb02f417ad83eacd84b8c57b0283 Mon Sep 17 00:00:00 2001
From: Alexey Skidanov <alexey.skidanov@sandisk.com>
Date: Thu, 29 Jan 2015 10:49:43 +0200
Subject: mmc: Resolve BKOPS compatability issue

This patch is coming to fix compatibility issue of BKOPS_EN  field of EXT_CSD.
In eMMC-5.1, BKOPS_EN was changed, and now it has two operational bits:
Bit 0 - MANUAL_EN
Bit 1 - AUTO_EN
In previous eMMC revisions, only Bit 0 was supported.

Signed-off-by: Alexey Skidanov <alexey.skidanov@sandisk.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c  | 2 +-
 drivers/mmc/core/mmc.c   | 8 +++++---
 include/linux/mmc/card.h | 2 +-
 include/linux/mmc/mmc.h  | 5 +++++
 4 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 1be7055548cb..0dc64e6e00d4 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -275,7 +275,7 @@ void mmc_start_bkops(struct mmc_card *card, bool from_exception)
 
 	BUG_ON(!card);
 
-	if (!card->ext_csd.bkops_en || mmc_card_doing_bkops(card))
+	if (!card->ext_csd.man_bkops_en || mmc_card_doing_bkops(card))
 		return;
 
 	err = mmc_read_bkops_status(card);
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 1fc48a280659..1d41e8541f38 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -483,11 +483,13 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		/* check whether the eMMC card supports BKOPS */
 		if (ext_csd[EXT_CSD_BKOPS_SUPPORT] & 0x1) {
 			card->ext_csd.bkops = 1;
-			card->ext_csd.bkops_en = ext_csd[EXT_CSD_BKOPS_EN];
+			card->ext_csd.man_bkops_en =
+					(ext_csd[EXT_CSD_BKOPS_EN] &
+						EXT_CSD_MANUAL_BKOPS_MASK);
 			card->ext_csd.raw_bkops_status =
 				ext_csd[EXT_CSD_BKOPS_STATUS];
-			if (!card->ext_csd.bkops_en)
-				pr_info("%s: BKOPS_EN bit is not set\n",
+			if (!card->ext_csd.man_bkops_en)
+				pr_info("%s: MAN_BKOPS_EN bit is not set\n",
 					mmc_hostname(card->host));
 		}
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 4d69c00497bd..a6cf4c063e4e 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -83,7 +83,7 @@ struct mmc_ext_csd {
 	bool			hpi;			/* HPI support bit */
 	unsigned int		hpi_cmd;		/* cmd used as HPI */
 	bool			bkops;		/* background support bit */
-	bool			bkops_en;	/* background enable bit */
+	bool			man_bkops_en;	/* manual bkops enable bit */
 	unsigned int            data_sector_size;       /* 512 bytes or 4KB */
 	unsigned int            data_tag_unit_size;     /* DATA TAG UNIT size */
 	unsigned int		boot_ro_lock;		/* ro lock support */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index fb97b5cc91cd..124f562118b8 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -427,6 +427,11 @@ struct _mmc_csd {
  */
 #define EXT_CSD_BKOPS_LEVEL_2		0x2
 
+/*
+ * BKOPS modes
+ */
+#define EXT_CSD_MANUAL_BKOPS_MASK	0x01
+
 /*
  * MMC_SWITCH access modes
  */
-- 
cgit v1.2.3


From 3b0f1d01e501792d8d89ab4371bc9e8cd2a10032 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:23 +0800
Subject: KVM: Rename kvm_arch_mmu_write_protect_pt_masked to be more generic
 for log dirty

We don't have to write protect guest memory for dirty logging if architecture
supports hardware dirty logging, such as PML on VMX, so rename it to be more
generic.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kvm/mmu.c       | 18 ++++++++++++++++--
 arch/x86/kvm/mmu.c       | 21 +++++++++++++++++++--
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      |  2 +-
 4 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 74aeabaa3c4d..6034697ede3f 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1081,7 +1081,7 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 }
 
 /**
- * kvm_arch_mmu_write_protect_pt_masked() - write protect dirty pages
+ * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
  * @kvm:	The KVM pointer
  * @slot:	The memory slot associated with mask
  * @gfn_offset:	The gfn offset in memory slot
@@ -1091,7 +1091,7 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
  * Walks bits set in mask write protects the associated pte's. Caller must
  * acquire kvm_mmu_lock.
  */
-void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 		struct kvm_memory_slot *slot,
 		gfn_t gfn_offset, unsigned long mask)
 {
@@ -1102,6 +1102,20 @@ void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
 	stage2_wp_range(kvm, start, end);
 }
 
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask)
+{
+	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0ed9f795e4f0..b18e65ce3683 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1216,7 +1216,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 }
 
 /**
- * kvm_arch_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
  * @slot: slot to protect
  * @gfn_offset: start of the BITS_PER_LONG pages we care about
@@ -1225,7 +1225,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
  * Used when we do not need to care about huge page mappings: e.g. during dirty
  * logging we do not have any such mappings.
  */
-void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 				     struct kvm_memory_slot *slot,
 				     gfn_t gfn_offset, unsigned long mask)
 {
@@ -1241,6 +1241,23 @@ void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
 	}
 }
 
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+				struct kvm_memory_slot *slot,
+				gfn_t gfn_offset, unsigned long mask)
+{
+	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
 static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d6719522f1f..32d057571bf6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -615,7 +615,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
 int kvm_get_dirty_log_protect(struct kvm *kvm,
 			struct kvm_dirty_log *log, bool *is_dirty);
 
-void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 					struct kvm_memory_slot *slot,
 					gfn_t gfn_offset,
 					unsigned long mask);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a8490f084483..0c281760a1c5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1059,7 +1059,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 		dirty_bitmap_buffer[i] = mask;
 
 		offset = i * BITS_PER_LONG;
-		kvm_arch_mmu_write_protect_pt_masked(kvm, memslot, offset,
+		kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset,
 								mask);
 	}
 
-- 
cgit v1.2.3


From 80b2502cea34a965a6b3390691854e753945ca5f Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@freescale.com>
Date: Wed, 28 Jan 2015 16:32:24 +0800
Subject: usb: gadget: introduce is_selfpowered for usb_gadget

Whether the gadget is selfpowerwed or not can be determined by composite
core, so we can use a common entry to indicate if the self-powered
is supported by gadget, and the related private variable at individual
udc driver can be deleted.

Signed-off-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/gadget.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 70ddb3943b62..e2f00fd8cd47 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -523,6 +523,7 @@ struct usb_gadget_ops {
  *	enabled HNP support.
  * @quirk_ep_out_aligned_size: epout requires buffer size to be aligned to
  *	MaxPacketSize.
+ * @is_selfpowered: if the gadget is self-powered.
  *
  * Gadgets have a mostly-portable "gadget driver" implementing device
  * functions, handling all usb configurations and interfaces.  Gadget
@@ -563,6 +564,7 @@ struct usb_gadget {
 	unsigned			a_hnp_support:1;
 	unsigned			a_alt_hnp_support:1;
 	unsigned			quirk_ep_out_aligned_size:1;
+	unsigned			is_selfpowered:1;
 };
 #define work_to_gadget(w)	(container_of((w), struct usb_gadget, work))
 
-- 
cgit v1.2.3


From ac3dd5bd128b1d1ce2a037775766f39d06a4848a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 22 Jan 2015 12:07:58 -0700
Subject: NVMe: avoid kmalloc/kfree for smaller IO

Currently we allocate an nvme_iod for each IO, which holds the
sg list, prps, and other IO related info. Set a threshold of
2 pages and/or 8KB of data, below which we can just embed this
in the per-command pdu in blk-mq. For any IO at or below
NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree.

For higher IOPS, this saves up to 1% of CPU time.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 119 ++++++++++++++++++++++++++++++++++------------
 include/linux/nvme.h      |   3 +-
 2 files changed, 89 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f4aa64160838..3eaa0becc52d 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
 	void *ctx;
 	int aborted;
 	struct nvme_queue *nvmeq;
+	struct nvme_iod iod[0];
 };
 
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES		2
+#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->page_size)
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+	unsigned int ret = sizeof(struct nvme_cmd_info);
+
+	ret += sizeof(struct nvme_iod);
+	ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
+	ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+
+	return ret;
+}
+
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 				unsigned int hctx_idx)
 {
@@ -217,6 +246,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
 	cmd->aborted = 0;
 }
 
+static void *iod_get_private(struct nvme_iod *iod)
+{
+	return (void *) (iod->private & ~0x1UL);
+}
+
+/*
+ * If bit 0 is set, the iod is embedded in the request payload.
+ */
+static bool iod_should_kfree(struct nvme_iod *iod)
+{
+	return (iod->private & 0x01) == 0;
+}
+
 /* Special values must be less than 0x1000 */
 #define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
@@ -360,35 +402,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
 	return ((void *)iod) + iod->offset;
 }
 
-/*
- * Will slightly overestimate the number of pages needed.  This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static int nvme_npages(unsigned size, struct nvme_dev *dev)
+static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
+			    unsigned nseg, unsigned long private)
 {
-	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
-	return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
+	iod->private = private;
+	iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+	iod->npages = -1;
+	iod->length = nbytes;
+	iod->nents = 0;
 }
 
 static struct nvme_iod *
-nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
+__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
+		 unsigned long priv, gfp_t gfp)
 {
 	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
-				sizeof(__le64 *) * nvme_npages(nbytes, dev) +
+				sizeof(__le64 *) * nvme_npages(bytes, dev) +
 				sizeof(struct scatterlist) * nseg, gfp);
 
-	if (iod) {
-		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
-		iod->npages = -1;
-		iod->length = nbytes;
-		iod->nents = 0;
-		iod->first_dma = 0ULL;
-	}
+	if (iod)
+		iod_init(iod, bytes, nseg, priv);
 
 	return iod;
 }
 
+static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
+			               gfp_t gfp)
+{
+	unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
+                                                sizeof(struct nvme_dsm_range);
+	unsigned long mask = 0;
+	struct nvme_iod *iod;
+
+	if (rq->nr_phys_segments <= NVME_INT_PAGES &&
+	    size <= NVME_INT_BYTES(dev)) {
+		struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+
+		iod = cmd->iod;
+		mask = 0x01;
+		iod_init(iod, size, rq->nr_phys_segments,
+				(unsigned long) rq | 0x01);
+		return iod;
+	}
+
+	return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
+				(unsigned long) rq, gfp);
+}
+
 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = dev->page_size / 8 - 1;
@@ -404,7 +464,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
 		prp_dma = next_prp_dma;
 	}
-	kfree(iod);
+
+	if (iod_should_kfree(iod))
+		kfree(iod);
 }
 
 static int nvme_error_status(u16 status)
@@ -423,7 +485,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_iod *iod = ctx;
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
 
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -579,7 +641,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 							struct nvme_ns *ns)
 {
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_command *cmnd;
 	u16 control = 0;
 	u32 dsmgmt = 0;
@@ -620,17 +682,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *req = bd->rq;
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
-	int psegs = req->nr_phys_segments;
 	enum dma_data_direction dma_dir;
-	unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
-						sizeof(struct nvme_dsm_range);
 
-	iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
+	iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
-	iod->private = req;
-
 	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
@@ -645,10 +702,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			goto retry_cmd;
 		iod_list(iod)[0] = (__le64 *)range;
 		iod->npages = 0;
-	} else if (psegs) {
+	} else if (req->nr_phys_segments) {
 		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 
-		sg_init_table(iod->sg, psegs);
+		sg_init_table(iod->sg, req->nr_phys_segments);
 		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
 		if (!iod->nents)
 			goto error_cmd;
@@ -1362,7 +1419,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
-		dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
 		dev->admin_tagset.driver_data = dev;
 
 		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1483,7 +1540,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	}
 
 	err = -ENOMEM;
-	iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
+	iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
 	if (!iod)
 		goto put_pages;
 
@@ -2109,7 +2166,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
 	dev->tagset.queue_depth =
 				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
-	dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+	dev->tagset.cmd_size = nvme_cmd_size(dev);
 	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
 	dev->tagset.driver_data = dev;
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 258945fcabf1..19a5d4b23209 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -132,13 +132,12 @@ struct nvme_ns {
  * allocated to store the PRP list.
  */
 struct nvme_iod {
-	void *private;		/* For the use of the submitter of the I/O */
+	unsigned long private;	/* For the use of the submitter of the I/O */
 	int npages;		/* In the PRP list. 0 means small pool in use */
 	int offset;		/* Of PRP list */
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
 	dma_addr_t first_dma;
-	struct list_head node;
 	struct scatterlist sg[0];
 };
 
-- 
cgit v1.2.3


From 50dd64d57eee8aed1a86e875ce77d078e44fad00 Mon Sep 17 00:00:00 2001
From: Karol Wrona <k.wrona@samsung.com>
Date: Wed, 28 Jan 2015 15:05:50 +0100
Subject: iio: common: ssp_sensors: Add sensorhub driver

Sensorhub  is MCU dedicated to collect data and manage several sensors.
Sensorhub is a spi device which provides a layer for IIO devices. It provides
some data parsing and common mechanism for sensorhub sensors.

Adds common sensorhub library for sensorhub driver and iio drivers
which uses sensorhub MCU to communicate with sensors.

Signed-off-by: Karol Wrona <k.wrona@samsung.com>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/common/Kconfig               |   1 +
 drivers/iio/common/Makefile              |   1 +
 drivers/iio/common/ssp_sensors/Kconfig   |  16 +
 drivers/iio/common/ssp_sensors/Makefile  |   6 +
 drivers/iio/common/ssp_sensors/ssp.h     | 257 +++++++++++
 drivers/iio/common/ssp_sensors/ssp_dev.c | 712 +++++++++++++++++++++++++++++++
 drivers/iio/common/ssp_sensors/ssp_spi.c | 608 ++++++++++++++++++++++++++
 include/linux/iio/common/ssp_sensors.h   |  82 ++++
 8 files changed, 1683 insertions(+)
 create mode 100644 drivers/iio/common/ssp_sensors/Kconfig
 create mode 100644 drivers/iio/common/ssp_sensors/Makefile
 create mode 100644 drivers/iio/common/ssp_sensors/ssp.h
 create mode 100644 drivers/iio/common/ssp_sensors/ssp_dev.c
 create mode 100644 drivers/iio/common/ssp_sensors/ssp_spi.c
 create mode 100644 include/linux/iio/common/ssp_sensors.h

(limited to 'include/linux')

diff --git a/drivers/iio/common/Kconfig b/drivers/iio/common/Kconfig
index 0b6e97d18fa0..790f106d719c 100644
--- a/drivers/iio/common/Kconfig
+++ b/drivers/iio/common/Kconfig
@@ -3,4 +3,5 @@
 #
 
 source "drivers/iio/common/hid-sensors/Kconfig"
+source "drivers/iio/common/ssp_sensors/Kconfig"
 source "drivers/iio/common/st_sensors/Kconfig"
diff --git a/drivers/iio/common/Makefile b/drivers/iio/common/Makefile
index 3112df0060e9..b1e4d9c9591c 100644
--- a/drivers/iio/common/Makefile
+++ b/drivers/iio/common/Makefile
@@ -8,4 +8,5 @@
 
 # When adding new entries keep the list in alphabetical order
 obj-y += hid-sensors/
+obj-y += ssp_sensors/
 obj-y += st_sensors/
diff --git a/drivers/iio/common/ssp_sensors/Kconfig b/drivers/iio/common/ssp_sensors/Kconfig
new file mode 100644
index 000000000000..759b975e2b40
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/Kconfig
@@ -0,0 +1,16 @@
+#
+# SSP sensor drivers and commons configuration
+#
+menu "SSP Sensor Common"
+
+config IIO_SSP_SENSORHUB
+	tristate "Samsung Sensorhub driver"
+	depends on SPI
+	select MFD_CORE
+	help
+	  SSP driver for sensorhub.
+	  If you say yes here you get ssp support for sensorhub.
+	  To compile this driver as a module, choose M here: the
+	  module will be called sensorhub.
+
+endmenu
diff --git a/drivers/iio/common/ssp_sensors/Makefile b/drivers/iio/common/ssp_sensors/Makefile
new file mode 100644
index 000000000000..07d3d6a6763b
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for SSP sensor drivers and commons.
+#
+
+sensorhub-objs				:= ssp_dev.o ssp_spi.o
+obj-$(CONFIG_IIO_SSP_SENSORHUB)		+= sensorhub.o
diff --git a/drivers/iio/common/ssp_sensors/ssp.h b/drivers/iio/common/ssp_sensors/ssp.h
new file mode 100644
index 000000000000..b910e91d7c0d
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/ssp.h
@@ -0,0 +1,257 @@
+/*
+ *  Copyright (C) 2014, Samsung Electronics Co. Ltd. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#ifndef __SSP_SENSORHUB_H__
+#define __SSP_SENSORHUB_H__
+
+#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/iio/common/ssp_sensors.h>
+#include <linux/iio/iio.h>
+#include <linux/spi/spi.h>
+
+#define SSP_DEVICE_ID		0x55
+
+#ifdef SSP_DBG
+#define ssp_dbg(format, ...) pr_info("[SSP] "format, ##__VA_ARGS__)
+#else
+#define ssp_dbg(format, ...)
+#endif
+
+#define SSP_SW_RESET_TIME		3000
+/* Sensor polling in ms */
+#define SSP_DEFAULT_POLLING_DELAY	200
+#define SSP_DEFAULT_RETRIES		3
+#define SSP_DATA_PACKET_SIZE		960
+#define SSP_HEADER_BUFFER_SIZE		4
+
+enum {
+	SSP_KERNEL_BINARY = 0,
+	SSP_KERNEL_CRASHED_BINARY,
+};
+
+enum {
+	SSP_INITIALIZATION_STATE = 0,
+	SSP_NO_SENSOR_STATE,
+	SSP_ADD_SENSOR_STATE,
+	SSP_RUNNING_SENSOR_STATE,
+};
+
+/* Firmware download STATE */
+enum {
+	SSP_FW_DL_STATE_FAIL = -1,
+	SSP_FW_DL_STATE_NONE = 0,
+	SSP_FW_DL_STATE_NEED_TO_SCHEDULE,
+	SSP_FW_DL_STATE_SCHEDULED,
+	SSP_FW_DL_STATE_DOWNLOADING,
+	SSP_FW_DL_STATE_SYNC,
+	SSP_FW_DL_STATE_DONE,
+};
+
+#define SSP_INVALID_REVISION			99999
+#define SSP_INVALID_REVISION2			0xffffff
+
+/* AP -> SSP Instruction */
+#define SSP_MSG2SSP_INST_BYPASS_SENSOR_ADD	0xa1
+#define SSP_MSG2SSP_INST_BYPASS_SENSOR_RM	0xa2
+#define SSP_MSG2SSP_INST_REMOVE_ALL		0xa3
+#define SSP_MSG2SSP_INST_CHANGE_DELAY		0xa4
+#define SSP_MSG2SSP_INST_LIBRARY_ADD		0xb1
+#define SSP_MSG2SSP_INST_LIBRARY_REMOVE		0xb2
+#define SSP_MSG2SSP_INST_LIB_NOTI		0xb4
+#define SSP_MSG2SSP_INST_LIB_DATA		0xc1
+
+#define SSP_MSG2SSP_AP_MCU_SET_GYRO_CAL		0xcd
+#define SSP_MSG2SSP_AP_MCU_SET_ACCEL_CAL	0xce
+#define SSP_MSG2SSP_AP_STATUS_SHUTDOWN		0xd0
+#define SSP_MSG2SSP_AP_STATUS_WAKEUP		0xd1
+#define SSP_MSG2SSP_AP_STATUS_SLEEP		0xd2
+#define SSP_MSG2SSP_AP_STATUS_RESUME		0xd3
+#define SSP_MSG2SSP_AP_STATUS_SUSPEND		0xd4
+#define SSP_MSG2SSP_AP_STATUS_RESET		0xd5
+#define SSP_MSG2SSP_AP_STATUS_POW_CONNECTED	0xd6
+#define SSP_MSG2SSP_AP_STATUS_POW_DISCONNECTED	0xd7
+#define SSP_MSG2SSP_AP_TEMPHUMIDITY_CAL_DONE	0xda
+#define SSP_MSG2SSP_AP_MCU_SET_DUMPMODE		0xdb
+#define SSP_MSG2SSP_AP_MCU_DUMP_CHECK		0xdc
+#define SSP_MSG2SSP_AP_MCU_BATCH_FLUSH		0xdd
+#define SSP_MSG2SSP_AP_MCU_BATCH_COUNT		0xdf
+
+#define SSP_MSG2SSP_AP_WHOAMI				0x0f
+#define SSP_MSG2SSP_AP_FIRMWARE_REV			0xf0
+#define SSP_MSG2SSP_AP_SENSOR_FORMATION			0xf1
+#define SSP_MSG2SSP_AP_SENSOR_PROXTHRESHOLD		0xf2
+#define SSP_MSG2SSP_AP_SENSOR_BARCODE_EMUL		0xf3
+#define SSP_MSG2SSP_AP_SENSOR_SCANNING			0xf4
+#define SSP_MSG2SSP_AP_SET_MAGNETIC_HWOFFSET		0xf5
+#define SSP_MSG2SSP_AP_GET_MAGNETIC_HWOFFSET		0xf6
+#define SSP_MSG2SSP_AP_SENSOR_GESTURE_CURRENT		0xf7
+#define SSP_MSG2SSP_AP_GET_THERM			0xf8
+#define SSP_MSG2SSP_AP_GET_BIG_DATA			0xf9
+#define SSP_MSG2SSP_AP_SET_BIG_DATA			0xfa
+#define SSP_MSG2SSP_AP_START_BIG_DATA			0xfb
+#define SSP_MSG2SSP_AP_SET_MAGNETIC_STATIC_MATRIX	0xfd
+#define SSP_MSG2SSP_AP_SENSOR_TILT			0xea
+#define SSP_MSG2SSP_AP_MCU_SET_TIME			0xfe
+#define SSP_MSG2SSP_AP_MCU_GET_TIME			0xff
+
+#define SSP_MSG2SSP_AP_FUSEROM				0x01
+
+/* voice data */
+#define SSP_TYPE_WAKE_UP_VOICE_SERVICE			0x01
+#define SSP_TYPE_WAKE_UP_VOICE_SOUND_SOURCE_AM		0x01
+#define SSP_TYPE_WAKE_UP_VOICE_SOUND_SOURCE_GRAMMER	0x02
+
+/* Factory Test */
+#define SSP_ACCELEROMETER_FACTORY			0x80
+#define SSP_GYROSCOPE_FACTORY				0x81
+#define SSP_GEOMAGNETIC_FACTORY				0x82
+#define SSP_PRESSURE_FACTORY				0x85
+#define SSP_GESTURE_FACTORY				0x86
+#define SSP_TEMPHUMIDITY_CRC_FACTORY			0x88
+#define SSP_GYROSCOPE_TEMP_FACTORY			0x8a
+#define SSP_GYROSCOPE_DPS_FACTORY			0x8b
+#define SSP_MCU_FACTORY					0x8c
+#define SSP_MCU_SLEEP_FACTORY				0x8d
+
+/* SSP -> AP ACK about write CMD */
+#define SSP_MSG_ACK		0x80	/* ACK from SSP to AP */
+#define SSP_MSG_NAK		0x70	/* NAK from SSP to AP */
+
+struct ssp_sensorhub_info {
+	char *fw_name;
+	char *fw_crashed_name;
+	unsigned int fw_rev;
+	const u8 * const mag_table;
+	const unsigned int mag_length;
+};
+
+/* ssp_msg options bit */
+#define SSP_RW		0
+#define SSP_INDEX	3
+
+#define SSP_AP2HUB_READ		0
+#define SSP_AP2HUB_WRITE	1
+#define SSP_HUB2AP_WRITE	2
+#define SSP_AP2HUB_READY	3
+#define SSP_AP2HUB_RETURN	4
+
+/**
+ * struct ssp_data - ssp platformdata structure
+ * @spi:		spi device
+ * @sensorhub_info:	info about sensorhub board specific features
+ * @wdt_timer:		watchdog timer
+ * @work_wdt:		watchdog work
+ * @work_firmware:	firmware upgrade work queue
+ * @work_refresh:	refresh work queue for reset request from MCU
+ * @shut_down:		shut down flag
+ * @mcu_dump_mode:	mcu dump mode for debug
+ * @time_syncing:	time syncing indication flag
+ * @timestamp:		previous time in ns calculated for time syncing
+ * @check_status:	status table for each sensor
+ * @com_fail_cnt:	communication fail count
+ * @reset_cnt:		reset count
+ * @timeout_cnt:	timeout count
+ * @available_sensors:	available sensors seen by sensorhub (bit array)
+ * @cur_firm_rev:	cached current firmware revision
+ * @last_resume_state:	last AP resume/suspend state used to handle the PM
+ *                      state of ssp
+ * @last_ap_state:	(obsolete) sleep notification for MCU
+ * @sensor_enable:	sensor enable mask
+ * @delay_buf:		data acquisition intervals table
+ * @batch_latency_buf:	yet unknown but existing in communication protocol
+ * @batch_opt_buf:	yet unknown but existing in communication protocol
+ * @accel_position:	yet unknown but existing in communication protocol
+ * @mag_position:	yet unknown but existing in communication protocol
+ * @fw_dl_state:	firmware download state
+ * @comm_lock:		lock protecting the handshake
+ * @pending_lock:	lock protecting pending list and completion
+ * @mcu_reset_gpio:	mcu reset line
+ * @ap_mcu_gpio:	ap to mcu gpio line
+ * @mcu_ap_gpio:	mcu to ap gpio line
+ * @pending_list:	pending list for messages queued to be sent/read
+ * @sensor_devs:	registered IIO devices table
+ * @enable_refcount:	enable reference count for wdt (watchdog timer)
+ * @header_buffer:	cache aligned buffer for packet header
+ */
+struct ssp_data {
+	struct spi_device *spi;
+	struct ssp_sensorhub_info *sensorhub_info;
+	struct timer_list wdt_timer;
+	struct work_struct work_wdt;
+	struct delayed_work work_refresh;
+
+	bool shut_down;
+	bool mcu_dump_mode;
+	bool time_syncing;
+	int64_t timestamp;
+
+	int check_status[SSP_SENSOR_MAX];
+
+	unsigned int com_fail_cnt;
+	unsigned int reset_cnt;
+	unsigned int timeout_cnt;
+
+	unsigned int available_sensors;
+	unsigned int cur_firm_rev;
+
+	char last_resume_state;
+	char last_ap_state;
+
+	unsigned int sensor_enable;
+	u32 delay_buf[SSP_SENSOR_MAX];
+	s32 batch_latency_buf[SSP_SENSOR_MAX];
+	s8 batch_opt_buf[SSP_SENSOR_MAX];
+
+	int accel_position;
+	int mag_position;
+	int fw_dl_state;
+
+	struct mutex comm_lock;
+	struct mutex pending_lock;
+
+	int mcu_reset_gpio;
+	int ap_mcu_gpio;
+	int mcu_ap_gpio;
+
+	struct list_head pending_list;
+
+	struct iio_dev *sensor_devs[SSP_SENSOR_MAX];
+	atomic_t enable_refcount;
+
+	__le16 header_buffer[SSP_HEADER_BUFFER_SIZE / sizeof(__le16)]
+		____cacheline_aligned;
+};
+
+void ssp_clean_pending_list(struct ssp_data *data);
+
+int ssp_command(struct ssp_data *data, char command, int arg);
+
+int ssp_send_instruction(struct ssp_data *data, u8 inst, u8 sensor_type,
+			 u8 *send_buf, u8 length);
+
+int ssp_irq_msg(struct ssp_data *data);
+
+int ssp_get_chipid(struct ssp_data *data);
+
+int ssp_set_magnetic_matrix(struct ssp_data *data);
+
+unsigned int ssp_get_sensor_scanning_info(struct ssp_data *data);
+
+unsigned int ssp_get_firmware_rev(struct ssp_data *data);
+
+int ssp_queue_ssp_refresh_task(struct ssp_data *data, unsigned int delay);
+
+#endif /* __SSP_SENSORHUB_H__ */
diff --git a/drivers/iio/common/ssp_sensors/ssp_dev.c b/drivers/iio/common/ssp_sensors/ssp_dev.c
new file mode 100644
index 000000000000..52d70435f5a1
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/ssp_dev.c
@@ -0,0 +1,712 @@
+/*
+ *  Copyright (C) 2014, Samsung Electronics Co. Ltd. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#include <linux/iio/iio.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_gpio.h>
+#include <linux/of_platform.h>
+#include "ssp.h"
+
+#define SSP_WDT_TIME			10000
+#define SSP_LIMIT_RESET_CNT		20
+#define SSP_LIMIT_TIMEOUT_CNT		3
+
+/* It is possible that it is max clk rate for version 1.0 of bootcode */
+#define SSP_BOOT_SPI_HZ	400000
+
+/*
+ * These fields can look enigmatic but this structure is used mainly to flat
+ * some values and depends on command type.
+ */
+struct ssp_instruction {
+	__le32 a;
+	__le32 b;
+	u8 c;
+} __attribute__((__packed__));
+
+static const u8 ssp_magnitude_table[] = {110, 85, 171, 71, 203, 195, 0, 67,
+	208, 56, 175, 244, 206, 213, 0, 92, 250, 0, 55, 48, 189, 252, 171,
+	243, 13, 45, 250};
+
+static const struct ssp_sensorhub_info ssp_rinato_info = {
+	.fw_name = "ssp_B2.fw",
+	.fw_crashed_name = "ssp_crashed.fw",
+	.fw_rev = 14052300,
+	.mag_table = ssp_magnitude_table,
+	.mag_length = ARRAY_SIZE(ssp_magnitude_table),
+};
+
+static const struct ssp_sensorhub_info ssp_thermostat_info = {
+	.fw_name = "thermostat_B2.fw",
+	.fw_crashed_name = "ssp_crashed.fw",
+	.fw_rev = 14080600,
+	.mag_table = ssp_magnitude_table,
+	.mag_length = ARRAY_SIZE(ssp_magnitude_table),
+};
+
+static const struct mfd_cell sensorhub_sensor_devs[] = {
+	{
+		.name = "ssp-accelerometer",
+	},
+	{
+		.name = "ssp-gyroscope",
+	},
+};
+
+static void ssp_toggle_mcu_reset_gpio(struct ssp_data *data)
+{
+	gpio_set_value(data->mcu_reset_gpio, 0);
+	usleep_range(1000, 1200);
+	gpio_set_value(data->mcu_reset_gpio, 1);
+	msleep(50);
+}
+
+static void ssp_sync_available_sensors(struct ssp_data *data)
+{
+	int i, ret;
+
+	for (i = 0; i < SSP_SENSOR_MAX; ++i) {
+		if (data->available_sensors & BIT(i)) {
+			ret = ssp_enable_sensor(data, i, data->delay_buf[i]);
+			if (ret < 0) {
+				dev_err(&data->spi->dev,
+					"Sync sensor nr: %d fail\n", i);
+				continue;
+			}
+		}
+	}
+
+	ret = ssp_command(data, SSP_MSG2SSP_AP_MCU_SET_DUMPMODE,
+			  data->mcu_dump_mode);
+	if (ret < 0)
+		dev_err(&data->spi->dev,
+			"SSP_MSG2SSP_AP_MCU_SET_DUMPMODE failed\n");
+}
+
+static void ssp_enable_mcu(struct ssp_data *data, bool enable)
+{
+	dev_info(&data->spi->dev, "current shutdown = %d, old = %d\n", enable,
+		 data->shut_down);
+
+	if (enable && data->shut_down) {
+		data->shut_down = false;
+		enable_irq(data->spi->irq);
+		enable_irq_wake(data->spi->irq);
+	} else if (!enable && !data->shut_down) {
+		data->shut_down = true;
+		disable_irq(data->spi->irq);
+		disable_irq_wake(data->spi->irq);
+	} else {
+		dev_warn(&data->spi->dev, "current shutdown = %d, old = %d\n",
+			 enable, data->shut_down);
+	}
+}
+
+/*
+ * This function is the first one which communicates with the mcu so it is
+ * possible that the first attempt will fail
+ */
+static int ssp_check_fwbl(struct ssp_data *data)
+{
+	int retries = 0;
+
+	while (retries++ < 5) {
+		data->cur_firm_rev = ssp_get_firmware_rev(data);
+		if (data->cur_firm_rev == SSP_INVALID_REVISION ||
+		    data->cur_firm_rev == SSP_INVALID_REVISION2) {
+			dev_warn(&data->spi->dev,
+				 "Invalid revision, trying %d time\n", retries);
+		} else {
+			break;
+		}
+	}
+
+	if (data->cur_firm_rev == SSP_INVALID_REVISION ||
+	    data->cur_firm_rev == SSP_INVALID_REVISION2) {
+		dev_err(&data->spi->dev, "SSP_INVALID_REVISION\n");
+		return SSP_FW_DL_STATE_NEED_TO_SCHEDULE;
+	}
+
+	dev_info(&data->spi->dev,
+		 "MCU Firm Rev : Old = %8u, New = %8u\n",
+		 data->cur_firm_rev,
+		 data->sensorhub_info->fw_rev);
+
+	if (data->cur_firm_rev != data->sensorhub_info->fw_rev)
+		return SSP_FW_DL_STATE_NEED_TO_SCHEDULE;
+
+	return SSP_FW_DL_STATE_NONE;
+}
+
+static void ssp_reset_mcu(struct ssp_data *data)
+{
+	ssp_enable_mcu(data, false);
+	ssp_clean_pending_list(data);
+	ssp_toggle_mcu_reset_gpio(data);
+	ssp_enable_mcu(data, true);
+}
+
+static void ssp_wdt_work_func(struct work_struct *work)
+{
+	struct ssp_data *data = container_of(work, struct ssp_data, work_wdt);
+
+	dev_err(&data->spi->dev, "%s - Sensor state: 0x%x, RC: %u, CC: %u\n",
+		__func__, data->available_sensors, data->reset_cnt,
+		data->com_fail_cnt);
+
+	ssp_reset_mcu(data);
+	data->com_fail_cnt = 0;
+	data->timeout_cnt = 0;
+}
+
+static void ssp_wdt_timer_func(unsigned long ptr)
+{
+	struct ssp_data *data = (struct ssp_data *)ptr;
+
+	switch (data->fw_dl_state) {
+	case SSP_FW_DL_STATE_FAIL:
+	case SSP_FW_DL_STATE_DOWNLOADING:
+	case SSP_FW_DL_STATE_SYNC:
+		goto _mod;
+	}
+
+	if (data->timeout_cnt > SSP_LIMIT_TIMEOUT_CNT ||
+	    data->com_fail_cnt > SSP_LIMIT_RESET_CNT)
+		queue_work(system_power_efficient_wq, &data->work_wdt);
+_mod:
+	mod_timer(&data->wdt_timer, jiffies + msecs_to_jiffies(SSP_WDT_TIME));
+}
+
+static void ssp_enable_wdt_timer(struct ssp_data *data)
+{
+	mod_timer(&data->wdt_timer, jiffies + msecs_to_jiffies(SSP_WDT_TIME));
+}
+
+static void ssp_disable_wdt_timer(struct ssp_data *data)
+{
+	del_timer_sync(&data->wdt_timer);
+	cancel_work_sync(&data->work_wdt);
+}
+
+/**
+ * ssp_get_sensor_delay() - gets sensor data acquisition period
+ * @data:	sensorhub structure
+ * @type:	SSP sensor type
+ *
+ * Returns acquisition period in ms
+ */
+u32 ssp_get_sensor_delay(struct ssp_data *data, enum ssp_sensor_type type)
+{
+	return data->delay_buf[type];
+}
+EXPORT_SYMBOL(ssp_get_sensor_delay);
+
+/**
+ * ssp_enable_sensor() - enables data acquisition for sensor
+ * @data:	sensorhub structure
+ * @type:	SSP sensor type
+ * @delay:	delay in ms
+ *
+ * Returns 0 or negative value in case of error
+ */
+int ssp_enable_sensor(struct ssp_data *data, enum ssp_sensor_type type,
+		      u32 delay)
+{
+	int ret;
+	struct ssp_instruction to_send;
+
+	to_send.a = cpu_to_le32(delay);
+	to_send.b = cpu_to_le32(data->batch_latency_buf[type]);
+	to_send.c = data->batch_opt_buf[type];
+
+	switch (data->check_status[type]) {
+	case SSP_INITIALIZATION_STATE:
+		/* do calibration step, now just enable */
+	case SSP_ADD_SENSOR_STATE:
+		ret = ssp_send_instruction(data,
+					   SSP_MSG2SSP_INST_BYPASS_SENSOR_ADD,
+					   type,
+					   (u8 *)&to_send, sizeof(to_send));
+		if (ret < 0) {
+			dev_err(&data->spi->dev, "Enabling sensor failed\n");
+			data->check_status[type] = SSP_NO_SENSOR_STATE;
+			goto derror;
+		}
+
+		data->sensor_enable |= BIT(type);
+		data->check_status[type] = SSP_RUNNING_SENSOR_STATE;
+		break;
+	case SSP_RUNNING_SENSOR_STATE:
+		ret = ssp_send_instruction(data,
+					   SSP_MSG2SSP_INST_CHANGE_DELAY, type,
+					   (u8 *)&to_send, sizeof(to_send));
+		if (ret < 0) {
+			dev_err(&data->spi->dev,
+				"Changing sensor delay failed\n");
+			goto derror;
+		}
+		break;
+	default:
+		data->check_status[type] = SSP_ADD_SENSOR_STATE;
+		break;
+	}
+
+	data->delay_buf[type] = delay;
+
+	if (atomic_inc_return(&data->enable_refcount) == 1)
+		ssp_enable_wdt_timer(data);
+
+	return 0;
+
+derror:
+	return ret;
+}
+EXPORT_SYMBOL(ssp_enable_sensor);
+
+/**
+ * ssp_change_delay() - changes data acquisition for sensor
+ * @data:	sensorhub structure
+ * @type:	SSP sensor type
+ * @delay:	delay in ms
+ *
+ * Returns 0 or negative value in case of error
+ */
+int ssp_change_delay(struct ssp_data *data, enum ssp_sensor_type type,
+		     u32 delay)
+{
+	int ret;
+	struct ssp_instruction to_send;
+
+	to_send.a = cpu_to_le32(delay);
+	to_send.b = cpu_to_le32(data->batch_latency_buf[type]);
+	to_send.c = data->batch_opt_buf[type];
+
+	ret = ssp_send_instruction(data, SSP_MSG2SSP_INST_CHANGE_DELAY, type,
+				   (u8 *)&to_send, sizeof(to_send));
+	if (ret < 0) {
+		dev_err(&data->spi->dev, "Changing sensor delay failed\n");
+		return ret;
+	}
+
+	data->delay_buf[type] = delay;
+
+	return 0;
+}
+EXPORT_SYMBOL(ssp_change_delay);
+
+/**
+ * ssp_disable_sensor() - disables sensor
+ *
+ * @data:	sensorhub structure
+ * @type:	SSP sensor type
+ *
+ * Returns 0 or negative value in case of error
+ */
+int ssp_disable_sensor(struct ssp_data *data, enum ssp_sensor_type type)
+{
+	int ret;
+	__le32 command;
+
+	if (data->sensor_enable & BIT(type)) {
+		command = cpu_to_le32(data->delay_buf[type]);
+
+		ret = ssp_send_instruction(data,
+					   SSP_MSG2SSP_INST_BYPASS_SENSOR_RM,
+					   type, (u8 *)&command,
+					   sizeof(command));
+		if (ret < 0) {
+			dev_err(&data->spi->dev, "Remove sensor fail\n");
+			return ret;
+		}
+
+		data->sensor_enable &= ~BIT(type);
+	}
+
+	data->check_status[type] = SSP_ADD_SENSOR_STATE;
+
+	if (atomic_dec_and_test(&data->enable_refcount))
+		ssp_disable_wdt_timer(data);
+
+	return 0;
+}
+EXPORT_SYMBOL(ssp_disable_sensor);
+
+static irqreturn_t ssp_irq_thread_fn(int irq, void *dev_id)
+{
+	struct ssp_data *data = dev_id;
+
+	/*
+	 * This wrapper is done to preserve error path for ssp_irq_msg, also
+	 * it is defined in different file.
+	 */
+	ssp_irq_msg(data);
+
+	return IRQ_HANDLED;
+}
+
+static int ssp_initialize_mcu(struct ssp_data *data)
+{
+	int ret;
+
+	ssp_clean_pending_list(data);
+
+	ret = ssp_get_chipid(data);
+	if (ret != SSP_DEVICE_ID) {
+		dev_err(&data->spi->dev, "%s - MCU %s ret = %d\n", __func__,
+			ret < 0 ? "is not working" : "identification failed",
+			ret);
+		return ret < 0 ? ret : -ENODEV;
+	}
+
+	dev_info(&data->spi->dev, "MCU device ID = %d\n", ret);
+
+	/*
+	 * needs clarification, for now do not want to export all transfer
+	 * methods to sensors' drivers
+	 */
+	ret = ssp_set_magnetic_matrix(data);
+	if (ret < 0) {
+		dev_err(&data->spi->dev,
+			"%s - ssp_set_magnetic_matrix failed\n", __func__);
+		return ret;
+	}
+
+	data->available_sensors = ssp_get_sensor_scanning_info(data);
+	if (data->available_sensors == 0) {
+		dev_err(&data->spi->dev,
+			"%s - ssp_get_sensor_scanning_info failed\n", __func__);
+		return -EIO;
+	}
+
+	data->cur_firm_rev = ssp_get_firmware_rev(data);
+	dev_info(&data->spi->dev, "MCU Firm Rev : New = %8u\n",
+		 data->cur_firm_rev);
+
+	return ssp_command(data, SSP_MSG2SSP_AP_MCU_DUMP_CHECK, 0);
+}
+
+/*
+ * sensorhub can request its reinitialization as some brutal and rare error
+ * handling. It can be requested from the MCU.
+ */
+static void ssp_refresh_task(struct work_struct *work)
+{
+	struct ssp_data *data = container_of((struct delayed_work *)work,
+					     struct ssp_data, work_refresh);
+
+	dev_info(&data->spi->dev, "refreshing\n");
+
+	data->reset_cnt++;
+
+	if (ssp_initialize_mcu(data) >= 0) {
+		ssp_sync_available_sensors(data);
+		if (data->last_ap_state != 0)
+			ssp_command(data, data->last_ap_state, 0);
+
+		if (data->last_resume_state != 0)
+			ssp_command(data, data->last_resume_state, 0);
+
+		data->timeout_cnt = 0;
+		data->com_fail_cnt = 0;
+	}
+}
+
+int ssp_queue_ssp_refresh_task(struct ssp_data *data, unsigned int delay)
+{
+	cancel_delayed_work_sync(&data->work_refresh);
+
+	return queue_delayed_work(system_power_efficient_wq,
+				  &data->work_refresh,
+				  msecs_to_jiffies(delay));
+}
+
+#ifdef CONFIG_OF
+static struct of_device_id ssp_of_match[] = {
+	{
+		.compatible	= "samsung,sensorhub-rinato",
+		.data		= &ssp_rinato_info,
+	}, {
+		.compatible	= "samsung,sensorhub-thermostat",
+		.data		= &ssp_thermostat_info,
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, ssp_of_match);
+
+static struct ssp_data *ssp_parse_dt(struct device *dev)
+{
+	int ret;
+	struct ssp_data *data;
+	struct device_node *node = dev->of_node;
+	const struct of_device_id *match;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	data->mcu_ap_gpio = of_get_named_gpio(node, "mcu-ap-gpios", 0);
+	if (data->mcu_ap_gpio < 0)
+		goto err_free_pd;
+
+	data->ap_mcu_gpio = of_get_named_gpio(node, "ap-mcu-gpios", 0);
+	if (data->ap_mcu_gpio < 0)
+		goto err_free_pd;
+
+	data->mcu_reset_gpio = of_get_named_gpio(node, "mcu-reset-gpios", 0);
+	if (data->mcu_reset_gpio < 0)
+		goto err_free_pd;
+
+	ret = devm_gpio_request_one(dev, data->ap_mcu_gpio, GPIOF_OUT_INIT_HIGH,
+				    "ap-mcu-gpios");
+	if (ret)
+		goto err_free_pd;
+
+	ret = devm_gpio_request_one(dev, data->mcu_reset_gpio,
+				    GPIOF_OUT_INIT_HIGH, "mcu-reset-gpios");
+	if (ret)
+		goto err_ap_mcu;
+
+	match = of_match_node(ssp_of_match, node);
+	if (!match)
+		goto err_mcu_reset_gpio;
+
+	data->sensorhub_info = (struct ssp_sensorhub_info *)match->data;
+
+	dev_set_drvdata(dev, data);
+
+	return data;
+
+err_mcu_reset_gpio:
+	devm_gpio_free(dev, data->mcu_reset_gpio);
+err_ap_mcu:
+	devm_gpio_free(dev, data->ap_mcu_gpio);
+err_free_pd:
+	devm_kfree(dev, data);
+	return NULL;
+}
+#else
+static struct ssp_data *ssp_parse_dt(struct device *pdev)
+{
+	return NULL;
+}
+#endif
+
+/**
+ * ssp_register_consumer() - registers iio consumer in ssp framework
+ *
+ * @indio_dev:	consumer iio device
+ * @type:	ssp sensor type
+ */
+void ssp_register_consumer(struct iio_dev *indio_dev, enum ssp_sensor_type type)
+{
+	struct ssp_data *data = dev_get_drvdata(indio_dev->dev.parent->parent);
+
+	data->sensor_devs[type] = indio_dev;
+}
+EXPORT_SYMBOL(ssp_register_consumer);
+
+static int ssp_probe(struct spi_device *spi)
+{
+	int ret, i;
+	struct ssp_data *data;
+
+	data = ssp_parse_dt(&spi->dev);
+	if (!data) {
+		dev_err(&spi->dev, "Failed to find platform data\n");
+		return -ENODEV;
+	}
+
+	ret = mfd_add_devices(&spi->dev, -1, sensorhub_sensor_devs,
+			      ARRAY_SIZE(sensorhub_sensor_devs), NULL, 0, NULL);
+	if (ret < 0) {
+		dev_err(&spi->dev, "mfd add devices fail\n");
+		return ret;
+	}
+
+	spi->mode = SPI_MODE_1;
+	ret = spi_setup(spi);
+	if (ret < 0) {
+		dev_err(&spi->dev, "Failed to setup spi\n");
+		return ret;
+	}
+
+	data->fw_dl_state = SSP_FW_DL_STATE_NONE;
+	data->spi = spi;
+	spi_set_drvdata(spi, data);
+
+	mutex_init(&data->comm_lock);
+
+	for (i = 0; i < SSP_SENSOR_MAX; ++i) {
+		data->delay_buf[i] = SSP_DEFAULT_POLLING_DELAY;
+		data->batch_latency_buf[i] = 0;
+		data->batch_opt_buf[i] = 0;
+		data->check_status[i] = SSP_INITIALIZATION_STATE;
+	}
+
+	data->delay_buf[SSP_BIO_HRM_LIB] = 100;
+
+	data->time_syncing = true;
+
+	mutex_init(&data->pending_lock);
+	INIT_LIST_HEAD(&data->pending_list);
+
+	atomic_set(&data->enable_refcount, 0);
+
+	INIT_WORK(&data->work_wdt, ssp_wdt_work_func);
+	INIT_DELAYED_WORK(&data->work_refresh, ssp_refresh_task);
+
+	setup_timer(&data->wdt_timer, ssp_wdt_timer_func, (unsigned long)data);
+
+	ret = request_threaded_irq(data->spi->irq, NULL,
+				   ssp_irq_thread_fn,
+				   IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+				   "SSP_Int", data);
+	if (ret < 0) {
+		dev_err(&spi->dev, "Irq request fail\n");
+		goto err_setup_irq;
+	}
+
+	/* Let's start with enabled one so irq balance could be ok */
+	data->shut_down = false;
+
+	/* just to avoid unbalanced irq set wake up */
+	enable_irq_wake(data->spi->irq);
+
+	data->fw_dl_state = ssp_check_fwbl(data);
+	if (data->fw_dl_state == SSP_FW_DL_STATE_NONE) {
+		ret = ssp_initialize_mcu(data);
+		if (ret < 0) {
+			dev_err(&spi->dev, "Initialize_mcu failed\n");
+			goto err_read_reg;
+		}
+	} else {
+		dev_err(&spi->dev, "Firmware version not supported\n");
+		ret = -EPERM;
+		goto err_read_reg;
+	}
+
+	return 0;
+
+err_read_reg:
+	free_irq(data->spi->irq, data);
+err_setup_irq:
+	mutex_destroy(&data->pending_lock);
+	mutex_destroy(&data->comm_lock);
+
+	dev_err(&spi->dev, "Probe failed!\n");
+
+	return ret;
+}
+
+static int ssp_remove(struct spi_device *spi)
+{
+	struct ssp_data *data = spi_get_drvdata(spi);
+
+	if (ssp_command(data, SSP_MSG2SSP_AP_STATUS_SHUTDOWN, 0) < 0)
+		dev_err(&data->spi->dev,
+			"SSP_MSG2SSP_AP_STATUS_SHUTDOWN failed\n");
+
+	ssp_enable_mcu(data, false);
+	ssp_disable_wdt_timer(data);
+
+	ssp_clean_pending_list(data);
+
+	free_irq(data->spi->irq, data);
+
+	del_timer_sync(&data->wdt_timer);
+	cancel_work_sync(&data->work_wdt);
+
+	mutex_destroy(&data->comm_lock);
+	mutex_destroy(&data->pending_lock);
+
+	mfd_remove_devices(&spi->dev);
+
+	return 0;
+}
+
+static int ssp_suspend(struct device *dev)
+{
+	int ret;
+	struct ssp_data *data = spi_get_drvdata(to_spi_device(dev));
+
+	data->last_resume_state = SSP_MSG2SSP_AP_STATUS_SUSPEND;
+
+	if (atomic_read(&data->enable_refcount) > 0)
+		ssp_disable_wdt_timer(data);
+
+	ret = ssp_command(data, SSP_MSG2SSP_AP_STATUS_SUSPEND, 0);
+	if (ret < 0) {
+		dev_err(&data->spi->dev,
+			"%s SSP_MSG2SSP_AP_STATUS_SUSPEND failed\n", __func__);
+
+		ssp_enable_wdt_timer(data);
+		return ret;
+	}
+
+	data->time_syncing = false;
+	disable_irq(data->spi->irq);
+
+	return 0;
+}
+
+static int ssp_resume(struct device *dev)
+{
+	int ret;
+	struct ssp_data *data = spi_get_drvdata(to_spi_device(dev));
+
+	enable_irq(data->spi->irq);
+
+	if (atomic_read(&data->enable_refcount) > 0)
+		ssp_enable_wdt_timer(data);
+
+	ret = ssp_command(data, SSP_MSG2SSP_AP_STATUS_RESUME, 0);
+	if (ret < 0) {
+		dev_err(&data->spi->dev,
+			"%s SSP_MSG2SSP_AP_STATUS_RESUME failed\n", __func__);
+		ssp_disable_wdt_timer(data);
+		return ret;
+	}
+
+	/* timesyncing is set by MCU */
+	data->last_resume_state = SSP_MSG2SSP_AP_STATUS_RESUME;
+
+	return 0;
+}
+
+static const struct dev_pm_ops ssp_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(ssp_suspend, ssp_resume)
+};
+
+static struct spi_driver ssp_driver = {
+	.probe = ssp_probe,
+	.remove = ssp_remove,
+	.driver = {
+		.pm = &ssp_pm_ops,
+		.bus = &spi_bus_type,
+		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(ssp_of_match),
+		.name = "sensorhub"
+	},
+};
+
+module_spi_driver(ssp_driver);
+
+MODULE_DESCRIPTION("ssp sensorhub driver");
+MODULE_AUTHOR("Samsung Electronics");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iio/common/ssp_sensors/ssp_spi.c b/drivers/iio/common/ssp_sensors/ssp_spi.c
new file mode 100644
index 000000000000..704284a475ae
--- /dev/null
+++ b/drivers/iio/common/ssp_sensors/ssp_spi.c
@@ -0,0 +1,608 @@
+/*
+ *  Copyright (C) 2014, Samsung Electronics Co. Ltd. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#include "ssp.h"
+
+#define SSP_DEV (&data->spi->dev)
+#define SSP_GET_MESSAGE_TYPE(data) (data & (3 << SSP_RW))
+
+/*
+ * SSP -> AP Instruction
+ * They tell what packet type can be expected. In the future there will
+ * be less of them. BYPASS means common sensor packets with accel, gyro,
+ * hrm etc. data. LIBRARY and META are mock-up's for now.
+ */
+#define SSP_MSG2AP_INST_BYPASS_DATA		0x37
+#define SSP_MSG2AP_INST_LIBRARY_DATA		0x01
+#define SSP_MSG2AP_INST_DEBUG_DATA		0x03
+#define SSP_MSG2AP_INST_BIG_DATA		0x04
+#define SSP_MSG2AP_INST_META_DATA		0x05
+#define SSP_MSG2AP_INST_TIME_SYNC		0x06
+#define SSP_MSG2AP_INST_RESET			0x07
+
+#define SSP_UNIMPLEMENTED -1
+
+struct ssp_msg_header {
+	u8 cmd;
+	__le16 length;
+	__le16 options;
+	__le32 data;
+} __attribute__((__packed__));
+
+struct ssp_msg {
+	u16 length;
+	u16 options;
+	struct list_head list;
+	struct completion *done;
+	struct ssp_msg_header *h;
+	char *buffer;
+};
+
+static const int ssp_offset_map[SSP_SENSOR_MAX] = {
+	[SSP_ACCELEROMETER_SENSOR] =		SSP_ACCELEROMETER_SIZE +
+						SSP_TIME_SIZE,
+	[SSP_GYROSCOPE_SENSOR] =		SSP_GYROSCOPE_SIZE +
+						SSP_TIME_SIZE,
+	[SSP_GEOMAGNETIC_UNCALIB_SENSOR] =	SSP_UNIMPLEMENTED,
+	[SSP_GEOMAGNETIC_RAW] =			SSP_UNIMPLEMENTED,
+	[SSP_GEOMAGNETIC_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_PRESSURE_SENSOR] =			SSP_UNIMPLEMENTED,
+	[SSP_GESTURE_SENSOR] =			SSP_UNIMPLEMENTED,
+	[SSP_PROXIMITY_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_TEMPERATURE_HUMIDITY_SENSOR] =	SSP_UNIMPLEMENTED,
+	[SSP_LIGHT_SENSOR] =			SSP_UNIMPLEMENTED,
+	[SSP_PROXIMITY_RAW] =			SSP_UNIMPLEMENTED,
+	[SSP_ORIENTATION_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_STEP_DETECTOR] =			SSP_UNIMPLEMENTED,
+	[SSP_SIG_MOTION_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_GYRO_UNCALIB_SENSOR] =		SSP_UNIMPLEMENTED,
+	[SSP_GAME_ROTATION_VECTOR] =		SSP_UNIMPLEMENTED,
+	[SSP_ROTATION_VECTOR] =			SSP_UNIMPLEMENTED,
+	[SSP_STEP_COUNTER] =			SSP_UNIMPLEMENTED,
+	[SSP_BIO_HRM_RAW] =			SSP_BIO_HRM_RAW_SIZE +
+						SSP_TIME_SIZE,
+	[SSP_BIO_HRM_RAW_FAC] =			SSP_BIO_HRM_RAW_FAC_SIZE +
+						SSP_TIME_SIZE,
+	[SSP_BIO_HRM_LIB] =			SSP_BIO_HRM_LIB_SIZE +
+						SSP_TIME_SIZE,
+};
+
+#define SSP_HEADER_SIZE		(sizeof(struct ssp_msg_header))
+#define SSP_HEADER_SIZE_ALIGNED	(ALIGN(SSP_HEADER_SIZE, 4))
+
+static struct ssp_msg *ssp_create_msg(u8 cmd, u16 len, u16 opt, u32 data)
+{
+	struct ssp_msg_header h;
+	struct ssp_msg *msg;
+
+	msg = kzalloc(sizeof(*msg), GFP_KERNEL);
+	if (!msg)
+		return NULL;
+
+	h.cmd = cmd;
+	h.length = cpu_to_le16(len);
+	h.options = cpu_to_le16(opt);
+	h.data = cpu_to_le32(data);
+
+	msg->buffer = kzalloc(SSP_HEADER_SIZE_ALIGNED + len,
+			      GFP_KERNEL | GFP_DMA);
+	if (!msg->buffer) {
+		kfree(msg);
+		return NULL;
+	}
+
+	msg->length = len;
+	msg->options = opt;
+
+	memcpy(msg->buffer, &h, SSP_HEADER_SIZE);
+
+	return msg;
+}
+
+/*
+ * It is a bit heavy to do it this way but often the function is used to compose
+ * the message from smaller chunks which are placed on the stack.  Often the
+ * chunks are small so memcpy should be optimalized.
+ */
+static inline void ssp_fill_buffer(struct ssp_msg *m, unsigned int offset,
+				   const void *src, unsigned int len)
+{
+	memcpy(&m->buffer[SSP_HEADER_SIZE_ALIGNED + offset], src, len);
+}
+
+static inline void ssp_get_buffer(struct ssp_msg *m, unsigned int offset,
+				  void *dest, unsigned int len)
+{
+	memcpy(dest, &m->buffer[SSP_HEADER_SIZE_ALIGNED + offset],  len);
+}
+
+#define SSP_GET_BUFFER_AT_INDEX(m, index) \
+	(m->buffer[SSP_HEADER_SIZE_ALIGNED + index])
+#define SSP_SET_BUFFER_AT_INDEX(m, index, val) \
+	(m->buffer[SSP_HEADER_SIZE_ALIGNED + index] = val)
+
+static void ssp_clean_msg(struct ssp_msg *m)
+{
+	kfree(m->buffer);
+	kfree(m);
+}
+
+static int ssp_print_mcu_debug(char *data_frame, int *data_index,
+			       int received_len)
+{
+	int length = data_frame[(*data_index)++];
+
+	if (length > received_len - *data_index || length <= 0) {
+		ssp_dbg("[SSP]: MSG From MCU-invalid debug length(%d/%d)\n",
+			length, received_len);
+		return length ? length : -EPROTO;
+	}
+
+	ssp_dbg("[SSP]: MSG From MCU - %s\n", &data_frame[*data_index]);
+
+	*data_index += length;
+
+	return 0;
+}
+
+/*
+ * It was designed that way - additional lines to some kind of handshake,
+ * please do not ask why - only the firmware guy can know it.
+ */
+static int ssp_check_lines(struct ssp_data *data, bool state)
+{
+	int delay_cnt = 0;
+
+	gpio_set_value_cansleep(data->ap_mcu_gpio, state);
+
+	while (gpio_get_value_cansleep(data->mcu_ap_gpio) != state) {
+		usleep_range(3000, 3500);
+
+		if (data->shut_down || delay_cnt++ > 500) {
+			dev_err(SSP_DEV, "%s:timeout, hw ack wait fail %d\n",
+				__func__, state);
+
+			if (!state)
+				gpio_set_value_cansleep(data->ap_mcu_gpio, 1);
+
+			return -ETIMEDOUT;
+		}
+	}
+
+	return 0;
+}
+
+static int ssp_do_transfer(struct ssp_data *data, struct ssp_msg *msg,
+			   struct completion *done, int timeout)
+{
+	int status;
+	/*
+	 * check if this is a short one way message or the whole transfer has
+	 * second part after an interrupt
+	 */
+	const bool use_no_irq = msg->length == 0;
+
+	if (data->shut_down)
+		return -EPERM;
+
+	msg->done = done;
+
+	mutex_lock(&data->comm_lock);
+
+	status = ssp_check_lines(data, false);
+	if (status < 0)
+		goto _error_locked;
+
+	status = spi_write(data->spi, msg->buffer, SSP_HEADER_SIZE);
+	if (status < 0) {
+		gpio_set_value_cansleep(data->ap_mcu_gpio, 1);
+		dev_err(SSP_DEV, "%s spi_write fail\n", __func__);
+		goto _error_locked;
+	}
+
+	if (!use_no_irq) {
+		mutex_lock(&data->pending_lock);
+		list_add_tail(&msg->list, &data->pending_list);
+		mutex_unlock(&data->pending_lock);
+	}
+
+	status = ssp_check_lines(data, true);
+	if (status < 0) {
+		if (!use_no_irq) {
+			mutex_lock(&data->pending_lock);
+			list_del(&msg->list);
+			mutex_unlock(&data->pending_lock);
+		}
+		goto _error_locked;
+	}
+
+	mutex_unlock(&data->comm_lock);
+
+	if (!use_no_irq && done)
+		if (wait_for_completion_timeout(done,
+						msecs_to_jiffies(timeout)) ==
+		    0) {
+			mutex_lock(&data->pending_lock);
+			list_del(&msg->list);
+			mutex_unlock(&data->pending_lock);
+
+			data->timeout_cnt++;
+			return -ETIMEDOUT;
+		}
+
+	return 0;
+
+_error_locked:
+	mutex_unlock(&data->comm_lock);
+	data->timeout_cnt++;
+	return status;
+}
+
+static inline int ssp_spi_sync_command(struct ssp_data *data,
+				       struct ssp_msg *msg)
+{
+	return ssp_do_transfer(data, msg, NULL, 0);
+}
+
+static int ssp_spi_sync(struct ssp_data *data, struct ssp_msg *msg,
+			int timeout)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	if (WARN_ON(!msg->length))
+		return -EPERM;
+
+	return ssp_do_transfer(data, msg, &done, timeout);
+}
+
+static int ssp_handle_big_data(struct ssp_data *data, char *dataframe, int *idx)
+{
+	/* mock-up, it will be changed with adding another sensor types */
+	*idx += 8;
+	return 0;
+}
+
+static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len)
+{
+	int idx, sd;
+	struct timespec ts;
+	struct ssp_sensor_data *spd;
+	struct iio_dev **indio_devs = data->sensor_devs;
+
+	getnstimeofday(&ts);
+
+	for (idx = 0; idx < len;) {
+		switch (dataframe[idx++]) {
+		case SSP_MSG2AP_INST_BYPASS_DATA:
+			sd = dataframe[idx++];
+			if (sd < 0 || sd >= SSP_SENSOR_MAX) {
+				dev_err(SSP_DEV,
+					"Mcu data frame1 error %d\n", sd);
+				return -EPROTO;
+			}
+
+			if (indio_devs[sd]) {
+				spd = iio_priv(indio_devs[sd]);
+				if (spd->process_data)
+					spd->process_data(indio_devs[sd],
+							  &dataframe[idx],
+							  data->timestamp);
+			} else {
+				dev_err(SSP_DEV, "no client for frame\n");
+			}
+
+			idx += ssp_offset_map[sd];
+			break;
+		case SSP_MSG2AP_INST_DEBUG_DATA:
+			sd = ssp_print_mcu_debug(dataframe, &idx, len);
+			if (sd) {
+				dev_err(SSP_DEV,
+					"Mcu data frame3 error %d\n", sd);
+				return sd;
+			}
+			break;
+		case SSP_MSG2AP_INST_LIBRARY_DATA:
+			idx += len;
+			break;
+		case SSP_MSG2AP_INST_BIG_DATA:
+			ssp_handle_big_data(data, dataframe, &idx);
+			break;
+		case SSP_MSG2AP_INST_TIME_SYNC:
+			data->time_syncing = true;
+			break;
+		case SSP_MSG2AP_INST_RESET:
+			ssp_queue_ssp_refresh_task(data, 0);
+			break;
+		}
+	}
+
+	if (data->time_syncing)
+		data->timestamp = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+
+	return 0;
+}
+
+/* threaded irq */
+int ssp_irq_msg(struct ssp_data *data)
+{
+	bool found = false;
+	char *buffer;
+	u8 msg_type;
+	int ret;
+	u16 length, msg_options;
+	struct ssp_msg *msg, *n;
+
+	ret = spi_read(data->spi, data->header_buffer, SSP_HEADER_BUFFER_SIZE);
+	if (ret < 0) {
+		dev_err(SSP_DEV, "header read fail\n");
+		return ret;
+	}
+
+	length = le16_to_cpu(data->header_buffer[1]);
+	msg_options = le16_to_cpu(data->header_buffer[0]);
+
+	if (length == 0) {
+		dev_err(SSP_DEV, "length received from mcu is 0\n");
+		return -EINVAL;
+	}
+
+	msg_type = SSP_GET_MESSAGE_TYPE(msg_options);
+
+	switch (msg_type) {
+	case SSP_AP2HUB_READ:
+	case SSP_AP2HUB_WRITE:
+		/*
+		 * this is a small list, a few elements - the packets can be
+		 * received with no order
+		 */
+		mutex_lock(&data->pending_lock);
+		list_for_each_entry_safe(msg, n, &data->pending_list, list) {
+			if (msg->options == msg_options) {
+				list_del(&msg->list);
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			/*
+			 * here can be implemented dead messages handling
+			 * but the slave should not send such ones - it is to
+			 * check but let's handle this
+			 */
+			buffer = kmalloc(length, GFP_KERNEL | GFP_DMA);
+			if (!buffer) {
+				ret = -ENOMEM;
+				goto _unlock;
+			}
+
+			/* got dead packet so it is always an error */
+			ret = spi_read(data->spi, buffer, length);
+			if (ret >= 0)
+				ret = -EPROTO;
+
+			kfree(buffer);
+
+			dev_err(SSP_DEV, "No match error %x\n",
+				msg_options);
+
+			goto _unlock;
+		}
+
+		if (msg_type == SSP_AP2HUB_READ)
+			ret = spi_read(data->spi,
+				       &msg->buffer[SSP_HEADER_SIZE_ALIGNED],
+				       msg->length);
+
+		if (msg_type == SSP_AP2HUB_WRITE) {
+			ret = spi_write(data->spi,
+					&msg->buffer[SSP_HEADER_SIZE_ALIGNED],
+					msg->length);
+			if (msg_options & SSP_AP2HUB_RETURN) {
+				msg->options =
+					SSP_AP2HUB_READ | SSP_AP2HUB_RETURN;
+				msg->length = 1;
+
+				list_add_tail(&msg->list, &data->pending_list);
+				goto _unlock;
+			}
+		}
+
+		if (msg->done)
+			if (!completion_done(msg->done))
+				complete(msg->done);
+_unlock:
+		mutex_unlock(&data->pending_lock);
+		break;
+	case SSP_HUB2AP_WRITE:
+		buffer = kzalloc(length, GFP_KERNEL | GFP_DMA);
+		if (!buffer)
+			return -ENOMEM;
+
+		ret = spi_read(data->spi, buffer, length);
+		if (ret < 0) {
+			dev_err(SSP_DEV, "spi read fail\n");
+			kfree(buffer);
+			break;
+		}
+
+		ret = ssp_parse_dataframe(data, buffer, length);
+
+		kfree(buffer);
+		break;
+
+	default:
+		dev_err(SSP_DEV, "unknown msg type\n");
+		return -EPROTO;
+	}
+
+	return ret;
+}
+
+void ssp_clean_pending_list(struct ssp_data *data)
+{
+	struct ssp_msg *msg, *n;
+
+	mutex_lock(&data->pending_lock);
+	list_for_each_entry_safe(msg, n, &data->pending_list, list) {
+		list_del(&msg->list);
+
+		if (msg->done)
+			if (!completion_done(msg->done))
+				complete(msg->done);
+	}
+	mutex_unlock(&data->pending_lock);
+}
+
+int ssp_command(struct ssp_data *data, char command, int arg)
+{
+	int ret;
+	struct ssp_msg *msg;
+
+	msg = ssp_create_msg(command, 0, SSP_AP2HUB_WRITE, arg);
+	if (!msg)
+		return -ENOMEM;
+
+	ssp_dbg("%s - command 0x%x %d\n", __func__, command, arg);
+
+	ret = ssp_spi_sync_command(data, msg);
+	ssp_clean_msg(msg);
+
+	return ret;
+}
+
+int ssp_send_instruction(struct ssp_data *data, u8 inst, u8 sensor_type,
+			 u8 *send_buf, u8 length)
+{
+	int ret;
+	struct ssp_msg *msg;
+
+	if (data->fw_dl_state == SSP_FW_DL_STATE_DOWNLOADING) {
+		dev_err(SSP_DEV, "%s - Skip Inst! DL state = %d\n",
+			__func__, data->fw_dl_state);
+		return -EBUSY;
+	} else if (!(data->available_sensors & BIT(sensor_type)) &&
+		   (inst <= SSP_MSG2SSP_INST_CHANGE_DELAY)) {
+		dev_err(SSP_DEV, "%s - Bypass Inst Skip! - %u\n",
+			__func__, sensor_type);
+		return -EIO; /* just fail */
+	}
+
+	msg = ssp_create_msg(inst, length + 2, SSP_AP2HUB_WRITE, 0);
+	if (!msg)
+		return -ENOMEM;
+
+	ssp_fill_buffer(msg, 0, &sensor_type, 1);
+	ssp_fill_buffer(msg, 1, send_buf, length);
+
+	ssp_dbg("%s - Inst = 0x%x, Sensor Type = 0x%x, data = %u\n",
+		__func__, inst, sensor_type, send_buf[1]);
+
+	ret = ssp_spi_sync(data, msg, 1000);
+	ssp_clean_msg(msg);
+
+	return ret;
+}
+
+int ssp_get_chipid(struct ssp_data *data)
+{
+	int ret;
+	char buffer;
+	struct ssp_msg *msg;
+
+	msg = ssp_create_msg(SSP_MSG2SSP_AP_WHOAMI, 1, SSP_AP2HUB_READ, 0);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = ssp_spi_sync(data, msg, 1000);
+
+	buffer = SSP_GET_BUFFER_AT_INDEX(msg, 0);
+
+	ssp_clean_msg(msg);
+
+	return ret < 0 ? ret : buffer;
+}
+
+int ssp_set_magnetic_matrix(struct ssp_data *data)
+{
+	int ret;
+	struct ssp_msg *msg;
+
+	msg = ssp_create_msg(SSP_MSG2SSP_AP_SET_MAGNETIC_STATIC_MATRIX,
+			     data->sensorhub_info->mag_length, SSP_AP2HUB_WRITE,
+			     0);
+	if (!msg)
+		return -ENOMEM;
+
+	ssp_fill_buffer(msg, 0, data->sensorhub_info->mag_table,
+			data->sensorhub_info->mag_length);
+
+	ret = ssp_spi_sync(data, msg, 1000);
+	ssp_clean_msg(msg);
+
+	return ret;
+}
+
+unsigned int ssp_get_sensor_scanning_info(struct ssp_data *data)
+{
+	int ret;
+	__le32 result;
+	u32 cpu_result = 0;
+
+	struct ssp_msg *msg = ssp_create_msg(SSP_MSG2SSP_AP_SENSOR_SCANNING, 4,
+					     SSP_AP2HUB_READ, 0);
+	if (!msg)
+		return 0;
+
+	ret = ssp_spi_sync(data, msg, 1000);
+	if (ret < 0) {
+		dev_err(SSP_DEV, "%s - spi read fail %d\n", __func__, ret);
+		goto _exit;
+	}
+
+	ssp_get_buffer(msg, 0, &result, 4);
+	cpu_result = le32_to_cpu(result);
+
+	dev_info(SSP_DEV, "%s state: 0x%08x\n", __func__, cpu_result);
+
+_exit:
+	ssp_clean_msg(msg);
+	return cpu_result;
+}
+
+unsigned int ssp_get_firmware_rev(struct ssp_data *data)
+{
+	int ret;
+	__le32 result;
+
+	struct ssp_msg *msg = ssp_create_msg(SSP_MSG2SSP_AP_FIRMWARE_REV, 4,
+					     SSP_AP2HUB_READ, 0);
+	if (!msg)
+		return SSP_INVALID_REVISION;
+
+	ret = ssp_spi_sync(data, msg, 1000);
+	if (ret < 0) {
+		dev_err(SSP_DEV, "%s - transfer fail %d\n", __func__, ret);
+		ret = SSP_INVALID_REVISION;
+		goto _exit;
+	}
+
+	ssp_get_buffer(msg, 0, &result, 4);
+	ret = le32_to_cpu(result);
+
+_exit:
+	ssp_clean_msg(msg);
+	return ret;
+}
diff --git a/include/linux/iio/common/ssp_sensors.h b/include/linux/iio/common/ssp_sensors.h
new file mode 100644
index 000000000000..f4d1b0edb432
--- /dev/null
+++ b/include/linux/iio/common/ssp_sensors.h
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (C) 2014, Samsung Electronics Co. Ltd. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+#ifndef _SSP_SENSORS_H_
+#define _SSP_SENSORS_H_
+
+#include <linux/iio/iio.h>
+
+#define SSP_TIME_SIZE				4
+#define SSP_ACCELEROMETER_SIZE			6
+#define SSP_GYROSCOPE_SIZE			6
+#define SSP_BIO_HRM_RAW_SIZE			8
+#define SSP_BIO_HRM_RAW_FAC_SIZE		36
+#define SSP_BIO_HRM_LIB_SIZE			8
+
+/**
+ * enum ssp_sensor_type - SSP sensor type
+ */
+enum ssp_sensor_type {
+	SSP_ACCELEROMETER_SENSOR = 0,
+	SSP_GYROSCOPE_SENSOR,
+	SSP_GEOMAGNETIC_UNCALIB_SENSOR,
+	SSP_GEOMAGNETIC_RAW,
+	SSP_GEOMAGNETIC_SENSOR,
+	SSP_PRESSURE_SENSOR,
+	SSP_GESTURE_SENSOR,
+	SSP_PROXIMITY_SENSOR,
+	SSP_TEMPERATURE_HUMIDITY_SENSOR,
+	SSP_LIGHT_SENSOR,
+	SSP_PROXIMITY_RAW,
+	SSP_ORIENTATION_SENSOR,
+	SSP_STEP_DETECTOR,
+	SSP_SIG_MOTION_SENSOR,
+	SSP_GYRO_UNCALIB_SENSOR,
+	SSP_GAME_ROTATION_VECTOR,
+	SSP_ROTATION_VECTOR,
+	SSP_STEP_COUNTER,
+	SSP_BIO_HRM_RAW,
+	SSP_BIO_HRM_RAW_FAC,
+	SSP_BIO_HRM_LIB,
+	SSP_SENSOR_MAX,
+};
+
+struct ssp_data;
+
+/**
+ * struct ssp_sensor_data - Sensor object
+ * @process_data:	Callback to feed sensor data.
+ * @type:		Used sensor type.
+ * @buffer:		Received data buffer.
+ */
+struct ssp_sensor_data {
+	int (*process_data)(struct iio_dev *indio_dev, void *buf,
+			    int64_t timestamp);
+	enum ssp_sensor_type type;
+	u8 *buffer;
+};
+
+void ssp_register_consumer(struct iio_dev *indio_dev,
+			   enum ssp_sensor_type type);
+
+int ssp_enable_sensor(struct ssp_data *data, enum ssp_sensor_type type,
+		      u32 delay);
+
+int ssp_disable_sensor(struct ssp_data *data, enum ssp_sensor_type type);
+
+u32 ssp_get_sensor_delay(struct ssp_data *data, enum ssp_sensor_type);
+
+int ssp_change_delay(struct ssp_data *data, enum ssp_sensor_type type,
+		     u32 delay);
+#endif /* _SSP_SENSORS_H_ */
-- 
cgit v1.2.3


From 2f0ecb7c6563d711bec15268d56adf1c630e77d1 Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Tue, 27 Jan 2015 20:41:52 +0200
Subject: iio: core: Introduce IIO_CHAN_INFO_DEBOUNCE_COUNT and _TIME

The pedometer needs to filter out false steps that might be generated by
tapping the foot, sitting, etc. To do that it computes the number of
steps that occur in a given time and decides the user is moving only
if this value is over a threshold. E.g.: the user starts moving only
if he takes 4 steps in 3 seconds. This filter is applied only when
the user starts moving.

A device that has such pedometer functionality is Freescale's MMA9553L:
http://www.freescale.com/files/sensors/doc/ref_manual/MMA9553LSWRM.pdf.

To export this feature, this patch introduces IIO_CHAN_INFO_DEBOUNCE_COUNT
and IIO_CHAN_INFO_DEBOUNCE_TIME. For the pedometer, in_steps_debounce_count
will specify the number of steps that need to occur in
in_steps_debounce_time seconds so that the pedometer decides the user is
moving.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 15 +++++++++++++++
 drivers/iio/industrialio-core.c         |  2 ++
 include/linux/iio/iio.h                 |  2 ++
 3 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index c03a1401b9ca..b4ea9c521f69 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1193,3 +1193,18 @@ Description:
 		This attribute is used to read the current speed value of the
 		user (which is the norm or magnitude of the velocity vector).
 		Units after application of scale are m/s.
+
+What:		/sys/.../iio:deviceX/in_steps_debounce_count
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies the number of steps that must occur within
+		in_steps_filter_debounce_time for the pedometer to decide the
+		consumer is making steps.
+
+What:		/sys/.../iio:deviceX/in_steps_debounce_time
+KernelVersion:	3.20
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies number of seconds in which we compute the steps
+		that occur in order to decide if the consumer is making steps.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 4ee6fdfa92fe..aaba9d3d980e 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -126,6 +126,8 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_ENABLE] = "en",
 	[IIO_CHAN_INFO_CALIBHEIGHT] = "calibheight",
 	[IIO_CHAN_INFO_CALIBWEIGHT] = "calibweight",
+	[IIO_CHAN_INFO_DEBOUNCE_COUNT] = "debounce_count",
+	[IIO_CHAN_INFO_DEBOUNCE_TIME] = "debounce_time",
 };
 
 /**
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 51f16437dacc..80d855061064 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -41,6 +41,8 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_ENABLE,
 	IIO_CHAN_INFO_CALIBHEIGHT,
 	IIO_CHAN_INFO_CALIBWEIGHT,
+	IIO_CHAN_INFO_DEBOUNCE_COUNT,
+	IIO_CHAN_INFO_DEBOUNCE_TIME,
 };
 
 enum iio_shared_by {
-- 
cgit v1.2.3


From 33692f27597fcab536d7cbbcc8f52905133e4aa7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Jan 2015 10:51:32 -0800
Subject: vm: add VM_FAULT_SIGSEGV handling support

The core VM already knows about VM_FAULT_SIGBUS, but cannot return a
"you should SIGSEGV" error, because the SIGSEGV case was generally
handled by the caller - usually the architecture fault handler.

That results in lots of duplication - all the architecture fault
handlers end up doing very similar "look up vma, check permissions, do
retries etc" - but it generally works.  However, there are cases where
the VM actually wants to SIGSEGV, and applications _expect_ SIGSEGV.

In particular, when accessing the stack guard page, libsigsegv expects a
SIGSEGV.  And it usually got one, because the stack growth is handled by
that duplicated architecture fault handler.

However, when the generic VM layer started propagating the error return
from the stack expansion in commit fee7e49d4514 ("mm: propagate error
from stack expansion even for guard page"), that now exposed the
existing VM_FAULT_SIGBUS result to user space.  And user space really
expected SIGSEGV, not SIGBUS.

To fix that case, we need to add a VM_FAULT_SIGSEGV, and teach all those
duplicate architecture fault handlers about it.  They all already have
the code to handle SIGSEGV, so it's about just tying that new return
value to the existing code, but it's all a bit annoying.

This is the mindless minimal patch to do this.  A more extensive patch
would be to try to gather up the mostly shared fault handling logic into
one generic helper routine, and long-term we really should do that
cleanup.

Just from this patch, you can generally see that most architectures just
copied (directly or indirectly) the old x86 way of doing things, but in
the meantime that original x86 model has been improved to hold the VM
semaphore for shorter times etc and to handle VM_FAULT_RETRY and other
"newer" things, so it would be a good idea to bring all those
improvements to the generic case and teach other architectures about
them too.

Reported-and-tested-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Jan Engelhardt <jengelh@inai.de>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> # "s390 still compiles and boots"
Cc: linux-arch@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/fault.c                        | 2 ++
 arch/arc/mm/fault.c                          | 2 ++
 arch/avr32/mm/fault.c                        | 2 ++
 arch/cris/mm/fault.c                         | 2 ++
 arch/frv/mm/fault.c                          | 2 ++
 arch/ia64/mm/fault.c                         | 2 ++
 arch/m32r/mm/fault.c                         | 2 ++
 arch/m68k/mm/fault.c                         | 2 ++
 arch/metag/mm/fault.c                        | 2 ++
 arch/microblaze/mm/fault.c                   | 2 ++
 arch/mips/mm/fault.c                         | 2 ++
 arch/mn10300/mm/fault.c                      | 2 ++
 arch/nios2/mm/fault.c                        | 2 ++
 arch/openrisc/mm/fault.c                     | 2 ++
 arch/parisc/mm/fault.c                       | 2 ++
 arch/powerpc/mm/copro_fault.c                | 2 +-
 arch/powerpc/mm/fault.c                      | 2 ++
 arch/s390/mm/fault.c                         | 6 ++++++
 arch/score/mm/fault.c                        | 2 ++
 arch/sh/mm/fault.c                           | 2 ++
 arch/sparc/mm/fault_32.c                     | 2 ++
 arch/sparc/mm/fault_64.c                     | 2 ++
 arch/tile/mm/fault.c                         | 2 ++
 arch/um/kernel/trap.c                        | 2 ++
 arch/x86/mm/fault.c                          | 2 ++
 arch/xtensa/mm/fault.c                       | 2 ++
 drivers/staging/lustre/lustre/llite/vvp_io.c | 2 +-
 include/linux/mm.h                           | 6 ++++--
 mm/gup.c                                     | 4 ++--
 mm/ksm.c                                     | 2 +-
 30 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 98838a05ba6d..9d0ac091a52a 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -156,6 +156,8 @@ retry:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 6f7e3a68803a..0f8df3b5b1b3 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -161,6 +161,8 @@ good_area:
 
 	if (fault & VM_FAULT_OOM)
 		goto out_of_memory;
+	else if (fault & VM_FAULT_SIGSEV)
+		goto bad_area;
 	else if (fault & VM_FAULT_SIGBUS)
 		goto do_sigbus;
 
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index 0eca93327195..d223a8b57c1e 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -142,6 +142,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 1790f22e71a2..2686a7aa8ec8 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -176,6 +176,8 @@ retry:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 9a66372fc7c7..ec4917ddf678 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -168,6 +168,8 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 7225dad87094..ba5ba7accd0d 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -172,6 +172,8 @@ retry:
 		 */
 		if (fault & VM_FAULT_OOM) {
 			goto out_of_memory;
+		} else if (fault & VM_FAULT_SIGSEGV) {
+			goto bad_area;
 		} else if (fault & VM_FAULT_SIGBUS) {
 			signal = SIGBUS;
 			goto bad_area;
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index e9c6a8014bd6..e3d4d4890104 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -200,6 +200,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 2bd7487440c4..b2f04aee46ec 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -145,6 +145,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto map_err;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto bus_err;
 		BUG();
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 332680e5ebf2..2de5dc695a87 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -141,6 +141,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index fa4cf52aa7a6..d46a5ebb7570 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -224,6 +224,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index becc42bb1849..70ab5d664332 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -158,6 +158,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 3516cbdf1ee9..0c2cc5d39c8e 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -262,6 +262,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index 15a0bb5fc06d..34429d5a0ccd 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -135,6 +135,8 @@ survive:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 0703acf7d327..230ac20ae794 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -171,6 +171,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 3ca9c1131cfe..e5120e653240 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -256,6 +256,8 @@ good_area:
 		 */
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto bad_area;
 		BUG();
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 5a236f082c78..1b5305d4bdab 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -76,7 +76,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
 		if (*flt & VM_FAULT_OOM) {
 			ret = -ENOMEM;
 			goto out_unlock;
-		} else if (*flt & VM_FAULT_SIGBUS) {
+		} else if (*flt & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
 			ret = -EFAULT;
 			goto out_unlock;
 		}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index eb79907f34fa..6154b0a2b063 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -437,6 +437,8 @@ good_area:
 	 */
 	fault = handle_mm_fault(mm, vma, address, flags);
 	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+		if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		rc = mm_fault_error(regs, address, fault);
 		if (rc >= MM_FAULT_RETURN)
 			goto bail;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 811937bb90be..9065d5aa3932 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -374,6 +374,12 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault)
 				do_no_context(regs);
 			else
 				pagefault_out_of_memory();
+		} else if (fault & VM_FAULT_SIGSEGV) {
+			/* Kernel mode? Handle exceptions or die */
+			if (!user_mode(regs))
+				do_no_context(regs);
+			else
+				do_sigsegv(regs, SEGV_MAPERR);
 		} else if (fault & VM_FAULT_SIGBUS) {
 			/* Kernel mode? Handle exceptions or die */
 			if (!user_mode(regs))
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 52238983527d..6860beb2a280 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -114,6 +114,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 541dc6101508..a58fec9b55e0 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -353,6 +353,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	} else {
 		if (fault & VM_FAULT_SIGBUS)
 			do_sigbus(regs, error_code, address);
+		else if (fault & VM_FAULT_SIGSEGV)
+			bad_area(regs, error_code, address);
 		else
 			BUG();
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 908e8c17c902..70d817154fe8 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -249,6 +249,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 18fcd7167095..479823249429 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -446,6 +446,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 565e25a98334..0f61a73534e6 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -442,6 +442,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 5678c3571e7c..209617302df8 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -80,6 +80,8 @@ good_area:
 		if (unlikely(fault & VM_FAULT_ERROR)) {
 			if (fault & VM_FAULT_OOM) {
 				goto out_of_memory;
+			} else if (fault & VM_FAULT_SIGSEGV) {
+				goto out;
 			} else if (fault & VM_FAULT_SIGBUS) {
 				err = -EACCES;
 				goto out;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 38dcec403b46..e3ff27a5b634 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -898,6 +898,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
 			     VM_FAULT_HWPOISON_LARGE))
 			do_sigbus(regs, error_code, address, fault);
+		else if (fault & VM_FAULT_SIGSEGV)
+			bad_area_nosemaphore(regs, error_code, address);
 		else
 			BUG();
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index b57c4f91f487..9e3571a6535c 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -117,6 +117,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 930f6010203e..65d610abe06e 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -632,7 +632,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 		return 0;
 	}
 
-	if (cfio->fault.ft_flags & VM_FAULT_SIGBUS) {
+	if (cfio->fault.ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
 		CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
 		return -EFAULT;
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80fc92a49649..dd5ea3016fc4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1070,6 +1070,7 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
 #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
 #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
+#define VM_FAULT_SIGSEGV 0x0040
 
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
@@ -1078,8 +1079,9 @@ static inline int page_mapped(struct page *page)
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
-#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
-			 VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE)
+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
+			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
+			 VM_FAULT_FALLBACK)
 
 /* Encode hstate index for a hwpoisoned large page */
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
diff --git a/mm/gup.c b/mm/gup.c
index a900759cc807..8dd50ce6326f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -296,7 +296,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 			return -ENOMEM;
 		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
 			return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
-		if (ret & VM_FAULT_SIGBUS)
+		if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
 			return -EFAULT;
 		BUG();
 	}
@@ -571,7 +571,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 			return -ENOMEM;
 		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
 			return -EHWPOISON;
-		if (ret & VM_FAULT_SIGBUS)
+		if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
 			return -EFAULT;
 		BUG();
 	}
diff --git a/mm/ksm.c b/mm/ksm.c
index d247efab5073..15647fb0394f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -376,7 +376,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 		else
 			ret = VM_FAULT_WRITE;
 		put_page(page);
-	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
+	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
 	/*
 	 * We must loop because handle_mm_fault() may back out if there's
 	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
-- 
cgit v1.2.3


From 7866a621043fbaca3d7389e9b9f69dd1a2e5a855 Mon Sep 17 00:00:00 2001
From: Salam Noureddine <noureddine@arista.com>
Date: Tue, 27 Jan 2015 11:35:48 -0800
Subject: dev: add per net_device packet type chains

When many pf_packet listeners are created on a lot of interfaces the
current implementation using global packet type lists scales poorly.
This patch adds per net_device packet type lists to fix this problem.

The patch was originally written by Eric Biederman for linux-2.6.29.
Tested on linux-3.16.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   2 +
 net/core/dev.c            | 132 +++++++++++++++++++++++++++++-----------------
 2 files changed, 86 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 642d426a668f..3d37c6eb1732 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1514,6 +1514,8 @@ struct net_device {
 	struct list_head	napi_list;
 	struct list_head	unreg_list;
 	struct list_head	close_list;
+	struct list_head	ptype_all;
+	struct list_head	ptype_specific;
 
 	struct {
 		struct list_head upper;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7f028d441e98..1d564d68e31a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -371,9 +371,10 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 static inline struct list_head *ptype_head(const struct packet_type *pt)
 {
 	if (pt->type == htons(ETH_P_ALL))
-		return &ptype_all;
+		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 	else
-		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+		return pt->dev ? &pt->dev->ptype_specific :
+				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 }
 
 /**
@@ -1734,6 +1735,23 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
+static inline void deliver_ptype_list_skb(struct sk_buff *skb,
+					  struct packet_type **pt,
+					  struct net_device *dev, __be16 type,
+					  struct list_head *ptype_list)
+{
+	struct packet_type *ptype, *pt_prev = *pt;
+
+	list_for_each_entry_rcu(ptype, ptype_list, list) {
+		if (ptype->type != type)
+			continue;
+		if (pt_prev)
+			deliver_skb(skb, pt_prev, dev);
+		pt_prev = ptype;
+	}
+	*pt = pt_prev;
+}
+
 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 {
 	if (!ptype->af_packet_priv || !skb->sk)
@@ -1757,45 +1775,54 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 	struct packet_type *ptype;
 	struct sk_buff *skb2 = NULL;
 	struct packet_type *pt_prev = NULL;
+	struct list_head *ptype_list = &ptype_all;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(ptype, &ptype_all, list) {
+again:
+	list_for_each_entry_rcu(ptype, ptype_list, list) {
 		/* Never send packets back to the socket
 		 * they originated from - MvS (miquels@drinkel.ow.org)
 		 */
-		if ((ptype->dev == dev || !ptype->dev) &&
-		    (!skb_loop_sk(ptype, skb))) {
-			if (pt_prev) {
-				deliver_skb(skb2, pt_prev, skb->dev);
-				pt_prev = ptype;
-				continue;
-			}
+		if (skb_loop_sk(ptype, skb))
+			continue;
 
-			skb2 = skb_clone(skb, GFP_ATOMIC);
-			if (!skb2)
-				break;
+		if (pt_prev) {
+			deliver_skb(skb2, pt_prev, skb->dev);
+			pt_prev = ptype;
+			continue;
+		}
 
-			net_timestamp_set(skb2);
+		/* need to clone skb, done only once */
+		skb2 = skb_clone(skb, GFP_ATOMIC);
+		if (!skb2)
+			goto out_unlock;
 
-			/* skb->nh should be correctly
-			   set by sender, so that the second statement is
-			   just protection against buggy protocols.
-			 */
-			skb_reset_mac_header(skb2);
-
-			if (skb_network_header(skb2) < skb2->data ||
-			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
-				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
-						     ntohs(skb2->protocol),
-						     dev->name);
-				skb_reset_network_header(skb2);
-			}
+		net_timestamp_set(skb2);
 
-			skb2->transport_header = skb2->network_header;
-			skb2->pkt_type = PACKET_OUTGOING;
-			pt_prev = ptype;
+		/* skb->nh should be correctly
+		 * set by sender, so that the second statement is
+		 * just protection against buggy protocols.
+		 */
+		skb_reset_mac_header(skb2);
+
+		if (skb_network_header(skb2) < skb2->data ||
+		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
+			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
+					     ntohs(skb2->protocol),
+					     dev->name);
+			skb_reset_network_header(skb2);
 		}
+
+		skb2->transport_header = skb2->network_header;
+		skb2->pkt_type = PACKET_OUTGOING;
+		pt_prev = ptype;
+	}
+
+	if (ptype_list == &ptype_all) {
+		ptype_list = &dev->ptype_all;
+		goto again;
 	}
+out_unlock:
 	if (pt_prev)
 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 	rcu_read_unlock();
@@ -2617,7 +2644,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 	unsigned int len;
 	int rc;
 
-	if (!list_empty(&ptype_all))
+	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
 		dev_queue_xmit_nit(skb, dev);
 
 	len = skb->len;
@@ -3615,7 +3642,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
 	struct net_device *orig_dev;
-	struct net_device *null_or_dev;
 	bool deliver_exact = false;
 	int ret = NET_RX_DROP;
 	__be16 type;
@@ -3658,11 +3684,15 @@ another_round:
 		goto skip_taps;
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (!ptype->dev || ptype->dev == skb->dev) {
-			if (pt_prev)
-				ret = deliver_skb(skb, pt_prev, orig_dev);
-			pt_prev = ptype;
-		}
+		if (pt_prev)
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+		pt_prev = ptype;
+	}
+
+	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
+		if (pt_prev)
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+		pt_prev = ptype;
 	}
 
 skip_taps:
@@ -3718,19 +3748,21 @@ ncls:
 		skb->vlan_tci = 0;
 	}
 
+	type = skb->protocol;
+
 	/* deliver only exact match when indicated */
-	null_or_dev = deliver_exact ? skb->dev : NULL;
+	if (likely(!deliver_exact)) {
+		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+				       &ptype_base[ntohs(type) &
+						   PTYPE_HASH_MASK]);
+	}
 
-	type = skb->protocol;
-	list_for_each_entry_rcu(ptype,
-			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type &&
-		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
-		     ptype->dev == orig_dev)) {
-			if (pt_prev)
-				ret = deliver_skb(skb, pt_prev, orig_dev);
-			pt_prev = ptype;
-		}
+	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+			       &orig_dev->ptype_specific);
+
+	if (unlikely(skb->dev != orig_dev)) {
+		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+				       &skb->dev->ptype_specific);
 	}
 
 	if (pt_prev) {
@@ -6579,6 +6611,8 @@ void netdev_run_todo(void)
 
 		/* paranoia */
 		BUG_ON(netdev_refcnt_read(dev));
+		BUG_ON(!list_empty(&dev->ptype_all));
+		BUG_ON(!list_empty(&dev->ptype_specific));
 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 		WARN_ON(dev->dn_ptr);
@@ -6761,6 +6795,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	INIT_LIST_HEAD(&dev->adj_list.lower);
 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
+	INIT_LIST_HEAD(&dev->ptype_all);
+	INIT_LIST_HEAD(&dev->ptype_specific);
 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
 
-- 
cgit v1.2.3


From 712eddf70225ab5ae65e946e22d2dfe6b93e8dd1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Sat, 24 Jan 2015 14:05:50 +0900
Subject: cpuidle: exynos: add coupled cpuidle support for exynos4210

The following patch adds coupled cpuidle support for Exynos4210 to
an existing cpuidle-exynos driver.  As a result it enables AFTR mode
to be used by default on Exynos4210 without the need to hot unplug
CPU1 first.

The patch is heavily based on earlier cpuidle-exynos4210 driver from
Daniel Lezcano:

http://www.spinics.net/lists/linux-samsung-soc/msg28134.html

Changes from Daniel's code include:
- porting code to current kernels
- fixing it to work on my setup (by using S5P_INFORM register
  instead of S5P_VA_SYSRAM one on Revison 1.1 and retrying poking
  CPU1 out of the BOOT ROM if necessary)
- fixing rare lockup caused by waiting for CPU1 to get stuck in
  the BOOT ROM (CPU hotplug code in arch/arm/mach-exynos/platsmp.c
  doesn't require this and works fine)
- moving Exynos specific code to arch/arm/mach-exynos/pm.c
- using cpu_boot_reg_base() helper instead of BOOT_VECTOR macro
- using exynos_cpu_*() helpers instead of accessing registers
  directly
- using arch_send_wakeup_ipi_mask() instead of dsb_sev()
  (this matches CPU hotplug code in arch/arm/mach-exynos/platsmp.c)
- integrating separate exynos4210-cpuidle driver into existing
  exynos-cpuidle one

Cc: Colin Cross <ccross@google.com>
Cc: Kukjin Kim <kgene.kim@samsung.com>
Cc: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Cc: Tomasz Figa <tomasz.figa@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Kukjin Kim <kgene@kernel.org>
---
 arch/arm/mach-exynos/common.h                |   4 +
 arch/arm/mach-exynos/exynos.c                |   4 +
 arch/arm/mach-exynos/platsmp.c               |   2 +-
 arch/arm/mach-exynos/pm.c                    | 122 +++++++++++++++++++++++++++
 drivers/cpuidle/Kconfig.arm                  |   1 +
 drivers/cpuidle/cpuidle-exynos.c             |  76 +++++++++++++++--
 include/linux/platform_data/cpuidle-exynos.h |  20 +++++
 7 files changed, 223 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/platform_data/cpuidle-exynos.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-exynos/common.h b/arch/arm/mach-exynos/common.h
index 865f878063cc..f70eca7ee705 100644
--- a/arch/arm/mach-exynos/common.h
+++ b/arch/arm/mach-exynos/common.h
@@ -13,6 +13,7 @@
 #define __ARCH_ARM_MACH_EXYNOS_COMMON_H
 
 #include <linux/of.h>
+#include <linux/platform_data/cpuidle-exynos.h>
 
 #define EXYNOS3250_SOC_ID	0xE3472000
 #define EXYNOS3_SOC_MASK	0xFFFFF000
@@ -150,8 +151,11 @@ extern void exynos_pm_central_suspend(void);
 extern int exynos_pm_central_resume(void);
 extern void exynos_enter_aftr(void);
 
+extern struct cpuidle_exynos_data cpuidle_coupled_exynos_data;
+
 extern void s5p_init_cpu(void __iomem *cpuid_addr);
 extern unsigned int samsung_rev(void);
+extern void __iomem *cpu_boot_reg_base(void);
 
 static inline void pmu_raw_writel(u32 val, u32 offset)
 {
diff --git a/arch/arm/mach-exynos/exynos.c b/arch/arm/mach-exynos/exynos.c
index c13d0837fa8c..509f2e5a2d70 100644
--- a/arch/arm/mach-exynos/exynos.c
+++ b/arch/arm/mach-exynos/exynos.c
@@ -246,6 +246,10 @@ static void __init exynos_dt_machine_init(void)
 	if (!IS_ENABLED(CONFIG_SMP))
 		exynos_sysram_init();
 
+#ifdef CONFIG_ARM_EXYNOS_CPUIDLE
+	if (of_machine_is_compatible("samsung,exynos4210"))
+		exynos_cpuidle.dev.platform_data = &cpuidle_coupled_exynos_data;
+#endif
 	if (of_machine_is_compatible("samsung,exynos4210") ||
 	    of_machine_is_compatible("samsung,exynos4212") ||
 	    (of_machine_is_compatible("samsung,exynos4412") &&
diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
index 7a1ebfeeeeb8..3f32c47a6d74 100644
--- a/arch/arm/mach-exynos/platsmp.c
+++ b/arch/arm/mach-exynos/platsmp.c
@@ -194,7 +194,7 @@ int exynos_cluster_power_state(int cluster)
 		S5P_CORE_LOCAL_PWR_EN);
 }
 
-static inline void __iomem *cpu_boot_reg_base(void)
+void __iomem *cpu_boot_reg_base(void)
 {
 	if (soc_is_exynos4210() && samsung_rev() == EXYNOS4210_REV_1_1)
 		return pmu_base_addr + S5P_INFORM5;
diff --git a/arch/arm/mach-exynos/pm.c b/arch/arm/mach-exynos/pm.c
index 159eb4c9779e..1f9cbd434c60 100644
--- a/arch/arm/mach-exynos/pm.c
+++ b/arch/arm/mach-exynos/pm.c
@@ -179,3 +179,125 @@ void exynos_enter_aftr(void)
 
 	cpu_pm_exit();
 }
+
+static atomic_t cpu1_wakeup = ATOMIC_INIT(0);
+
+static int exynos_cpu0_enter_aftr(void)
+{
+	int ret = -1;
+
+	/*
+	 * If the other cpu is powered on, we have to power it off, because
+	 * the AFTR state won't work otherwise
+	 */
+	if (cpu_online(1)) {
+		/*
+		 * We reach a sync point with the coupled idle state, we know
+		 * the other cpu will power down itself or will abort the
+		 * sequence, let's wait for one of these to happen
+		 */
+		while (exynos_cpu_power_state(1)) {
+			/*
+			 * The other cpu may skip idle and boot back
+			 * up again
+			 */
+			if (atomic_read(&cpu1_wakeup))
+				goto abort;
+
+			/*
+			 * The other cpu may bounce through idle and
+			 * boot back up again, getting stuck in the
+			 * boot rom code
+			 */
+			if (__raw_readl(cpu_boot_reg_base()) == 0)
+				goto abort;
+
+			cpu_relax();
+		}
+	}
+
+	exynos_enter_aftr();
+	ret = 0;
+
+abort:
+	if (cpu_online(1)) {
+		/*
+		 * Set the boot vector to something non-zero
+		 */
+		__raw_writel(virt_to_phys(exynos_cpu_resume),
+			     cpu_boot_reg_base());
+		dsb();
+
+		/*
+		 * Turn on cpu1 and wait for it to be on
+		 */
+		exynos_cpu_power_up(1);
+		while (exynos_cpu_power_state(1) != S5P_CORE_LOCAL_PWR_EN)
+			cpu_relax();
+
+		while (!atomic_read(&cpu1_wakeup)) {
+			/*
+			 * Poke cpu1 out of the boot rom
+			 */
+			__raw_writel(virt_to_phys(exynos_cpu_resume),
+				     cpu_boot_reg_base());
+
+			arch_send_wakeup_ipi_mask(cpumask_of(1));
+		}
+	}
+
+	return ret;
+}
+
+static int exynos_wfi_finisher(unsigned long flags)
+{
+	cpu_do_idle();
+
+	return -1;
+}
+
+static int exynos_cpu1_powerdown(void)
+{
+	int ret = -1;
+
+	/*
+	 * Idle sequence for cpu1
+	 */
+	if (cpu_pm_enter())
+		goto cpu1_aborted;
+
+	/*
+	 * Turn off cpu 1
+	 */
+	exynos_cpu_power_down(1);
+
+	ret = cpu_suspend(0, exynos_wfi_finisher);
+
+	cpu_pm_exit();
+
+cpu1_aborted:
+	dsb();
+	/*
+	 * Notify cpu 0 that cpu 1 is awake
+	 */
+	atomic_set(&cpu1_wakeup, 1);
+
+	return ret;
+}
+
+static void exynos_pre_enter_aftr(void)
+{
+	__raw_writel(virt_to_phys(exynos_cpu_resume), cpu_boot_reg_base());
+}
+
+static void exynos_post_enter_aftr(void)
+{
+	atomic_set(&cpu1_wakeup, 0);
+}
+
+struct cpuidle_exynos_data cpuidle_coupled_exynos_data = {
+	.cpu0_enter_aftr		= exynos_cpu0_enter_aftr,
+	.cpu1_powerdown		= exynos_cpu1_powerdown,
+	.pre_enter_aftr		= exynos_pre_enter_aftr,
+	.post_enter_aftr		= exynos_post_enter_aftr,
+};
diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index 8c16ab20fb15..8e07c9419153 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -55,6 +55,7 @@ config ARM_AT91_CPUIDLE
 config ARM_EXYNOS_CPUIDLE
 	bool "Cpu Idle Driver for the Exynos processors"
 	depends on ARCH_EXYNOS
+	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
 	help
 	  Select this to enable cpuidle for Exynos processors
 
diff --git a/drivers/cpuidle/cpuidle-exynos.c b/drivers/cpuidle/cpuidle-exynos.c
index 4003a3160865..26f5f29fdb03 100644
--- a/drivers/cpuidle/cpuidle-exynos.c
+++ b/drivers/cpuidle/cpuidle-exynos.c
@@ -1,8 +1,11 @@
-/* linux/arch/arm/mach-exynos/cpuidle.c
- *
- * Copyright (c) 2011 Samsung Electronics Co., Ltd.
+/*
+ * Copyright (c) 2011-2014 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
  *
+ * Coupled cpuidle support based on the work of:
+ *	Colin Cross <ccross@android.com>
+ *	Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -13,13 +16,49 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/platform_data/cpuidle-exynos.h>
 
 #include <asm/proc-fns.h>
 #include <asm/suspend.h>
 #include <asm/cpuidle.h>
 
+static atomic_t exynos_idle_barrier;
+
+static struct cpuidle_exynos_data *exynos_cpuidle_pdata;
 static void (*exynos_enter_aftr)(void);
 
+static int exynos_enter_coupled_lowpower(struct cpuidle_device *dev,
+					 struct cpuidle_driver *drv,
+					 int index)
+{
+	int ret;
+
+	exynos_cpuidle_pdata->pre_enter_aftr();
+
+	/*
+	 * Waiting all cpus to reach this point at the same moment
+	 */
+	cpuidle_coupled_parallel_barrier(dev, &exynos_idle_barrier);
+
+	/*
+	 * Both cpus will reach this point at the same time
+	 */
+	ret = dev->cpu ? exynos_cpuidle_pdata->cpu1_powerdown()
+		       : exynos_cpuidle_pdata->cpu0_enter_aftr();
+	if (ret)
+		index = ret;
+
+	/*
+	 * Waiting all cpus to finish the power sequence before going further
+	 */
+	cpuidle_coupled_parallel_barrier(dev, &exynos_idle_barrier);
+
+	exynos_cpuidle_pdata->post_enter_aftr();
+
+	return index;
+}
+
 static int exynos_enter_lowpower(struct cpuidle_device *dev,
 				struct cpuidle_driver *drv,
 				int index)
@@ -55,13 +94,40 @@ static struct cpuidle_driver exynos_idle_driver = {
 	.safe_state_index = 0,
 };
 
+static struct cpuidle_driver exynos_coupled_idle_driver = {
+	.name			= "exynos_coupled_idle",
+	.owner			= THIS_MODULE,
+	.states = {
+		[0] = ARM_CPUIDLE_WFI_STATE,
+		[1] = {
+			.enter			= exynos_enter_coupled_lowpower,
+			.exit_latency		= 5000,
+			.target_residency	= 10000,
+			.flags			= CPUIDLE_FLAG_COUPLED |
+						  CPUIDLE_FLAG_TIMER_STOP,
+			.name			= "C1",
+			.desc			= "ARM power down",
+		},
+	},
+	.state_count = 2,
+	.safe_state_index = 0,
+};
+
 static int exynos_cpuidle_probe(struct platform_device *pdev)
 {
 	int ret;
 
-	exynos_enter_aftr = (void *)(pdev->dev.platform_data);
+	if (of_machine_is_compatible("samsung,exynos4210")) {
+		exynos_cpuidle_pdata = pdev->dev.platform_data;
+
+		ret = cpuidle_register(&exynos_coupled_idle_driver,
+				       cpu_possible_mask);
+	} else {
+		exynos_enter_aftr = (void *)(pdev->dev.platform_data);
+
+		ret = cpuidle_register(&exynos_idle_driver, NULL);
+	}
 
-	ret = cpuidle_register(&exynos_idle_driver, NULL);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to register cpuidle driver\n");
 		return ret;
diff --git a/include/linux/platform_data/cpuidle-exynos.h b/include/linux/platform_data/cpuidle-exynos.h
new file mode 100644
index 000000000000..bfa40e4c5d5f
--- /dev/null
+++ b/include/linux/platform_data/cpuidle-exynos.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ *              http://www.samsung.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#ifndef __CPUIDLE_EXYNOS_H
+#define __CPUIDLE_EXYNOS_H
+
+struct cpuidle_exynos_data {
+	int (*cpu0_enter_aftr)(void);
+	int (*cpu1_powerdown)(void);
+	void (*pre_enter_aftr)(void);
+	void (*post_enter_aftr)(void);
+};
+
+#endif
-- 
cgit v1.2.3


From f262f28c147051e7aa6daaf4fb5996833ffadff4 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 26 Jan 2015 13:16:27 +0900
Subject: PM / devfreq: event: Add devfreq_event class

This patch adds a new class in devfreq, devfreq_event, which provides
raw data (e.g., memory bus utilization, GPU utilization) for devfreq
governors.

- devfreq_event device : Provides raw data for a governor of a devfreq device
- devfreq device       : Monitors device state and changes frequency/voltage
			of the device using the raw data from its
			devfreq_event device.

A devfreq device dertermines performance states (normally the frequency
and the voltage vlues) based on the results its designtated devfreq governor:
e.g., ondemand, performance, powersave.

In order to give such results required by a devfreq device, the devfreq
governor requires data that indicates the performance requirement given
to the devfreq device. The conventional (previous) implementatino of
devfreq subsystem requires a devfreq device driver to implement its own
mechanism to acquire performance requirement for its governor. However,
there had been issues with such requirements:

1. Although performance requirement of such devices is usually acquired
 from common devices (PMU/PPMU), we do not have any abstract structure to
 represent them properly.
2. Such performance requirement devices (PMU/PPMU) are actual hardware
 pieces that may be represented by Device Tree directly while devfreq device
 itself is a virtual entity that are not considered to be represented by
 Device Tree according to Device Tree folks.

In order to address such issues, a devferq_event device (represented by
this patch) provides a template for device drivers representing
performance monitoring unit, which gives the basic or raw data for
preformance requirement, which in turn, is required by devfreq governors.

The following description explains the feature of two kind of devfreq class:
- devfreq class (existing)
 : devfreq consumer device use raw data from devfreq_event device for
   determining proper current system state and change voltage/frequency
   dynamically using various governors.

- devfreq_event class (new)
 : Provide measured raw data to devfreq device for governor

Cc: MyungJoo Ham <myungjoo.ham@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
[Commit message rewritten & conflict resolved by MyungJoo]
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/Kconfig             |   2 +
 drivers/devfreq/Makefile            |   3 +
 drivers/devfreq/devfreq-event.c     | 494 ++++++++++++++++++++++++++++++++++++
 drivers/devfreq/event/Kconfig       |  25 ++
 drivers/devfreq/event/Makefile      |   2 +
 drivers/devfreq/event/exynos-ppmu.c | 374 +++++++++++++++++++++++++++
 drivers/devfreq/event/exynos-ppmu.h |  93 +++++++
 include/linux/devfreq-event.h       | 196 ++++++++++++++
 8 files changed, 1189 insertions(+)
 create mode 100644 drivers/devfreq/devfreq-event.c
 create mode 100644 drivers/devfreq/event/Kconfig
 create mode 100644 drivers/devfreq/event/Makefile
 create mode 100644 drivers/devfreq/event/exynos-ppmu.c
 create mode 100644 drivers/devfreq/event/exynos-ppmu.h
 create mode 100644 include/linux/devfreq-event.h

(limited to 'include/linux')

diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig
index 4aab799712bb..51dccb3620ea 100644
--- a/drivers/devfreq/Kconfig
+++ b/drivers/devfreq/Kconfig
@@ -97,4 +97,6 @@ config ARM_TEGRA_DEVFREQ
          It reads ACTMON counters of memory controllers and adjusts the
          operating frequencies and voltages with OPP support.
 
+source "drivers/devfreq/event/Kconfig"
+
 endif # PM_DEVFREQ
diff --git a/drivers/devfreq/Makefile b/drivers/devfreq/Makefile
index 0ea991f3a483..9256c35bcce9 100644
--- a/drivers/devfreq/Makefile
+++ b/drivers/devfreq/Makefile
@@ -8,3 +8,6 @@ obj-$(CONFIG_DEVFREQ_GOV_USERSPACE)	+= governor_userspace.o
 obj-$(CONFIG_ARM_EXYNOS4_BUS_DEVFREQ)	+= exynos/
 obj-$(CONFIG_ARM_EXYNOS5_BUS_DEVFREQ)	+= exynos/
 obj-$(CONFIG_ARM_TEGRA_DEVFREQ)		+= tegra-devfreq.o
+
+# DEVFREQ Event Drivers
+obj-$(CONFIG_PM_DEVFREQ_EVENT)		+= event/
diff --git a/drivers/devfreq/devfreq-event.c b/drivers/devfreq/devfreq-event.c
new file mode 100644
index 000000000000..f304a0289eda
--- /dev/null
+++ b/drivers/devfreq/devfreq-event.c
@@ -0,0 +1,494 @@
+/*
+ * devfreq-event: a framework to provide raw data and events of devfreq devices
+ *
+ * Copyright (C) 2015 Samsung Electronics
+ * Author: Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This driver is based on drivers/devfreq/devfreq.c.
+ */
+
+#include <linux/devfreq-event.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/of.h>
+
+static struct class *devfreq_event_class;
+
+/* The list of all devfreq event list */
+static LIST_HEAD(devfreq_event_list);
+static DEFINE_MUTEX(devfreq_event_list_lock);
+
+#define to_devfreq_event(DEV) container_of(DEV, struct devfreq_event_dev, dev)
+
+/**
+ * devfreq_event_enable_edev() - Enable the devfreq-event dev and increase
+ *				 the enable_count of devfreq-event dev.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function increase the enable_count and enable the
+ * devfreq-event device. The devfreq-event device should be enabled before
+ * using it by devfreq device.
+ */
+int devfreq_event_enable_edev(struct devfreq_event_dev *edev)
+{
+	int ret = 0;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	mutex_lock(&edev->lock);
+	if (edev->desc->ops && edev->desc->ops->enable
+			&& edev->enable_count == 0) {
+		ret = edev->desc->ops->enable(edev);
+		if (ret < 0)
+			goto err;
+	}
+	edev->enable_count++;
+err:
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_enable_edev);
+
+/**
+ * devfreq_event_disable_edev() - Disable the devfreq-event dev and decrease
+ *				  the enable_count of the devfreq-event dev.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function decrease the enable_count and disable the
+ * devfreq-event device. After the devfreq-event device is disabled,
+ * devfreq device can't use the devfreq-event device for get/set/reset
+ * operations.
+ */
+int devfreq_event_disable_edev(struct devfreq_event_dev *edev)
+{
+	int ret = 0;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	mutex_lock(&edev->lock);
+	if (edev->enable_count <= 0) {
+		dev_warn(&edev->dev, "unbalanced enable_count\n");
+		ret = -EIO;
+		goto err;
+	}
+
+	if (edev->desc->ops && edev->desc->ops->disable
+			&& edev->enable_count == 1) {
+		ret = edev->desc->ops->disable(edev);
+		if (ret < 0)
+			goto err;
+	}
+	edev->enable_count--;
+err:
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_disable_edev);
+
+/**
+ * devfreq_event_is_enabled() - Check whether devfreq-event dev is enabled or
+ *				not.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function check whether devfreq-event dev is enabled or not.
+ * If return true, the devfreq-event dev is enabeld. If return false, the
+ * devfreq-event dev is disabled.
+ */
+bool devfreq_event_is_enabled(struct devfreq_event_dev *edev)
+{
+	bool enabled = false;
+
+	if (!edev || !edev->desc)
+		return enabled;
+
+	mutex_lock(&edev->lock);
+
+	if (edev->enable_count > 0)
+		enabled = true;
+
+	mutex_unlock(&edev->lock);
+
+	return enabled;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_is_enabled);
+
+/**
+ * devfreq_event_set_event() - Set event to devfreq-event dev to start.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function set the event to the devfreq-event device to start
+ * for getting the event data which could be various event type.
+ */
+int devfreq_event_set_event(struct devfreq_event_dev *edev)
+{
+	int ret;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	if (!edev->desc->ops || !edev->desc->ops->set_event)
+		return -EINVAL;
+
+	if (!devfreq_event_is_enabled(edev))
+		return -EPERM;
+
+	mutex_lock(&edev->lock);
+	ret = edev->desc->ops->set_event(edev);
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_set_event);
+
+/**
+ * devfreq_event_get_event() - Get {load|total}_count from devfreq-event dev.
+ * @edev	: the devfreq-event device
+ * @edata	: the calculated data of devfreq-event device
+ *
+ * Note that this function get the calculated event data from devfreq-event dev
+ * after stoping the progress of whole sequence of devfreq-event dev.
+ */
+int devfreq_event_get_event(struct devfreq_event_dev *edev,
+			    struct devfreq_event_data *edata)
+{
+	int ret;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	if (!edev->desc->ops || !edev->desc->ops->get_event)
+		return -EINVAL;
+
+	if (!devfreq_event_is_enabled(edev))
+		return -EINVAL;
+
+	edata->total_count = edata->load_count = 0;
+
+	mutex_lock(&edev->lock);
+	ret = edev->desc->ops->get_event(edev, edata);
+	if (ret < 0)
+		edata->total_count = edata->load_count = 0;
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_get_event);
+
+/**
+ * devfreq_event_reset_event() - Reset all opeations of devfreq-event dev.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function stop all operations of devfreq-event dev and reset
+ * the current event data to make the devfreq-event device into initial state.
+ */
+int devfreq_event_reset_event(struct devfreq_event_dev *edev)
+{
+	int ret = 0;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	if (!devfreq_event_is_enabled(edev))
+		return -EPERM;
+
+	mutex_lock(&edev->lock);
+	if (edev->desc->ops && edev->desc->ops->reset)
+		ret = edev->desc->ops->reset(edev);
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_reset_event);
+
+/**
+ * devfreq_event_get_edev_by_phandle() - Get the devfreq-event dev from
+ *					 devicetree.
+ * @dev		: the pointer to the given device
+ * @index	: the index into list of devfreq-event device
+ *
+ * Note that this function return the pointer of devfreq-event device.
+ */
+struct devfreq_event_dev *devfreq_event_get_edev_by_phandle(struct device *dev,
+						      int index)
+{
+	struct device_node *node;
+	struct devfreq_event_dev *edev;
+
+	if (!dev->of_node) {
+		dev_err(dev, "device does not have a device node entry\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	node = of_parse_phandle(dev->of_node, "devfreq-events", index);
+	if (!node) {
+		dev_err(dev, "failed to get phandle in %s node\n",
+			dev->of_node->full_name);
+		return ERR_PTR(-ENODEV);
+	}
+
+	mutex_lock(&devfreq_event_list_lock);
+	list_for_each_entry(edev, &devfreq_event_list, node) {
+		if (!strcmp(edev->desc->name, node->name))
+			goto out;
+	}
+	edev = NULL;
+out:
+	mutex_unlock(&devfreq_event_list_lock);
+
+	if (!edev) {
+		dev_err(dev, "unable to get devfreq-event device : %s\n",
+			node->name);
+		of_node_put(node);
+		return ERR_PTR(-ENODEV);
+	}
+
+	of_node_put(node);
+
+	return edev;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_get_edev_by_phandle);
+
+/**
+ * devfreq_event_get_edev_count() - Get the count of devfreq-event dev
+ * @dev		: the pointer to the given device
+ *
+ * Note that this function return the count of devfreq-event devices.
+ */
+int devfreq_event_get_edev_count(struct device *dev)
+{
+	int count;
+
+	if (!dev->of_node) {
+		dev_err(dev, "device does not have a device node entry\n");
+		return -EINVAL;
+	}
+
+	count = of_property_count_elems_of_size(dev->of_node, "devfreq-events",
+						sizeof(u32));
+	if (count < 0 ) {
+		dev_err(dev,
+			"failed to get the count of devfreq-event in %s node\n",
+			dev->of_node->full_name);
+		return count;
+	}
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_get_edev_count);
+
+static void devfreq_event_release_edev(struct device *dev)
+{
+	struct devfreq_event_dev *edev = to_devfreq_event(dev);
+
+	kfree(edev);
+}
+
+/**
+ * devfreq_event_add_edev() - Add new devfreq-event device.
+ * @dev		: the device owning the devfreq-event device being created
+ * @desc	: the devfreq-event device's decriptor which include essential
+ *		  data for devfreq-event device.
+ *
+ * Note that this function add new devfreq-event device to devfreq-event class
+ * list and register the device of the devfreq-event device.
+ */
+struct devfreq_event_dev *devfreq_event_add_edev(struct device *dev,
+						struct devfreq_event_desc *desc)
+{
+	struct devfreq_event_dev *edev;
+	static atomic_t event_no = ATOMIC_INIT(0);
+	int ret;
+
+	if (!dev || !desc)
+		return ERR_PTR(-EINVAL);
+
+	if (!desc->name || !desc->ops)
+		return ERR_PTR(-EINVAL);
+
+	if (!desc->ops->set_event || !desc->ops->get_event)
+		return ERR_PTR(-EINVAL);
+
+	edev = kzalloc(sizeof(struct devfreq_event_dev), GFP_KERNEL);
+	if (!edev)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&edev->lock);
+	edev->desc = desc;
+	edev->enable_count = 0;
+	edev->dev.parent = dev;
+	edev->dev.class = devfreq_event_class;
+	edev->dev.release = devfreq_event_release_edev;
+
+	dev_set_name(&edev->dev, "event.%d", atomic_inc_return(&event_no) - 1);
+	ret = device_register(&edev->dev);
+	if (ret < 0) {
+		put_device(&edev->dev);
+		return ERR_PTR(ret);
+	}
+	dev_set_drvdata(&edev->dev, edev);
+
+	INIT_LIST_HEAD(&edev->node);
+
+	mutex_lock(&devfreq_event_list_lock);
+	list_add(&edev->node, &devfreq_event_list);
+	mutex_unlock(&devfreq_event_list_lock);
+
+	return edev;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_add_edev);
+
+/**
+ * devfreq_event_remove_edev() - Remove the devfreq-event device registered.
+ * @dev		: the devfreq-event device
+ *
+ * Note that this function remove the registered devfreq-event device.
+ */
+int devfreq_event_remove_edev(struct devfreq_event_dev *edev)
+{
+	if (!edev)
+		return -EINVAL;
+
+	WARN_ON(edev->enable_count);
+
+	mutex_lock(&devfreq_event_list_lock);
+	list_del(&edev->node);
+	mutex_unlock(&devfreq_event_list_lock);
+
+	device_unregister(&edev->dev);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_remove_edev);
+
+static int devm_devfreq_event_match(struct device *dev, void *res, void *data)
+{
+	struct devfreq_event_dev **r = res;
+
+	if (WARN_ON(!r || !*r))
+		return 0;
+
+	return *r == data;
+}
+
+static void devm_devfreq_event_release(struct device *dev, void *res)
+{
+	devfreq_event_remove_edev(*(struct devfreq_event_dev **)res);
+}
+
+/**
+ * devm_devfreq_event_add_edev() - Resource-managed devfreq_event_add_edev()
+ * @dev		: the device owning the devfreq-event device being created
+ * @desc	: the devfreq-event device's decriptor which include essential
+ *		  data for devfreq-event device.
+ *
+ * Note that this function manages automatically the memory of devfreq-event
+ * device using device resource management and simplify the free operation
+ * for memory of devfreq-event device.
+ */
+struct devfreq_event_dev *devm_devfreq_event_add_edev(struct device *dev,
+						struct devfreq_event_desc *desc)
+{
+	struct devfreq_event_dev **ptr, *edev;
+
+	ptr = devres_alloc(devm_devfreq_event_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	edev = devfreq_event_add_edev(dev, desc);
+	if (IS_ERR(edev)) {
+		devres_free(ptr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	*ptr = edev;
+	devres_add(dev, ptr);
+
+	return edev;
+}
+EXPORT_SYMBOL_GPL(devm_devfreq_event_add_edev);
+
+/**
+ * devm_devfreq_event_remove_edev()- Resource-managed devfreq_event_remove_edev()
+ * @dev		: the device owning the devfreq-event device being created
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function manages automatically the memory of devfreq-event
+ * device using device resource management.
+ */
+void devm_devfreq_event_remove_edev(struct device *dev,
+				struct devfreq_event_dev *edev)
+{
+	WARN_ON(devres_release(dev, devm_devfreq_event_release,
+			       devm_devfreq_event_match, edev));
+}
+EXPORT_SYMBOL_GPL(devm_devfreq_event_remove_edev);
+
+/*
+ * Device attributes for devfreq-event class.
+ */
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct devfreq_event_dev *edev = to_devfreq_event(dev);
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	return sprintf(buf, "%s\n", edev->desc->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static ssize_t enable_count_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct devfreq_event_dev *edev = to_devfreq_event(dev);
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	return sprintf(buf, "%d\n", edev->enable_count);
+}
+static DEVICE_ATTR_RO(enable_count);
+
+static struct attribute *devfreq_event_attrs[] = {
+	&dev_attr_name.attr,
+	&dev_attr_enable_count.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(devfreq_event);
+
+static int __init devfreq_event_init(void)
+{
+	devfreq_event_class = class_create(THIS_MODULE, "devfreq-event");
+	if (IS_ERR(devfreq_event_class)) {
+		pr_err("%s: couldn't create class\n", __FILE__);
+		return PTR_ERR(devfreq_event_class);
+	}
+
+	devfreq_event_class->dev_groups = devfreq_event_groups;
+
+	return 0;
+}
+subsys_initcall(devfreq_event_init);
+
+static void __exit devfreq_event_exit(void)
+{
+	class_destroy(devfreq_event_class);
+}
+module_exit(devfreq_event_exit);
+
+MODULE_AUTHOR("Chanwoo Choi <cw00.choi@samsung.com>");
+MODULE_DESCRIPTION("DEVFREQ-Event class support");
+MODULE_LICENSE("GPL");
diff --git a/drivers/devfreq/event/Kconfig b/drivers/devfreq/event/Kconfig
new file mode 100644
index 000000000000..a11720affc31
--- /dev/null
+++ b/drivers/devfreq/event/Kconfig
@@ -0,0 +1,25 @@
+menuconfig PM_DEVFREQ_EVENT
+	bool "DEVFREQ-Event device Support"
+	help
+	  The devfreq-event device provide the raw data and events which
+	  indicate the current state of devfreq-event device. The provided
+	  data from devfreq-event device is used to monitor the state of
+	  device and determine the suitable size of resource to reduce the
+	  wasted resource.
+
+	  The devfreq-event device can support the various type of events
+	  (e.g., raw data, utilization, latency, bandwidth). The events
+	  may be used by devfreq governor and other subsystem.
+
+if PM_DEVFREQ_EVENT
+
+config DEVFREQ_EVENT_EXYNOS_PPMU
+	bool "EXYNOS PPMU (Platform Performance Monitoring Unit) DEVFREQ event Driver"
+	depends on ARCH_EXYNOS
+	select PM_OPP
+	help
+	  This add the devfreq-event driver for Exynos SoC. It provides PPMU
+	  (Platform Performance Monitoring Unit) counters to estimate the
+	  utilization of each module.
+
+endif # PM_DEVFREQ_EVENT
diff --git a/drivers/devfreq/event/Makefile b/drivers/devfreq/event/Makefile
new file mode 100644
index 000000000000..be146ead79cf
--- /dev/null
+++ b/drivers/devfreq/event/Makefile
@@ -0,0 +1,2 @@
+# Exynos DEVFREQ Event Drivers
+obj-$(CONFIG_DEVFREQ_EVENT_EXYNOS_PPMU) += exynos-ppmu.o
diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
new file mode 100644
index 000000000000..135be0aada9d
--- /dev/null
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -0,0 +1,374 @@
+/*
+ * exynos_ppmu.c - EXYNOS PPMU (Platform Performance Monitoring Unit) support
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author : Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This driver is based on drivers/devfreq/exynos/exynos_ppmu.c
+ */
+
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/suspend.h>
+#include <linux/devfreq-event.h>
+
+#include "exynos-ppmu.h"
+
+struct exynos_ppmu_data {
+	void __iomem *base;
+	struct clk *clk;
+};
+
+struct exynos_ppmu {
+	struct devfreq_event_dev **edev;
+	struct devfreq_event_desc *desc;
+	unsigned int num_events;
+
+	struct device *dev;
+	struct mutex lock;
+
+	struct exynos_ppmu_data ppmu;
+};
+
+#define PPMU_EVENT(name)			\
+	{ "ppmu-event0-"#name, PPMU_PMNCNT0 },	\
+	{ "ppmu-event1-"#name, PPMU_PMNCNT1 },	\
+	{ "ppmu-event2-"#name, PPMU_PMNCNT2 },	\
+	{ "ppmu-event3-"#name, PPMU_PMNCNT3 }
+
+struct __exynos_ppmu_events {
+	char *name;
+	int id;
+} ppmu_events[] = {
+	/* For Exynos3250, Exynos4 and Exynos5260 */
+	PPMU_EVENT(g3d),
+	PPMU_EVENT(fsys),
+
+	/* For Exynos4 SoCs and Exynos3250 */
+	PPMU_EVENT(dmc0),
+	PPMU_EVENT(dmc1),
+	PPMU_EVENT(cpu),
+	PPMU_EVENT(rightbus),
+	PPMU_EVENT(leftbus),
+	PPMU_EVENT(lcd0),
+	PPMU_EVENT(camif),
+
+	/* Only for Exynos3250 and Exynos5260 */
+	PPMU_EVENT(mfc),
+
+	/* Only for Exynos4 SoCs */
+	PPMU_EVENT(mfc-left),
+	PPMU_EVENT(mfc-right),
+
+	/* Only for Exynos5260 SoCs */
+	PPMU_EVENT(drex0-s0),
+	PPMU_EVENT(drex0-s1),
+	PPMU_EVENT(drex1-s0),
+	PPMU_EVENT(drex1-s1),
+	PPMU_EVENT(eagle),
+	PPMU_EVENT(kfc),
+	PPMU_EVENT(isp),
+	PPMU_EVENT(fimc),
+	PPMU_EVENT(gscl),
+	PPMU_EVENT(mscl),
+	PPMU_EVENT(fimd0x),
+	PPMU_EVENT(fimd1x),
+	{ /* sentinel */ },
+};
+
+static int exynos_ppmu_find_ppmu_id(struct devfreq_event_dev *edev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ppmu_events); i++)
+		if (!strcmp(edev->desc->name, ppmu_events[i].name))
+			return ppmu_events[i].id;
+
+	return -EINVAL;
+}
+
+static int exynos_ppmu_disable(struct devfreq_event_dev *edev)
+{
+	struct exynos_ppmu *info = devfreq_event_get_drvdata(edev);
+	u32 pmnc;
+
+	/* Disable all counters */
+	__raw_writel(PPMU_CCNT_MASK |
+		     PPMU_PMCNT0_MASK |
+		     PPMU_PMCNT1_MASK |
+		     PPMU_PMCNT2_MASK |
+		     PPMU_PMCNT3_MASK,
+		     info->ppmu.base + PPMU_CNTENC);
+
+	/* Disable PPMU */
+	pmnc = __raw_readl(info->ppmu.base + PPMU_PMNC);
+	pmnc &= ~PPMU_PMNC_ENABLE_MASK;
+	__raw_writel(pmnc, info->ppmu.base + PPMU_PMNC);
+
+	return 0;
+}
+
+static int exynos_ppmu_set_event(struct devfreq_event_dev *edev)
+{
+	struct exynos_ppmu *info = devfreq_event_get_drvdata(edev);
+	int id = exynos_ppmu_find_ppmu_id(edev);
+	u32 pmnc, cntens;
+
+	if (id < 0)
+		return id;
+
+	/* Enable specific counter */
+	cntens = __raw_readl(info->ppmu.base + PPMU_CNTENS);
+	cntens |= (PPMU_CCNT_MASK | (PPMU_ENABLE << id));
+	__raw_writel(cntens, info->ppmu.base + PPMU_CNTENS);
+
+	/* Set the event of Read/Write data count  */
+	__raw_writel(PPMU_RO_DATA_CNT | PPMU_WO_DATA_CNT,
+			info->ppmu.base + PPMU_BEVTxSEL(id));
+
+	/* Reset cycle counter/performance counter and enable PPMU */
+	pmnc = __raw_readl(info->ppmu.base + PPMU_PMNC);
+	pmnc &= ~(PPMU_PMNC_ENABLE_MASK
+			| PPMU_PMNC_COUNTER_RESET_MASK
+			| PPMU_PMNC_CC_RESET_MASK);
+	pmnc |= (PPMU_ENABLE << PPMU_PMNC_ENABLE_SHIFT);
+	pmnc |= (PPMU_ENABLE << PPMU_PMNC_COUNTER_RESET_SHIFT);
+	pmnc |= (PPMU_ENABLE << PPMU_PMNC_CC_RESET_SHIFT);
+	__raw_writel(pmnc, info->ppmu.base + PPMU_PMNC);
+
+	return 0;
+}
+
+static int exynos_ppmu_get_event(struct devfreq_event_dev *edev,
+				struct devfreq_event_data *edata)
+{
+	struct exynos_ppmu *info = devfreq_event_get_drvdata(edev);
+	int id = exynos_ppmu_find_ppmu_id(edev);
+	u32 pmnc, cntenc;
+
+	if (id < 0)
+		return -EINVAL;
+
+	/* Disable PPMU */
+	pmnc = __raw_readl(info->ppmu.base + PPMU_PMNC);
+	pmnc &= ~PPMU_PMNC_ENABLE_MASK;
+	__raw_writel(pmnc, info->ppmu.base + PPMU_PMNC);
+
+	/* Read cycle count */
+	edata->total_count = __raw_readl(info->ppmu.base + PPMU_CCNT);
+
+	/* Read performance count */
+	switch (id) {
+	case PPMU_PMNCNT0:
+	case PPMU_PMNCNT1:
+	case PPMU_PMNCNT2:
+		edata->load_count
+			= __raw_readl(info->ppmu.base + PPMU_PMNCT(id));
+		break;
+	case PPMU_PMNCNT3:
+		edata->load_count =
+			((__raw_readl(info->ppmu.base + PPMU_PMCNT3_HIGH) << 8)
+			| __raw_readl(info->ppmu.base + PPMU_PMCNT3_LOW));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Disable specific counter */
+	cntenc = __raw_readl(info->ppmu.base + PPMU_CNTENC);
+	cntenc |= (PPMU_CCNT_MASK | (PPMU_ENABLE << id));
+	__raw_writel(cntenc, info->ppmu.base + PPMU_CNTENC);
+
+	dev_dbg(&edev->dev, "%s (event: %ld/%ld)\n", edev->desc->name,
+					edata->load_count, edata->total_count);
+
+	return 0;
+}
+
+static struct devfreq_event_ops exynos_ppmu_ops = {
+	.disable = exynos_ppmu_disable,
+	.set_event = exynos_ppmu_set_event,
+	.get_event = exynos_ppmu_get_event,
+};
+
+static int of_get_devfreq_events(struct device_node *np,
+				 struct exynos_ppmu *info)
+{
+	struct devfreq_event_desc *desc;
+	struct device *dev = info->dev;
+	struct device_node *events_np, *node;
+	int i, j, count;
+
+	events_np = of_get_child_by_name(np, "events");
+	if (!events_np) {
+		dev_err(dev,
+			"failed to get child node of devfreq-event devices\n");
+		return -EINVAL;
+	}
+
+	count = of_get_child_count(events_np);
+	desc = devm_kzalloc(dev, sizeof(*desc) * count, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+	info->num_events = count;
+
+	j = 0;
+	for_each_child_of_node(events_np, node) {
+		for (i = 0; i < ARRAY_SIZE(ppmu_events); i++) {
+			if (!ppmu_events[i].name)
+				continue;
+
+			if (!of_node_cmp(node->name, ppmu_events[i].name))
+				break;
+		}
+
+		if (i == ARRAY_SIZE(ppmu_events)) {
+			dev_warn(dev,
+				"don't know how to configure events : %s\n",
+				node->name);
+			continue;
+		}
+
+		desc[j].ops = &exynos_ppmu_ops;
+		desc[j].driver_data = info;
+
+		of_property_read_string(node, "event-name", &desc[j].name);
+
+		j++;
+
+		of_node_put(node);
+	}
+	info->desc = desc;
+
+	of_node_put(events_np);
+
+	return 0;
+}
+
+static int exynos_ppmu_parse_dt(struct exynos_ppmu *info)
+{
+	struct device *dev = info->dev;
+	struct device_node *np = dev->of_node;
+	int ret = 0;
+
+	if (!np) {
+		dev_err(dev, "failed to find devicetree node\n");
+		return -EINVAL;
+	}
+
+	/* Maps the memory mapped IO to control PPMU register */
+	info->ppmu.base = of_iomap(np, 0);
+	if (IS_ERR_OR_NULL(info->ppmu.base)) {
+		dev_err(dev, "failed to map memory region\n");
+		return -ENOMEM;
+	}
+
+	info->ppmu.clk = devm_clk_get(dev, "ppmu");
+	if (IS_ERR(info->ppmu.clk)) {
+		info->ppmu.clk = NULL;
+		dev_warn(dev, "cannot get PPMU clock\n");
+	}
+
+	ret = of_get_devfreq_events(np, info);
+	if (ret < 0) {
+		dev_err(dev, "failed to parse exynos ppmu dt node\n");
+		goto err;
+	}
+
+	return 0;
+
+err:
+	iounmap(info->ppmu.base);
+
+	return ret;
+}
+
+static int exynos_ppmu_probe(struct platform_device *pdev)
+{
+	struct exynos_ppmu *info;
+	struct devfreq_event_dev **edev;
+	struct devfreq_event_desc *desc;
+	int i, ret = 0, size;
+
+	info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	mutex_init(&info->lock);
+	info->dev = &pdev->dev;
+
+	/* Parse dt data to get resource */
+	ret = exynos_ppmu_parse_dt(info);
+	if (ret < 0) {
+		dev_err(&pdev->dev,
+			"failed to parse devicetree for resource\n");
+		return ret;
+	}
+	desc = info->desc;
+
+	size = sizeof(struct devfreq_event_dev *) * info->num_events;
+	info->edev = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!info->edev) {
+		dev_err(&pdev->dev,
+			"failed to allocate memory devfreq-event devices\n");
+		return -ENOMEM;
+	}
+	edev = info->edev;
+	platform_set_drvdata(pdev, info);
+
+	for (i = 0; i < info->num_events; i++) {
+		edev[i] = devm_devfreq_event_add_edev(&pdev->dev, &desc[i]);
+		if (IS_ERR(edev)) {
+			ret = PTR_ERR(edev);
+			dev_err(&pdev->dev,
+				"failed to add devfreq-event device\n");
+			goto err;
+		}
+	}
+
+	clk_prepare_enable(info->ppmu.clk);
+
+	return 0;
+err:
+	iounmap(info->ppmu.base);
+
+	return ret;
+}
+
+static int exynos_ppmu_remove(struct platform_device *pdev)
+{
+	struct exynos_ppmu *info = platform_get_drvdata(pdev);
+
+	clk_disable_unprepare(info->ppmu.clk);
+	iounmap(info->ppmu.base);
+
+	return 0;
+}
+
+static struct of_device_id exynos_ppmu_id_match[] = {
+	{ .compatible = "samsung,exynos-ppmu", },
+	{ /* sentinel */ },
+};
+
+static struct platform_driver exynos_ppmu_driver = {
+	.probe	= exynos_ppmu_probe,
+	.remove	= exynos_ppmu_remove,
+	.driver = {
+		.name	= "exynos-ppmu",
+		.of_match_table = exynos_ppmu_id_match,
+	},
+};
+module_platform_driver(exynos_ppmu_driver);
+
+MODULE_DESCRIPTION("Exynos PPMU(Platform Performance Monitoring Unit) driver");
+MODULE_AUTHOR("Chanwoo Choi <cw00.choi@samsung.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/devfreq/event/exynos-ppmu.h b/drivers/devfreq/event/exynos-ppmu.h
new file mode 100644
index 000000000000..4e831d48c138
--- /dev/null
+++ b/drivers/devfreq/event/exynos-ppmu.h
@@ -0,0 +1,93 @@
+/*
+ * exynos_ppmu.h - EXYNOS PPMU header file
+ *
+ * Copyright (c) 2015 Samsung Electronics Co., Ltd.
+ * Author : Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __EXYNOS_PPMU_H__
+#define __EXYNOS_PPMU_H__
+
+enum ppmu_state {
+	PPMU_DISABLE = 0,
+	PPMU_ENABLE,
+};
+
+enum ppmu_counter {
+	PPMU_PMNCNT0 = 0,
+	PPMU_PMNCNT1,
+	PPMU_PMNCNT2,
+	PPMU_PMNCNT3,
+
+	PPMU_PMNCNT_MAX,
+};
+
+enum ppmu_event_type {
+	PPMU_RO_BUSY_CYCLE_CNT	= 0x0,
+	PPMU_WO_BUSY_CYCLE_CNT	= 0x1,
+	PPMU_RW_BUSY_CYCLE_CNT	= 0x2,
+	PPMU_RO_REQUEST_CNT	= 0x3,
+	PPMU_WO_REQUEST_CNT	= 0x4,
+	PPMU_RO_DATA_CNT	= 0x5,
+	PPMU_WO_DATA_CNT	= 0x6,
+	PPMU_RO_LATENCY		= 0x12,
+	PPMU_WO_LATENCY		= 0x16,
+};
+
+enum ppmu_reg {
+	/* PPC control register */
+	PPMU_PMNC		= 0x00,
+	PPMU_CNTENS		= 0x10,
+	PPMU_CNTENC		= 0x20,
+	PPMU_INTENS		= 0x30,
+	PPMU_INTENC		= 0x40,
+	PPMU_FLAG		= 0x50,
+
+	/* Cycle Counter and Performance Event Counter Register */
+	PPMU_CCNT		= 0x100,
+	PPMU_PMCNT0		= 0x110,
+	PPMU_PMCNT1		= 0x120,
+	PPMU_PMCNT2		= 0x130,
+	PPMU_PMCNT3_HIGH	= 0x140,
+	PPMU_PMCNT3_LOW		= 0x150,
+
+	/* Bus Event Generator */
+	PPMU_BEVT0SEL		= 0x1000,
+	PPMU_BEVT1SEL		= 0x1100,
+	PPMU_BEVT2SEL		= 0x1200,
+	PPMU_BEVT3SEL		= 0x1300,
+	PPMU_COUNTER_RESET	= 0x1810,
+	PPMU_READ_OVERFLOW_CNT	= 0x1810,
+	PPMU_READ_UNDERFLOW_CNT	= 0x1814,
+	PPMU_WRITE_OVERFLOW_CNT	= 0x1850,
+	PPMU_WRITE_UNDERFLOW_CNT = 0x1854,
+	PPMU_READ_PENDING_CNT	= 0x1880,
+	PPMU_WRITE_PENDING_CNT	= 0x1884
+};
+
+/* PMNC register */
+#define PPMU_PMNC_CC_RESET_SHIFT	2
+#define PPMU_PMNC_COUNTER_RESET_SHIFT	1
+#define PPMU_PMNC_ENABLE_SHIFT		0
+#define PPMU_PMNC_START_MODE_MASK	BIT(16)
+#define PPMU_PMNC_CC_DIVIDER_MASK	BIT(3)
+#define PPMU_PMNC_CC_RESET_MASK		BIT(2)
+#define PPMU_PMNC_COUNTER_RESET_MASK	BIT(1)
+#define PPMU_PMNC_ENABLE_MASK		BIT(0)
+
+/* CNTENS/CNTENC/INTENS/INTENC/FLAG register */
+#define PPMU_CCNT_MASK			BIT(31)
+#define PPMU_PMCNT3_MASK		BIT(3)
+#define PPMU_PMCNT2_MASK		BIT(2)
+#define PPMU_PMCNT1_MASK		BIT(1)
+#define PPMU_PMCNT0_MASK		BIT(0)
+
+/* PPMU_PMNCTx/PPMU_BETxSEL registers */
+#define PPMU_PMNCT(x)			(PPMU_PMCNT0 + (0x10 * x))
+#define PPMU_BEVTxSEL(x)		(PPMU_BEVT0SEL + (0x100 * x))
+
+#endif /* __EXYNOS_PPMU_H__ */
diff --git a/include/linux/devfreq-event.h b/include/linux/devfreq-event.h
new file mode 100644
index 000000000000..602fbbfcfeed
--- /dev/null
+++ b/include/linux/devfreq-event.h
@@ -0,0 +1,196 @@
+/*
+ * devfreq-event: a framework to provide raw data and events of devfreq devices
+ *
+ * Copyright (C) 2014 Samsung Electronics
+ * Author: Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_DEVFREQ_EVENT_H__
+#define __LINUX_DEVFREQ_EVENT_H__
+
+#include <linux/device.h>
+
+/**
+ * struct devfreq_event_dev - the devfreq-event device
+ *
+ * @node	: Contain the devfreq-event device that have been registered.
+ * @dev		: the device registered by devfreq-event class. dev.parent is
+ *		  the device using devfreq-event.
+ * @lock	: a mutex to protect accessing devfreq-event.
+ * @enable_count: the number of enable function have been called.
+ * @desc	: the description for devfreq-event device.
+ *
+ * This structure contains devfreq-event device information.
+ */
+struct devfreq_event_dev {
+	struct list_head node;
+
+	struct device dev;
+	struct mutex lock;
+	u32 enable_count;
+
+	const struct devfreq_event_desc *desc;
+};
+
+/**
+ * struct devfreq_event_data - the devfreq-event data
+ *
+ * @load_count	: load count of devfreq-event device for the given period.
+ * @total_count	: total count of devfreq-event device for the given period.
+ *		  each count may represent a clock cycle, a time unit
+ *		  (ns/us/...), or anything the device driver wants.
+ *		  Generally, utilization is load_count / total_count.
+ *
+ * This structure contains the data of devfreq-event device for polling period.
+ */
+struct devfreq_event_data {
+	unsigned long load_count;
+	unsigned long total_count;
+};
+
+/**
+ * struct devfreq_event_ops - the operations of devfreq-event device
+ *
+ * @enable	: Enable the devfreq-event device.
+ * @disable	: Disable the devfreq-event device.
+ * @reset	: Reset all setting of the devfreq-event device.
+ * @set_event	: Set the specific event type for the devfreq-event device.
+ * @get_event	: Get the result of the devfreq-event devie with specific
+ *		  event type.
+ *
+ * This structure contains devfreq-event device operations which can be
+ * implemented by devfreq-event device drivers.
+ */
+struct devfreq_event_ops {
+	/* Optional functions */
+	int (*enable)(struct devfreq_event_dev *edev);
+	int (*disable)(struct devfreq_event_dev *edev);
+	int (*reset)(struct devfreq_event_dev *edev);
+
+	/* Mandatory functions */
+	int (*set_event)(struct devfreq_event_dev *edev);
+	int (*get_event)(struct devfreq_event_dev *edev,
+			 struct devfreq_event_data *edata);
+};
+
+/**
+ * struct devfreq_event_desc - the descriptor of devfreq-event device
+ *
+ * @name	: the name of devfreq-event device.
+ * @driver_data	: the private data for devfreq-event driver.
+ * @ops		: the operation to control devfreq-event device.
+ *
+ * Each devfreq-event device is described with a this structure.
+ * This structure contains the various data for devfreq-event device.
+ */
+struct devfreq_event_desc {
+	const char *name;
+	void *driver_data;
+
+	struct devfreq_event_ops *ops;
+};
+
+#if defined(CONFIG_PM_DEVFREQ_EVENT)
+extern int devfreq_event_enable_edev(struct devfreq_event_dev *edev);
+extern int devfreq_event_disable_edev(struct devfreq_event_dev *edev);
+extern bool devfreq_event_is_enabled(struct devfreq_event_dev *edev);
+extern int devfreq_event_set_event(struct devfreq_event_dev *edev);
+extern int devfreq_event_get_event(struct devfreq_event_dev *edev,
+				struct devfreq_event_data *edata);
+extern int devfreq_event_reset_event(struct devfreq_event_dev *edev);
+extern struct devfreq_event_dev *devfreq_event_get_edev_by_phandle(
+				struct device *dev, int index);
+extern int devfreq_event_get_edev_count(struct device *dev);
+extern struct devfreq_event_dev *devfreq_event_add_edev(struct device *dev,
+				struct devfreq_event_desc *desc);
+extern int devfreq_event_remove_edev(struct devfreq_event_dev *edev);
+extern struct devfreq_event_dev *devm_devfreq_event_add_edev(struct device *dev,
+				struct devfreq_event_desc *desc);
+extern void devm_devfreq_event_remove_edev(struct device *dev,
+				struct devfreq_event_dev *edev);
+static inline void *devfreq_event_get_drvdata(struct devfreq_event_dev *edev)
+{
+	return edev->desc->driver_data;
+}
+#else
+static inline int devfreq_event_enable_edev(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline int devfreq_event_disable_edev(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline bool devfreq_event_is_enabled(struct devfreq_event_dev *edev)
+{
+	return false;
+}
+
+static inline int devfreq_event_set_event(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline int devfreq_event_get_event(struct devfreq_event_dev *edev,
+					struct devfreq_event_data *edata)
+{
+	return -EINVAL;
+}
+
+static inline int devfreq_event_reset_event(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline void *devfreq_event_get_drvdata(struct devfreq_event_dev *edev)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline struct devfreq_event_dev *devfreq_event_get_edev_by_phandle(
+					struct device *dev, int index)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline int devfreq_event_get_edev_count(struct device *dev)
+{
+	return -EINVAL;
+}
+
+static inline struct devfreq_event_dev *devfreq_event_add_edev(struct device *dev,
+					struct devfreq_event_desc *desc)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline int devfreq_event_remove_edev(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline struct devfreq_event_dev *devm_devfreq_event_add_edev(
+					struct device *dev,
+					struct devfreq_event_desc *desc)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void devm_devfreq_event_remove_edev(struct device *dev,
+					struct devfreq_event_dev *edev)
+{
+}
+
+static inline void *devfreq_event_get_drvdata(struct devfreq_event_dev *edev)
+{
+	return NULL;
+}
+#endif /* CONFIG_PM_DEVFREQ_EVENT */
+
+#endif /* __LINUX_DEVFREQ_EVENT_H__ */
-- 
cgit v1.2.3


From e4b3d38088df6f3acd40259c8ec32c9bd3bfe3da Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Fri, 16 Jan 2015 18:30:37 +0100
Subject: phy: exynos-video-mipi: Fix regression by adding support for PMU
 regmap

After the Exynos Power Management Unit (PMU) driver was converted
to the platform device driver in commit 14fc8b93d47323561edf5d482
("ARM: EXYNOS: Add platform driver support for Exynos PMU") and
then PMU device nodes added to Exynos4 DTs in commit
7b9613aca42a5522d269 ("ARM: dts: add PMU syscon node for exynos4")
the mipi video phy driver started failing probing, due to overlapping
memory mapped register region resources.

Now all the Exynos peripheral devices which have registers in the PMU
region are supposed to use the regmap provided by the syscon driver.
So support for regmap is added in this patch, this unfortunately
creates yet another indirection into that supposedly trivial driver.

The additional mutex is required because single register is used by
PHY pairs (they share bit in a register). An improvement here could
be to allow a PHY instance be created with a driver custom mutex,
which would then be common for each PHY pair. This would eliminate
one of 3 mutexes which need to be taken in the phy_power_on/
phy_power_off code path. However, I tried to keep this bug fix patch
possibly simple.

This change is needed to make MIPI DSI displays and MIPI CSI-2
camera sensors working again on Exynos4 boards.

Cc: Pankaj Dubey <pankaj.dubey@samsung.com>
Cc: Kukjin Kim <kgene.kim@samsung.com>
Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 .../devicetree/bindings/phy/samsung-phy.txt        |  2 +-
 drivers/phy/phy-exynos-mipi-video.c                | 89 ++++++++++++++--------
 include/linux/mfd/syscon/exynos4-pmu.h             | 21 +++++
 3 files changed, 79 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/mfd/syscon/exynos4-pmu.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/phy/samsung-phy.txt b/Documentation/devicetree/bindings/phy/samsung-phy.txt
index d5bad920827f..91e38cfe1f8f 100644
--- a/Documentation/devicetree/bindings/phy/samsung-phy.txt
+++ b/Documentation/devicetree/bindings/phy/samsung-phy.txt
@@ -3,8 +3,8 @@ Samsung S5P/EXYNOS SoC series MIPI CSIS/DSIM DPHY
 
 Required properties:
 - compatible : should be "samsung,s5pv210-mipi-video-phy";
-- reg : offset and length of the MIPI DPHY register set;
 - #phy-cells : from the generic phy bindings, must be 1;
+- syscon - phandle to the PMU system controller;
 
 For "samsung,s5pv210-mipi-video-phy" compatible PHYs the second cell in
 the PHY specifier identifies the PHY and its meaning is as follows:
diff --git a/drivers/phy/phy-exynos-mipi-video.c b/drivers/phy/phy-exynos-mipi-video.c
index 943e0f88a120..f017b2f2a54e 100644
--- a/drivers/phy/phy-exynos-mipi-video.c
+++ b/drivers/phy/phy-exynos-mipi-video.c
@@ -12,19 +12,18 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
+#include <linux/mfd/syscon/exynos4-pmu.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
+#include <linux/regmap.h>
 #include <linux/spinlock.h>
+#include <linux/mfd/syscon.h>
 
-/* MIPI_PHYn_CONTROL register offset: n = 0..1 */
+/* MIPI_PHYn_CONTROL reg. offset (for base address from ioremap): n = 0..1 */
 #define EXYNOS_MIPI_PHY_CONTROL(n)	((n) * 4)
-#define EXYNOS_MIPI_PHY_ENABLE		(1 << 0)
-#define EXYNOS_MIPI_PHY_SRESETN		(1 << 1)
-#define EXYNOS_MIPI_PHY_MRESETN		(1 << 2)
-#define EXYNOS_MIPI_PHY_RESET_MASK	(3 << 1)
 
 enum exynos_mipi_phy_id {
 	EXYNOS_MIPI_PHY_ID_CSIS0,
@@ -38,43 +37,62 @@ enum exynos_mipi_phy_id {
 	((id) == EXYNOS_MIPI_PHY_ID_DSIM0 || (id) == EXYNOS_MIPI_PHY_ID_DSIM1)
 
 struct exynos_mipi_video_phy {
-	spinlock_t slock;
 	struct video_phy_desc {
 		struct phy *phy;
 		unsigned int index;
 	} phys[EXYNOS_MIPI_PHYS_NUM];
+	spinlock_t slock;
 	void __iomem *regs;
+	struct mutex mutex;
+	struct regmap *regmap;
 };
 
 static int __set_phy_state(struct exynos_mipi_video_phy *state,
 			enum exynos_mipi_phy_id id, unsigned int on)
 {
+	const unsigned int offset = EXYNOS4_MIPI_PHY_CONTROL(id / 2);
 	void __iomem *addr;
-	u32 reg, reset;
-
-	addr = state->regs + EXYNOS_MIPI_PHY_CONTROL(id / 2);
+	u32 val, reset;
 
 	if (is_mipi_dsim_phy_id(id))
-		reset = EXYNOS_MIPI_PHY_MRESETN;
-	else
-		reset = EXYNOS_MIPI_PHY_SRESETN;
-
-	spin_lock(&state->slock);
-	reg = readl(addr);
-	if (on)
-		reg |= reset;
+		reset = EXYNOS4_MIPI_PHY_MRESETN;
 	else
-		reg &= ~reset;
-	writel(reg, addr);
-
-	/* Clear ENABLE bit only if MRESETN, SRESETN bits are not set. */
-	if (on)
-		reg |= EXYNOS_MIPI_PHY_ENABLE;
-	else if (!(reg & EXYNOS_MIPI_PHY_RESET_MASK))
-		reg &= ~EXYNOS_MIPI_PHY_ENABLE;
+		reset = EXYNOS4_MIPI_PHY_SRESETN;
+
+	if (state->regmap) {
+		mutex_lock(&state->mutex);
+		regmap_read(state->regmap, offset, &val);
+		if (on)
+			val |= reset;
+		else
+			val &= ~reset;
+		regmap_write(state->regmap, offset, val);
+		if (on)
+			val |= EXYNOS4_MIPI_PHY_ENABLE;
+		else if (!(val & EXYNOS4_MIPI_PHY_RESET_MASK))
+			val &= ~EXYNOS4_MIPI_PHY_ENABLE;
+		regmap_write(state->regmap, offset, val);
+		mutex_unlock(&state->mutex);
+	} else {
+		addr = state->regs + EXYNOS_MIPI_PHY_CONTROL(id / 2);
+
+		spin_lock(&state->slock);
+		val = readl(addr);
+		if (on)
+			val |= reset;
+		else
+			val &= ~reset;
+		writel(val, addr);
+		/* Clear ENABLE bit only if MRESETN, SRESETN bits are not set */
+		if (on)
+			val |= EXYNOS4_MIPI_PHY_ENABLE;
+		else if (!(val & EXYNOS4_MIPI_PHY_RESET_MASK))
+			val &= ~EXYNOS4_MIPI_PHY_ENABLE;
+
+		writel(val, addr);
+		spin_unlock(&state->slock);
+	}
 
-	writel(reg, addr);
-	spin_unlock(&state->slock);
 	return 0;
 }
 
@@ -118,7 +136,6 @@ static int exynos_mipi_video_phy_probe(struct platform_device *pdev)
 {
 	struct exynos_mipi_video_phy *state;
 	struct device *dev = &pdev->dev;
-	struct resource *res;
 	struct phy_provider *phy_provider;
 	unsigned int i;
 
@@ -126,14 +143,22 @@ static int exynos_mipi_video_phy_probe(struct platform_device *pdev)
 	if (!state)
 		return -ENOMEM;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	state->regmap = syscon_regmap_lookup_by_phandle(dev->of_node, "syscon");
+	if (IS_ERR(state->regmap)) {
+		struct resource *res;
 
-	state->regs = devm_ioremap_resource(dev, res);
-	if (IS_ERR(state->regs))
-		return PTR_ERR(state->regs);
+		dev_info(dev, "regmap lookup failed: %ld\n",
+			 PTR_ERR(state->regmap));
+
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+		state->regs = devm_ioremap_resource(dev, res);
+		if (IS_ERR(state->regs))
+			return PTR_ERR(state->regs);
+	}
 
 	dev_set_drvdata(dev, state);
 	spin_lock_init(&state->slock);
+	mutex_init(&state->mutex);
 
 	for (i = 0; i < EXYNOS_MIPI_PHYS_NUM; i++) {
 		struct phy *phy = devm_phy_create(dev, NULL,
diff --git a/include/linux/mfd/syscon/exynos4-pmu.h b/include/linux/mfd/syscon/exynos4-pmu.h
new file mode 100644
index 000000000000..278b1b1549e9
--- /dev/null
+++ b/include/linux/mfd/syscon/exynos4-pmu.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2015 Samsung Electronics Co., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_MFD_SYSCON_PMU_EXYNOS4_H_
+#define _LINUX_MFD_SYSCON_PMU_EXYNOS4_H_
+
+/* Exynos4 PMU register definitions */
+
+/* MIPI_PHYn_CONTROL register offset: n = 0..1 */
+#define EXYNOS4_MIPI_PHY_CONTROL(n)	(0x710 + (n) * 4)
+#define EXYNOS4_MIPI_PHY_ENABLE		(1 << 0)
+#define EXYNOS4_MIPI_PHY_SRESETN	(1 << 1)
+#define EXYNOS4_MIPI_PHY_MRESETN	(1 << 2)
+#define EXYNOS4_MIPI_PHY_RESET_MASK	(3 << 1)
+
+#endif /* _LINUX_MFD_SYSCON_PMU_EXYNOS4_H_ */
-- 
cgit v1.2.3


From 38e478c4489a845a5e8baf7849c286af5fed5b66 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Oct 2014 15:56:21 +0200
Subject: quota: Split ->set_xstate callback into two

Split ->set_xstate callback into two callbacks - one for turning quotas
on (->quota_enable) and one for turning quotas off (->quota_disable). That
way we don't have to pass quotactl command into the callback which seems
cleaner.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c      | 20 +++++++++++++----
 fs/xfs/xfs_quotaops.c | 59 ++++++++++++++++++++++++++++++++-------------------
 include/linux/quota.h |  3 ++-
 3 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 6f3856328eea..e2ae2b99e555 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -208,15 +208,26 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
 	return sb->s_qcop->set_dqblk(sb, qid, &fdq);
 }
 
-static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
+static int quota_enable(struct super_block *sb, void __user *addr)
 {
 	__u32 flags;
 
 	if (copy_from_user(&flags, addr, sizeof(flags)))
 		return -EFAULT;
-	if (!sb->s_qcop->set_xstate)
+	if (!sb->s_qcop->quota_enable)
 		return -ENOSYS;
-	return sb->s_qcop->set_xstate(sb, flags, cmd);
+	return sb->s_qcop->quota_enable(sb, flags);
+}
+
+static int quota_disable(struct super_block *sb, void __user *addr)
+{
+	__u32 flags;
+
+	if (copy_from_user(&flags, addr, sizeof(flags)))
+		return -EFAULT;
+	if (!sb->s_qcop->quota_disable)
+		return -ENOSYS;
+	return sb->s_qcop->quota_disable(sb, flags);
 }
 
 static int quota_getxstate(struct super_block *sb, void __user *addr)
@@ -447,8 +458,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 			return -ENOSYS;
 		return sb->s_qcop->quota_sync(sb, type);
 	case Q_XQUOTAON:
+		return quota_enable(sb, addr);
 	case Q_XQUOTAOFF:
-		return quota_setxstate(sb, cmd, addr);
+		return quota_disable(sb, addr);
 	case Q_XQUOTARM:
 		return quota_rmxquota(sb, addr);
 	case Q_XGETQSTAT:
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index a226203fa46a..6923905ab33d 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -64,19 +64,10 @@ xfs_fs_get_xstatev(
 	return xfs_qm_scall_getqstatv(mp, fqs);
 }
 
-STATIC int
-xfs_fs_set_xstate(
-	struct super_block	*sb,
-	unsigned int		uflags,
-	int			op)
+static unsigned int
+xfs_quota_flags(unsigned int uflags)
 {
-	struct xfs_mount	*mp = XFS_M(sb);
-	unsigned int		flags = 0;
-
-	if (sb->s_flags & MS_RDONLY)
-		return -EROFS;
-	if (!XFS_IS_QUOTA_RUNNING(mp))
-		return -ENOSYS;
+	unsigned int flags = 0;
 
 	if (uflags & FS_QUOTA_UDQ_ACCT)
 		flags |= XFS_UQUOTA_ACCT;
@@ -91,16 +82,39 @@ xfs_fs_set_xstate(
 	if (uflags & FS_QUOTA_PDQ_ENFD)
 		flags |= XFS_PQUOTA_ENFD;
 
-	switch (op) {
-	case Q_XQUOTAON:
-		return xfs_qm_scall_quotaon(mp, flags);
-	case Q_XQUOTAOFF:
-		if (!XFS_IS_QUOTA_ON(mp))
-			return -EINVAL;
-		return xfs_qm_scall_quotaoff(mp, flags);
-	}
+	return flags;
+}
+
+STATIC int
+xfs_quota_enable(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+
+	return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
+}
+
+STATIC int
+xfs_quota_disable(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -EINVAL;
 
-	return -EINVAL;
+	return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
 }
 
 STATIC int
@@ -166,7 +180,8 @@ xfs_fs_set_dqblk(
 const struct quotactl_ops xfs_quotactl_operations = {
 	.get_xstatev		= xfs_fs_get_xstatev,
 	.get_xstate		= xfs_fs_get_xstate,
-	.set_xstate		= xfs_fs_set_xstate,
+	.quota_enable		= xfs_quota_enable,
+	.quota_disable		= xfs_quota_disable,
 	.rm_xquota		= xfs_fs_rm_xquota,
 	.get_dqblk		= xfs_fs_get_dqblk,
 	.set_dqblk		= xfs_fs_set_dqblk,
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 227f37f463c9..4da497b807c4 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -371,13 +371,14 @@ struct quotactl_ops {
 	int (*quota_on)(struct super_block *, int, int, struct path *);
 	int (*quota_on_meta)(struct super_block *, int, int);
 	int (*quota_off)(struct super_block *, int);
+	int (*quota_enable)(struct super_block *, unsigned int);
+	int (*quota_disable)(struct super_block *, unsigned int);
 	int (*quota_sync)(struct super_block *, int);
 	int (*get_info)(struct super_block *, int, struct if_dqinfo *);
 	int (*set_info)(struct super_block *, int, struct if_dqinfo *);
 	int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
 	int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
 	int (*get_xstate)(struct super_block *, struct fs_quota_stat *);
-	int (*set_xstate)(struct super_block *, unsigned int, int);
 	int (*get_xstatev)(struct super_block *, struct fs_quota_statv *);
 	int (*rm_xquota)(struct super_block *, unsigned int);
 };
-- 
cgit v1.2.3


From d3b863248577504f6eecca2a464d6ddf86b71584 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Oct 2014 16:07:12 +0200
Subject: quota: Wire up ->quota_{enable,disable} callbacks into
 Q_QUOTA{ON,OFF}

Make Q_QUOTAON / Q_QUOTAOFF quotactl call ->quota_enable /
->quota_disable callback when provided. To match current behavior of
ocfs2 & ext4 we make these quotactls turn on / off quota enforcement for
appropriate quota type.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c         | 31 +++++++++++++++++++++++++++----
 include/linux/quotaops.h |  2 ++
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index e2ae2b99e555..ce78a70a596f 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -66,18 +66,43 @@ static int quota_sync_all(int type)
 	return ret;
 }
 
+unsigned int qtype_enforce_flag(int type)
+{
+	switch (type) {
+	case USRQUOTA:
+		return FS_QUOTA_UDQ_ENFD;
+	case GRPQUOTA:
+		return FS_QUOTA_GDQ_ENFD;
+	case PRJQUOTA:
+		return FS_QUOTA_PDQ_ENFD;
+	}
+	return 0;
+}
+
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
 		         struct path *path)
 {
-	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta &&
+	    !sb->s_qcop->quota_enable)
 		return -ENOSYS;
 	if (sb->s_qcop->quota_on_meta)
 		return sb->s_qcop->quota_on_meta(sb, type, id);
+	if (sb->s_qcop->quota_enable)
+		return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	return sb->s_qcop->quota_on(sb, type, id, path);
 }
 
+static int quota_quotaoff(struct super_block *sb, int type)
+{
+	if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
+		return -ENOSYS;
+	if (sb->s_qcop->quota_disable)
+		return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
+	return sb->s_qcop->quota_off(sb, type);
+}
+
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
 {
 	__u32 fmt;
@@ -440,9 +465,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 	case Q_QUOTAON:
 		return quota_quotaon(sb, type, cmd, id, path);
 	case Q_QUOTAOFF:
-		if (!sb->s_qcop->quota_off)
-			return -ENOSYS;
-		return sb->s_qcop->quota_off(sb, type);
+		return quota_quotaoff(sb, type);
 	case Q_GETFMT:
 		return quota_getfmt(sb, type, addr);
 	case Q_GETINFO:
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 29e3455f7d41..ff0b665591d0 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -386,4 +386,6 @@ static inline void dquot_release_reservation_block(struct inode *inode,
 	__dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE);
 }
 
+unsigned int qtype_enforce_flag(int type);
+
 #endif /* _LINUX_QUOTAOPS_ */
-- 
cgit v1.2.3


From 3e2af67e66ff025796af1a8a1fcbb4236304f90c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 6 Oct 2014 18:40:51 +0200
Subject: quota: Add ->quota_{enable,disable} callbacks for VFS quotas

Add functions which translate ->quota_enable / ->quota_disable calls
into appropriate changes in VFS quota. This will enable filesystems
supporting VFS quota files in system inodes to be controlled via
Q_XQUOTA[ON|OFF] quotactls for better userspace compatibility.

Also provide a vector for quotactl using these functions which can be
used by filesystems with quota files stored in hidden system files.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 91 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/quotaops.h |  1 +
 2 files changed, 92 insertions(+)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 29eb9dc5728a..b47d0c17ea6f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2385,6 +2385,86 @@ out:
 }
 EXPORT_SYMBOL(dquot_quota_on_mount);
 
+static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
+{
+	int ret;
+	int type;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+		return -ENOSYS;
+	/* Accounting cannot be turned on while fs is mounted */
+	flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
+	if (!flags)
+		return -EINVAL;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!(flags & qtype_enforce_flag(type)))
+			continue;
+		/* Can't enforce without accounting */
+		if (!sb_has_quota_usage_enabled(sb, type))
+			return -EINVAL;
+		ret = dquot_enable(dqopt->files[type], type,
+				   dqopt->info[type].dqi_fmt_id,
+				   DQUOT_LIMITS_ENABLED);
+		if (ret < 0)
+			goto out_err;
+	}
+	return 0;
+out_err:
+	/* Backout enforcement enablement we already did */
+	for (type--; type >= 0; type--)  {
+		if (flags & qtype_enforce_flag(type))
+			dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+	}
+	/* Error code translation for better compatibility with XFS */
+	if (ret == -EBUSY)
+		ret = -EEXIST;
+	return ret;
+}
+
+static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
+{
+	int ret;
+	int type;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+		return -ENOSYS;
+	/*
+	 * We don't support turning off accounting via quotactl. In principle
+	 * quota infrastructure can do this but filesystems don't expect
+	 * userspace to be able to do it.
+	 */
+	if (flags &
+		  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
+		return -EOPNOTSUPP;
+
+	/* Filter out limits not enabled */
+	for (type = 0; type < MAXQUOTAS; type++)
+		if (!sb_has_quota_limits_enabled(sb, type))
+			flags &= ~qtype_enforce_flag(type);
+	/* Nothing left? */
+	if (!flags)
+		return -EEXIST;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (flags & qtype_enforce_flag(type)) {
+			ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+			if (ret < 0)
+				goto out_err;
+		}
+	}
+	return 0;
+out_err:
+	/* Backout enforcement disabling we already did */
+	for (type--; type >= 0; type--)  {
+		if (flags & qtype_enforce_flag(type))
+			dquot_enable(dqopt->files[type], type,
+				     dqopt->info[type].dqi_fmt_id,
+				     DQUOT_LIMITS_ENABLED);
+	}
+	return ret;
+}
+
 static inline qsize_t qbtos(qsize_t blocks)
 {
 	return blocks << QIF_DQBLKSIZE_BITS;
@@ -2614,6 +2694,17 @@ const struct quotactl_ops dquot_quotactl_ops = {
 };
 EXPORT_SYMBOL(dquot_quotactl_ops);
 
+const struct quotactl_ops dquot_quotactl_sysfile_ops = {
+	.quota_enable	= dquot_quota_enable,
+	.quota_disable	= dquot_quota_disable,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
+
 static int do_proc_dqstats(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index ff0b665591d0..df73258cca47 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -166,6 +166,7 @@ static inline bool sb_has_quota_active(struct super_block *sb, int type)
  */
 extern const struct dquot_operations dquot_operations;
 extern const struct quotactl_ops dquot_quotactl_ops;
+extern const struct quotactl_ops dquot_quotactl_sysfile_ops;
 
 #else
 
-- 
cgit v1.2.3


From aaa3daed156ff3c6acb28c8b18028f8b57d6c91b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Oct 2014 18:35:31 +0200
Subject: quota: Remove quota_on_meta callback

There are no more users for quota_on_meta callback. Just remove it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c      | 5 +----
 include/linux/quota.h | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ce78a70a596f..d14a799c7785 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -82,11 +82,8 @@ unsigned int qtype_enforce_flag(int type)
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
 		         struct path *path)
 {
-	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta &&
-	    !sb->s_qcop->quota_enable)
+	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
 		return -ENOSYS;
-	if (sb->s_qcop->quota_on_meta)
-		return sb->s_qcop->quota_on_meta(sb, type, id);
 	if (sb->s_qcop->quota_enable)
 		return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
 	if (IS_ERR(path))
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 4da497b807c4..0938159d65c8 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -369,7 +369,6 @@ struct qc_dqblk {
 /* Operations handling requests from userspace */
 struct quotactl_ops {
 	int (*quota_on)(struct super_block *, int, int, struct path *);
-	int (*quota_on_meta)(struct super_block *, int, int);
 	int (*quota_off)(struct super_block *, int);
 	int (*quota_enable)(struct super_block *, unsigned int);
 	int (*quota_disable)(struct super_block *, unsigned int);
-- 
cgit v1.2.3


From b10a08194c2b615955dfab2300331a90ae9344c7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 9 Oct 2014 16:54:13 +0200
Subject: quota: Store maximum space limit in bytes

Currently maximum space limit quota format supports is in blocks however
since we store space limits in bytes, this is somewhat confusing. So
store the maximum limit in bytes as well. Also rename the field to match
the new unit and related inode field to match the new naming scheme.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/quota_local.c |  4 ++--
 fs/quota/dquot.c       | 18 ++++--------------
 fs/quota/quota_v1.c    |  4 ++--
 fs/quota/quota_v2.c    | 10 +++++-----
 include/linux/quota.h  |  4 ++--
 5 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 55f832f553cb..89c0b2620814 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -701,8 +701,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 	/* We don't need the lock and we have to acquire quota file locks
 	 * which will later depend on this lock */
 	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
-	info->dqi_maxblimit = 0x7fffffffffffffffLL;
-	info->dqi_maxilimit = 0x7fffffffffffffffLL;
+	info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
+	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
 	oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
 	if (!oinfo) {
 		mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b47d0c17ea6f..0ccd4ba3a246 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2465,16 +2465,6 @@ out_err:
 	return ret;
 }
 
-static inline qsize_t qbtos(qsize_t blocks)
-{
-	return blocks << QIF_DQBLKSIZE_BITS;
-}
-
-static inline qsize_t stoqb(qsize_t space)
-{
-	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
-}
-
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
@@ -2524,13 +2514,13 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 		return -EINVAL;
 
 	if (((di->d_fieldmask & QC_SPC_SOFT) &&
-	     stoqb(di->d_spc_softlimit) > dqi->dqi_maxblimit) ||
+	     di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
 	    ((di->d_fieldmask & QC_SPC_HARD) &&
-	     stoqb(di->d_spc_hardlimit) > dqi->dqi_maxblimit) ||
+	     di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
 	    ((di->d_fieldmask & QC_INO_SOFT) &&
-	     (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+	     (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
 	    ((di->d_fieldmask & QC_INO_HARD) &&
-	     (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
+	     (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
 		return -ERANGE;
 
 	spin_lock(&dq_data_lock);
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 469c6848b322..8fe79beced5c 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -169,8 +169,8 @@ static int v1_read_file_info(struct super_block *sb, int type)
 	}
 	ret = 0;
 	/* limits are stored as unsigned 32-bit data */
-	dqopt->info[type].dqi_maxblimit = 0xffffffff;
-	dqopt->info[type].dqi_maxilimit = 0xffffffff;
+	dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
+	dqopt->info[type].dqi_max_ino_limit = 0xffffffff;
 	dqopt->info[type].dqi_igrace =
 			dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
 	dqopt->info[type].dqi_bgrace =
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 54cac436ac72..9cb10d7197f7 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -117,12 +117,12 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	qinfo = info->dqi_priv;
 	if (version == 0) {
 		/* limits are stored as unsigned 32-bit data */
-		info->dqi_maxblimit = 0xffffffff;
-		info->dqi_maxilimit = 0xffffffff;
+		info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
+		info->dqi_max_ino_limit = 0xffffffff;
 	} else {
-		/* used space is stored as unsigned 64-bit value */
-		info->dqi_maxblimit = 0xffffffffffffffffULL;	/* 2^64-1 */
-		info->dqi_maxilimit = 0xffffffffffffffffULL;
+		/* used space is stored as unsigned 64-bit value in bytes */
+		info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */
+		info->dqi_max_ino_limit = 0xffffffffffffffffULL;
 	}
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 0938159d65c8..d534e8ed308a 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -216,8 +216,8 @@ struct mem_dqinfo {
 	unsigned long dqi_flags;
 	unsigned int dqi_bgrace;
 	unsigned int dqi_igrace;
-	qsize_t dqi_maxblimit;
-	qsize_t dqi_maxilimit;
+	qsize_t dqi_max_spc_limit;
+	qsize_t dqi_max_ino_limit;
 	void *dqi_priv;
 };
 
-- 
cgit v1.2.3


From 284f4902a632584e8d73cf7d9363f819adf7240c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 21 Jan 2015 11:02:13 -0500
Subject: xprtrdma: Modernize htonl and ntohl

Clean up: Replace htonl and ntohl with the be32 equivalents.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/rpc_rdma.h |  9 ++++++++
 include/linux/sunrpc/svc_rdma.h |  2 --
 net/sunrpc/xprtrdma/rpc_rdma.c  | 48 ++++++++++++++++++++++-------------------
 3 files changed, 35 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index b78f16b1dea3..1578ed241c19 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -42,6 +42,9 @@
 
 #include <linux/types.h>
 
+#define RPCRDMA_VERSION		1
+#define rpcrdma_version		cpu_to_be32(RPCRDMA_VERSION)
+
 struct rpcrdma_segment {
 	__be32 rs_handle;	/* Registered memory handle */
 	__be32 rs_length;	/* Length of the chunk in bytes */
@@ -115,4 +118,10 @@ enum rpcrdma_proc {
 	RDMA_ERROR = 4		/* An RPC RDMA encoding error */
 };
 
+#define rdma_msg	cpu_to_be32(RDMA_MSG)
+#define rdma_nomsg	cpu_to_be32(RDMA_NOMSG)
+#define rdma_msgp	cpu_to_be32(RDMA_MSGP)
+#define rdma_done	cpu_to_be32(RDMA_DONE)
+#define rdma_error	cpu_to_be32(RDMA_ERROR)
+
 #endif				/* _LINUX_SUNRPC_RPC_RDMA_H */
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 975da754c778..ddfe88f52219 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -63,8 +63,6 @@ extern atomic_t rdma_stat_rq_prod;
 extern atomic_t rdma_stat_sq_poll;
 extern atomic_t rdma_stat_sq_prod;
 
-#define RPCRDMA_VERSION 1
-
 /*
  * Contexts are built when an RDMA request is created and are a
  * record of the resources that can be recovered when the request
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index df01d124936c..a6fb30b0a8cc 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -209,9 +209,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 		if (cur_rchunk) {	/* read */
 			cur_rchunk->rc_discrim = xdr_one;
 			/* all read chunks have the same "position" */
-			cur_rchunk->rc_position = htonl(pos);
-			cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey);
-			cur_rchunk->rc_target.rs_length = htonl(seg->mr_len);
+			cur_rchunk->rc_position = cpu_to_be32(pos);
+			cur_rchunk->rc_target.rs_handle =
+						cpu_to_be32(seg->mr_rkey);
+			cur_rchunk->rc_target.rs_length =
+						cpu_to_be32(seg->mr_len);
 			xdr_encode_hyper(
 					(__be32 *)&cur_rchunk->rc_target.rs_offset,
 					seg->mr_base);
@@ -222,8 +224,10 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 			cur_rchunk++;
 			r_xprt->rx_stats.read_chunk_count++;
 		} else {		/* write/reply */
-			cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey);
-			cur_wchunk->wc_target.rs_length = htonl(seg->mr_len);
+			cur_wchunk->wc_target.rs_handle =
+						cpu_to_be32(seg->mr_rkey);
+			cur_wchunk->wc_target.rs_length =
+						cpu_to_be32(seg->mr_len);
 			xdr_encode_hyper(
 					(__be32 *)&cur_wchunk->wc_target.rs_offset,
 					seg->mr_base);
@@ -257,7 +261,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 		*iptr++ = xdr_zero;	/* encode a NULL reply chunk */
 	} else {
 		warray->wc_discrim = xdr_one;
-		warray->wc_nchunks = htonl(nchunks);
+		warray->wc_nchunks = cpu_to_be32(nchunks);
 		iptr = (__be32 *) cur_wchunk;
 		if (type == rpcrdma_writech) {
 			*iptr++ = xdr_zero; /* finish the write chunk list */
@@ -404,11 +408,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 
 	/* build RDMA header in private area at front */
 	headerp = (struct rpcrdma_msg *) req->rl_base;
-	/* don't htonl XID, it's already done in request */
+	/* don't byte-swap XID, it's already done in request */
 	headerp->rm_xid = rqst->rq_xid;
-	headerp->rm_vers = xdr_one;
-	headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests);
-	headerp->rm_type = htonl(RDMA_MSG);
+	headerp->rm_vers = rpcrdma_version;
+	headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
+	headerp->rm_type = rdma_msg;
 
 	/*
 	 * Chunks needed for results?
@@ -482,11 +486,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 						RPCRDMA_INLINE_PAD_VALUE(rqst));
 
 		if (padlen) {
-			headerp->rm_type = htonl(RDMA_MSGP);
+			headerp->rm_type = rdma_msgp;
 			headerp->rm_body.rm_padded.rm_align =
-				htonl(RPCRDMA_INLINE_PAD_VALUE(rqst));
+				cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
 			headerp->rm_body.rm_padded.rm_thresh =
-				htonl(RPCRDMA_INLINE_PAD_THRESH);
+				cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
 			headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
 			headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
 			headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
@@ -570,7 +574,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
 	unsigned int i, total_len;
 	struct rpcrdma_write_chunk *cur_wchunk;
 
-	i = ntohl(**iptrp);	/* get array count */
+	i = be32_to_cpu(**iptrp);
 	if (i > max)
 		return -1;
 	cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
@@ -582,11 +586,11 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
 			xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
 			dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
 				__func__,
-				ntohl(seg->rs_length),
+				be32_to_cpu(seg->rs_length),
 				(unsigned long long)off,
-				ntohl(seg->rs_handle));
+				be32_to_cpu(seg->rs_handle));
 		}
-		total_len += ntohl(seg->rs_length);
+		total_len += be32_to_cpu(seg->rs_length);
 		++cur_wchunk;
 	}
 	/* check and adjust for properly terminated write chunk */
@@ -749,9 +753,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 		goto repost;
 	}
 	headerp = (struct rpcrdma_msg *) rep->rr_base;
-	if (headerp->rm_vers != xdr_one) {
+	if (headerp->rm_vers != rpcrdma_version) {
 		dprintk("RPC:       %s: invalid version %d\n",
-			__func__, ntohl(headerp->rm_vers));
+			__func__, be32_to_cpu(headerp->rm_vers));
 		goto repost;
 	}
 
@@ -793,7 +797,7 @@ repost:
 	/* check for expected message types */
 	/* The order of some of these tests is important. */
 	switch (headerp->rm_type) {
-	case htonl(RDMA_MSG):
+	case rdma_msg:
 		/* never expect read chunks */
 		/* never expect reply chunks (two ways to check) */
 		/* never expect write chunks without having offered RDMA */
@@ -832,7 +836,7 @@ repost:
 		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
 		break;
 
-	case htonl(RDMA_NOMSG):
+	case rdma_nomsg:
 		/* never expect read or write chunks, always reply chunks */
 		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
 		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
@@ -853,7 +857,7 @@ badheader:
 		dprintk("%s: invalid rpcrdma reply header (type %d):"
 				" chunks[012] == %d %d %d"
 				" expected chunks <= %d\n",
-				__func__, ntohl(headerp->rm_type),
+				__func__, be32_to_cpu(headerp->rm_type),
 				headerp->rm_body.rm_chunks[0],
 				headerp->rm_body.rm_chunks[1],
 				headerp->rm_body.rm_chunks[2],
-- 
cgit v1.2.3


From f2846481b4bf758cf7c3fe8f24b35950306f1db2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 21 Jan 2015 11:02:29 -0500
Subject: xprtrdma: Clean up hdrlen

Clean up: Replace naked integers with a documenting macro.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/rpc_rdma.h |  5 ++++-
 net/sunrpc/xprtrdma/rpc_rdma.c  | 12 +++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index 1578ed241c19..f33c5a4d6fe4 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -98,7 +98,10 @@ struct rpcrdma_msg {
 	} rm_body;
 };
 
-#define RPCRDMA_HDRLEN_MIN	28
+/*
+ * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks
+ */
+#define RPCRDMA_HDRLEN_MIN	(sizeof(__be32) * 7)
 
 enum rpcrdma_errcode {
 	ERR_VERS = 1,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 150dd7641803..dcf5ebc3d373 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -472,7 +472,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 		return -EIO;
 	}
 
-	hdrlen = 28; /*sizeof *headerp;*/
+	hdrlen = RPCRDMA_HDRLEN_MIN;
 	padlen = 0;
 
 	/*
@@ -748,7 +748,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 		}
 		return;
 	}
-	if (rep->rr_len < 28) {
+	if (rep->rr_len < RPCRDMA_HDRLEN_MIN) {
 		dprintk("RPC:       %s: short/invalid reply\n", __func__);
 		goto repost;
 	}
@@ -830,8 +830,9 @@ repost:
 		} else {
 			/* else ordinary inline */
 			rdmalen = 0;
-			iptr = (__be32 *)((unsigned char *)headerp + 28);
-			rep->rr_len -= 28; /*sizeof *headerp;*/
+			iptr = (__be32 *)((unsigned char *)headerp +
+							RPCRDMA_HDRLEN_MIN);
+			rep->rr_len -= RPCRDMA_HDRLEN_MIN;
 			status = rep->rr_len;
 		}
 		/* Fix up the rpc results for upper layer */
@@ -845,7 +846,8 @@ repost:
 		    headerp->rm_body.rm_chunks[2] != xdr_one ||
 		    req->rl_nchunks == 0)
 			goto badheader;
-		iptr = (__be32 *)((unsigned char *)headerp + 28);
+		iptr = (__be32 *)((unsigned char *)headerp +
+							RPCRDMA_HDRLEN_MIN);
 		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
 		if (rdmalen < 0)
 			goto badheader;
-- 
cgit v1.2.3


From 74807dffcdd72bb4c0786214e8486ef9bb088156 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Thu, 29 Jan 2015 22:25:40 +0200
Subject: clk: ti: add omap3 legacy clock data

Introduces omap3 legacy clock data under clock driver. The clock data
is also in new format, which makes it possible to get rid of the
clk-private.h header. This patch also introduces SoC specific init
functions that shall be called from the low level init.

The data format used in this file has two possible evolution paths;
it can either be removed completely once no longer needed, or it will
be possible to retain the format and modify the TI clock driver to be
a loadable module at some point. The actual path to be followed
will be decided later.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 drivers/clk/ti/Makefile          |    3 +-
 drivers/clk/ti/clk-3xxx-legacy.c | 4653 ++++++++++++++++++++++++++++++++++++++
 include/linux/clk/ti.h           |   12 +
 3 files changed, 4667 insertions(+), 1 deletion(-)
 create mode 100644 drivers/clk/ti/clk-3xxx-legacy.c

(limited to 'include/linux')

diff --git a/drivers/clk/ti/Makefile b/drivers/clk/ti/Makefile
index ed4d0aaf8916..d8a770ae508d 100644
--- a/drivers/clk/ti/Makefile
+++ b/drivers/clk/ti/Makefile
@@ -4,7 +4,8 @@ clk-common				= dpll.o composite.o divider.o gate.o \
 					  fixed-factor.o mux.o apll.o
 obj-$(CONFIG_SOC_AM33XX)		+= $(clk-common) clk-33xx.o
 obj-$(CONFIG_ARCH_OMAP2)		+= $(clk-common) interface.o clk-2xxx.o
-obj-$(CONFIG_ARCH_OMAP3)		+= $(clk-common) interface.o clk-3xxx.o
+obj-$(CONFIG_ARCH_OMAP3)		+= $(clk-common) interface.o \
+					   clk-3xxx.o clk-3xxx-legacy.o
 obj-$(CONFIG_ARCH_OMAP4)		+= $(clk-common) clk-44xx.o
 obj-$(CONFIG_SOC_OMAP5)			+= $(clk-common) clk-54xx.o
 obj-$(CONFIG_SOC_DRA7XX)		+= $(clk-common) clk-7xx.o \
diff --git a/drivers/clk/ti/clk-3xxx-legacy.c b/drivers/clk/ti/clk-3xxx-legacy.c
new file mode 100644
index 000000000000..e0732a4c8f26
--- /dev/null
+++ b/drivers/clk/ti/clk-3xxx-legacy.c
@@ -0,0 +1,4653 @@
+/*
+ * OMAP3 Legacy clock data
+ *
+ * Copyright (C) 2014 Texas Instruments, Inc
+ *     Tero Kristo (t-kristo@ti.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/clk-provider.h>
+#include <linux/clk/ti.h>
+
+#include "clock.h"
+
+static struct ti_clk_fixed virt_12m_ck_data = {
+	.frequency = 12000000,
+};
+
+static struct ti_clk virt_12m_ck = {
+	.name = "virt_12m_ck",
+	.type = TI_CLK_FIXED,
+	.data = &virt_12m_ck_data,
+};
+
+static struct ti_clk_fixed virt_13m_ck_data = {
+	.frequency = 13000000,
+};
+
+static struct ti_clk virt_13m_ck = {
+	.name = "virt_13m_ck",
+	.type = TI_CLK_FIXED,
+	.data = &virt_13m_ck_data,
+};
+
+static struct ti_clk_fixed virt_19200000_ck_data = {
+	.frequency = 19200000,
+};
+
+static struct ti_clk virt_19200000_ck = {
+	.name = "virt_19200000_ck",
+	.type = TI_CLK_FIXED,
+	.data = &virt_19200000_ck_data,
+};
+
+static struct ti_clk_fixed virt_26000000_ck_data = {
+	.frequency = 26000000,
+};
+
+static struct ti_clk virt_26000000_ck = {
+	.name = "virt_26000000_ck",
+	.type = TI_CLK_FIXED,
+	.data = &virt_26000000_ck_data,
+};
+
+static struct ti_clk_fixed virt_38_4m_ck_data = {
+	.frequency = 38400000,
+};
+
+static struct ti_clk virt_38_4m_ck = {
+	.name = "virt_38_4m_ck",
+	.type = TI_CLK_FIXED,
+	.data = &virt_38_4m_ck_data,
+};
+
+static struct ti_clk_fixed virt_16_8m_ck_data = {
+	.frequency = 16800000,
+};
+
+static struct ti_clk virt_16_8m_ck = {
+	.name = "virt_16_8m_ck",
+	.type = TI_CLK_FIXED,
+	.data = &virt_16_8m_ck_data,
+};
+
+static const char *osc_sys_ck_parents[] = {
+	"virt_12m_ck",
+	"virt_13m_ck",
+	"virt_19200000_ck",
+	"virt_26000000_ck",
+	"virt_38_4m_ck",
+	"virt_16_8m_ck",
+};
+
+static struct ti_clk_mux osc_sys_ck_data = {
+	.num_parents = ARRAY_SIZE(osc_sys_ck_parents),
+	.reg = 0xd40,
+	.module = TI_CLKM_PRM,
+	.parents = osc_sys_ck_parents,
+};
+
+static struct ti_clk osc_sys_ck = {
+	.name = "osc_sys_ck",
+	.type = TI_CLK_MUX,
+	.data = &osc_sys_ck_data,
+};
+
+static struct ti_clk_divider sys_ck_data = {
+	.parent = "osc_sys_ck",
+	.bit_shift = 6,
+	.max_div = 3,
+	.reg = 0x1270,
+	.module = TI_CLKM_PRM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk sys_ck = {
+	.name = "sys_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &sys_ck_data,
+};
+
+static const char *dpll3_ck_parents[] = {
+	"sys_ck",
+	"sys_ck",
+};
+
+static struct ti_clk_dpll dpll3_ck_data = {
+	.num_parents = ARRAY_SIZE(dpll3_ck_parents),
+	.control_reg = 0xd00,
+	.idlest_reg = 0xd20,
+	.mult_div1_reg = 0xd40,
+	.autoidle_reg = 0xd30,
+	.module = TI_CLKM_CM,
+	.parents = dpll3_ck_parents,
+	.flags = CLKF_CORE,
+	.freqsel_mask = 0xf0,
+	.div1_mask = 0x7f00,
+	.idlest_mask = 0x1,
+	.auto_recal_bit = 0x3,
+	.max_divider = 0x80,
+	.min_divider = 0x1,
+	.recal_en_bit = 0x5,
+	.max_multiplier = 0x7ff,
+	.enable_mask = 0x7,
+	.mult_mask = 0x7ff0000,
+	.recal_st_bit = 0x5,
+	.autoidle_mask = 0x7,
+};
+
+static struct ti_clk dpll3_ck = {
+	.name = "dpll3_ck",
+	.clkdm_name = "dpll3_clkdm",
+	.type = TI_CLK_DPLL,
+	.data = &dpll3_ck_data,
+};
+
+static struct ti_clk_divider dpll3_m2_ck_data = {
+	.parent = "dpll3_ck",
+	.bit_shift = 27,
+	.max_div = 31,
+	.reg = 0xd40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll3_m2_ck = {
+	.name = "dpll3_m2_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll3_m2_ck_data,
+};
+
+static struct ti_clk_fixed_factor core_ck_data = {
+	.parent = "dpll3_m2_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk core_ck = {
+	.name = "core_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_ck_data,
+};
+
+static struct ti_clk_divider l3_ick_data = {
+	.parent = "core_ck",
+	.max_div = 3,
+	.reg = 0xa40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk l3_ick = {
+	.name = "l3_ick",
+	.type = TI_CLK_DIVIDER,
+	.data = &l3_ick_data,
+};
+
+static struct ti_clk_fixed_factor security_l3_ick_data = {
+	.parent = "l3_ick",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk security_l3_ick = {
+	.name = "security_l3_ick",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &security_l3_ick_data,
+};
+
+static struct ti_clk_fixed_factor wkup_l4_ick_data = {
+	.parent = "sys_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk wkup_l4_ick = {
+	.name = "wkup_l4_ick",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &wkup_l4_ick_data,
+};
+
+static struct ti_clk_gate usim_ick_data = {
+	.parent = "wkup_l4_ick",
+	.bit_shift = 9,
+	.reg = 0xc10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk usim_ick = {
+	.name = "usim_ick",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &usim_ick_data,
+};
+
+static struct ti_clk_gate dss2_alwon_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 1,
+	.reg = 0xe00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk dss2_alwon_fck = {
+	.name = "dss2_alwon_fck",
+	.clkdm_name = "dss_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &dss2_alwon_fck_data,
+};
+
+static struct ti_clk_divider l4_ick_data = {
+	.parent = "l3_ick",
+	.bit_shift = 2,
+	.max_div = 3,
+	.reg = 0xa40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk l4_ick = {
+	.name = "l4_ick",
+	.type = TI_CLK_DIVIDER,
+	.data = &l4_ick_data,
+};
+
+static struct ti_clk_fixed_factor core_l4_ick_data = {
+	.parent = "l4_ick",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk core_l4_ick = {
+	.name = "core_l4_ick",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_l4_ick_data,
+};
+
+static struct ti_clk_gate mmchs2_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 25,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mmchs2_ick = {
+	.name = "mmchs2_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mmchs2_ick_data,
+};
+
+static const char *dpll4_ck_parents[] = {
+	"sys_ck",
+	"sys_ck",
+};
+
+static struct ti_clk_dpll dpll4_ck_data = {
+	.num_parents = ARRAY_SIZE(dpll4_ck_parents),
+	.control_reg = 0xd00,
+	.idlest_reg = 0xd20,
+	.mult_div1_reg = 0xd44,
+	.autoidle_reg = 0xd30,
+	.module = TI_CLKM_CM,
+	.parents = dpll4_ck_parents,
+	.flags = CLKF_PER,
+	.freqsel_mask = 0xf00000,
+	.modes = 0x82,
+	.div1_mask = 0x7f,
+	.idlest_mask = 0x2,
+	.auto_recal_bit = 0x13,
+	.max_divider = 0x80,
+	.min_divider = 0x1,
+	.recal_en_bit = 0x6,
+	.max_multiplier = 0x7ff,
+	.enable_mask = 0x70000,
+	.mult_mask = 0x7ff00,
+	.recal_st_bit = 0x6,
+	.autoidle_mask = 0x38,
+};
+
+static struct ti_clk dpll4_ck = {
+	.name = "dpll4_ck",
+	.clkdm_name = "dpll4_clkdm",
+	.type = TI_CLK_DPLL,
+	.data = &dpll4_ck_data,
+};
+
+static struct ti_clk_divider dpll4_m2_ck_data = {
+	.parent = "dpll4_ck",
+	.max_div = 63,
+	.reg = 0xd48,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll4_m2_ck = {
+	.name = "dpll4_m2_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll4_m2_ck_data,
+};
+
+static struct ti_clk_fixed_factor dpll4_m2x2_mul_ck_data = {
+	.parent = "dpll4_m2_ck",
+	.div = 1,
+	.mult = 2,
+};
+
+static struct ti_clk dpll4_m2x2_mul_ck = {
+	.name = "dpll4_m2x2_mul_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll4_m2x2_mul_ck_data,
+};
+
+static struct ti_clk_gate dpll4_m2x2_ck_data = {
+	.parent = "dpll4_m2x2_mul_ck",
+	.bit_shift = 0x1b,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll4_m2x2_ck = {
+	.name = "dpll4_m2x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll4_m2x2_ck_data,
+};
+
+static struct ti_clk_fixed_factor omap_96m_alwon_fck_data = {
+	.parent = "dpll4_m2x2_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk omap_96m_alwon_fck = {
+	.name = "omap_96m_alwon_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &omap_96m_alwon_fck_data,
+};
+
+static struct ti_clk_fixed_factor cm_96m_fck_data = {
+	.parent = "omap_96m_alwon_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk cm_96m_fck = {
+	.name = "cm_96m_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &cm_96m_fck_data,
+};
+
+static const char *omap_96m_fck_parents[] = {
+	"cm_96m_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux omap_96m_fck_data = {
+	.bit_shift = 6,
+	.num_parents = ARRAY_SIZE(omap_96m_fck_parents),
+	.reg = 0xd40,
+	.module = TI_CLKM_CM,
+	.parents = omap_96m_fck_parents,
+};
+
+static struct ti_clk omap_96m_fck = {
+	.name = "omap_96m_fck",
+	.type = TI_CLK_MUX,
+	.data = &omap_96m_fck_data,
+};
+
+static struct ti_clk_fixed_factor core_96m_fck_data = {
+	.parent = "omap_96m_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk core_96m_fck = {
+	.name = "core_96m_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_96m_fck_data,
+};
+
+static struct ti_clk_gate mspro_fck_data = {
+	.parent = "core_96m_fck",
+	.bit_shift = 23,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk mspro_fck = {
+	.name = "mspro_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mspro_fck_data,
+};
+
+static struct ti_clk_gate dss_ick_3430es2_data = {
+	.parent = "l4_ick",
+	.bit_shift = 0,
+	.reg = 0xe10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_DSS | CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk dss_ick_3430es2 = {
+	.name = "dss_ick",
+	.clkdm_name = "dss_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &dss_ick_3430es2_data,
+};
+
+static struct ti_clk_gate uart4_ick_am35xx_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 23,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk uart4_ick_am35xx = {
+	.name = "uart4_ick_am35xx",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart4_ick_am35xx_data,
+};
+
+static struct ti_clk_fixed_factor security_l4_ick2_data = {
+	.parent = "l4_ick",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk security_l4_ick2 = {
+	.name = "security_l4_ick2",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &security_l4_ick2_data,
+};
+
+static struct ti_clk_gate aes1_ick_data = {
+	.parent = "security_l4_ick2",
+	.bit_shift = 3,
+	.reg = 0xa14,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk aes1_ick = {
+	.name = "aes1_ick",
+	.type = TI_CLK_GATE,
+	.data = &aes1_ick_data,
+};
+
+static const char *dpll5_ck_parents[] = {
+	"sys_ck",
+	"sys_ck",
+};
+
+static struct ti_clk_dpll dpll5_ck_data = {
+	.num_parents = ARRAY_SIZE(dpll5_ck_parents),
+	.control_reg = 0xd04,
+	.idlest_reg = 0xd24,
+	.mult_div1_reg = 0xd4c,
+	.autoidle_reg = 0xd34,
+	.module = TI_CLKM_CM,
+	.parents = dpll5_ck_parents,
+	.freqsel_mask = 0xf0,
+	.modes = 0x82,
+	.div1_mask = 0x7f,
+	.idlest_mask = 0x1,
+	.auto_recal_bit = 0x3,
+	.max_divider = 0x80,
+	.min_divider = 0x1,
+	.recal_en_bit = 0x19,
+	.max_multiplier = 0x7ff,
+	.enable_mask = 0x7,
+	.mult_mask = 0x7ff00,
+	.recal_st_bit = 0x19,
+	.autoidle_mask = 0x7,
+};
+
+static struct ti_clk dpll5_ck = {
+	.name = "dpll5_ck",
+	.clkdm_name = "dpll5_clkdm",
+	.type = TI_CLK_DPLL,
+	.data = &dpll5_ck_data,
+};
+
+static struct ti_clk_divider dpll5_m2_ck_data = {
+	.parent = "dpll5_ck",
+	.max_div = 31,
+	.reg = 0xd50,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll5_m2_ck = {
+	.name = "dpll5_m2_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll5_m2_ck_data,
+};
+
+static struct ti_clk_gate usbhost_120m_fck_data = {
+	.parent = "dpll5_m2_ck",
+	.bit_shift = 1,
+	.reg = 0x1400,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk usbhost_120m_fck = {
+	.name = "usbhost_120m_fck",
+	.clkdm_name = "usbhost_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &usbhost_120m_fck_data,
+};
+
+static struct ti_clk_fixed_factor cm_96m_d2_fck_data = {
+	.parent = "cm_96m_fck",
+	.div = 2,
+	.mult = 1,
+};
+
+static struct ti_clk cm_96m_d2_fck = {
+	.name = "cm_96m_d2_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &cm_96m_d2_fck_data,
+};
+
+static struct ti_clk_fixed sys_altclk_data = {
+	.frequency = 0x0,
+};
+
+static struct ti_clk sys_altclk = {
+	.name = "sys_altclk",
+	.type = TI_CLK_FIXED,
+	.data = &sys_altclk_data,
+};
+
+static const char *omap_48m_fck_parents[] = {
+	"cm_96m_d2_fck",
+	"sys_altclk",
+};
+
+static struct ti_clk_mux omap_48m_fck_data = {
+	.bit_shift = 3,
+	.num_parents = ARRAY_SIZE(omap_48m_fck_parents),
+	.reg = 0xd40,
+	.module = TI_CLKM_CM,
+	.parents = omap_48m_fck_parents,
+};
+
+static struct ti_clk omap_48m_fck = {
+	.name = "omap_48m_fck",
+	.type = TI_CLK_MUX,
+	.data = &omap_48m_fck_data,
+};
+
+static struct ti_clk_fixed_factor core_48m_fck_data = {
+	.parent = "omap_48m_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk core_48m_fck = {
+	.name = "core_48m_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_48m_fck_data,
+};
+
+static struct ti_clk_fixed mcbsp_clks_data = {
+	.frequency = 0x0,
+};
+
+static struct ti_clk mcbsp_clks = {
+	.name = "mcbsp_clks",
+	.type = TI_CLK_FIXED,
+	.data = &mcbsp_clks_data,
+};
+
+static struct ti_clk_gate mcbsp2_gate_fck_data = {
+	.parent = "mcbsp_clks",
+	.bit_shift = 0,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_fixed_factor per_96m_fck_data = {
+	.parent = "omap_96m_alwon_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk per_96m_fck = {
+	.name = "per_96m_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &per_96m_fck_data,
+};
+
+static const char *mcbsp2_mux_fck_parents[] = {
+	"per_96m_fck",
+	"mcbsp_clks",
+};
+
+static struct ti_clk_mux mcbsp2_mux_fck_data = {
+	.bit_shift = 6,
+	.num_parents = ARRAY_SIZE(mcbsp2_mux_fck_parents),
+	.reg = 0x274,
+	.module = TI_CLKM_SCRM,
+	.parents = mcbsp2_mux_fck_parents,
+};
+
+static struct ti_clk_composite mcbsp2_fck_data = {
+	.mux = &mcbsp2_mux_fck_data,
+	.gate = &mcbsp2_gate_fck_data,
+};
+
+static struct ti_clk mcbsp2_fck = {
+	.name = "mcbsp2_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &mcbsp2_fck_data,
+};
+
+static struct ti_clk_fixed_factor dpll3_m2x2_ck_data = {
+	.parent = "dpll3_m2_ck",
+	.div = 1,
+	.mult = 2,
+};
+
+static struct ti_clk dpll3_m2x2_ck = {
+	.name = "dpll3_m2x2_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll3_m2x2_ck_data,
+};
+
+static struct ti_clk_fixed_factor corex2_fck_data = {
+	.parent = "dpll3_m2x2_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk corex2_fck = {
+	.name = "corex2_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &corex2_fck_data,
+};
+
+static struct ti_clk_gate ssi_ssr_gate_fck_3430es1_data = {
+	.parent = "corex2_fck",
+	.bit_shift = 0,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_NO_WAIT,
+};
+
+static int ssi_ssr_div_fck_3430es1_divs[] = {
+	0,
+	1,
+	2,
+	3,
+	4,
+	0,
+	6,
+	0,
+	8,
+};
+
+static struct ti_clk_divider ssi_ssr_div_fck_3430es1_data = {
+	.num_dividers = ARRAY_SIZE(ssi_ssr_div_fck_3430es1_divs),
+	.parent = "corex2_fck",
+	.bit_shift = 8,
+	.dividers = ssi_ssr_div_fck_3430es1_divs,
+	.reg = 0xa40,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_composite ssi_ssr_fck_3430es1_data = {
+	.gate = &ssi_ssr_gate_fck_3430es1_data,
+	.divider = &ssi_ssr_div_fck_3430es1_data,
+};
+
+static struct ti_clk ssi_ssr_fck_3430es1 = {
+	.name = "ssi_ssr_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &ssi_ssr_fck_3430es1_data,
+};
+
+static struct ti_clk_fixed_factor ssi_sst_fck_3430es1_data = {
+	.parent = "ssi_ssr_fck",
+	.div = 2,
+	.mult = 1,
+};
+
+static struct ti_clk ssi_sst_fck_3430es1 = {
+	.name = "ssi_sst_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &ssi_sst_fck_3430es1_data,
+};
+
+static struct ti_clk_fixed omap_32k_fck_data = {
+	.frequency = 32768,
+};
+
+static struct ti_clk omap_32k_fck = {
+	.name = "omap_32k_fck",
+	.type = TI_CLK_FIXED,
+	.data = &omap_32k_fck_data,
+};
+
+static struct ti_clk_fixed_factor per_32k_alwon_fck_data = {
+	.parent = "omap_32k_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk per_32k_alwon_fck = {
+	.name = "per_32k_alwon_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &per_32k_alwon_fck_data,
+};
+
+static struct ti_clk_gate gpio5_dbck_data = {
+	.parent = "per_32k_alwon_fck",
+	.bit_shift = 16,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk gpio5_dbck = {
+	.name = "gpio5_dbck",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio5_dbck_data,
+};
+
+static struct ti_clk_gate gpt1_ick_data = {
+	.parent = "wkup_l4_ick",
+	.bit_shift = 0,
+	.reg = 0xc10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt1_ick = {
+	.name = "gpt1_ick",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt1_ick_data,
+};
+
+static struct ti_clk_gate mcspi3_fck_data = {
+	.parent = "core_48m_fck",
+	.bit_shift = 20,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk mcspi3_fck = {
+	.name = "mcspi3_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcspi3_fck_data,
+};
+
+static struct ti_clk_gate gpt2_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 3,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static const char *gpt2_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt2_mux_fck_data = {
+	.num_parents = ARRAY_SIZE(gpt2_mux_fck_parents),
+	.reg = 0x1040,
+	.module = TI_CLKM_CM,
+	.parents = gpt2_mux_fck_parents,
+};
+
+static struct ti_clk_composite gpt2_fck_data = {
+	.mux = &gpt2_mux_fck_data,
+	.gate = &gpt2_gate_fck_data,
+};
+
+static struct ti_clk gpt2_fck = {
+	.name = "gpt2_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt2_fck_data,
+};
+
+static struct ti_clk_gate gpt10_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 11,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt10_ick = {
+	.name = "gpt10_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt10_ick_data,
+};
+
+static struct ti_clk_gate uart2_fck_data = {
+	.parent = "core_48m_fck",
+	.bit_shift = 14,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk uart2_fck = {
+	.name = "uart2_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart2_fck_data,
+};
+
+static struct ti_clk_fixed_factor sr_l4_ick_data = {
+	.parent = "l4_ick",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk sr_l4_ick = {
+	.name = "sr_l4_ick",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &sr_l4_ick_data,
+};
+
+static struct ti_clk_fixed_factor omap_96m_d8_fck_data = {
+	.parent = "omap_96m_fck",
+	.div = 8,
+	.mult = 1,
+};
+
+static struct ti_clk omap_96m_d8_fck = {
+	.name = "omap_96m_d8_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &omap_96m_d8_fck_data,
+};
+
+static struct ti_clk_divider dpll4_m5_ck_data = {
+	.parent = "dpll4_ck",
+	.max_div = 63,
+	.reg = 0xf40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll4_m5_ck = {
+	.name = "dpll4_m5_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll4_m5_ck_data,
+};
+
+static struct ti_clk_fixed_factor dpll4_m5x2_mul_ck_data = {
+	.parent = "dpll4_m5_ck",
+	.div = 1,
+	.mult = 2,
+	.flags = CLKF_SET_RATE_PARENT,
+};
+
+static struct ti_clk dpll4_m5x2_mul_ck = {
+	.name = "dpll4_m5x2_mul_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll4_m5x2_mul_ck_data,
+};
+
+static struct ti_clk_gate dpll4_m5x2_ck_data = {
+	.parent = "dpll4_m5x2_mul_ck",
+	.bit_shift = 0x1e,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll4_m5x2_ck = {
+	.name = "dpll4_m5x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll4_m5x2_ck_data,
+};
+
+static struct ti_clk_gate cam_mclk_data = {
+	.parent = "dpll4_m5x2_ck",
+	.bit_shift = 0,
+	.reg = 0xf00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_SET_RATE_PARENT,
+};
+
+static struct ti_clk cam_mclk = {
+	.name = "cam_mclk",
+	.type = TI_CLK_GATE,
+	.data = &cam_mclk_data,
+};
+
+static struct ti_clk_gate mcbsp3_gate_fck_data = {
+	.parent = "mcbsp_clks",
+	.bit_shift = 1,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static const char *mcbsp3_mux_fck_parents[] = {
+	"per_96m_fck",
+	"mcbsp_clks",
+};
+
+static struct ti_clk_mux mcbsp3_mux_fck_data = {
+	.num_parents = ARRAY_SIZE(mcbsp3_mux_fck_parents),
+	.reg = 0x2d8,
+	.module = TI_CLKM_SCRM,
+	.parents = mcbsp3_mux_fck_parents,
+};
+
+static struct ti_clk_composite mcbsp3_fck_data = {
+	.mux = &mcbsp3_mux_fck_data,
+	.gate = &mcbsp3_gate_fck_data,
+};
+
+static struct ti_clk mcbsp3_fck = {
+	.name = "mcbsp3_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &mcbsp3_fck_data,
+};
+
+static struct ti_clk_gate csi2_96m_fck_data = {
+	.parent = "core_96m_fck",
+	.bit_shift = 1,
+	.reg = 0xf00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk csi2_96m_fck = {
+	.name = "csi2_96m_fck",
+	.clkdm_name = "cam_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &csi2_96m_fck_data,
+};
+
+static struct ti_clk_gate gpt9_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 10,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static const char *gpt9_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt9_mux_fck_data = {
+	.bit_shift = 7,
+	.num_parents = ARRAY_SIZE(gpt9_mux_fck_parents),
+	.reg = 0x1040,
+	.module = TI_CLKM_CM,
+	.parents = gpt9_mux_fck_parents,
+};
+
+static struct ti_clk_composite gpt9_fck_data = {
+	.mux = &gpt9_mux_fck_data,
+	.gate = &gpt9_gate_fck_data,
+};
+
+static struct ti_clk gpt9_fck = {
+	.name = "gpt9_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt9_fck_data,
+};
+
+static struct ti_clk_divider dpll3_m3_ck_data = {
+	.parent = "dpll3_ck",
+	.bit_shift = 16,
+	.max_div = 31,
+	.reg = 0x1140,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll3_m3_ck = {
+	.name = "dpll3_m3_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll3_m3_ck_data,
+};
+
+static struct ti_clk_fixed_factor dpll3_m3x2_mul_ck_data = {
+	.parent = "dpll3_m3_ck",
+	.div = 1,
+	.mult = 2,
+};
+
+static struct ti_clk dpll3_m3x2_mul_ck = {
+	.name = "dpll3_m3x2_mul_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll3_m3x2_mul_ck_data,
+};
+
+static struct ti_clk_gate sr2_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 7,
+	.reg = 0xc00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk sr2_fck = {
+	.name = "sr2_fck",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &sr2_fck_data,
+};
+
+static struct ti_clk_fixed pclk_ck_data = {
+	.frequency = 27000000,
+};
+
+static struct ti_clk pclk_ck = {
+	.name = "pclk_ck",
+	.type = TI_CLK_FIXED,
+	.data = &pclk_ck_data,
+};
+
+static struct ti_clk_gate wdt2_ick_data = {
+	.parent = "wkup_l4_ick",
+	.bit_shift = 5,
+	.reg = 0xc10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk wdt2_ick = {
+	.name = "wdt2_ick",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &wdt2_ick_data,
+};
+
+static struct ti_clk_fixed_factor core_l3_ick_data = {
+	.parent = "l3_ick",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk core_l3_ick = {
+	.name = "core_l3_ick",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_l3_ick_data,
+};
+
+static struct ti_clk_gate mcspi4_fck_data = {
+	.parent = "core_48m_fck",
+	.bit_shift = 21,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk mcspi4_fck = {
+	.name = "mcspi4_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcspi4_fck_data,
+};
+
+static struct ti_clk_fixed_factor per_48m_fck_data = {
+	.parent = "omap_48m_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk per_48m_fck = {
+	.name = "per_48m_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &per_48m_fck_data,
+};
+
+static struct ti_clk_gate uart4_fck_data = {
+	.parent = "per_48m_fck",
+	.bit_shift = 18,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk uart4_fck = {
+	.name = "uart4_fck",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart4_fck_data,
+};
+
+static struct ti_clk_fixed_factor omap_96m_d10_fck_data = {
+	.parent = "omap_96m_fck",
+	.div = 10,
+	.mult = 1,
+};
+
+static struct ti_clk omap_96m_d10_fck = {
+	.name = "omap_96m_d10_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &omap_96m_d10_fck_data,
+};
+
+static struct ti_clk_gate usim_gate_fck_data = {
+	.parent = "omap_96m_fck",
+	.bit_shift = 9,
+	.reg = 0xc00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_fixed_factor per_l4_ick_data = {
+	.parent = "l4_ick",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk per_l4_ick = {
+	.name = "per_l4_ick",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &per_l4_ick_data,
+};
+
+static struct ti_clk_gate gpt5_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 6,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt5_ick = {
+	.name = "gpt5_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt5_ick_data,
+};
+
+static struct ti_clk_gate mcspi2_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 19,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mcspi2_ick = {
+	.name = "mcspi2_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcspi2_ick_data,
+};
+
+static struct ti_clk_fixed_factor ssi_l4_ick_data = {
+	.parent = "l4_ick",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk ssi_l4_ick = {
+	.name = "ssi_l4_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &ssi_l4_ick_data,
+};
+
+static struct ti_clk_gate ssi_ick_3430es1_data = {
+	.parent = "ssi_l4_ick",
+	.bit_shift = 0,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_NO_WAIT | CLKF_INTERFACE,
+};
+
+static struct ti_clk ssi_ick_3430es1 = {
+	.name = "ssi_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &ssi_ick_3430es1_data,
+};
+
+static struct ti_clk_gate i2c2_fck_data = {
+	.parent = "core_96m_fck",
+	.bit_shift = 16,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk i2c2_fck = {
+	.name = "i2c2_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &i2c2_fck_data,
+};
+
+static struct ti_clk_divider dpll1_fck_data = {
+	.parent = "core_ck",
+	.bit_shift = 19,
+	.max_div = 7,
+	.reg = 0x940,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll1_fck = {
+	.name = "dpll1_fck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll1_fck_data,
+};
+
+static const char *dpll1_ck_parents[] = {
+	"sys_ck",
+	"dpll1_fck",
+};
+
+static struct ti_clk_dpll dpll1_ck_data = {
+	.num_parents = ARRAY_SIZE(dpll1_ck_parents),
+	.control_reg = 0x904,
+	.idlest_reg = 0x924,
+	.mult_div1_reg = 0x940,
+	.autoidle_reg = 0x934,
+	.module = TI_CLKM_CM,
+	.parents = dpll1_ck_parents,
+	.freqsel_mask = 0xf0,
+	.modes = 0xa0,
+	.div1_mask = 0x7f,
+	.idlest_mask = 0x1,
+	.auto_recal_bit = 0x3,
+	.max_divider = 0x80,
+	.min_divider = 0x1,
+	.recal_en_bit = 0x7,
+	.max_multiplier = 0x7ff,
+	.enable_mask = 0x7,
+	.mult_mask = 0x7ff00,
+	.recal_st_bit = 0x7,
+	.autoidle_mask = 0x7,
+};
+
+static struct ti_clk dpll1_ck = {
+	.name = "dpll1_ck",
+	.clkdm_name = "dpll1_clkdm",
+	.type = TI_CLK_DPLL,
+	.data = &dpll1_ck_data,
+};
+
+static struct ti_clk_fixed secure_32k_fck_data = {
+	.frequency = 32768,
+};
+
+static struct ti_clk secure_32k_fck = {
+	.name = "secure_32k_fck",
+	.type = TI_CLK_FIXED,
+	.data = &secure_32k_fck_data,
+};
+
+static struct ti_clk_gate gpio5_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 16,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpio5_ick = {
+	.name = "gpio5_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio5_ick_data,
+};
+
+static struct ti_clk_divider dpll4_m4_ck_data = {
+	.parent = "dpll4_ck",
+	.max_div = 32,
+	.reg = 0xe40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll4_m4_ck = {
+	.name = "dpll4_m4_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll4_m4_ck_data,
+};
+
+static struct ti_clk_fixed_factor dpll4_m4x2_mul_ck_data = {
+	.parent = "dpll4_m4_ck",
+	.div = 1,
+	.mult = 2,
+	.flags = CLKF_SET_RATE_PARENT,
+};
+
+static struct ti_clk dpll4_m4x2_mul_ck = {
+	.name = "dpll4_m4x2_mul_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll4_m4x2_mul_ck_data,
+};
+
+static struct ti_clk_gate dpll4_m4x2_ck_data = {
+	.parent = "dpll4_m4x2_mul_ck",
+	.bit_shift = 0x1d,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_SET_RATE_PARENT | CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll4_m4x2_ck = {
+	.name = "dpll4_m4x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll4_m4x2_ck_data,
+};
+
+static struct ti_clk_gate dss1_alwon_fck_3430es2_data = {
+	.parent = "dpll4_m4x2_ck",
+	.bit_shift = 0,
+	.reg = 0xe00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_DSS | CLKF_SET_RATE_PARENT,
+};
+
+static struct ti_clk dss1_alwon_fck_3430es2 = {
+	.name = "dss1_alwon_fck",
+	.clkdm_name = "dss_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &dss1_alwon_fck_3430es2_data,
+};
+
+static struct ti_clk_gate uart3_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 11,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk uart3_ick = {
+	.name = "uart3_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart3_ick_data,
+};
+
+static struct ti_clk_divider dpll4_m3_ck_data = {
+	.parent = "dpll4_ck",
+	.bit_shift = 8,
+	.max_div = 32,
+	.reg = 0xe40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll4_m3_ck = {
+	.name = "dpll4_m3_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll4_m3_ck_data,
+};
+
+static struct ti_clk_gate mcbsp3_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 1,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mcbsp3_ick = {
+	.name = "mcbsp3_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcbsp3_ick_data,
+};
+
+static struct ti_clk_gate gpio3_dbck_data = {
+	.parent = "per_32k_alwon_fck",
+	.bit_shift = 14,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk gpio3_dbck = {
+	.name = "gpio3_dbck",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio3_dbck_data,
+};
+
+static struct ti_clk_gate fac_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 8,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk fac_ick = {
+	.name = "fac_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &fac_ick_data,
+};
+
+static struct ti_clk_gate clkout2_src_gate_ck_data = {
+	.parent = "core_ck",
+	.bit_shift = 7,
+	.reg = 0xd70,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_NO_WAIT,
+};
+
+static struct ti_clk_fixed_factor dpll4_m3x2_mul_ck_data = {
+	.parent = "dpll4_m3_ck",
+	.div = 1,
+	.mult = 2,
+};
+
+static struct ti_clk dpll4_m3x2_mul_ck = {
+	.name = "dpll4_m3x2_mul_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll4_m3x2_mul_ck_data,
+};
+
+static struct ti_clk_gate dpll4_m3x2_ck_data = {
+	.parent = "dpll4_m3x2_mul_ck",
+	.bit_shift = 0x1c,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll4_m3x2_ck = {
+	.name = "dpll4_m3x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll4_m3x2_ck_data,
+};
+
+static const char *omap_54m_fck_parents[] = {
+	"dpll4_m3x2_ck",
+	"sys_altclk",
+};
+
+static struct ti_clk_mux omap_54m_fck_data = {
+	.bit_shift = 5,
+	.num_parents = ARRAY_SIZE(omap_54m_fck_parents),
+	.reg = 0xd40,
+	.module = TI_CLKM_CM,
+	.parents = omap_54m_fck_parents,
+};
+
+static struct ti_clk omap_54m_fck = {
+	.name = "omap_54m_fck",
+	.type = TI_CLK_MUX,
+	.data = &omap_54m_fck_data,
+};
+
+static const char *clkout2_src_mux_ck_parents[] = {
+	"core_ck",
+	"sys_ck",
+	"cm_96m_fck",
+	"omap_54m_fck",
+};
+
+static struct ti_clk_mux clkout2_src_mux_ck_data = {
+	.num_parents = ARRAY_SIZE(clkout2_src_mux_ck_parents),
+	.reg = 0xd70,
+	.module = TI_CLKM_CM,
+	.parents = clkout2_src_mux_ck_parents,
+};
+
+static struct ti_clk_composite clkout2_src_ck_data = {
+	.mux = &clkout2_src_mux_ck_data,
+	.gate = &clkout2_src_gate_ck_data,
+};
+
+static struct ti_clk clkout2_src_ck = {
+	.name = "clkout2_src_ck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &clkout2_src_ck_data,
+};
+
+static struct ti_clk_gate i2c1_fck_data = {
+	.parent = "core_96m_fck",
+	.bit_shift = 15,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk i2c1_fck = {
+	.name = "i2c1_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &i2c1_fck_data,
+};
+
+static struct ti_clk_gate wdt3_fck_data = {
+	.parent = "per_32k_alwon_fck",
+	.bit_shift = 12,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk wdt3_fck = {
+	.name = "wdt3_fck",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &wdt3_fck_data,
+};
+
+static struct ti_clk_gate gpt7_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 8,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static const char *gpt7_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt7_mux_fck_data = {
+	.bit_shift = 5,
+	.num_parents = ARRAY_SIZE(gpt7_mux_fck_parents),
+	.reg = 0x1040,
+	.module = TI_CLKM_CM,
+	.parents = gpt7_mux_fck_parents,
+};
+
+static struct ti_clk_composite gpt7_fck_data = {
+	.mux = &gpt7_mux_fck_data,
+	.gate = &gpt7_gate_fck_data,
+};
+
+static struct ti_clk gpt7_fck = {
+	.name = "gpt7_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt7_fck_data,
+};
+
+static struct ti_clk_gate usb_l4_gate_ick_data = {
+	.parent = "l4_ick",
+	.bit_shift = 5,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INTERFACE,
+};
+
+static struct ti_clk_divider usb_l4_div_ick_data = {
+	.parent = "l4_ick",
+	.bit_shift = 4,
+	.max_div = 1,
+	.reg = 0xa40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk_composite usb_l4_ick_data = {
+	.gate = &usb_l4_gate_ick_data,
+	.divider = &usb_l4_div_ick_data,
+};
+
+static struct ti_clk usb_l4_ick = {
+	.name = "usb_l4_ick",
+	.type = TI_CLK_COMPOSITE,
+	.data = &usb_l4_ick_data,
+};
+
+static struct ti_clk_gate uart4_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 18,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk uart4_ick = {
+	.name = "uart4_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart4_ick_data,
+};
+
+static struct ti_clk_fixed dummy_ck_data = {
+	.frequency = 0,
+};
+
+static struct ti_clk dummy_ck = {
+	.name = "dummy_ck",
+	.type = TI_CLK_FIXED,
+	.data = &dummy_ck_data,
+};
+
+static const char *gpt3_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt3_mux_fck_data = {
+	.bit_shift = 1,
+	.num_parents = ARRAY_SIZE(gpt3_mux_fck_parents),
+	.reg = 0x1040,
+	.module = TI_CLKM_CM,
+	.parents = gpt3_mux_fck_parents,
+};
+
+static struct ti_clk_gate gpt9_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 10,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt9_ick = {
+	.name = "gpt9_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt9_ick_data,
+};
+
+static struct ti_clk_gate gpt10_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 11,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_gate dss_ick_3430es1_data = {
+	.parent = "l4_ick",
+	.bit_shift = 0,
+	.reg = 0xe10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_NO_WAIT | CLKF_INTERFACE,
+};
+
+static struct ti_clk dss_ick_3430es1 = {
+	.name = "dss_ick",
+	.clkdm_name = "dss_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &dss_ick_3430es1_data,
+};
+
+static struct ti_clk_gate gpt11_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 12,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt11_ick = {
+	.name = "gpt11_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt11_ick_data,
+};
+
+static struct ti_clk_divider dpll2_fck_data = {
+	.parent = "core_ck",
+	.bit_shift = 19,
+	.max_div = 7,
+	.reg = 0x40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll2_fck = {
+	.name = "dpll2_fck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll2_fck_data,
+};
+
+static struct ti_clk_gate uart1_fck_data = {
+	.parent = "core_48m_fck",
+	.bit_shift = 13,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk uart1_fck = {
+	.name = "uart1_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart1_fck_data,
+};
+
+static struct ti_clk_gate hsotgusb_ick_3430es1_data = {
+	.parent = "core_l3_ick",
+	.bit_shift = 4,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_NO_WAIT | CLKF_INTERFACE,
+};
+
+static struct ti_clk hsotgusb_ick_3430es1 = {
+	.name = "hsotgusb_ick_3430es1",
+	.clkdm_name = "core_l3_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &hsotgusb_ick_3430es1_data,
+};
+
+static struct ti_clk_gate gpio2_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 13,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpio2_ick = {
+	.name = "gpio2_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio2_ick_data,
+};
+
+static struct ti_clk_gate mmchs1_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 24,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mmchs1_ick = {
+	.name = "mmchs1_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mmchs1_ick_data,
+};
+
+static struct ti_clk_gate modem_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 31,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk modem_fck = {
+	.name = "modem_fck",
+	.clkdm_name = "d2d_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &modem_fck_data,
+};
+
+static struct ti_clk_gate mcbsp4_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 2,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mcbsp4_ick = {
+	.name = "mcbsp4_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcbsp4_ick_data,
+};
+
+static struct ti_clk_gate gpio1_ick_data = {
+	.parent = "wkup_l4_ick",
+	.bit_shift = 3,
+	.reg = 0xc10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpio1_ick = {
+	.name = "gpio1_ick",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio1_ick_data,
+};
+
+static const char *gpt6_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt6_mux_fck_data = {
+	.bit_shift = 4,
+	.num_parents = ARRAY_SIZE(gpt6_mux_fck_parents),
+	.reg = 0x1040,
+	.module = TI_CLKM_CM,
+	.parents = gpt6_mux_fck_parents,
+};
+
+static struct ti_clk_fixed_factor dpll1_x2_ck_data = {
+	.parent = "dpll1_ck",
+	.div = 1,
+	.mult = 2,
+};
+
+static struct ti_clk dpll1_x2_ck = {
+	.name = "dpll1_x2_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll1_x2_ck_data,
+};
+
+static struct ti_clk_divider dpll1_x2m2_ck_data = {
+	.parent = "dpll1_x2_ck",
+	.max_div = 31,
+	.reg = 0x944,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll1_x2m2_ck = {
+	.name = "dpll1_x2m2_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll1_x2m2_ck_data,
+};
+
+static struct ti_clk_fixed_factor mpu_ck_data = {
+	.parent = "dpll1_x2m2_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk mpu_ck = {
+	.name = "mpu_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &mpu_ck_data,
+};
+
+static struct ti_clk_divider arm_fck_data = {
+	.parent = "mpu_ck",
+	.max_div = 2,
+	.reg = 0x924,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk arm_fck = {
+	.name = "arm_fck",
+	.type = TI_CLK_DIVIDER,
+	.data = &arm_fck_data,
+};
+
+static struct ti_clk_fixed_factor core_d3_ck_data = {
+	.parent = "core_ck",
+	.div = 3,
+	.mult = 1,
+};
+
+static struct ti_clk core_d3_ck = {
+	.name = "core_d3_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_d3_ck_data,
+};
+
+static struct ti_clk_gate gpt11_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 12,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+};
+
+static const char *gpt11_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt11_mux_fck_data = {
+	.bit_shift = 7,
+	.num_parents = ARRAY_SIZE(gpt11_mux_fck_parents),
+	.reg = 0xa40,
+	.module = TI_CLKM_CM,
+	.parents = gpt11_mux_fck_parents,
+};
+
+static struct ti_clk_composite gpt11_fck_data = {
+	.mux = &gpt11_mux_fck_data,
+	.gate = &gpt11_gate_fck_data,
+};
+
+static struct ti_clk gpt11_fck = {
+	.name = "gpt11_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt11_fck_data,
+};
+
+static struct ti_clk_fixed_factor core_d6_ck_data = {
+	.parent = "core_ck",
+	.div = 6,
+	.mult = 1,
+};
+
+static struct ti_clk core_d6_ck = {
+	.name = "core_d6_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_d6_ck_data,
+};
+
+static struct ti_clk_gate uart4_fck_am35xx_data = {
+	.parent = "core_48m_fck",
+	.bit_shift = 23,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk uart4_fck_am35xx = {
+	.name = "uart4_fck_am35xx",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart4_fck_am35xx_data,
+};
+
+static struct ti_clk_gate dpll3_m3x2_ck_data = {
+	.parent = "dpll3_m3x2_mul_ck",
+	.bit_shift = 0xc,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll3_m3x2_ck = {
+	.name = "dpll3_m3x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll3_m3x2_ck_data,
+};
+
+static struct ti_clk_fixed_factor emu_core_alwon_ck_data = {
+	.parent = "dpll3_m3x2_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk emu_core_alwon_ck = {
+	.name = "emu_core_alwon_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &emu_core_alwon_ck_data,
+};
+
+static struct ti_clk_divider dpll4_m6_ck_data = {
+	.parent = "dpll4_ck",
+	.bit_shift = 24,
+	.max_div = 63,
+	.reg = 0x1140,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll4_m6_ck = {
+	.name = "dpll4_m6_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll4_m6_ck_data,
+};
+
+static struct ti_clk_fixed_factor dpll4_m6x2_mul_ck_data = {
+	.parent = "dpll4_m6_ck",
+	.div = 1,
+	.mult = 2,
+};
+
+static struct ti_clk dpll4_m6x2_mul_ck = {
+	.name = "dpll4_m6x2_mul_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll4_m6x2_mul_ck_data,
+};
+
+static struct ti_clk_gate dpll4_m6x2_ck_data = {
+	.parent = "dpll4_m6x2_mul_ck",
+	.bit_shift = 0x1f,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll4_m6x2_ck = {
+	.name = "dpll4_m6x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll4_m6x2_ck_data,
+};
+
+static struct ti_clk_fixed_factor emu_per_alwon_ck_data = {
+	.parent = "dpll4_m6x2_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk emu_per_alwon_ck = {
+	.name = "emu_per_alwon_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &emu_per_alwon_ck_data,
+};
+
+static struct ti_clk_fixed_factor emu_mpu_alwon_ck_data = {
+	.parent = "mpu_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk emu_mpu_alwon_ck = {
+	.name = "emu_mpu_alwon_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &emu_mpu_alwon_ck_data,
+};
+
+static const char *emu_src_mux_ck_parents[] = {
+	"sys_ck",
+	"emu_core_alwon_ck",
+	"emu_per_alwon_ck",
+	"emu_mpu_alwon_ck",
+};
+
+static struct ti_clk_mux emu_src_mux_ck_data = {
+	.num_parents = ARRAY_SIZE(emu_src_mux_ck_parents),
+	.reg = 0x1140,
+	.module = TI_CLKM_CM,
+	.parents = emu_src_mux_ck_parents,
+};
+
+static struct ti_clk emu_src_mux_ck = {
+	.name = "emu_src_mux_ck",
+	.type = TI_CLK_MUX,
+	.data = &emu_src_mux_ck_data,
+};
+
+static struct ti_clk_gate emu_src_ck_data = {
+	.parent = "emu_src_mux_ck",
+	.flags = CLKF_CLKDM,
+};
+
+static struct ti_clk emu_src_ck = {
+	.name = "emu_src_ck",
+	.clkdm_name = "emu_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &emu_src_ck_data,
+};
+
+static struct ti_clk_divider atclk_fck_data = {
+	.parent = "emu_src_ck",
+	.bit_shift = 4,
+	.max_div = 3,
+	.reg = 0x1140,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk atclk_fck = {
+	.name = "atclk_fck",
+	.type = TI_CLK_DIVIDER,
+	.data = &atclk_fck_data,
+};
+
+static struct ti_clk_gate ipss_ick_data = {
+	.parent = "core_l3_ick",
+	.bit_shift = 4,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_AM35XX | CLKF_INTERFACE,
+};
+
+static struct ti_clk ipss_ick = {
+	.name = "ipss_ick",
+	.clkdm_name = "core_l3_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &ipss_ick_data,
+};
+
+static struct ti_clk_gate emac_ick_data = {
+	.parent = "ipss_ick",
+	.bit_shift = 1,
+	.reg = 0x59c,
+	.module = TI_CLKM_SCRM,
+	.flags = CLKF_AM35XX,
+};
+
+static struct ti_clk emac_ick = {
+	.name = "emac_ick",
+	.clkdm_name = "core_l3_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &emac_ick_data,
+};
+
+static struct ti_clk_gate vpfe_ick_data = {
+	.parent = "ipss_ick",
+	.bit_shift = 2,
+	.reg = 0x59c,
+	.module = TI_CLKM_SCRM,
+	.flags = CLKF_AM35XX,
+};
+
+static struct ti_clk vpfe_ick = {
+	.name = "vpfe_ick",
+	.clkdm_name = "core_l3_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &vpfe_ick_data,
+};
+
+static const char *dpll2_ck_parents[] = {
+	"sys_ck",
+	"dpll2_fck",
+};
+
+static struct ti_clk_dpll dpll2_ck_data = {
+	.num_parents = ARRAY_SIZE(dpll2_ck_parents),
+	.control_reg = 0x4,
+	.idlest_reg = 0x24,
+	.mult_div1_reg = 0x40,
+	.autoidle_reg = 0x34,
+	.module = TI_CLKM_CM,
+	.parents = dpll2_ck_parents,
+	.freqsel_mask = 0xf0,
+	.modes = 0xa2,
+	.div1_mask = 0x7f,
+	.idlest_mask = 0x1,
+	.auto_recal_bit = 0x3,
+	.max_divider = 0x80,
+	.min_divider = 0x1,
+	.recal_en_bit = 0x8,
+	.max_multiplier = 0x7ff,
+	.enable_mask = 0x7,
+	.mult_mask = 0x7ff00,
+	.recal_st_bit = 0x8,
+	.autoidle_mask = 0x7,
+};
+
+static struct ti_clk dpll2_ck = {
+	.name = "dpll2_ck",
+	.clkdm_name = "dpll2_clkdm",
+	.type = TI_CLK_DPLL,
+	.data = &dpll2_ck_data,
+};
+
+static struct ti_clk_divider dpll2_m2_ck_data = {
+	.parent = "dpll2_ck",
+	.max_div = 31,
+	.reg = 0x44,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk dpll2_m2_ck = {
+	.name = "dpll2_m2_ck",
+	.type = TI_CLK_DIVIDER,
+	.data = &dpll2_m2_ck_data,
+};
+
+static const char *mcbsp4_mux_fck_parents[] = {
+	"per_96m_fck",
+	"mcbsp_clks",
+};
+
+static struct ti_clk_mux mcbsp4_mux_fck_data = {
+	.bit_shift = 2,
+	.num_parents = ARRAY_SIZE(mcbsp4_mux_fck_parents),
+	.reg = 0x2d8,
+	.module = TI_CLKM_SCRM,
+	.parents = mcbsp4_mux_fck_parents,
+};
+
+static const char *mcbsp1_mux_fck_parents[] = {
+	"core_96m_fck",
+	"mcbsp_clks",
+};
+
+static struct ti_clk_mux mcbsp1_mux_fck_data = {
+	.bit_shift = 2,
+	.num_parents = ARRAY_SIZE(mcbsp1_mux_fck_parents),
+	.reg = 0x274,
+	.module = TI_CLKM_SCRM,
+	.parents = mcbsp1_mux_fck_parents,
+};
+
+static struct ti_clk_gate gpt8_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 9,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_gate gpt8_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 9,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt8_ick = {
+	.name = "gpt8_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt8_ick_data,
+};
+
+static const char *gpt10_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt10_mux_fck_data = {
+	.bit_shift = 6,
+	.num_parents = ARRAY_SIZE(gpt10_mux_fck_parents),
+	.reg = 0xa40,
+	.module = TI_CLKM_CM,
+	.parents = gpt10_mux_fck_parents,
+};
+
+static struct ti_clk_gate mmchs3_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 30,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mmchs3_ick = {
+	.name = "mmchs3_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mmchs3_ick_data,
+};
+
+static struct ti_clk_gate gpio3_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 14,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpio3_ick = {
+	.name = "gpio3_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio3_ick_data,
+};
+
+static const char *traceclk_src_fck_parents[] = {
+	"sys_ck",
+	"emu_core_alwon_ck",
+	"emu_per_alwon_ck",
+	"emu_mpu_alwon_ck",
+};
+
+static struct ti_clk_mux traceclk_src_fck_data = {
+	.bit_shift = 2,
+	.num_parents = ARRAY_SIZE(traceclk_src_fck_parents),
+	.reg = 0x1140,
+	.module = TI_CLKM_CM,
+	.parents = traceclk_src_fck_parents,
+};
+
+static struct ti_clk traceclk_src_fck = {
+	.name = "traceclk_src_fck",
+	.type = TI_CLK_MUX,
+	.data = &traceclk_src_fck_data,
+};
+
+static struct ti_clk_divider traceclk_fck_data = {
+	.parent = "traceclk_src_fck",
+	.bit_shift = 11,
+	.max_div = 7,
+	.reg = 0x1140,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk traceclk_fck = {
+	.name = "traceclk_fck",
+	.type = TI_CLK_DIVIDER,
+	.data = &traceclk_fck_data,
+};
+
+static struct ti_clk_gate mcbsp5_gate_fck_data = {
+	.parent = "mcbsp_clks",
+	.bit_shift = 10,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_gate sad2d_ick_data = {
+	.parent = "l3_ick",
+	.bit_shift = 3,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk sad2d_ick = {
+	.name = "sad2d_ick",
+	.clkdm_name = "d2d_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &sad2d_ick_data,
+};
+
+static const char *gpt1_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt1_mux_fck_data = {
+	.num_parents = ARRAY_SIZE(gpt1_mux_fck_parents),
+	.reg = 0xc40,
+	.module = TI_CLKM_CM,
+	.parents = gpt1_mux_fck_parents,
+};
+
+static struct ti_clk_gate hecc_ck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 3,
+	.reg = 0x59c,
+	.module = TI_CLKM_SCRM,
+	.flags = CLKF_AM35XX,
+};
+
+static struct ti_clk hecc_ck = {
+	.name = "hecc_ck",
+	.clkdm_name = "core_l3_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &hecc_ck_data,
+};
+
+static struct ti_clk_gate gpt1_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 0,
+	.reg = 0xc00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_composite gpt1_fck_data = {
+	.mux = &gpt1_mux_fck_data,
+	.gate = &gpt1_gate_fck_data,
+};
+
+static struct ti_clk gpt1_fck = {
+	.name = "gpt1_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt1_fck_data,
+};
+
+static struct ti_clk_gate dpll4_m2x2_ck_omap36xx_data = {
+	.parent = "dpll4_m2x2_mul_ck",
+	.bit_shift = 0x1b,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_HSDIV | CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll4_m2x2_ck_omap36xx = {
+	.name = "dpll4_m2x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll4_m2x2_ck_omap36xx_data,
+	.patch = &dpll4_m2x2_ck,
+};
+
+static struct ti_clk_divider gfx_l3_fck_data = {
+	.parent = "l3_ick",
+	.max_div = 7,
+	.reg = 0xb40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk gfx_l3_fck = {
+	.name = "gfx_l3_fck",
+	.type = TI_CLK_DIVIDER,
+	.data = &gfx_l3_fck_data,
+};
+
+static struct ti_clk_gate gfx_cg1_ck_data = {
+	.parent = "gfx_l3_fck",
+	.bit_shift = 1,
+	.reg = 0xb00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk gfx_cg1_ck = {
+	.name = "gfx_cg1_ck",
+	.clkdm_name = "gfx_3430es1_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gfx_cg1_ck_data,
+};
+
+static struct ti_clk_gate mailboxes_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 7,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mailboxes_ick = {
+	.name = "mailboxes_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mailboxes_ick_data,
+};
+
+static struct ti_clk_gate sha11_ick_data = {
+	.parent = "security_l4_ick2",
+	.bit_shift = 1,
+	.reg = 0xa14,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk sha11_ick = {
+	.name = "sha11_ick",
+	.type = TI_CLK_GATE,
+	.data = &sha11_ick_data,
+};
+
+static struct ti_clk_gate hsotgusb_ick_am35xx_data = {
+	.parent = "ipss_ick",
+	.bit_shift = 0,
+	.reg = 0x59c,
+	.module = TI_CLKM_SCRM,
+	.flags = CLKF_AM35XX,
+};
+
+static struct ti_clk hsotgusb_ick_am35xx = {
+	.name = "hsotgusb_ick_am35xx",
+	.clkdm_name = "core_l3_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &hsotgusb_ick_am35xx_data,
+};
+
+static struct ti_clk_gate mmchs3_fck_data = {
+	.parent = "core_96m_fck",
+	.bit_shift = 30,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk mmchs3_fck = {
+	.name = "mmchs3_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mmchs3_fck_data,
+};
+
+static struct ti_clk_divider pclk_fck_data = {
+	.parent = "emu_src_ck",
+	.bit_shift = 8,
+	.max_div = 7,
+	.reg = 0x1140,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk pclk_fck = {
+	.name = "pclk_fck",
+	.type = TI_CLK_DIVIDER,
+	.data = &pclk_fck_data,
+};
+
+static const char *dpll4_ck_omap36xx_parents[] = {
+	"sys_ck",
+	"sys_ck",
+};
+
+static struct ti_clk_dpll dpll4_ck_omap36xx_data = {
+	.num_parents = ARRAY_SIZE(dpll4_ck_omap36xx_parents),
+	.control_reg = 0xd00,
+	.idlest_reg = 0xd20,
+	.mult_div1_reg = 0xd44,
+	.autoidle_reg = 0xd30,
+	.module = TI_CLKM_CM,
+	.parents = dpll4_ck_omap36xx_parents,
+	.modes = 0x82,
+	.div1_mask = 0x7f,
+	.idlest_mask = 0x2,
+	.auto_recal_bit = 0x13,
+	.max_divider = 0x80,
+	.min_divider = 0x1,
+	.recal_en_bit = 0x6,
+	.max_multiplier = 0xfff,
+	.enable_mask = 0x70000,
+	.mult_mask = 0xfff00,
+	.recal_st_bit = 0x6,
+	.autoidle_mask = 0x38,
+	.sddiv_mask = 0xff000000,
+	.dco_mask = 0xe00000,
+	.flags = CLKF_PER | CLKF_J_TYPE,
+};
+
+static struct ti_clk dpll4_ck_omap36xx = {
+	.name = "dpll4_ck",
+	.type = TI_CLK_DPLL,
+	.data = &dpll4_ck_omap36xx_data,
+	.patch = &dpll4_ck,
+};
+
+static struct ti_clk_gate uart3_fck_data = {
+	.parent = "per_48m_fck",
+	.bit_shift = 11,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk uart3_fck = {
+	.name = "uart3_fck",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart3_fck_data,
+};
+
+static struct ti_clk_fixed_factor wkup_32k_fck_data = {
+	.parent = "omap_32k_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk wkup_32k_fck = {
+	.name = "wkup_32k_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &wkup_32k_fck_data,
+};
+
+static struct ti_clk_gate sys_clkout1_data = {
+	.parent = "osc_sys_ck",
+	.bit_shift = 7,
+	.reg = 0xd70,
+	.module = TI_CLKM_PRM,
+};
+
+static struct ti_clk sys_clkout1 = {
+	.name = "sys_clkout1",
+	.type = TI_CLK_GATE,
+	.data = &sys_clkout1_data,
+};
+
+static struct ti_clk_fixed_factor gpmc_fck_data = {
+	.parent = "core_l3_ick",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk gpmc_fck = {
+	.name = "gpmc_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &gpmc_fck_data,
+};
+
+static struct ti_clk_fixed_factor dpll5_m2_d20_ck_data = {
+	.parent = "dpll5_m2_ck",
+	.div = 20,
+	.mult = 1,
+};
+
+static struct ti_clk dpll5_m2_d20_ck = {
+	.name = "dpll5_m2_d20_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll5_m2_d20_ck_data,
+};
+
+static struct ti_clk_gate dpll4_m5x2_ck_omap36xx_data = {
+	.parent = "dpll4_m5x2_mul_ck",
+	.bit_shift = 0x1e,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_HSDIV | CLKF_SET_RATE_PARENT | CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll4_m5x2_ck_omap36xx = {
+	.name = "dpll4_m5x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll4_m5x2_ck_omap36xx_data,
+	.patch = &dpll4_m5x2_ck,
+};
+
+static struct ti_clk_gate ssi_ssr_gate_fck_3430es2_data = {
+	.parent = "corex2_fck",
+	.bit_shift = 0,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_NO_WAIT,
+};
+
+static struct ti_clk_gate uart1_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 13,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk uart1_ick = {
+	.name = "uart1_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart1_ick_data,
+};
+
+static struct ti_clk_gate iva2_ck_data = {
+	.parent = "dpll2_m2_ck",
+	.bit_shift = 0,
+	.reg = 0x0,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk iva2_ck = {
+	.name = "iva2_ck",
+	.clkdm_name = "iva2_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &iva2_ck_data,
+};
+
+static struct ti_clk_gate pka_ick_data = {
+	.parent = "security_l3_ick",
+	.bit_shift = 4,
+	.reg = 0xa14,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk pka_ick = {
+	.name = "pka_ick",
+	.type = TI_CLK_GATE,
+	.data = &pka_ick_data,
+};
+
+static struct ti_clk_gate gpt12_ick_data = {
+	.parent = "wkup_l4_ick",
+	.bit_shift = 1,
+	.reg = 0xc10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt12_ick = {
+	.name = "gpt12_ick",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt12_ick_data,
+};
+
+static const char *mcbsp5_mux_fck_parents[] = {
+	"core_96m_fck",
+	"mcbsp_clks",
+};
+
+static struct ti_clk_mux mcbsp5_mux_fck_data = {
+	.bit_shift = 4,
+	.num_parents = ARRAY_SIZE(mcbsp5_mux_fck_parents),
+	.reg = 0x2d8,
+	.module = TI_CLKM_SCRM,
+	.parents = mcbsp5_mux_fck_parents,
+};
+
+static struct ti_clk_composite mcbsp5_fck_data = {
+	.mux = &mcbsp5_mux_fck_data,
+	.gate = &mcbsp5_gate_fck_data,
+};
+
+static struct ti_clk mcbsp5_fck = {
+	.name = "mcbsp5_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &mcbsp5_fck_data,
+};
+
+static struct ti_clk_gate usbhost_48m_fck_data = {
+	.parent = "omap_48m_fck",
+	.bit_shift = 0,
+	.reg = 0x1400,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_DSS,
+};
+
+static struct ti_clk usbhost_48m_fck = {
+	.name = "usbhost_48m_fck",
+	.clkdm_name = "usbhost_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &usbhost_48m_fck_data,
+};
+
+static struct ti_clk_gate des1_ick_data = {
+	.parent = "security_l4_ick2",
+	.bit_shift = 0,
+	.reg = 0xa14,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk des1_ick = {
+	.name = "des1_ick",
+	.type = TI_CLK_GATE,
+	.data = &des1_ick_data,
+};
+
+static struct ti_clk_gate sgx_gate_fck_data = {
+	.parent = "core_ck",
+	.bit_shift = 1,
+	.reg = 0xb00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_fixed_factor core_d4_ck_data = {
+	.parent = "core_ck",
+	.div = 4,
+	.mult = 1,
+};
+
+static struct ti_clk core_d4_ck = {
+	.name = "core_d4_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_d4_ck_data,
+};
+
+static struct ti_clk_fixed_factor omap_192m_alwon_fck_data = {
+	.parent = "dpll4_m2x2_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk omap_192m_alwon_fck = {
+	.name = "omap_192m_alwon_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &omap_192m_alwon_fck_data,
+};
+
+static struct ti_clk_fixed_factor core_d2_ck_data = {
+	.parent = "core_ck",
+	.div = 2,
+	.mult = 1,
+};
+
+static struct ti_clk core_d2_ck = {
+	.name = "core_d2_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_d2_ck_data,
+};
+
+static struct ti_clk_fixed_factor corex2_d3_fck_data = {
+	.parent = "corex2_fck",
+	.div = 3,
+	.mult = 1,
+};
+
+static struct ti_clk corex2_d3_fck = {
+	.name = "corex2_d3_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &corex2_d3_fck_data,
+};
+
+static struct ti_clk_fixed_factor corex2_d5_fck_data = {
+	.parent = "corex2_fck",
+	.div = 5,
+	.mult = 1,
+};
+
+static struct ti_clk corex2_d5_fck = {
+	.name = "corex2_d5_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &corex2_d5_fck_data,
+};
+
+static const char *sgx_mux_fck_parents[] = {
+	"core_d3_ck",
+	"core_d4_ck",
+	"core_d6_ck",
+	"cm_96m_fck",
+	"omap_192m_alwon_fck",
+	"core_d2_ck",
+	"corex2_d3_fck",
+	"corex2_d5_fck",
+};
+
+static struct ti_clk_mux sgx_mux_fck_data = {
+	.num_parents = ARRAY_SIZE(sgx_mux_fck_parents),
+	.reg = 0xb40,
+	.module = TI_CLKM_CM,
+	.parents = sgx_mux_fck_parents,
+};
+
+static struct ti_clk_composite sgx_fck_data = {
+	.mux = &sgx_mux_fck_data,
+	.gate = &sgx_gate_fck_data,
+};
+
+static struct ti_clk sgx_fck = {
+	.name = "sgx_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &sgx_fck_data,
+};
+
+static struct ti_clk_gate mcspi1_fck_data = {
+	.parent = "core_48m_fck",
+	.bit_shift = 18,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk mcspi1_fck = {
+	.name = "mcspi1_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcspi1_fck_data,
+};
+
+static struct ti_clk_gate mmchs2_fck_data = {
+	.parent = "core_96m_fck",
+	.bit_shift = 25,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk mmchs2_fck = {
+	.name = "mmchs2_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mmchs2_fck_data,
+};
+
+static struct ti_clk_gate mcspi2_fck_data = {
+	.parent = "core_48m_fck",
+	.bit_shift = 19,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk mcspi2_fck = {
+	.name = "mcspi2_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcspi2_fck_data,
+};
+
+static struct ti_clk_gate vpfe_fck_data = {
+	.parent = "pclk_ck",
+	.bit_shift = 10,
+	.reg = 0x59c,
+	.module = TI_CLKM_SCRM,
+};
+
+static struct ti_clk vpfe_fck = {
+	.name = "vpfe_fck",
+	.type = TI_CLK_GATE,
+	.data = &vpfe_fck_data,
+};
+
+static struct ti_clk_gate gpt4_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 5,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_gate mcbsp1_gate_fck_data = {
+	.parent = "mcbsp_clks",
+	.bit_shift = 9,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_gate gpt5_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 6,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static const char *gpt5_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt5_mux_fck_data = {
+	.bit_shift = 3,
+	.num_parents = ARRAY_SIZE(gpt5_mux_fck_parents),
+	.reg = 0x1040,
+	.module = TI_CLKM_CM,
+	.parents = gpt5_mux_fck_parents,
+};
+
+static struct ti_clk_composite gpt5_fck_data = {
+	.mux = &gpt5_mux_fck_data,
+	.gate = &gpt5_gate_fck_data,
+};
+
+static struct ti_clk gpt5_fck = {
+	.name = "gpt5_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt5_fck_data,
+};
+
+static struct ti_clk_gate ts_fck_data = {
+	.parent = "omap_32k_fck",
+	.bit_shift = 1,
+	.reg = 0xa08,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk ts_fck = {
+	.name = "ts_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &ts_fck_data,
+};
+
+static struct ti_clk_fixed_factor wdt1_fck_data = {
+	.parent = "secure_32k_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk wdt1_fck = {
+	.name = "wdt1_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &wdt1_fck_data,
+};
+
+static struct ti_clk_gate dpll4_m6x2_ck_omap36xx_data = {
+	.parent = "dpll4_m6x2_mul_ck",
+	.bit_shift = 0x1f,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_HSDIV | CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll4_m6x2_ck_omap36xx = {
+	.name = "dpll4_m6x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll4_m6x2_ck_omap36xx_data,
+	.patch = &dpll4_m6x2_ck,
+};
+
+static const char *gpt4_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt4_mux_fck_data = {
+	.bit_shift = 2,
+	.num_parents = ARRAY_SIZE(gpt4_mux_fck_parents),
+	.reg = 0x1040,
+	.module = TI_CLKM_CM,
+	.parents = gpt4_mux_fck_parents,
+};
+
+static struct ti_clk_gate usbhost_ick_data = {
+	.parent = "l4_ick",
+	.bit_shift = 0,
+	.reg = 0x1410,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_DSS | CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk usbhost_ick = {
+	.name = "usbhost_ick",
+	.clkdm_name = "usbhost_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &usbhost_ick_data,
+};
+
+static struct ti_clk_gate mcbsp2_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 0,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mcbsp2_ick = {
+	.name = "mcbsp2_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcbsp2_ick_data,
+};
+
+static struct ti_clk_gate omapctrl_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 6,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk omapctrl_ick = {
+	.name = "omapctrl_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &omapctrl_ick_data,
+};
+
+static struct ti_clk_fixed_factor omap_96m_d4_fck_data = {
+	.parent = "omap_96m_fck",
+	.div = 4,
+	.mult = 1,
+};
+
+static struct ti_clk omap_96m_d4_fck = {
+	.name = "omap_96m_d4_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &omap_96m_d4_fck_data,
+};
+
+static struct ti_clk_gate gpt6_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 7,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt6_ick = {
+	.name = "gpt6_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt6_ick_data,
+};
+
+static struct ti_clk_gate dpll3_m3x2_ck_omap36xx_data = {
+	.parent = "dpll3_m3x2_mul_ck",
+	.bit_shift = 0xc,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_HSDIV | CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll3_m3x2_ck_omap36xx = {
+	.name = "dpll3_m3x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll3_m3x2_ck_omap36xx_data,
+	.patch = &dpll3_m3x2_ck,
+};
+
+static struct ti_clk_gate i2c3_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 17,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk i2c3_ick = {
+	.name = "i2c3_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &i2c3_ick_data,
+};
+
+static struct ti_clk_gate gpio6_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 17,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpio6_ick = {
+	.name = "gpio6_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio6_ick_data,
+};
+
+static struct ti_clk_gate mspro_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 23,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mspro_ick = {
+	.name = "mspro_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mspro_ick_data,
+};
+
+static struct ti_clk_composite mcbsp1_fck_data = {
+	.mux = &mcbsp1_mux_fck_data,
+	.gate = &mcbsp1_gate_fck_data,
+};
+
+static struct ti_clk mcbsp1_fck = {
+	.name = "mcbsp1_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &mcbsp1_fck_data,
+};
+
+static struct ti_clk_gate gpt3_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 4,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_fixed rmii_ck_data = {
+	.frequency = 50000000,
+};
+
+static struct ti_clk rmii_ck = {
+	.name = "rmii_ck",
+	.type = TI_CLK_FIXED,
+	.data = &rmii_ck_data,
+};
+
+static struct ti_clk_gate gpt6_gate_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 7,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_composite gpt6_fck_data = {
+	.mux = &gpt6_mux_fck_data,
+	.gate = &gpt6_gate_fck_data,
+};
+
+static struct ti_clk gpt6_fck = {
+	.name = "gpt6_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt6_fck_data,
+};
+
+static struct ti_clk_fixed_factor dpll5_m2_d4_ck_data = {
+	.parent = "dpll5_m2_ck",
+	.div = 4,
+	.mult = 1,
+};
+
+static struct ti_clk dpll5_m2_d4_ck = {
+	.name = "dpll5_m2_d4_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll5_m2_d4_ck_data,
+};
+
+static struct ti_clk_fixed_factor sys_d2_ck_data = {
+	.parent = "sys_ck",
+	.div = 2,
+	.mult = 1,
+};
+
+static struct ti_clk sys_d2_ck = {
+	.name = "sys_d2_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &sys_d2_ck_data,
+};
+
+static struct ti_clk_fixed_factor omap_96m_d2_fck_data = {
+	.parent = "omap_96m_fck",
+	.div = 2,
+	.mult = 1,
+};
+
+static struct ti_clk omap_96m_d2_fck = {
+	.name = "omap_96m_d2_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &omap_96m_d2_fck_data,
+};
+
+static struct ti_clk_fixed_factor dpll5_m2_d8_ck_data = {
+	.parent = "dpll5_m2_ck",
+	.div = 8,
+	.mult = 1,
+};
+
+static struct ti_clk dpll5_m2_d8_ck = {
+	.name = "dpll5_m2_d8_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll5_m2_d8_ck_data,
+};
+
+static struct ti_clk_fixed_factor dpll5_m2_d16_ck_data = {
+	.parent = "dpll5_m2_ck",
+	.div = 16,
+	.mult = 1,
+};
+
+static struct ti_clk dpll5_m2_d16_ck = {
+	.name = "dpll5_m2_d16_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll5_m2_d16_ck_data,
+};
+
+static const char *usim_mux_fck_parents[] = {
+	"sys_ck",
+	"sys_d2_ck",
+	"omap_96m_d2_fck",
+	"omap_96m_d4_fck",
+	"omap_96m_d8_fck",
+	"omap_96m_d10_fck",
+	"dpll5_m2_d4_ck",
+	"dpll5_m2_d8_ck",
+	"dpll5_m2_d16_ck",
+	"dpll5_m2_d20_ck",
+};
+
+static struct ti_clk_mux usim_mux_fck_data = {
+	.bit_shift = 3,
+	.num_parents = ARRAY_SIZE(usim_mux_fck_parents),
+	.reg = 0xc40,
+	.module = TI_CLKM_CM,
+	.parents = usim_mux_fck_parents,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk_composite usim_fck_data = {
+	.mux = &usim_mux_fck_data,
+	.gate = &usim_gate_fck_data,
+};
+
+static struct ti_clk usim_fck = {
+	.name = "usim_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &usim_fck_data,
+};
+
+static int ssi_ssr_div_fck_3430es2_divs[] = {
+	0,
+	1,
+	2,
+	3,
+	4,
+	0,
+	6,
+	0,
+	8,
+};
+
+static struct ti_clk_divider ssi_ssr_div_fck_3430es2_data = {
+	.num_dividers = ARRAY_SIZE(ssi_ssr_div_fck_3430es2_divs),
+	.parent = "corex2_fck",
+	.bit_shift = 8,
+	.dividers = ssi_ssr_div_fck_3430es2_divs,
+	.reg = 0xa40,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_composite ssi_ssr_fck_3430es2_data = {
+	.gate = &ssi_ssr_gate_fck_3430es2_data,
+	.divider = &ssi_ssr_div_fck_3430es2_data,
+};
+
+static struct ti_clk ssi_ssr_fck_3430es2 = {
+	.name = "ssi_ssr_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &ssi_ssr_fck_3430es2_data,
+};
+
+static struct ti_clk_gate dss1_alwon_fck_3430es1_data = {
+	.parent = "dpll4_m4x2_ck",
+	.bit_shift = 0,
+	.reg = 0xe00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_SET_RATE_PARENT,
+};
+
+static struct ti_clk dss1_alwon_fck_3430es1 = {
+	.name = "dss1_alwon_fck",
+	.clkdm_name = "dss_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &dss1_alwon_fck_3430es1_data,
+};
+
+static struct ti_clk_gate gpt3_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 4,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt3_ick = {
+	.name = "gpt3_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt3_ick_data,
+};
+
+static struct ti_clk_fixed_factor omap_12m_fck_data = {
+	.parent = "omap_48m_fck",
+	.div = 4,
+	.mult = 1,
+};
+
+static struct ti_clk omap_12m_fck = {
+	.name = "omap_12m_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &omap_12m_fck_data,
+};
+
+static struct ti_clk_fixed_factor core_12m_fck_data = {
+	.parent = "omap_12m_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk core_12m_fck = {
+	.name = "core_12m_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &core_12m_fck_data,
+};
+
+static struct ti_clk_gate hdq_fck_data = {
+	.parent = "core_12m_fck",
+	.bit_shift = 22,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk hdq_fck = {
+	.name = "hdq_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &hdq_fck_data,
+};
+
+static struct ti_clk_gate usbtll_fck_data = {
+	.parent = "dpll5_m2_ck",
+	.bit_shift = 2,
+	.reg = 0xa08,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk usbtll_fck = {
+	.name = "usbtll_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &usbtll_fck_data,
+};
+
+static struct ti_clk_gate hsotgusb_fck_am35xx_data = {
+	.parent = "sys_ck",
+	.bit_shift = 8,
+	.reg = 0x59c,
+	.module = TI_CLKM_SCRM,
+};
+
+static struct ti_clk hsotgusb_fck_am35xx = {
+	.name = "hsotgusb_fck_am35xx",
+	.clkdm_name = "core_l3_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &hsotgusb_fck_am35xx_data,
+};
+
+static struct ti_clk_gate hsotgusb_ick_3430es2_data = {
+	.parent = "core_l3_ick",
+	.bit_shift = 4,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_HSOTGUSB | CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk hsotgusb_ick_3430es2 = {
+	.name = "hsotgusb_ick_3430es2",
+	.clkdm_name = "core_l3_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &hsotgusb_ick_3430es2_data,
+};
+
+static struct ti_clk_gate gfx_l3_ck_data = {
+	.parent = "l3_ick",
+	.bit_shift = 0,
+	.reg = 0xb10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk gfx_l3_ck = {
+	.name = "gfx_l3_ck",
+	.clkdm_name = "gfx_3430es1_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gfx_l3_ck_data,
+};
+
+static struct ti_clk_fixed_factor gfx_l3_ick_data = {
+	.parent = "gfx_l3_ck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk gfx_l3_ick = {
+	.name = "gfx_l3_ick",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &gfx_l3_ick_data,
+};
+
+static struct ti_clk_gate mcbsp1_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 9,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mcbsp1_ick = {
+	.name = "mcbsp1_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcbsp1_ick_data,
+};
+
+static struct ti_clk_fixed_factor gpt12_fck_data = {
+	.parent = "secure_32k_fck",
+	.div = 1,
+	.mult = 1,
+};
+
+static struct ti_clk gpt12_fck = {
+	.name = "gpt12_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &gpt12_fck_data,
+};
+
+static struct ti_clk_gate gfx_cg2_ck_data = {
+	.parent = "gfx_l3_fck",
+	.bit_shift = 2,
+	.reg = 0xb00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk gfx_cg2_ck = {
+	.name = "gfx_cg2_ck",
+	.clkdm_name = "gfx_3430es1_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gfx_cg2_ck_data,
+};
+
+static struct ti_clk_gate i2c2_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 16,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk i2c2_ick = {
+	.name = "i2c2_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &i2c2_ick_data,
+};
+
+static struct ti_clk_gate gpio4_dbck_data = {
+	.parent = "per_32k_alwon_fck",
+	.bit_shift = 15,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk gpio4_dbck = {
+	.name = "gpio4_dbck",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio4_dbck_data,
+};
+
+static struct ti_clk_gate i2c3_fck_data = {
+	.parent = "core_96m_fck",
+	.bit_shift = 17,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk i2c3_fck = {
+	.name = "i2c3_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &i2c3_fck_data,
+};
+
+static struct ti_clk_composite gpt3_fck_data = {
+	.mux = &gpt3_mux_fck_data,
+	.gate = &gpt3_gate_fck_data,
+};
+
+static struct ti_clk gpt3_fck = {
+	.name = "gpt3_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt3_fck_data,
+};
+
+static struct ti_clk_gate i2c1_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 15,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk i2c1_ick = {
+	.name = "i2c1_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &i2c1_ick_data,
+};
+
+static struct ti_clk_gate omap_32ksync_ick_data = {
+	.parent = "wkup_l4_ick",
+	.bit_shift = 2,
+	.reg = 0xc10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk omap_32ksync_ick = {
+	.name = "omap_32ksync_ick",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &omap_32ksync_ick_data,
+};
+
+static struct ti_clk_gate aes2_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 28,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk aes2_ick = {
+	.name = "aes2_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &aes2_ick_data,
+};
+
+static const char *gpt8_mux_fck_parents[] = {
+	"omap_32k_fck",
+	"sys_ck",
+};
+
+static struct ti_clk_mux gpt8_mux_fck_data = {
+	.bit_shift = 6,
+	.num_parents = ARRAY_SIZE(gpt8_mux_fck_parents),
+	.reg = 0x1040,
+	.module = TI_CLKM_CM,
+	.parents = gpt8_mux_fck_parents,
+};
+
+static struct ti_clk_composite gpt8_fck_data = {
+	.mux = &gpt8_mux_fck_data,
+	.gate = &gpt8_gate_fck_data,
+};
+
+static struct ti_clk gpt8_fck = {
+	.name = "gpt8_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt8_fck_data,
+};
+
+static struct ti_clk_gate mcbsp4_gate_fck_data = {
+	.parent = "mcbsp_clks",
+	.bit_shift = 2,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk_composite mcbsp4_fck_data = {
+	.mux = &mcbsp4_mux_fck_data,
+	.gate = &mcbsp4_gate_fck_data,
+};
+
+static struct ti_clk mcbsp4_fck = {
+	.name = "mcbsp4_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &mcbsp4_fck_data,
+};
+
+static struct ti_clk_gate gpio2_dbck_data = {
+	.parent = "per_32k_alwon_fck",
+	.bit_shift = 13,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk gpio2_dbck = {
+	.name = "gpio2_dbck",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio2_dbck_data,
+};
+
+static struct ti_clk_gate usbtll_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 2,
+	.reg = 0xa18,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk usbtll_ick = {
+	.name = "usbtll_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &usbtll_ick_data,
+};
+
+static struct ti_clk_gate mcspi4_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 21,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mcspi4_ick = {
+	.name = "mcspi4_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcspi4_ick_data,
+};
+
+static struct ti_clk_gate dss_96m_fck_data = {
+	.parent = "omap_96m_fck",
+	.bit_shift = 2,
+	.reg = 0xe00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk dss_96m_fck = {
+	.name = "dss_96m_fck",
+	.clkdm_name = "dss_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &dss_96m_fck_data,
+};
+
+static struct ti_clk_divider rm_ick_data = {
+	.parent = "l4_ick",
+	.bit_shift = 1,
+	.max_div = 3,
+	.reg = 0xc40,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk rm_ick = {
+	.name = "rm_ick",
+	.type = TI_CLK_DIVIDER,
+	.data = &rm_ick_data,
+};
+
+static struct ti_clk_gate hdq_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 22,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk hdq_ick = {
+	.name = "hdq_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &hdq_ick_data,
+};
+
+static struct ti_clk_fixed_factor dpll3_x2_ck_data = {
+	.parent = "dpll3_ck",
+	.div = 1,
+	.mult = 2,
+};
+
+static struct ti_clk dpll3_x2_ck = {
+	.name = "dpll3_x2_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll3_x2_ck_data,
+};
+
+static struct ti_clk_gate mad2d_ick_data = {
+	.parent = "l3_ick",
+	.bit_shift = 3,
+	.reg = 0xa18,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mad2d_ick = {
+	.name = "mad2d_ick",
+	.clkdm_name = "d2d_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mad2d_ick_data,
+};
+
+static struct ti_clk_gate fshostusb_fck_data = {
+	.parent = "core_48m_fck",
+	.bit_shift = 5,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk fshostusb_fck = {
+	.name = "fshostusb_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &fshostusb_fck_data,
+};
+
+static struct ti_clk_gate sr1_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 6,
+	.reg = 0xc00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk sr1_fck = {
+	.name = "sr1_fck",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &sr1_fck_data,
+};
+
+static struct ti_clk_gate des2_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 26,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk des2_ick = {
+	.name = "des2_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &des2_ick_data,
+};
+
+static struct ti_clk_gate sdrc_ick_data = {
+	.parent = "core_l3_ick",
+	.bit_shift = 1,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk sdrc_ick = {
+	.name = "sdrc_ick",
+	.clkdm_name = "core_l3_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &sdrc_ick_data,
+};
+
+static struct ti_clk_composite gpt4_fck_data = {
+	.mux = &gpt4_mux_fck_data,
+	.gate = &gpt4_gate_fck_data,
+};
+
+static struct ti_clk gpt4_fck = {
+	.name = "gpt4_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt4_fck_data,
+};
+
+static struct ti_clk_gate dpll4_m3x2_ck_omap36xx_data = {
+	.parent = "dpll4_m3x2_mul_ck",
+	.bit_shift = 0x1c,
+	.reg = 0xd00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_HSDIV | CLKF_SET_BIT_TO_DISABLE,
+};
+
+static struct ti_clk dpll4_m3x2_ck_omap36xx = {
+	.name = "dpll4_m3x2_ck",
+	.type = TI_CLK_GATE,
+	.data = &dpll4_m3x2_ck_omap36xx_data,
+	.patch = &dpll4_m3x2_ck,
+};
+
+static struct ti_clk_gate cpefuse_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 0,
+	.reg = 0xa08,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk cpefuse_fck = {
+	.name = "cpefuse_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &cpefuse_fck_data,
+};
+
+static struct ti_clk_gate mcspi3_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 20,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mcspi3_ick = {
+	.name = "mcspi3_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcspi3_ick_data,
+};
+
+static struct ti_clk_fixed_factor ssi_sst_fck_3430es2_data = {
+	.parent = "ssi_ssr_fck",
+	.div = 2,
+	.mult = 1,
+};
+
+static struct ti_clk ssi_sst_fck_3430es2 = {
+	.name = "ssi_sst_fck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &ssi_sst_fck_3430es2_data,
+};
+
+static struct ti_clk_gate gpio1_dbck_data = {
+	.parent = "wkup_32k_fck",
+	.bit_shift = 3,
+	.reg = 0xc00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk gpio1_dbck = {
+	.name = "gpio1_dbck",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio1_dbck_data,
+};
+
+static struct ti_clk_gate gpt4_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 5,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt4_ick = {
+	.name = "gpt4_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt4_ick_data,
+};
+
+static struct ti_clk_gate gpt2_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 3,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt2_ick = {
+	.name = "gpt2_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt2_ick_data,
+};
+
+static struct ti_clk_gate mmchs1_fck_data = {
+	.parent = "core_96m_fck",
+	.bit_shift = 24,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk mmchs1_fck = {
+	.name = "mmchs1_fck",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mmchs1_fck_data,
+};
+
+static struct ti_clk_fixed dummy_apb_pclk_data = {
+	.frequency = 0x0,
+};
+
+static struct ti_clk dummy_apb_pclk = {
+	.name = "dummy_apb_pclk",
+	.type = TI_CLK_FIXED,
+	.data = &dummy_apb_pclk_data,
+};
+
+static struct ti_clk_gate gpio6_dbck_data = {
+	.parent = "per_32k_alwon_fck",
+	.bit_shift = 17,
+	.reg = 0x1000,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk gpio6_dbck = {
+	.name = "gpio6_dbck",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio6_dbck_data,
+};
+
+static struct ti_clk_gate uart2_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 14,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk uart2_ick = {
+	.name = "uart2_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &uart2_ick_data,
+};
+
+static struct ti_clk_fixed_factor dpll4_x2_ck_data = {
+	.parent = "dpll4_ck",
+	.div = 1,
+	.mult = 2,
+};
+
+static struct ti_clk dpll4_x2_ck = {
+	.name = "dpll4_x2_ck",
+	.type = TI_CLK_FIXED_FACTOR,
+	.data = &dpll4_x2_ck_data,
+};
+
+static struct ti_clk_gate gpt7_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 8,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpt7_ick = {
+	.name = "gpt7_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpt7_ick_data,
+};
+
+static struct ti_clk_gate dss_tv_fck_data = {
+	.parent = "omap_54m_fck",
+	.bit_shift = 2,
+	.reg = 0xe00,
+	.module = TI_CLKM_CM,
+};
+
+static struct ti_clk dss_tv_fck = {
+	.name = "dss_tv_fck",
+	.clkdm_name = "dss_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &dss_tv_fck_data,
+};
+
+static struct ti_clk_gate mcbsp5_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 10,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mcbsp5_ick = {
+	.name = "mcbsp5_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcbsp5_ick_data,
+};
+
+static struct ti_clk_gate mcspi1_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 18,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk mcspi1_ick = {
+	.name = "mcspi1_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &mcspi1_ick_data,
+};
+
+static struct ti_clk_gate d2d_26m_fck_data = {
+	.parent = "sys_ck",
+	.bit_shift = 3,
+	.reg = 0xa00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk d2d_26m_fck = {
+	.name = "d2d_26m_fck",
+	.clkdm_name = "d2d_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &d2d_26m_fck_data,
+};
+
+static struct ti_clk_gate wdt3_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 12,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk wdt3_ick = {
+	.name = "wdt3_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &wdt3_ick_data,
+};
+
+static struct ti_clk_divider pclkx2_fck_data = {
+	.parent = "emu_src_ck",
+	.bit_shift = 6,
+	.max_div = 3,
+	.reg = 0x1140,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_STARTS_AT_ONE,
+};
+
+static struct ti_clk pclkx2_fck = {
+	.name = "pclkx2_fck",
+	.type = TI_CLK_DIVIDER,
+	.data = &pclkx2_fck_data,
+};
+
+static struct ti_clk_gate sha12_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 27,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk sha12_ick = {
+	.name = "sha12_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &sha12_ick_data,
+};
+
+static struct ti_clk_gate emac_fck_data = {
+	.parent = "rmii_ck",
+	.bit_shift = 9,
+	.reg = 0x59c,
+	.module = TI_CLKM_SCRM,
+};
+
+static struct ti_clk emac_fck = {
+	.name = "emac_fck",
+	.type = TI_CLK_GATE,
+	.data = &emac_fck_data,
+};
+
+static struct ti_clk_composite gpt10_fck_data = {
+	.mux = &gpt10_mux_fck_data,
+	.gate = &gpt10_gate_fck_data,
+};
+
+static struct ti_clk gpt10_fck = {
+	.name = "gpt10_fck",
+	.type = TI_CLK_COMPOSITE,
+	.data = &gpt10_fck_data,
+};
+
+static struct ti_clk_gate wdt2_fck_data = {
+	.parent = "wkup_32k_fck",
+	.bit_shift = 5,
+	.reg = 0xc00,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk wdt2_fck = {
+	.name = "wdt2_fck",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &wdt2_fck_data,
+};
+
+static struct ti_clk_gate cam_ick_data = {
+	.parent = "l4_ick",
+	.bit_shift = 0,
+	.reg = 0xf10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_NO_WAIT | CLKF_INTERFACE,
+};
+
+static struct ti_clk cam_ick = {
+	.name = "cam_ick",
+	.clkdm_name = "cam_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &cam_ick_data,
+};
+
+static struct ti_clk_gate ssi_ick_3430es2_data = {
+	.parent = "ssi_l4_ick",
+	.bit_shift = 0,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_SSI | CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk ssi_ick_3430es2 = {
+	.name = "ssi_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &ssi_ick_3430es2_data,
+};
+
+static struct ti_clk_gate gpio4_ick_data = {
+	.parent = "per_l4_ick",
+	.bit_shift = 15,
+	.reg = 0x1010,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk gpio4_ick = {
+	.name = "gpio4_ick",
+	.clkdm_name = "per_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &gpio4_ick_data,
+};
+
+static struct ti_clk_gate wdt1_ick_data = {
+	.parent = "wkup_l4_ick",
+	.bit_shift = 4,
+	.reg = 0xc10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk wdt1_ick = {
+	.name = "wdt1_ick",
+	.clkdm_name = "wkup_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &wdt1_ick_data,
+};
+
+static struct ti_clk_gate rng_ick_data = {
+	.parent = "security_l4_ick2",
+	.bit_shift = 2,
+	.reg = 0xa14,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk rng_ick = {
+	.name = "rng_ick",
+	.type = TI_CLK_GATE,
+	.data = &rng_ick_data,
+};
+
+static struct ti_clk_gate icr_ick_data = {
+	.parent = "core_l4_ick",
+	.bit_shift = 29,
+	.reg = 0xa10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_OMAP3 | CLKF_INTERFACE,
+};
+
+static struct ti_clk icr_ick = {
+	.name = "icr_ick",
+	.clkdm_name = "core_l4_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &icr_ick_data,
+};
+
+static struct ti_clk_gate sgx_ick_data = {
+	.parent = "l3_ick",
+	.bit_shift = 0,
+	.reg = 0xb10,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_WAIT,
+};
+
+static struct ti_clk sgx_ick = {
+	.name = "sgx_ick",
+	.clkdm_name = "sgx_clkdm",
+	.type = TI_CLK_GATE,
+	.data = &sgx_ick_data,
+};
+
+static struct ti_clk_divider sys_clkout2_data = {
+	.parent = "clkout2_src_ck",
+	.bit_shift = 3,
+	.max_div = 64,
+	.reg = 0xd70,
+	.module = TI_CLKM_CM,
+	.flags = CLKF_INDEX_POWER_OF_TWO,
+};
+
+static struct ti_clk sys_clkout2 = {
+	.name = "sys_clkout2",
+	.type = TI_CLK_DIVIDER,
+	.data = &sys_clkout2_data,
+};
+
+static struct ti_clk_alias omap34xx_omap36xx_clks[] = {
+	CLK(NULL, "security_l4_ick2", &security_l4_ick2),
+	CLK(NULL, "aes1_ick", &aes1_ick),
+	CLK("omap_rng", "ick", &rng_ick),
+	CLK("omap3-rom-rng", "ick", &rng_ick),
+	CLK(NULL, "sha11_ick", &sha11_ick),
+	CLK(NULL, "des1_ick", &des1_ick),
+	CLK(NULL, "cam_mclk", &cam_mclk),
+	CLK(NULL, "cam_ick", &cam_ick),
+	CLK(NULL, "csi2_96m_fck", &csi2_96m_fck),
+	CLK(NULL, "security_l3_ick", &security_l3_ick),
+	CLK(NULL, "pka_ick", &pka_ick),
+	CLK(NULL, "icr_ick", &icr_ick),
+	CLK(NULL, "des2_ick", &des2_ick),
+	CLK(NULL, "mspro_ick", &mspro_ick),
+	CLK(NULL, "mailboxes_ick", &mailboxes_ick),
+	CLK(NULL, "ssi_l4_ick", &ssi_l4_ick),
+	CLK(NULL, "sr1_fck", &sr1_fck),
+	CLK(NULL, "sr2_fck", &sr2_fck),
+	CLK(NULL, "sr_l4_ick", &sr_l4_ick),
+	CLK(NULL, "dpll2_fck", &dpll2_fck),
+	CLK(NULL, "dpll2_ck", &dpll2_ck),
+	CLK(NULL, "dpll2_m2_ck", &dpll2_m2_ck),
+	CLK(NULL, "iva2_ck", &iva2_ck),
+	CLK(NULL, "modem_fck", &modem_fck),
+	CLK(NULL, "sad2d_ick", &sad2d_ick),
+	CLK(NULL, "mad2d_ick", &mad2d_ick),
+	CLK(NULL, "mspro_fck", &mspro_fck),
+	{ NULL },
+};
+
+static struct ti_clk_alias omap36xx_omap3430es2plus_clks[] = {
+	CLK(NULL, "ssi_ssr_fck", &ssi_ssr_fck_3430es2),
+	CLK(NULL, "ssi_sst_fck", &ssi_sst_fck_3430es2),
+	CLK("musb-omap2430", "ick", &hsotgusb_ick_3430es2),
+	CLK(NULL, "hsotgusb_ick", &hsotgusb_ick_3430es2),
+	CLK(NULL, "ssi_ick", &ssi_ick_3430es2),
+	CLK(NULL, "sys_d2_ck", &sys_d2_ck),
+	CLK(NULL, "omap_96m_d2_fck", &omap_96m_d2_fck),
+	CLK(NULL, "omap_96m_d4_fck", &omap_96m_d4_fck),
+	CLK(NULL, "omap_96m_d8_fck", &omap_96m_d8_fck),
+	CLK(NULL, "omap_96m_d10_fck", &omap_96m_d10_fck),
+	CLK(NULL, "dpll5_m2_d4_ck", &dpll5_m2_d4_ck),
+	CLK(NULL, "dpll5_m2_d8_ck", &dpll5_m2_d8_ck),
+	CLK(NULL, "dpll5_m2_d16_ck", &dpll5_m2_d16_ck),
+	CLK(NULL, "dpll5_m2_d20_ck", &dpll5_m2_d20_ck),
+	CLK(NULL, "usim_fck", &usim_fck),
+	CLK(NULL, "usim_ick", &usim_ick),
+	{ NULL },
+};
+
+static struct ti_clk_alias omap3xxx_clks[] = {
+	CLK(NULL, "apb_pclk", &dummy_apb_pclk),
+	CLK(NULL, "omap_32k_fck", &omap_32k_fck),
+	CLK(NULL, "virt_12m_ck", &virt_12m_ck),
+	CLK(NULL, "virt_13m_ck", &virt_13m_ck),
+	CLK(NULL, "virt_19200000_ck", &virt_19200000_ck),
+	CLK(NULL, "virt_26000000_ck", &virt_26000000_ck),
+	CLK(NULL, "virt_38_4m_ck", &virt_38_4m_ck),
+	CLK(NULL, "virt_16_8m_ck", &virt_16_8m_ck),
+	CLK(NULL, "osc_sys_ck", &osc_sys_ck),
+	CLK("twl", "fck", &osc_sys_ck),
+	CLK(NULL, "sys_ck", &sys_ck),
+	CLK(NULL, "timer_sys_ck", &sys_ck),
+	CLK(NULL, "dpll4_ck", &dpll4_ck),
+	CLK(NULL, "dpll4_m2_ck", &dpll4_m2_ck),
+	CLK(NULL, "dpll4_m2x2_mul_ck", &dpll4_m2x2_mul_ck),
+	CLK(NULL, "dpll4_m2x2_ck", &dpll4_m2x2_ck),
+	CLK(NULL, "omap_96m_alwon_fck", &omap_96m_alwon_fck),
+	CLK(NULL, "dpll3_ck", &dpll3_ck),
+	CLK(NULL, "dpll3_m3_ck", &dpll3_m3_ck),
+	CLK(NULL, "dpll3_m3x2_mul_ck", &dpll3_m3x2_mul_ck),
+	CLK(NULL, "dpll3_m3x2_ck", &dpll3_m3x2_ck),
+	CLK("etb", "emu_core_alwon_ck", &emu_core_alwon_ck),
+	CLK(NULL, "sys_altclk", &sys_altclk),
+	CLK(NULL, "mcbsp_clks", &mcbsp_clks),
+	CLK(NULL, "sys_clkout1", &sys_clkout1),
+	CLK(NULL, "dpll3_m2_ck", &dpll3_m2_ck),
+	CLK(NULL, "core_ck", &core_ck),
+	CLK(NULL, "dpll1_fck", &dpll1_fck),
+	CLK(NULL, "dpll1_ck", &dpll1_ck),
+	CLK(NULL, "cpufreq_ck", &dpll1_ck),
+	CLK(NULL, "dpll1_x2_ck", &dpll1_x2_ck),
+	CLK(NULL, "dpll1_x2m2_ck", &dpll1_x2m2_ck),
+	CLK(NULL, "dpll3_x2_ck", &dpll3_x2_ck),
+	CLK(NULL, "dpll3_m2x2_ck", &dpll3_m2x2_ck),
+	CLK(NULL, "dpll4_x2_ck", &dpll4_x2_ck),
+	CLK(NULL, "cm_96m_fck", &cm_96m_fck),
+	CLK(NULL, "omap_96m_fck", &omap_96m_fck),
+	CLK(NULL, "dpll4_m3_ck", &dpll4_m3_ck),
+	CLK(NULL, "dpll4_m3x2_mul_ck", &dpll4_m3x2_mul_ck),
+	CLK(NULL, "dpll4_m3x2_ck", &dpll4_m3x2_ck),
+	CLK(NULL, "omap_54m_fck", &omap_54m_fck),
+	CLK(NULL, "cm_96m_d2_fck", &cm_96m_d2_fck),
+	CLK(NULL, "omap_48m_fck", &omap_48m_fck),
+	CLK(NULL, "omap_12m_fck", &omap_12m_fck),
+	CLK(NULL, "dpll4_m4_ck", &dpll4_m4_ck),
+	CLK(NULL, "dpll4_m4x2_mul_ck", &dpll4_m4x2_mul_ck),
+	CLK(NULL, "dpll4_m4x2_ck", &dpll4_m4x2_ck),
+	CLK(NULL, "dpll4_m5_ck", &dpll4_m5_ck),
+	CLK(NULL, "dpll4_m5x2_mul_ck", &dpll4_m5x2_mul_ck),
+	CLK(NULL, "dpll4_m5x2_ck", &dpll4_m5x2_ck),
+	CLK(NULL, "dpll4_m6_ck", &dpll4_m6_ck),
+	CLK(NULL, "dpll4_m6x2_mul_ck", &dpll4_m6x2_mul_ck),
+	CLK(NULL, "dpll4_m6x2_ck", &dpll4_m6x2_ck),
+	CLK("etb", "emu_per_alwon_ck", &emu_per_alwon_ck),
+	CLK(NULL, "clkout2_src_ck", &clkout2_src_ck),
+	CLK(NULL, "sys_clkout2", &sys_clkout2),
+	CLK(NULL, "corex2_fck", &corex2_fck),
+	CLK(NULL, "mpu_ck", &mpu_ck),
+	CLK(NULL, "arm_fck", &arm_fck),
+	CLK("etb", "emu_mpu_alwon_ck", &emu_mpu_alwon_ck),
+	CLK(NULL, "l3_ick", &l3_ick),
+	CLK(NULL, "l4_ick", &l4_ick),
+	CLK(NULL, "rm_ick", &rm_ick),
+	CLK(NULL, "timer_32k_ck", &omap_32k_fck),
+	CLK(NULL, "gpt10_fck", &gpt10_fck),
+	CLK(NULL, "gpt11_fck", &gpt11_fck),
+	CLK(NULL, "core_96m_fck", &core_96m_fck),
+	CLK(NULL, "mmchs2_fck", &mmchs2_fck),
+	CLK(NULL, "mmchs1_fck", &mmchs1_fck),
+	CLK(NULL, "i2c3_fck", &i2c3_fck),
+	CLK(NULL, "i2c2_fck", &i2c2_fck),
+	CLK(NULL, "i2c1_fck", &i2c1_fck),
+	CLK(NULL, "mcbsp5_fck", &mcbsp5_fck),
+	CLK(NULL, "mcbsp1_fck", &mcbsp1_fck),
+	CLK(NULL, "core_48m_fck", &core_48m_fck),
+	CLK(NULL, "mcspi4_fck", &mcspi4_fck),
+	CLK(NULL, "mcspi3_fck", &mcspi3_fck),
+	CLK(NULL, "mcspi2_fck", &mcspi2_fck),
+	CLK(NULL, "mcspi1_fck", &mcspi1_fck),
+	CLK(NULL, "uart2_fck", &uart2_fck),
+	CLK(NULL, "uart1_fck", &uart1_fck),
+	CLK(NULL, "core_12m_fck", &core_12m_fck),
+	CLK("omap_hdq.0", "fck", &hdq_fck),
+	CLK(NULL, "hdq_fck", &hdq_fck),
+	CLK(NULL, "core_l3_ick", &core_l3_ick),
+	CLK(NULL, "sdrc_ick", &sdrc_ick),
+	CLK(NULL, "gpmc_fck", &gpmc_fck),
+	CLK(NULL, "core_l4_ick", &core_l4_ick),
+	CLK("omap_hsmmc.1", "ick", &mmchs2_ick),
+	CLK("omap_hsmmc.0", "ick", &mmchs1_ick),
+	CLK(NULL, "mmchs2_ick", &mmchs2_ick),
+	CLK(NULL, "mmchs1_ick", &mmchs1_ick),
+	CLK("omap_hdq.0", "ick", &hdq_ick),
+	CLK(NULL, "hdq_ick", &hdq_ick),
+	CLK("omap2_mcspi.4", "ick", &mcspi4_ick),
+	CLK("omap2_mcspi.3", "ick", &mcspi3_ick),
+	CLK("omap2_mcspi.2", "ick", &mcspi2_ick),
+	CLK("omap2_mcspi.1", "ick", &mcspi1_ick),
+	CLK(NULL, "mcspi4_ick", &mcspi4_ick),
+	CLK(NULL, "mcspi3_ick", &mcspi3_ick),
+	CLK(NULL, "mcspi2_ick", &mcspi2_ick),
+	CLK(NULL, "mcspi1_ick", &mcspi1_ick),
+	CLK("omap_i2c.3", "ick", &i2c3_ick),
+	CLK("omap_i2c.2", "ick", &i2c2_ick),
+	CLK("omap_i2c.1", "ick", &i2c1_ick),
+	CLK(NULL, "i2c3_ick", &i2c3_ick),
+	CLK(NULL, "i2c2_ick", &i2c2_ick),
+	CLK(NULL, "i2c1_ick", &i2c1_ick),
+	CLK(NULL, "uart2_ick", &uart2_ick),
+	CLK(NULL, "uart1_ick", &uart1_ick),
+	CLK(NULL, "gpt11_ick", &gpt11_ick),
+	CLK(NULL, "gpt10_ick", &gpt10_ick),
+	CLK("omap-mcbsp.5", "ick", &mcbsp5_ick),
+	CLK("omap-mcbsp.1", "ick", &mcbsp1_ick),
+	CLK(NULL, "mcbsp5_ick", &mcbsp5_ick),
+	CLK(NULL, "mcbsp1_ick", &mcbsp1_ick),
+	CLK(NULL, "omapctrl_ick", &omapctrl_ick),
+	CLK(NULL, "dss_tv_fck", &dss_tv_fck),
+	CLK(NULL, "dss_96m_fck", &dss_96m_fck),
+	CLK(NULL, "dss2_alwon_fck", &dss2_alwon_fck),
+	CLK(NULL, "init_60m_fclk", &dummy_ck),
+	CLK(NULL, "gpt1_fck", &gpt1_fck),
+	CLK(NULL, "aes2_ick", &aes2_ick),
+	CLK(NULL, "wkup_32k_fck", &wkup_32k_fck),
+	CLK(NULL, "gpio1_dbck", &gpio1_dbck),
+	CLK(NULL, "sha12_ick", &sha12_ick),
+	CLK(NULL, "wdt2_fck", &wdt2_fck),
+	CLK(NULL, "wkup_l4_ick", &wkup_l4_ick),
+	CLK("omap_wdt", "ick", &wdt2_ick),
+	CLK(NULL, "wdt2_ick", &wdt2_ick),
+	CLK(NULL, "wdt1_ick", &wdt1_ick),
+	CLK(NULL, "gpio1_ick", &gpio1_ick),
+	CLK(NULL, "omap_32ksync_ick", &omap_32ksync_ick),
+	CLK(NULL, "gpt12_ick", &gpt12_ick),
+	CLK(NULL, "gpt1_ick", &gpt1_ick),
+	CLK(NULL, "per_96m_fck", &per_96m_fck),
+	CLK(NULL, "per_48m_fck", &per_48m_fck),
+	CLK(NULL, "uart3_fck", &uart3_fck),
+	CLK(NULL, "gpt2_fck", &gpt2_fck),
+	CLK(NULL, "gpt3_fck", &gpt3_fck),
+	CLK(NULL, "gpt4_fck", &gpt4_fck),
+	CLK(NULL, "gpt5_fck", &gpt5_fck),
+	CLK(NULL, "gpt6_fck", &gpt6_fck),
+	CLK(NULL, "gpt7_fck", &gpt7_fck),
+	CLK(NULL, "gpt8_fck", &gpt8_fck),
+	CLK(NULL, "gpt9_fck", &gpt9_fck),
+	CLK(NULL, "per_32k_alwon_fck", &per_32k_alwon_fck),
+	CLK(NULL, "gpio6_dbck", &gpio6_dbck),
+	CLK(NULL, "gpio5_dbck", &gpio5_dbck),
+	CLK(NULL, "gpio4_dbck", &gpio4_dbck),
+	CLK(NULL, "gpio3_dbck", &gpio3_dbck),
+	CLK(NULL, "gpio2_dbck", &gpio2_dbck),
+	CLK(NULL, "wdt3_fck", &wdt3_fck),
+	CLK(NULL, "per_l4_ick", &per_l4_ick),
+	CLK(NULL, "gpio6_ick", &gpio6_ick),
+	CLK(NULL, "gpio5_ick", &gpio5_ick),
+	CLK(NULL, "gpio4_ick", &gpio4_ick),
+	CLK(NULL, "gpio3_ick", &gpio3_ick),
+	CLK(NULL, "gpio2_ick", &gpio2_ick),
+	CLK(NULL, "wdt3_ick", &wdt3_ick),
+	CLK(NULL, "uart3_ick", &uart3_ick),
+	CLK(NULL, "uart4_ick", &uart4_ick),
+	CLK(NULL, "gpt9_ick", &gpt9_ick),
+	CLK(NULL, "gpt8_ick", &gpt8_ick),
+	CLK(NULL, "gpt7_ick", &gpt7_ick),
+	CLK(NULL, "gpt6_ick", &gpt6_ick),
+	CLK(NULL, "gpt5_ick", &gpt5_ick),
+	CLK(NULL, "gpt4_ick", &gpt4_ick),
+	CLK(NULL, "gpt3_ick", &gpt3_ick),
+	CLK(NULL, "gpt2_ick", &gpt2_ick),
+	CLK("omap-mcbsp.2", "ick", &mcbsp2_ick),
+	CLK("omap-mcbsp.3", "ick", &mcbsp3_ick),
+	CLK("omap-mcbsp.4", "ick", &mcbsp4_ick),
+	CLK(NULL, "mcbsp4_ick", &mcbsp2_ick),
+	CLK(NULL, "mcbsp3_ick", &mcbsp3_ick),
+	CLK(NULL, "mcbsp2_ick", &mcbsp4_ick),
+	CLK(NULL, "mcbsp2_fck", &mcbsp2_fck),
+	CLK(NULL, "mcbsp3_fck", &mcbsp3_fck),
+	CLK(NULL, "mcbsp4_fck", &mcbsp4_fck),
+	CLK(NULL, "emu_src_mux_ck", &emu_src_mux_ck),
+	CLK("etb", "emu_src_ck", &emu_src_ck),
+	CLK(NULL, "emu_src_mux_ck", &emu_src_mux_ck),
+	CLK(NULL, "emu_src_ck", &emu_src_ck),
+	CLK(NULL, "pclk_fck", &pclk_fck),
+	CLK(NULL, "pclkx2_fck", &pclkx2_fck),
+	CLK(NULL, "atclk_fck", &atclk_fck),
+	CLK(NULL, "traceclk_src_fck", &traceclk_src_fck),
+	CLK(NULL, "traceclk_fck", &traceclk_fck),
+	CLK(NULL, "secure_32k_fck", &secure_32k_fck),
+	CLK(NULL, "gpt12_fck", &gpt12_fck),
+	CLK(NULL, "wdt1_fck", &wdt1_fck),
+	{ NULL },
+};
+
+static struct ti_clk_alias omap36xx_am35xx_omap3430es2plus_clks[] = {
+	CLK(NULL, "dpll5_ck", &dpll5_ck),
+	CLK(NULL, "dpll5_m2_ck", &dpll5_m2_ck),
+	CLK(NULL, "core_d3_ck", &core_d3_ck),
+	CLK(NULL, "core_d4_ck", &core_d4_ck),
+	CLK(NULL, "core_d6_ck", &core_d6_ck),
+	CLK(NULL, "omap_192m_alwon_fck", &omap_192m_alwon_fck),
+	CLK(NULL, "core_d2_ck", &core_d2_ck),
+	CLK(NULL, "corex2_d3_fck", &corex2_d3_fck),
+	CLK(NULL, "corex2_d5_fck", &corex2_d5_fck),
+	CLK(NULL, "sgx_fck", &sgx_fck),
+	CLK(NULL, "sgx_ick", &sgx_ick),
+	CLK(NULL, "cpefuse_fck", &cpefuse_fck),
+	CLK(NULL, "ts_fck", &ts_fck),
+	CLK(NULL, "usbtll_fck", &usbtll_fck),
+	CLK(NULL, "usbtll_ick", &usbtll_ick),
+	CLK("omap_hsmmc.2", "ick", &mmchs3_ick),
+	CLK(NULL, "mmchs3_ick", &mmchs3_ick),
+	CLK(NULL, "mmchs3_fck", &mmchs3_fck),
+	CLK(NULL, "dss1_alwon_fck", &dss1_alwon_fck_3430es2),
+	CLK("omapdss_dss", "ick", &dss_ick_3430es2),
+	CLK(NULL, "dss_ick", &dss_ick_3430es2),
+	CLK(NULL, "usbhost_120m_fck", &usbhost_120m_fck),
+	CLK(NULL, "usbhost_48m_fck", &usbhost_48m_fck),
+	CLK(NULL, "usbhost_ick", &usbhost_ick),
+	{ NULL },
+};
+
+static struct ti_clk_alias omap3430es1_clks[] = {
+	CLK(NULL, "gfx_l3_ck", &gfx_l3_ck),
+	CLK(NULL, "gfx_l3_fck", &gfx_l3_fck),
+	CLK(NULL, "gfx_l3_ick", &gfx_l3_ick),
+	CLK(NULL, "gfx_cg1_ck", &gfx_cg1_ck),
+	CLK(NULL, "gfx_cg2_ck", &gfx_cg2_ck),
+	CLK(NULL, "d2d_26m_fck", &d2d_26m_fck),
+	CLK(NULL, "fshostusb_fck", &fshostusb_fck),
+	CLK(NULL, "ssi_ssr_fck", &ssi_ssr_fck_3430es1),
+	CLK(NULL, "ssi_sst_fck", &ssi_sst_fck_3430es1),
+	CLK("musb-omap2430", "ick", &hsotgusb_ick_3430es1),
+	CLK(NULL, "hsotgusb_ick", &hsotgusb_ick_3430es1),
+	CLK(NULL, "fac_ick", &fac_ick),
+	CLK(NULL, "ssi_ick", &ssi_ick_3430es1),
+	CLK(NULL, "usb_l4_ick", &usb_l4_ick),
+	CLK(NULL, "dss1_alwon_fck", &dss1_alwon_fck_3430es1),
+	CLK("omapdss_dss", "ick", &dss_ick_3430es1),
+	CLK(NULL, "dss_ick", &dss_ick_3430es1),
+	{ NULL },
+};
+
+static struct ti_clk_alias omap36xx_clks[] = {
+	CLK(NULL, "uart4_fck", &uart4_fck),
+	{ NULL },
+};
+
+static struct ti_clk_alias am35xx_clks[] = {
+	CLK(NULL, "ipss_ick", &ipss_ick),
+	CLK(NULL, "rmii_ck", &rmii_ck),
+	CLK(NULL, "pclk_ck", &pclk_ck),
+	CLK(NULL, "emac_ick", &emac_ick),
+	CLK(NULL, "emac_fck", &emac_fck),
+	CLK("davinci_emac.0", NULL, &emac_ick),
+	CLK("davinci_mdio.0", NULL, &emac_fck),
+	CLK("vpfe-capture", "master", &vpfe_ick),
+	CLK("vpfe-capture", "slave", &vpfe_fck),
+	CLK(NULL, "hsotgusb_ick", &hsotgusb_ick_am35xx),
+	CLK(NULL, "hsotgusb_fck", &hsotgusb_fck_am35xx),
+	CLK(NULL, "hecc_ck", &hecc_ck),
+	CLK(NULL, "uart4_ick", &uart4_ick_am35xx),
+	CLK(NULL, "uart4_fck", &uart4_fck_am35xx),
+	{ NULL },
+};
+
+static struct ti_clk *omap36xx_clk_patches[] = {
+	&dpll4_m3x2_ck_omap36xx,
+	&dpll3_m3x2_ck_omap36xx,
+	&dpll4_m6x2_ck_omap36xx,
+	&dpll4_m2x2_ck_omap36xx,
+	&dpll4_m5x2_ck_omap36xx,
+	&dpll4_ck_omap36xx,
+	NULL,
+};
+
+static const char *enable_init_clks[] = {
+	"sdrc_ick",
+	"gpmc_fck",
+	"omapctrl_ick",
+};
+
+static void __init omap3_clk_legacy_common_init(void)
+{
+	omap2_clk_disable_autoidle_all();
+
+	omap2_clk_enable_init_clocks(enable_init_clks,
+				     ARRAY_SIZE(enable_init_clks));
+
+	pr_info("Clocking rate (Crystal/Core/MPU): %ld.%01ld/%ld/%ld MHz\n",
+		(clk_get_rate(osc_sys_ck.clk) / 1000000),
+		(clk_get_rate(osc_sys_ck.clk) / 100000) % 10,
+		(clk_get_rate(core_ck.clk) / 1000000),
+		(clk_get_rate(arm_fck.clk) / 1000000));
+}
+
+int __init omap3430es1_clk_legacy_init(void)
+{
+	int r;
+
+	r = ti_clk_register_legacy_clks(omap3430es1_clks);
+	r |= ti_clk_register_legacy_clks(omap34xx_omap36xx_clks);
+	r |= ti_clk_register_legacy_clks(omap3xxx_clks);
+
+	omap3_clk_legacy_common_init();
+
+	return r;
+}
+
+int __init omap3430_clk_legacy_init(void)
+{
+	int r;
+
+	r = ti_clk_register_legacy_clks(omap34xx_omap36xx_clks);
+	r |= ti_clk_register_legacy_clks(omap36xx_omap3430es2plus_clks);
+	r |= ti_clk_register_legacy_clks(omap36xx_am35xx_omap3430es2plus_clks);
+	r |= ti_clk_register_legacy_clks(omap3xxx_clks);
+
+	omap3_clk_legacy_common_init();
+	omap3_clk_lock_dpll5();
+
+	return r;
+}
+
+int __init omap36xx_clk_legacy_init(void)
+{
+	int r;
+
+	ti_clk_patch_legacy_clks(omap36xx_clk_patches);
+	r = ti_clk_register_legacy_clks(omap36xx_clks);
+	r |= ti_clk_register_legacy_clks(omap36xx_omap3430es2plus_clks);
+	r |= ti_clk_register_legacy_clks(omap34xx_omap36xx_clks);
+	r |= ti_clk_register_legacy_clks(omap36xx_am35xx_omap3430es2plus_clks);
+	r |= ti_clk_register_legacy_clks(omap3xxx_clks);
+
+	omap3_clk_legacy_common_init();
+	omap3_clk_lock_dpll5();
+
+	return r;
+}
+
+int __init am35xx_clk_legacy_init(void)
+{
+	int r;
+
+	r = ti_clk_register_legacy_clks(am35xx_clks);
+	r |= ti_clk_register_legacy_clks(omap36xx_am35xx_omap3430es2plus_clks);
+	r |= ti_clk_register_legacy_clks(omap3xxx_clks);
+
+	omap3_clk_legacy_common_init();
+	omap3_clk_lock_dpll5();
+
+	return r;
+}
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 55ef529a0dbf..13a20bf9e1a5 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -217,6 +217,13 @@ struct ti_dt_clk {
 /* Maximum number of clock memmaps */
 #define CLK_MAX_MEMMAPS			4
 
+/* Static memmap indices */
+enum {
+	TI_CLKM_CM = 0,
+	TI_CLKM_PRM,
+	TI_CLKM_SCRM,
+};
+
 typedef void (*ti_of_clk_init_cb_t)(struct clk_hw *, struct device_node *);
 
 /**
@@ -348,4 +355,9 @@ extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait;
 
+int omap3430_clk_legacy_init(void);
+int omap3430es1_clk_legacy_init(void);
+int omap36xx_clk_legacy_init(void);
+int am35xx_clk_legacy_init(void);
+
 #endif
-- 
cgit v1.2.3


From d4bcef3fbe887ff93b58da4fcf6df1eee416e8fa Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Thu, 29 Jan 2015 20:37:07 +0900
Subject: net: Fix vlan_get_protocol for stacked vlan

vlan_get_protocol() could not get network protocol if a skb has a 802.1ad
vlan tag or multiple vlans, which caused incorrect checksum calculation
in several drivers.

Fix vlan_get_protocol() to retrieve network protocol instead of incorrect
vlan protocol.

As the logic is the same as skb_network_protocol(), create a common helper
function __vlan_get_protocol() and call it from existing functions.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 60 +++++++++++++++++++++++++++++++++++++------------
 net/core/dev.c          | 31 +------------------------
 2 files changed, 47 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 515a35e2a48a..960e666c51e4 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -472,27 +472,59 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 /**
  * vlan_get_protocol - get protocol EtherType.
  * @skb: skbuff to query
+ * @type: first vlan protocol
+ * @depth: buffer to store length of eth and vlan tags in bytes
  *
  * Returns the EtherType of the packet, regardless of whether it is
  * vlan encapsulated (normal or hardware accelerated) or not.
  */
-static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
+static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type,
+					 int *depth)
 {
-	__be16 protocol = 0;
-
-	if (vlan_tx_tag_present(skb) ||
-	     skb->protocol != cpu_to_be16(ETH_P_8021Q))
-		protocol = skb->protocol;
-	else {
-		__be16 proto, *protop;
-		protop = skb_header_pointer(skb, offsetof(struct vlan_ethhdr,
-						h_vlan_encapsulated_proto),
-						sizeof(proto), &proto);
-		if (likely(protop))
-			protocol = *protop;
+	unsigned int vlan_depth = skb->mac_len;
+
+	/* if type is 802.1Q/AD then the header should already be
+	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+	 * ETH_HLEN otherwise
+	 */
+	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+		if (vlan_depth) {
+			if (WARN_ON(vlan_depth < VLAN_HLEN))
+				return 0;
+			vlan_depth -= VLAN_HLEN;
+		} else {
+			vlan_depth = ETH_HLEN;
+		}
+		do {
+			struct vlan_hdr *vh;
+
+			if (unlikely(!pskb_may_pull(skb,
+						    vlan_depth + VLAN_HLEN)))
+				return 0;
+
+			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+			type = vh->h_vlan_encapsulated_proto;
+			vlan_depth += VLAN_HLEN;
+		} while (type == htons(ETH_P_8021Q) ||
+			 type == htons(ETH_P_8021AD));
 	}
 
-	return protocol;
+	if (depth)
+		*depth = vlan_depth;
+
+	return type;
+}
+
+/**
+ * vlan_get_protocol - get protocol EtherType.
+ * @skb: skbuff to query
+ *
+ * Returns the EtherType of the packet, regardless of whether it is
+ * vlan encapsulated (normal or hardware accelerated) or not.
+ */
+static inline __be16 vlan_get_protocol(struct sk_buff *skb)
+{
+	return __vlan_get_protocol(skb, skb->protocol, NULL);
 }
 
 static inline void vlan_set_encap_proto(struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 171420e75b03..c87a2264a02b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2352,7 +2352,6 @@ EXPORT_SYMBOL(skb_checksum_help);
 
 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 {
-	unsigned int vlan_depth = skb->mac_len;
 	__be16 type = skb->protocol;
 
 	/* Tunnel gso handlers can set protocol to ethernet. */
@@ -2366,35 +2365,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 		type = eth->h_proto;
 	}
 
-	/* if skb->protocol is 802.1Q/AD then the header should already be
-	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
-	 * ETH_HLEN otherwise
-	 */
-	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
-		if (vlan_depth) {
-			if (WARN_ON(vlan_depth < VLAN_HLEN))
-				return 0;
-			vlan_depth -= VLAN_HLEN;
-		} else {
-			vlan_depth = ETH_HLEN;
-		}
-		do {
-			struct vlan_hdr *vh;
-
-			if (unlikely(!pskb_may_pull(skb,
-						    vlan_depth + VLAN_HLEN)))
-				return 0;
-
-			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
-			type = vh->h_vlan_encapsulated_proto;
-			vlan_depth += VLAN_HLEN;
-		} while (type == htons(ETH_P_8021Q) ||
-			 type == htons(ETH_P_8021AD));
-	}
-
-	*depth = vlan_depth;
-
-	return type;
+	return __vlan_get_protocol(skb, type, depth);
 }
 
 /**
-- 
cgit v1.2.3


From 074f9dd55f9cab1b82690ed7e44bcf38b9616ce0 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 29 Jan 2015 15:05:04 -0500
Subject: USB: add flag for HCDs that can't receive wakeup requests
 (isp1760-hcd)

Currently the USB stack assumes that all host controller drivers are
capable of receiving wakeup requests from downstream devices.
However, this isn't true for the isp1760-hcd driver, which means that
it isn't safe to do a runtime suspend of any device attached to a
root-hub port if the device requires wakeup.

This patch adds a "cant_recv_wakeups" flag to the usb_hcd structure
and sets the flag in isp1760-hcd.  The core is modified to prevent a
direct child of the root hub from being put into runtime suspend with
wakeup enabled if the flag is set.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Nicolas Pitre <nico@linaro.org>
CC: <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
---
 drivers/usb/core/driver.c      | 12 ++++++++++++
 drivers/usb/host/isp1760-hcd.c |  3 +++
 include/linux/usb/hcd.h        |  2 ++
 3 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index c76ec9758ce3..818369afff63 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -1780,6 +1780,18 @@ static int autosuspend_check(struct usb_device *udev)
 		dev_dbg(&udev->dev, "remote wakeup needed for autosuspend\n");
 		return -EOPNOTSUPP;
 	}
+
+	/*
+	 * If the device is a direct child of the root hub and the HCD
+	 * doesn't handle wakeup requests, don't allow autosuspend when
+	 * wakeup is needed.
+	 */
+	if (w && udev->parent == udev->bus->root_hub &&
+			bus_to_hcd(udev->bus)->cant_recv_wakeups) {
+		dev_dbg(&udev->dev, "HCD doesn't handle wakeup requests\n");
+		return -EOPNOTSUPP;
+	}
+
 	udev->do_remote_wakeup = w;
 	return 0;
 }
diff --git a/drivers/usb/host/isp1760-hcd.c b/drivers/usb/host/isp1760-hcd.c
index dbba455884f4..cecf39a220e7 100644
--- a/drivers/usb/host/isp1760-hcd.c
+++ b/drivers/usb/host/isp1760-hcd.c
@@ -2245,6 +2245,9 @@ struct usb_hcd *isp1760_register(phys_addr_t res_start, resource_size_t res_len,
 	hcd->rsrc_start = res_start;
 	hcd->rsrc_len = res_len;
 
+	/* This driver doesn't support wakeup requests */
+	hcd->cant_recv_wakeups = 1;
+
 	ret = usb_add_hcd(hcd, irq, irqflags);
 	if (ret)
 		goto err_unmap;
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 8968f616e414..68b1e836dff1 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -146,6 +146,8 @@ struct usb_hcd {
 	unsigned		amd_resume_bug:1; /* AMD remote wakeup quirk */
 	unsigned		can_do_streams:1; /* HC supports streams */
 	unsigned		tpl_support:1; /* OTG & EH TPL support */
+	unsigned		cant_recv_wakeups:1;
+			/* wakeup requests from downstream aren't received */
 
 	unsigned int		irq;		/* irq allocated */
 	void __iomem		*regs;		/* device memory/io */
-- 
cgit v1.2.3


From 448c7f3830ca283e485aa943279acea6bde8b270 Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@bitmath.org>
Date: Sun, 1 Feb 2015 11:25:14 -0800
Subject: Input: MT - add support for balanced slot assignment

Some devices are not fast enough to differentiate between a fast-moving
contact and a new contact. This problem cannot be fully resolved because
information is truly missing, but it is possible to safe-guard against
obvious mistakes by restricting movement with a maximum displacement.

The new problem formulation for dmax > 0 cannot benefit from the speedup
for positive definite matrices, but since the convergence is faster, the
result is about the same. For a handful of contacts, the latency difference
is truly negligible.

Suggested-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Tested-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Henrik Rydberg <rydberg@bitmath.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/input-mt.c                  | 31 ++++++++++++++++++++-----------
 drivers/input/mouse/alps.c                |  2 +-
 drivers/input/mouse/bcm5974.c             |  2 +-
 drivers/input/mouse/cypress_ps2.c         |  2 +-
 drivers/input/mouse/synaptics.c           |  2 +-
 drivers/input/touchscreen/pixcir_i2c_ts.c |  2 +-
 include/linux/input/mt.h                  |  3 ++-
 7 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c
index fbe29fcb15c5..cb150a1dbaff 100644
--- a/drivers/input/input-mt.c
+++ b/drivers/input/input-mt.c
@@ -293,7 +293,7 @@ void input_mt_sync_frame(struct input_dev *dev)
 }
 EXPORT_SYMBOL(input_mt_sync_frame);
 
-static int adjust_dual(int *begin, int step, int *end, int eq)
+static int adjust_dual(int *begin, int step, int *end, int eq, int mu)
 {
 	int f, *p, s, c;
 
@@ -311,9 +311,10 @@ static int adjust_dual(int *begin, int step, int *end, int eq)
 			s = *p;
 
 	c = (f + s + 1) / 2;
-	if (c == 0 || (c > 0 && !eq))
+	if (c == 0 || (c > mu && (!eq || mu > 0)))
 		return 0;
-	if (s < 0)
+	/* Improve convergence for positive matrices by penalizing overcovers */
+	if (s < 0 && mu <= 0)
 		c *= 2;
 
 	for (p = begin; p != end; p += step)
@@ -322,23 +323,24 @@ static int adjust_dual(int *begin, int step, int *end, int eq)
 	return (c < s && s <= 0) || (f >= 0 && f < c);
 }
 
-static void find_reduced_matrix(int *w, int nr, int nc, int nrc)
+static void find_reduced_matrix(int *w, int nr, int nc, int nrc, int mu)
 {
 	int i, k, sum;
 
 	for (k = 0; k < nrc; k++) {
 		for (i = 0; i < nr; i++)
-			adjust_dual(w + i, nr, w + i + nrc, nr <= nc);
+			adjust_dual(w + i, nr, w + i + nrc, nr <= nc, mu);
 		sum = 0;
 		for (i = 0; i < nrc; i += nr)
-			sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr);
+			sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr, mu);
 		if (!sum)
 			break;
 	}
 }
 
 static int input_mt_set_matrix(struct input_mt *mt,
-			       const struct input_mt_pos *pos, int num_pos)
+			       const struct input_mt_pos *pos, int num_pos,
+			       int mu)
 {
 	const struct input_mt_pos *p;
 	struct input_mt_slot *s;
@@ -352,7 +354,7 @@ static int input_mt_set_matrix(struct input_mt *mt,
 		y = input_mt_get_value(s, ABS_MT_POSITION_Y);
 		for (p = pos; p != pos + num_pos; p++) {
 			int dx = x - p->x, dy = y - p->y;
-			*w++ = dx * dx + dy * dy;
+			*w++ = dx * dx + dy * dy - mu;
 		}
 	}
 
@@ -393,17 +395,24 @@ static void input_mt_set_slots(struct input_mt *mt,
  * @slots: the slot assignment to be filled
  * @pos: the position array to match
  * @num_pos: number of positions
+ * @dmax: maximum ABS_MT_POSITION displacement (zero for infinite)
  *
  * Performs a best match against the current contacts and returns
  * the slot assignment list. New contacts are assigned to unused
  * slots.
  *
+ * The assignments are balanced so that all coordinate displacements are
+ * below the euclidian distance dmax. If no such assignment can be found,
+ * some contacts are assigned to unused slots.
+ *
  * Returns zero on success, or negative error in case of failure.
  */
 int input_mt_assign_slots(struct input_dev *dev, int *slots,
-			  const struct input_mt_pos *pos, int num_pos)
+			  const struct input_mt_pos *pos, int num_pos,
+			  int dmax)
 {
 	struct input_mt *mt = dev->mt;
+	int mu = 2 * dmax * dmax;
 	int nrc;
 
 	if (!mt || !mt->red)
@@ -413,8 +422,8 @@ int input_mt_assign_slots(struct input_dev *dev, int *slots,
 	if (num_pos < 1)
 		return 0;
 
-	nrc = input_mt_set_matrix(mt, pos, num_pos);
-	find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc);
+	nrc = input_mt_set_matrix(mt, pos, num_pos, mu);
+	find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc, mu);
 	input_mt_set_slots(mt, slots, num_pos);
 
 	return 0;
diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c
index f719f28d370c..f205b8be2ce4 100644
--- a/drivers/input/mouse/alps.c
+++ b/drivers/input/mouse/alps.c
@@ -435,7 +435,7 @@ static void alps_report_mt_data(struct psmouse *psmouse, int n)
 	struct alps_fields *f = &priv->f;
 	int i, slot[MAX_TOUCHES];
 
-	input_mt_assign_slots(dev, slot, f->mt, n);
+	input_mt_assign_slots(dev, slot, f->mt, n, 0);
 	for (i = 0; i < n; i++)
 		alps_set_slot(dev, slot[i], f->mt[i].x, f->mt[i].y);
 
diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c
index c329cdb0b91a..b10709f04615 100644
--- a/drivers/input/mouse/bcm5974.c
+++ b/drivers/input/mouse/bcm5974.c
@@ -564,7 +564,7 @@ static int report_tp_state(struct bcm5974 *dev, int size)
 		dev->index[n++] = &f[i];
 	}
 
-	input_mt_assign_slots(input, dev->slots, dev->pos, n);
+	input_mt_assign_slots(input, dev->slots, dev->pos, n, 0);
 
 	for (i = 0; i < n; i++)
 		report_finger_data(input, dev->slots[i],
diff --git a/drivers/input/mouse/cypress_ps2.c b/drivers/input/mouse/cypress_ps2.c
index 8af34ffe208b..9118a1861a45 100644
--- a/drivers/input/mouse/cypress_ps2.c
+++ b/drivers/input/mouse/cypress_ps2.c
@@ -538,7 +538,7 @@ static void cypress_process_packet(struct psmouse *psmouse, bool zero_pkt)
 		pos[i].y = contact->y;
 	}
 
-	input_mt_assign_slots(input, slots, pos, n);
+	input_mt_assign_slots(input, slots, pos, n, 0);
 
 	for (i = 0; i < n; i++) {
 		contact = &report_data.contacts[i];
diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c
index f89de8971bf8..a3692e3b7cab 100644
--- a/drivers/input/mouse/synaptics.c
+++ b/drivers/input/mouse/synaptics.c
@@ -809,7 +809,7 @@ static void synaptics_report_mt_data(struct psmouse *psmouse,
 		pos[i].y = synaptics_invert_y(hw[i]->y);
 	}
 
-	input_mt_assign_slots(dev, slot, pos, nsemi);
+	input_mt_assign_slots(dev, slot, pos, nsemi, 0);
 
 	for (i = 0; i < nsemi; i++) {
 		input_mt_slot(dev, slot[i]);
diff --git a/drivers/input/touchscreen/pixcir_i2c_ts.c b/drivers/input/touchscreen/pixcir_i2c_ts.c
index 4fb5537fdd42..2c2107147319 100644
--- a/drivers/input/touchscreen/pixcir_i2c_ts.c
+++ b/drivers/input/touchscreen/pixcir_i2c_ts.c
@@ -126,7 +126,7 @@ static void pixcir_ts_report(struct pixcir_i2c_ts_data *ts,
 			pos[i].y = touch->y;
 		}
 
-		input_mt_assign_slots(ts->input, slots, pos, n);
+		input_mt_assign_slots(ts->input, slots, pos, n, 0);
 	}
 
 	for (i = 0; i < n; i++) {
diff --git a/include/linux/input/mt.h b/include/linux/input/mt.h
index f583ff639776..d7188de4db96 100644
--- a/include/linux/input/mt.h
+++ b/include/linux/input/mt.h
@@ -119,7 +119,8 @@ struct input_mt_pos {
 };
 
 int input_mt_assign_slots(struct input_dev *dev, int *slots,
-			  const struct input_mt_pos *pos, int num_pos);
+			  const struct input_mt_pos *pos, int num_pos,
+			  int dmax);
 
 int input_mt_get_slot_by_key(struct input_dev *dev, int key);
 
-- 
cgit v1.2.3


From 00845eb968ead28007338b2bb852b8beef816583 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 1 Feb 2015 12:23:32 -0800
Subject: sched: don't cause task state changes in nested sleep debugging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 8eb23b9f35aa ("sched: Debug nested sleeps") added code to report
on nested sleep conditions, which we generally want to avoid because the
inner sleeping operation can re-set the thread state to TASK_RUNNING,
but that will then cause the outer sleep loop not actually sleep when it
calls schedule.

However, that's actually valid traditional behavior, with the inner
sleep being some fairly rare case (like taking a sleeping lock that
normally doesn't actually need to sleep).

And the debug code would actually change the state of the task to
TASK_RUNNING internally, which makes that kind of traditional and
working code not work at all, because now the nested sleep doesn't just
sometimes cause the outer one to not block, but will cause it to happen
every time.

In particular, it will cause the cardbus kernel daemon (pccardd) to
basically busy-loop doing scheduling, converting a laptop into a heater,
as reported by Bruno Prémont.  But there may be other legacy uses of
that nested sleep model in other drivers that are also likely to never
get converted to the new model.

This fixes both cases:

 - don't set TASK_RUNNING when the nested condition happens (note: even
   if WARN_ONCE() only _warns_ once, the return value isn't whether the
   warning happened, but whether the condition for the warning was true.
   So despite the warning only happening once, the "if (WARN_ON(..))"
   would trigger for every nested sleep.

 - in the cases where we knowingly disable the warning by using
   "sched_annotate_sleep()", don't change the task state (that is used
   for all core scheduling decisions), instead use '->task_state_change'
   that is used for the debugging decision itself.

(Credit for the second part of the fix goes to Oleg Nesterov: "Can't we
avoid this subtle change in behaviour DEBUG_ATOMIC_SLEEP adds?" with the
suggested change to use 'task_state_change' as part of the test)

Reported-and-bisected-by: Bruno Prémont <bonbons@linux-vserver.org>
Tested-by: Rafael J Wysocki <rjw@rjwysocki.net>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>,
Cc: Ilya Dryomov <ilya.dryomov@inktank.com>,
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>,
Cc: Davidlohr Bueso <dave@stgolabs.net>,
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 +-
 kernel/sched/core.c    | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5449d2f4a1ef..64ce58bee6f5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -176,7 +176,7 @@ extern int _cond_resched(void);
  */
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
-# define sched_annotate_sleep()	__set_current_state(TASK_RUNNING)
+# define sched_annotate_sleep()	(current->task_state_change = 0)
 #else
   static inline void ___might_sleep(const char *file, int line,
 				   int preempt_offset) { }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc00566e..e628cb11b560 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7292,13 +7292,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	 * since we will exit with TASK_RUNNING make sure we enter with it,
 	 * otherwise we will destroy state.
 	 */
-	if (WARN_ONCE(current->state != TASK_RUNNING,
+	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
 			"do not call blocking ops when !TASK_RUNNING; "
 			"state=%lx set at [<%p>] %pS\n",
 			current->state,
 			(void *)current->task_state_change,
-			(void *)current->task_state_change))
-		__set_current_state(TASK_RUNNING);
+			(void *)current->task_state_change);
 
 	___might_sleep(file, line, preempt_offset);
 }
-- 
cgit v1.2.3


From fd979c0132074856975a6e79bc2226b99435ec5b Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Fri, 30 Jan 2015 13:45:57 -0800
Subject: perf: provide sysfs_show for struct perf_pmu_events_attr

(struct perf_pmu_events_attr) is defined in include/linux/perf_event.h,
but the only "show" for it is in x86 and contains x86 specific stuff.

Make a generic one for those of us who are just using the event_str.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/perf_event.h |  3 +++
 kernel/events/core.c       | 12 ++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 486e84ccb1f9..58f59bdb590b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -897,6 +897,9 @@ struct perf_pmu_events_attr {
 	const char *event_str;
 };
 
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+			      char *page);
+
 #define PMU_EVENT_ATTR(_name, _var, _id, _show)				\
 static struct perf_pmu_events_attr _var = {				\
 	.attr = __ATTR(_name, 0444, _show, NULL),			\
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c1ee7f2bebc..934687f8d51b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8276,6 +8276,18 @@ void __init perf_event_init(void)
 		     != 1024);
 }
 
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+			      char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr =
+		container_of(attr, struct perf_pmu_events_attr, attr);
+
+	if (pmu_attr->event_str)
+		return sprintf(page, "%s\n", pmu_attr->event_str);
+
+	return 0;
+}
+
 static int __init perf_event_sysfs_init(void)
 {
 	struct pmu *pmu;
-- 
cgit v1.2.3


From f0405b816149665393cc62b9e5082fc2d79714df Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Fri, 30 Jan 2015 13:45:58 -0800
Subject: perf: add PMU_EVENT_ATTR_STRING() helper

Helper for constructing static struct perf_pmu_events_attr s.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/perf_event.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 58f59bdb590b..1d3631448b91 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -906,6 +906,13 @@ static struct perf_pmu_events_attr _var = {				\
 	.id   =  _id,							\
 };
 
+#define PMU_EVENT_ATTR_STRING(_name, _var, _str)			    \
+static struct perf_pmu_events_attr _var = {				    \
+	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+	.id		= 0,						    \
+	.event_str	= _str,						    \
+};
+
 #define PMU_FORMAT_ATTR(_name, _format)					\
 static ssize_t								\
 _name##_show(struct device *dev,					\
-- 
cgit v1.2.3


From aafb3e98b27977148c8c86499684f8f5c3decfbb Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Thu, 29 Jan 2015 22:40:11 -0800
Subject: netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for
 switch device offloads

This is a high level feature flag for all switch asic offloads

switch drivers set this flag on switch ports. Logical devices like
bridge, bonds, vxlans can inherit this flag from their slaves/ports.

The patch also adds the flag to NETIF_F_ONE_FOR_ALL, so that it gets
propagated to the upperdevices (bridges and bonds).

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 8e30685affeb..7d59dc6ab789 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -66,6 +66,7 @@ enum {
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
+	NETIF_F_HW_SWITCH_OFFLOAD_BIT,  /* HW switch offload */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -124,6 +125,7 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
+#define NETIF_F_HW_SWITCH_OFFLOAD	__NETIF_F(HW_SWITCH_OFFLOAD)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
@@ -159,7 +161,9 @@ enum {
  */
 #define NETIF_F_ONE_FOR_ALL	(NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
 				 NETIF_F_SG | NETIF_F_HIGHDMA |		\
-				 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED)
+				 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED | \
+				 NETIF_F_HW_SWITCH_OFFLOAD)
+
 /*
  * If one device doesn't support one of these features, then disable it
  * for all in netdev_increment_features.
-- 
cgit v1.2.3


From add511b38266aa10c1079f9248854e6a415c4dc2 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Thu, 29 Jan 2015 22:40:12 -0800
Subject: bridge: add flags argument to ndo_bridge_setlink and
 ndo_bridge_dellink

bridge flags are needed inside ndo_bridge_setlink/dellink handlers to
avoid another call to parse IFLA_AF_SPEC inside these handlers

This is used later in this series

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_main.c   |  3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 +-
 drivers/net/ethernet/rocker/rocker.c          |  2 +-
 include/linux/netdevice.h                     |  6 ++++--
 net/bridge/br_netlink.c                       |  4 ++--
 net/bridge/br_private.h                       |  4 ++--
 net/core/rtnetlink.c                          | 10 ++++++----
 7 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 598c5070c629..efed92c7b731 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4327,7 +4327,8 @@ fw_exit:
 	return status;
 }
 
-static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh)
+static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+				 u16 flags)
 {
 	struct be_adapter *adapter = netdev_priv(dev);
 	struct nlattr *attr, *br_spec;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 7bb421bfd84e..e4086fea4be2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7786,7 +7786,7 @@ static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 }
 
 static int ixgbe_ndo_bridge_setlink(struct net_device *dev,
-				    struct nlmsghdr *nlh)
+				    struct nlmsghdr *nlh, u16 flags)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct nlattr *attr, *br_spec;
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 11f4ffcc113d..f0d607ca5e73 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3722,7 +3722,7 @@ skip:
 }
 
 static int rocker_port_bridge_setlink(struct net_device *dev,
-				      struct nlmsghdr *nlh)
+				      struct nlmsghdr *nlh, u16 flags)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	struct nlattr *protinfo;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3d37c6eb1732..16251e96e6aa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1154,13 +1154,15 @@ struct net_device_ops {
 						int idx);
 
 	int			(*ndo_bridge_setlink)(struct net_device *dev,
-						      struct nlmsghdr *nlh);
+						      struct nlmsghdr *nlh,
+						      u16 flags);
 	int			(*ndo_bridge_getlink)(struct sk_buff *skb,
 						      u32 pid, u32 seq,
 						      struct net_device *dev,
 						      u32 filter_mask);
 	int			(*ndo_bridge_dellink)(struct net_device *dev,
-						      struct nlmsghdr *nlh);
+						      struct nlmsghdr *nlh,
+						      u16 flags);
 	int			(*ndo_change_carrier)(struct net_device *dev,
 						      bool new_carrier);
 	int			(*ndo_get_phys_port_id)(struct net_device *dev,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index e08b260f33fe..088e80203845 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -494,7 +494,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 }
 
 /* Change state and parameters on port. */
-int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
 {
 	struct nlattr *protinfo;
 	struct nlattr *afspec;
@@ -550,7 +550,7 @@ out:
 }
 
 /* Delete port information */
-int br_dellink(struct net_device *dev, struct nlmsghdr *nlh)
+int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
 {
 	struct nlattr *afspec;
 	struct net_bridge_port *p;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e8e3f3681680..de0919975a25 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -819,8 +819,8 @@ extern struct rtnl_link_ops br_link_ops;
 int br_netlink_init(void);
 void br_netlink_fini(void);
 void br_ifinfo_notify(int event, struct net_bridge_port *port);
-int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg);
-int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg);
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
+int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
 int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
 	       u32 filter_mask);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fedd7ab4085a..673cb4c6f391 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2991,7 +2991,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
 			goto out;
 		}
 
-		err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+		err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags);
 		if (err)
 			goto out;
 
@@ -3002,7 +3002,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (!dev->netdev_ops->ndo_bridge_setlink)
 			err = -EOPNOTSUPP;
 		else
-			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh,
+								  flags);
 		if (!err) {
 			flags &= ~BRIDGE_FLAGS_SELF;
 
@@ -3064,7 +3065,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
 			goto out;
 		}
 
-		err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
+		err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags);
 		if (err)
 			goto out;
 
@@ -3075,7 +3076,8 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (!dev->netdev_ops->ndo_bridge_dellink)
 			err = -EOPNOTSUPP;
 		else
-			err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
+			err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh,
+								  flags);
 
 		if (!err) {
 			flags &= ~BRIDGE_FLAGS_SELF;
-- 
cgit v1.2.3


From d0a57bd5b53d6b7fe7a6c626023737436b5df630 Mon Sep 17 00:00:00 2001
From: Peter De Schrijver <pdeschrijver@nvidia.com>
Date: Tue, 16 Dec 2014 12:38:27 -0800
Subject: clk: tegra: make tegra_clocks_apply_init_table() arch_initcall

tegra_clocks_apply_init_table() needs to be called after the udelay
loop has been calibrated (see commit
441f199a37cfd66c5dd8dd45490bd3ea6971117d ("clk: tegra: defer
application of init table") for why that is).  On existing Tegra SoCs
this was done by calling tegra_clocks_apply_init_table() from
tegra_dt_init(). To make this also work on ARM64, we need to change
this into an initcall. tegra_dt_init() is called from
customize_machine which is an arch_initcall. Therefore this should
also work on existing 32bit Tegra SoCs.

Tested on Tegra20 (ventana), Tegra30 (beaverboard), Tegra124 (jetson TK1) and
Tegra132.

Signed-off-by: Peter De Schrijver <pdeschrijver@nvidia.com>
[paul@pwsan.com: tweaked the commit message]
Signed-off-by: Paul Walmsley <paul@pwsan.com>
Signed-off-by: Paul Walmsley <pwalmsley@nvidia.com>
Cc: Thierry Reding <treding@nvidia.com>
Cc: Prashant Gaikwad <pgaikwad@nvidia.com>
Cc: Mike Turquette <mturquette@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Stephen Warren <swarren@wwwdotorg.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Alexandre Courbot <gnurou@gmail.com>
---
 arch/arm/mach-tegra/tegra.c | 2 --
 drivers/clk/tegra/clk.c     | 7 +++++--
 include/linux/clk/tegra.h   | 2 --
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-tegra/tegra.c b/arch/arm/mach-tegra/tegra.c
index ef016af1c9e7..914341bcef25 100644
--- a/arch/arm/mach-tegra/tegra.c
+++ b/arch/arm/mach-tegra/tegra.c
@@ -91,8 +91,6 @@ static void __init tegra_dt_init(void)
 	struct soc_device *soc_dev;
 	struct device *parent = NULL;
 
-	tegra_clocks_apply_init_table();
-
 	soc_dev_attr = kzalloc(sizeof(*soc_dev_attr), GFP_KERNEL);
 	if (!soc_dev_attr)
 		goto out;
diff --git a/drivers/clk/tegra/clk.c b/drivers/clk/tegra/clk.c
index 97dc8595c3cd..9ddb7547cb43 100644
--- a/drivers/clk/tegra/clk.c
+++ b/drivers/clk/tegra/clk.c
@@ -302,10 +302,13 @@ struct clk ** __init tegra_lookup_dt_id(int clk_id,
 
 tegra_clk_apply_init_table_func tegra_clk_apply_init_table;
 
-void __init tegra_clocks_apply_init_table(void)
+static int __init tegra_clocks_apply_init_table(void)
 {
 	if (!tegra_clk_apply_init_table)
-		return;
+		return 0;
 
 	tegra_clk_apply_init_table();
+
+	return 0;
 }
+arch_initcall(tegra_clocks_apply_init_table);
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 3ca9fca827a2..19c4208f4752 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -120,6 +120,4 @@ static inline void tegra_cpu_clock_resume(void)
 }
 #endif
 
-void tegra_clocks_apply_init_table(void);
-
 #endif /* __LINUX_CLK_TEGRA_H_ */
-- 
cgit v1.2.3


From 15d0f5ea348b9c4e6d41df294dde38a56a39c7bf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 2 Feb 2015 10:07:59 -0700
Subject: Make super_blocks and sb_lock static

The only user outside of fs/super.c is gone now

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/super.c         | 4 ++--
 include/linux/fs.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 3b4dadafdd60..05a021638b11 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
 #include "internal.h"
 
 
-LIST_HEAD(super_blocks);
-DEFINE_SPINLOCK(sb_lock);
+static LIST_HEAD(super_blocks);
+static DEFINE_SPINLOCK(sb_lock);
 
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
 	"sb_writers",
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65d02de342e1..2f717baefdf8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1183,8 +1183,6 @@ struct mm_struct;
 #define UMOUNT_NOFOLLOW	0x00000008	/* Don't follow symlink on umount */
 #define UMOUNT_UNUSED	0x80000000	/* Flag guaranteed to be unused */
 
-extern struct list_head super_blocks;
-extern spinlock_t sb_lock;
 
 /* Possible states of 'frozen' field */
 enum {
-- 
cgit v1.2.3


From 6cae0a4648c0db2a74efb816cd2ce84390c90480 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 16 Aug 2014 13:31:51 +0200
Subject: nfs: add LAYOUT_TYPE_MAX enum value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This gives us a nice upper bound for later use in nfѕd.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nfs4.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 022b761dbf0a..8a3589c2542c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -516,6 +516,7 @@ enum pnfs_layouttype {
 	LAYOUT_NFSV4_1_FILES  = 1,
 	LAYOUT_OSD2_OBJECTS = 2,
 	LAYOUT_BLOCK_VOLUME = 3,
+	LAYOUT_TYPE_MAX
 };
 
 /* used for both layout return and recall */
-- 
cgit v1.2.3


From 11afe9f76e121e960445deee5b7f26f0787a1990 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 21 Jan 2015 19:17:03 +0100
Subject: fs: add FL_LAYOUT lease type

This (ab-)uses the file locking code to allow filesystems to recall
outstanding pNFS layouts on a file.  This new lease type is similar but
not quite the same as FL_DELEG.  A FL_LAYOUT lease can always be granted,
an a per-filesystem lock (XFS iolock for the initial implementation)
ensures not FL_LAYOUT leases granted when we would need to recall them.

Also included are changes that allow multiple outstanding read
leases of different types on the same file as long as they have a
differnt owner.  This wasn't a problem until now as nfsd never set
FL_LEASE leases, and no one else used FL_DELEG leases, but given that
nfsd will also issues FL_LAYOUT leases we will have to handle it now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c         | 14 ++++++++++----
 include/linux/fs.h | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 22ac7694cc84..4753218f308e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
 #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
 
 static bool lease_breaking(struct file_lock *fl)
@@ -1371,6 +1371,8 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 
 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 {
+	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+		return false;
 	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
 		return false;
 	return locks_conflict(breaker, lease);
@@ -1594,11 +1596,14 @@ int fcntl_getlease(struct file *filp)
  * conflict with the lease we're trying to set.
  */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg)
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 {
 	int ret = 0;
 	struct inode *inode = dentry->d_inode;
 
+	if (flags & FL_LAYOUT)
+		return 0;
+
 	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 		return -EAGAIN;
 
@@ -1647,7 +1652,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
-	error = check_conflicting_open(dentry, arg);
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
 	if (error)
 		goto out;
 
@@ -1703,7 +1708,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * precedes these checks.
 	 */
 	smp_mb();
-	error = check_conflicting_open(dentry, arg);
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
 	if (error) {
 		locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
 		goto out;
@@ -1787,6 +1792,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
 			WARN_ON_ONCE(1);
 			return -ENOLCK;
 		}
+
 		return generic_add_lease(filp, arg, flp, priv);
 	default:
 		return -EINVAL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ddd2fa7cefd3..84740145f835 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -875,6 +875,7 @@ static inline struct file *get_file(struct file *f)
 #define FL_DOWNGRADE_PENDING	256 /* Lease is being downgraded */
 #define FL_UNLOCK_PENDING	512 /* Lease is being broken */
 #define FL_OFDLCK	1024	/* lock is "owned" by struct file */
+#define FL_LAYOUT	2048	/* outstanding pNFS layout */
 
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
@@ -2037,6 +2038,16 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 	return ret;
 }
 
+static inline int break_layout(struct inode *inode, bool wait)
+{
+	smp_mb();
+	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+		return __break_lease(inode,
+				wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
+				FL_LAYOUT);
+	return 0;
+}
+
 #else /* !CONFIG_FILE_LOCKING */
 static inline int locks_mandatory_locked(struct file *file)
 {
@@ -2092,6 +2103,11 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 	return 0;
 }
 
+static inline int break_layout(struct inode *inode, bool wait)
+{
+	return 0;
+}
+
 #endif /* CONFIG_FILE_LOCKING */
 
 /* fs/open.c */
-- 
cgit v1.2.3


From 9cf514ccfacb301f3b1b4509a8ce25dffad55880 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 5 May 2014 13:11:59 +0200
Subject: nfsd: implement pNFS operations

Add support for the GETDEVICEINFO, LAYOUTGET, LAYOUTCOMMIT and
LAYOUTRETURN NFSv4.1 operations, as well as backing code to manage
outstanding layouts and devices.

Layout management is very straight forward, with a nfs4_layout_stateid
structure that extends nfs4_stid to manage layout stateids as the
top-level structure.  It is linked into the nfs4_file and nfs4_client
structures like the other stateids, and contains a linked list of
layouts that hang of the stateid.  The actual layout operations are
implemented in layout drivers that are not part of this commit, but
will be added later.

The worst part of this commit is the management of the pNFS device IDs,
which suffers from a specification that is not sanely implementable due
to the fact that the device-IDs are global and not bound to an export,
and have a small enough size so that we can't store the fsid portion of
a file handle, and must never be reused.  As we still do need perform all
export authentication and validation checks on a device ID passed to
GETDEVICEINFO we are caught between a rock and a hard place.  To work
around this issue we add a new hash that maps from a 64-bit integer to a
fsid so that we can look up the export to authenticate against it,
a 32-bit integer as a generation that we can bump when changing the device,
and a currently unused 32-bit integer that could be used in the future
to handle more than a single device per export.  Entries in this hash
table are never deleted as we can't reuse the ids anyway, and would have
a severe lifetime problem anyway as Linux export structures are temporary
structures that can go away under load.

Parts of the XDR data, structures and marshaling/unmarshaling code, as
well as many concepts are derived from the old pNFS server implementation
from Andy Adamson, Benny Halevy, Dean Hildebrand, Marc Eshel, Fred Isaman,
Mike Sager, Ricardo Labiaga and many others.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfsd/Kconfig                  |  10 +
 fs/nfsd/Makefile                 |   1 +
 fs/nfsd/export.c                 |   8 +
 fs/nfsd/export.h                 |   2 +
 fs/nfsd/nfs4layouts.c            | 487 +++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4proc.c               | 302 ++++++++++++++++++++++++
 fs/nfsd/nfs4state.c              |  16 +-
 fs/nfsd/nfs4xdr.c                | 312 +++++++++++++++++++++++++
 fs/nfsd/nfsctl.c                 |   9 +-
 fs/nfsd/nfsd.h                   |  16 +-
 fs/nfsd/pnfs.h                   |  80 +++++++
 fs/nfsd/state.h                  |  21 ++
 fs/nfsd/xdr4.h                   |  59 +++++
 include/linux/nfs4.h             |   1 +
 include/uapi/linux/nfsd/debug.h  |   1 +
 include/uapi/linux/nfsd/export.h |   4 +-
 16 files changed, 1324 insertions(+), 5 deletions(-)
 create mode 100644 fs/nfsd/nfs4layouts.c
 create mode 100644 fs/nfsd/pnfs.h

(limited to 'include/linux')

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
 
 	  If unsure, say N.
 
+config NFSD_PNFS
+	bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+	depends on NFSD_V4
+	help
+	  This option enables support for the parallel NFS features of the
+	  minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+	  server.
+
+	  If unsure, say N.
+
 config NFSD_V4_SECURITY_LABEL
 	bool "Provide Security Label support for NFSv4 server"
 	depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..5806270a8567 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,3 +12,4 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
 			   nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
 #include "nfsd.h"
 #include "nfsfh.h"
 #include "netns.h"
+#include "pnfs.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_EXPORT
 
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	exp.ex_client = dom;
 	exp.cd = cd;
+	exp.ex_devid_map = NULL;
 
 	/* expiry */
 	err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		if (!gid_valid(exp.ex_anon_gid))
 			goto out4;
 		err = 0;
+
+		nfsd4_setup_layout_type(&exp);
 	}
 
 	expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = 0;
+	new->ex_layout_type = 0;
 	new->ex_uuid = NULL;
 	new->cd = item->cd;
 }
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_anon_uid = item->ex_anon_uid;
 	new->ex_anon_gid = item->ex_anon_gid;
 	new->ex_fsid = item->ex_fsid;
+	new->ex_devid_map = item->ex_devid_map;
+	item->ex_devid_map = NULL;
 	new->ex_uuid = item->ex_uuid;
 	item->ex_uuid = NULL;
 	new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	item->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = item->ex_fslocs.migrated;
 	item->ex_fslocs.migrated = 0;
+	new->ex_layout_type = item->ex_layout_type;
 	new->ex_nflavors = item->ex_nflavors;
 	for (i = 0; i < MAX_SECINFO_LIST; i++) {
 		new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
 	struct nfsd4_fs_locations ex_fslocs;
 	uint32_t		ex_nflavors;
 	struct exp_flavor_info	ex_flavors[MAX_SECINFO_LIST];
+	enum pnfs_layouttype	ex_layout_type;
+	struct nfsd4_deviceid_map *ex_devid_map;
 	struct cache_detail	*cd;
 };
 
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..8273270418b1
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/jhash.h>
+#include <linux/sched.h>
+
+#include "pnfs.h"
+#include "netns.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PNFS
+
+struct nfs4_layout {
+	struct list_head		lo_perstate;
+	struct nfs4_layout_stateid	*lo_state;
+	struct nfsd4_layout_seg		lo_seg;
+};
+
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+};
+
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS	8
+#define DEVID_HASH_SIZE	(1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK	(DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+
+static inline u32 devid_hashfn(u64 idx)
+{
+	return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+	const struct knfsd_fh *fh = &fhp->fh_handle;
+	size_t fsid_len = key_len(fh->fh_fsid_type);
+	struct nfsd4_deviceid_map *map, *old;
+	int i;
+
+	map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+	if (!map)
+		return;
+
+	map->fsid_type = fh->fh_fsid_type;
+	memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+
+	spin_lock(&nfsd_devid_lock);
+	if (fhp->fh_export->ex_devid_map)
+		goto out_unlock;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+			if (old->fsid_type != fh->fh_fsid_type)
+				continue;
+			if (memcmp(old->fsid, fh->fh_fsid,
+					key_len(old->fsid_type)))
+				continue;
+
+			fhp->fh_export->ex_devid_map = old;
+			goto out_unlock;
+		}
+	}
+
+	map->idx = nfsd_devid_seq++;
+	list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+	fhp->fh_export->ex_devid_map = map;
+	map = NULL;
+
+out_unlock:
+	spin_unlock(&nfsd_devid_lock);
+	kfree(map);
+}
+
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+	struct nfsd4_deviceid_map *map, *ret = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+		if (map->idx == idx)
+			ret = map;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation)
+{
+	if (!fhp->fh_export->ex_devid_map) {
+		nfsd4_alloc_devid_map(fhp);
+		if (!fhp->fh_export->ex_devid_map)
+			return -ENOMEM;
+	}
+
+	id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+	id->generation = device_generation;
+	id->pad = 0;
+	return 0;
+}
+
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+	if (exp->ex_flags & NFSEXP_NOPNFS)
+		return;
+}
+
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+	struct nfs4_layout_stateid *ls = layoutstateid(stid);
+	struct nfs4_client *clp = ls->ls_stid.sc_client;
+	struct nfs4_file *fp = ls->ls_stid.sc_file;
+
+	spin_lock(&clp->cl_lock);
+	list_del_init(&ls->ls_perclnt);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_del_init(&ls->ls_perfile);
+	spin_unlock(&fp->fi_lock);
+
+	kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+		struct nfs4_stid *parent, u32 layout_type)
+{
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_file *fp = parent->sc_file;
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stp;
+
+	stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+	if (!stp)
+		return NULL;
+	stp->sc_free = nfsd4_free_layout_stateid;
+	get_nfs4_file(fp);
+	stp->sc_file = fp;
+
+	ls = layoutstateid(stp);
+	INIT_LIST_HEAD(&ls->ls_perclnt);
+	INIT_LIST_HEAD(&ls->ls_perfile);
+	spin_lock_init(&ls->ls_lock);
+	INIT_LIST_HEAD(&ls->ls_layouts);
+	ls->ls_layout_type = layout_type;
+
+	spin_lock(&clp->cl_lock);
+	stp->sc_type = NFS4_LAYOUT_STID;
+	list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_add(&ls->ls_perfile, &fp->fi_lo_states);
+	spin_unlock(&fp->fi_lock);
+
+	return ls;
+}
+
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stid;
+	unsigned char typemask = NFS4_LAYOUT_STID;
+	__be32 status;
+
+	if (create)
+		typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+
+	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+			net_generic(SVC_NET(rqstp), nfsd_net_id));
+	if (status)
+		goto out;
+
+	if (!fh_match(&cstate->current_fh.fh_handle,
+		      &stid->sc_file->fi_fhandle)) {
+		status = nfserr_bad_stateid;
+		goto out_put_stid;
+	}
+
+	if (stid->sc_type != NFS4_LAYOUT_STID) {
+		ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+		nfs4_put_stid(stid);
+
+		status = nfserr_jukebox;
+		if (!ls)
+			goto out;
+	} else {
+		ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+
+		status = nfserr_bad_stateid;
+		if (stateid->si_generation > stid->sc_stateid.si_generation)
+			goto out_put_stid;
+		if (layout_type != ls->ls_layout_type)
+			goto out_put_stid;
+	}
+
+	*lsp = ls;
+	return 0;
+
+out_put_stid:
+	nfs4_put_stid(stid);
+out:
+	return status;
+}
+
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+	u64 end = seg->offset + seg->length;
+	return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+	if (end == NFS4_MAX_UINT64)
+		lo->length = NFS4_MAX_UINT64;
+	else
+		lo->length = end - lo->offset;
+}
+
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+	if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+		return false;
+	if (layout_end(&lo->lo_seg) <= s->offset)
+		return false;
+	if (layout_end(s) <= lo->lo_seg.offset)
+		return false;
+	return true;
+}
+
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+	if (lo->iomode != new->iomode)
+		return false;
+	if (layout_end(new) < lo->offset)
+		return false;
+	if (layout_end(lo) < new->offset)
+		return false;
+
+	lo->offset = min(lo->offset, new->offset);
+	layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+	return true;
+}
+
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+	struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+	struct nfs4_layout *lp, *new = NULL;
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+	if (!new)
+		return nfserr_jukebox;
+	memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+	new->lo_state = ls;
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+
+	atomic_inc(&ls->ls_stid.sc_count);
+	list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+	new = NULL;
+done:
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	spin_unlock(&ls->ls_lock);
+	if (new)
+		kmem_cache_free(nfs4_layout_cache, new);
+	return nfs_ok;
+}
+
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+	while (!list_empty(reaplist)) {
+		struct nfs4_layout *lp = list_first_entry(reaplist,
+				struct nfs4_layout, lo_perstate);
+
+		list_del(&lp->lo_perstate);
+		nfs4_put_stid(&lp->lo_state->ls_stid);
+		kmem_cache_free(nfs4_layout_cache, lp);
+	}
+}
+
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+		struct list_head *reaplist)
+{
+	struct nfsd4_layout_seg *lo = &lp->lo_seg;
+	u64 end = layout_end(lo);
+
+	if (seg->offset <= lo->offset) {
+		if (layout_end(seg) >= end) {
+			list_move_tail(&lp->lo_perstate, reaplist);
+			return;
+		}
+		end = seg->offset;
+	} else {
+		/* retain the whole layout segment on a split. */
+		if (layout_end(seg) < end) {
+			dprintk("%s: split not supported\n", __func__);
+			return;
+		}
+
+		lo->offset = layout_end(seg);
+	}
+
+	layout_update_len(lo, end);
+}
+
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_layout *lp, *n;
+	LIST_HEAD(reaplist);
+	__be32 nfserr;
+	int found = 0;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+						false, lrp->lr_layout_type,
+						&ls);
+	if (nfserr)
+		return nfserr;
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+		if (layouts_overlapping(lp, &lrp->lr_seg)) {
+			nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+			found++;
+		}
+	}
+	if (!list_empty(&ls->ls_layouts)) {
+		if (found) {
+			update_stateid(&ls->ls_stid.sc_stateid);
+			memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+				sizeof(stateid_t));
+		}
+		lrp->lrs_present = 1;
+	} else {
+		nfs4_unhash_stid(&ls->ls_stid);
+		lrp->lrs_present = 0;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	nfs4_put_stid(&ls->ls_stid);
+	nfsd4_free_layouts(&reaplist);
+	return nfs_ok;
+}
+
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_layout *lp, *t;
+	LIST_HEAD(reaplist);
+
+	lrp->lrs_present = 0;
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+		if (lrp->lr_return_type == RETURN_FSID &&
+		    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+				   &cstate->current_fh.fh_handle))
+			continue;
+
+		spin_lock(&ls->ls_lock);
+		list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+			if (lrp->lr_seg.iomode == IOMODE_ANY ||
+			    lrp->lr_seg.iomode == lp->lo_seg.iomode)
+				list_move_tail(&lp->lo_perstate, &reaplist);
+		}
+		spin_unlock(&ls->ls_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+	return 0;
+}
+
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+		struct list_head *reaplist)
+{
+	spin_lock(&ls->ls_lock);
+	list_splice_init(&ls->ls_layouts, reaplist);
+	spin_unlock(&ls->ls_lock);
+}
+
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+		nfsd4_return_all_layouts(ls, &reaplist);
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&fp->fi_lock);
+	list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+		if (ls->ls_stid.sc_client == clp)
+			nfsd4_return_all_layouts(ls, &reaplist);
+	}
+	spin_unlock(&fp->fi_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+int
+nfsd4_init_pnfs(void)
+{
+	int i;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+
+	nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+			sizeof(struct nfs4_layout), 0, 0, NULL);
+	if (!nfs4_layout_cache)
+		return -ENOMEM;
+
+	nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+			sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+	if (!nfs4_layout_stateid_cache) {
+		kmem_cache_destroy(nfs4_layout_cache);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void
+nfsd4_exit_pnfs(void)
+{
+	int i;
+
+	kmem_cache_destroy(nfs4_layout_cache);
+	kmem_cache_destroy(nfs4_layout_stateid_cache);
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		struct nfsd4_deviceid_map *map, *n;
+
+		list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+			kfree(map);
+	}
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..2b91443497cc 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,7 @@
 #include "current_stateid.h"
 #include "netns.h"
 #include "acl.h"
+#include "pnfs.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1178,6 +1179,252 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status == nfserr_same ? nfs_ok : status;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+	if (!exp->ex_layout_type) {
+		dprintk("%s: export does not support pNFS\n", __func__);
+		return NULL;
+	}
+
+	if (exp->ex_layout_type != layout_type) {
+		dprintk("%s: layout type %d not supported\n",
+			__func__, layout_type);
+		return NULL;
+	}
+
+	return nfsd4_layout_ops[layout_type];
+}
+
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	const struct nfsd4_layout_ops *ops;
+	struct nfsd4_deviceid_map *map;
+	struct svc_export *exp;
+	__be32 nfserr;
+
+	dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+	       __func__,
+	       gdp->gd_layout_type,
+	       gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+	       gdp->gd_maxcount);
+
+	map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+	if (!map) {
+		dprintk("%s: couldn't find device ID to export mapping!\n",
+			__func__);
+		return nfserr_noent;
+	}
+
+	exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+	if (IS_ERR(exp)) {
+		dprintk("%s: could not find device id\n", __func__);
+		return nfserr_noent;
+	}
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+	if (!ops)
+		goto out;
+
+	nfserr = nfs_ok;
+	if (gdp->gd_maxcount != 0)
+		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+
+	gdp->gd_notify_types &= ops->notify_types;
+	exp_put(exp);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutget *lgp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+	int accmode;
+
+	switch (lgp->lg_seg.iomode) {
+	case IOMODE_READ:
+		accmode = NFSD_MAY_READ;
+		break;
+	case IOMODE_RW:
+		accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n",
+			__func__, lgp->lg_seg.iomode);
+		nfserr = nfserr_badiomode;
+		goto out;
+	}
+
+	nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+	if (!ops)
+		goto out;
+
+	/*
+	 * Verify minlength and range as per RFC5661:
+	 *  o  If loga_length is less than loga_minlength,
+	 *     the metadata server MUST return NFS4ERR_INVAL.
+	 *  o  If the sum of loga_offset and loga_minlength exceeds
+	 *     NFS4_UINT64_MAX, and loga_minlength is not
+	 *     NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+	 *  o  If the sum of loga_offset and loga_length exceeds
+	 *     NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+	 *     the error NFS4ERR_INVAL MUST result.
+	 */
+	nfserr = nfserr_inval;
+	if (lgp->lg_seg.length < lgp->lg_minlength ||
+	    (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+	     lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+	    (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+	     lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+		goto out;
+	if (lgp->lg_seg.length == 0)
+		goto out;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+						true, lgp->lg_layout_type, &ls);
+	if (nfserr)
+		goto out;
+
+	nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+				     current_fh, lgp);
+	if (nfserr)
+		goto out_put_stid;
+
+	nfserr = nfsd4_insert_layout(lgp, ls);
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutcommit *lcp)
+{
+	const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	loff_t new_size = lcp->lc_last_wr + 1;
+	struct inode *inode;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+	if (!ops)
+		goto out;
+	inode = current_fh->fh_dentry->d_inode;
+
+	nfserr = nfserr_inval;
+	if (new_size <= seg->offset) {
+		dprintk("pnfsd: last write before layout segment\n");
+		goto out;
+	}
+	if (new_size > seg->offset + seg->length) {
+		dprintk("pnfsd: last write beyond layout segment\n");
+		goto out;
+	}
+	if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+		dprintk("pnfsd: layoutcommit beyond EOF\n");
+		goto out;
+	}
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+						false, lcp->lc_layout_type,
+						&ls);
+	if (nfserr) {
+		/* fixup error code as per RFC5661 */
+		if (nfserr == nfserr_bad_stateid)
+			nfserr = nfserr_badlayout;
+		goto out;
+	}
+
+	nfserr = ops->proc_layoutcommit(inode, lcp);
+	if (nfserr)
+		goto out_put_stid;
+
+	if (new_size > i_size_read(inode)) {
+		lcp->lc_size_chg = 1;
+		lcp->lc_newsize = new_size;
+	} else {
+		lcp->lc_size_chg = 0;
+	}
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+		goto out;
+
+	switch (lrp->lr_seg.iomode) {
+	case IOMODE_READ:
+	case IOMODE_RW:
+	case IOMODE_ANY:
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n", __func__,
+			lrp->lr_seg.iomode);
+		nfserr = nfserr_inval;
+		goto out;
+	}
+
+	switch (lrp->lr_return_type) {
+	case RETURN_FILE:
+		nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+		break;
+	case RETURN_FSID:
+	case RETURN_ALL:
+		nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+		break;
+	default:
+		dprintk("%s: invalid return_type %d\n", __func__,
+			lrp->lr_return_type);
+		nfserr = nfserr_inval;
+		break;
+	}
+out:
+	return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 /*
  * NULL call.
  */
@@ -1679,6 +1926,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
 		op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
 
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE		128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* logr_return_on_close */ +
+		op_encode_stateid_maxsz +
+		1 /* nr of layouts */ +
+		MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* locr_newsize */ +
+		2 /* ns_size */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* lrs_stateid */ +
+		op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2243,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO] = {
+		.op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_GETDEVICEINFO",
+	},
+	[OP_LAYOUTGET] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutget,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTGET",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+	},
+	[OP_LAYOUTCOMMIT] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutcommit,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTCOMMIT",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+	},
+	[OP_LAYOUTRETURN] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutreturn,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTRETURN",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+	},
+#endif /* CONFIG_NFSD_PNFS */
 
 	/* NFSv4.2 operations */
 	[OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index eefd29ec43f2..c89f79dc69e2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
 #include "current_stateid.h"
 
 #include "netns.h"
+#include "pnfs.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -1539,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 	INIT_LIST_HEAD(&clp->cl_lru);
 	INIT_LIST_HEAD(&clp->cl_callbacks);
 	INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
 	spin_lock_init(&clp->cl_lock);
 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	return clp;
@@ -1643,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
 		nfs4_get_stateowner(&oo->oo_owner);
 		release_openowner(oo);
 	}
+	nfsd4_return_all_client_layouts(clp);
 	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2126,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 static void
 nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
 {
-	/* pNFS is not supported */
+#ifdef CONFIG_NFSD_PNFS
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
 	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
 
 	/* Referrals are supported, Migration is not. */
 	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3055,6 +3063,9 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
 	fp->fi_share_deny = 0;
 	memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
 	memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&fp->fi_lo_states);
+#endif
 	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
 }
 
@@ -4841,6 +4852,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
+	nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
+				      stp->st_stid.sc_file);
+
 	nfsd4_close_open_stateid(stp);
 
 	/* put reference from nfs4_preprocess_seqid_op */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 974533e5a427..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
 #include "state.h"
 #include "cache.h"
 #include "netns.h"
+#include "pnfs.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1522,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
 	DECODE_TAIL;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	DECODE_HEAD;
+	u32 num, i;
+
+	READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+	COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+	gdev->gd_layout_type = be32_to_cpup(p++);
+	gdev->gd_maxcount = be32_to_cpup(p++);
+	num = be32_to_cpup(p++);
+	if (num) {
+		READ_BUF(4 * num);
+		gdev->gd_notify_types = be32_to_cpup(p++);
+		for (i = 1; i < num; i++) {
+			if (be32_to_cpup(p++)) {
+				status = nfserr_inval;
+				goto out;
+			}
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutget *lgp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(36);
+	lgp->lg_signal = be32_to_cpup(p++);
+	lgp->lg_layout_type = be32_to_cpup(p++);
+	lgp->lg_seg.iomode = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+	p = xdr_decode_hyper(p, &lgp->lg_minlength);
+	nfsd4_decode_stateid(argp, &lgp->lg_sid);
+	READ_BUF(4);
+	lgp->lg_maxcount = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutcommit *lcp)
+{
+	DECODE_HEAD;
+	u32 timechange;
+
+	READ_BUF(20);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+	lcp->lc_reclaim = be32_to_cpup(p++);
+	nfsd4_decode_stateid(argp, &lcp->lc_sid);
+	READ_BUF(4);
+	lcp->lc_newoffset = be32_to_cpup(p++);
+	if (lcp->lc_newoffset) {
+		READ_BUF(8);
+		p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+	} else
+		lcp->lc_last_wr = 0;
+	READ_BUF(4);
+	timechange = be32_to_cpup(p++);
+	if (timechange) {
+		status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+		if (status)
+			return status;
+	} else {
+		lcp->lc_mtime.tv_nsec = UTIME_NOW;
+	}
+	READ_BUF(8);
+	lcp->lc_layout_type = be32_to_cpup(p++);
+
+	/*
+	 * Save the layout update in XDR format and let the layout driver deal
+	 * with it later.
+	 */
+	lcp->lc_up_len = be32_to_cpup(p++);
+	if (lcp->lc_up_len > 0) {
+		READ_BUF(lcp->lc_up_len);
+		READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutreturn *lrp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(16);
+	lrp->lr_reclaim = be32_to_cpup(p++);
+	lrp->lr_layout_type = be32_to_cpup(p++);
+	lrp->lr_seg.iomode = be32_to_cpup(p++);
+	lrp->lr_return_type = be32_to_cpup(p++);
+	if (lrp->lr_return_type == RETURN_FILE) {
+		READ_BUF(16);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+		nfsd4_decode_stateid(argp, &lrp->lr_sid);
+		READ_BUF(4);
+		lrp->lrf_body_len = be32_to_cpup(p++);
+		if (lrp->lrf_body_len > 0) {
+			READ_BUF(lrp->lrf_body_len);
+			READMEM(lrp->lrf_body, lrp->lrf_body_len);
+		}
+	} else {
+		lrp->lr_seg.offset = 0;
+		lrp->lr_seg.length = NFS4_MAX_UINT64;
+	}
+
+	DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static __be32
 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
 		       struct nfsd4_fallocate *fallocate)
@@ -1616,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
 	[OP_FREE_STATEID]	= (nfsd4_dec)nfsd4_decode_free_stateid,
 	[OP_GET_DIR_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
 	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
 	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2548,6 +2678,30 @@ out_acl:
 			get_parent_attributes(exp, &stat);
 		p = xdr_encode_hyper(p, stat.ino);
 	}
+#ifdef CONFIG_NFSD_PNFS
+	if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+	    (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+		if (exp->ex_layout_type) {
+			p = xdr_reserve_space(xdr, 8);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(1);
+			*p++ = cpu_to_be32(exp->ex_layout_type);
+		} else {
+			p = xdr_reserve_space(xdr, 4);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(0);
+		}
+	}
+
+	if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(stat.blksize);
+	}
+#endif /* CONFIG_NFSD_PNFS */
 	if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
 		status = nfsd4_encode_security_label(xdr, rqstp, context,
 								contextlen);
@@ -3824,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfserr;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[gdev->gd_layout_type];
+	u32 starting_len = xdr->buf->len, needed_len;
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(gdev->gd_layout_type);
+
+	/* If maxcount is 0 then just update notifications */
+	if (gdev->gd_maxcount != 0) {
+		nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+		if (nfserr) {
+			/*
+			 * We don't bother to burden the layout drivers with
+			 * enforcing gd_maxcount, just tell the client to
+			 * come back with a bigger buffer if it's not enough.
+			 */
+			if (xdr->buf->len + 4 > gdev->gd_maxcount)
+				goto toosmall;
+			goto out;
+		}
+	}
+
+	nfserr = nfserr_resource;
+	if (gdev->gd_notify_types) {
+		p = xdr_reserve_space(xdr, 4 + 4);
+		if (!p)
+			goto out;
+		*p++ = cpu_to_be32(1);			/* bitmap length */
+		*p++ = cpu_to_be32(gdev->gd_notify_types);
+	} else {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out;
+		*p++ = 0;
+	}
+
+	nfserr = 0;
+out:
+	kfree(gdev->gd_device);
+	dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+	return nfserr;
+
+toosmall:
+	dprintk("%s: maxcount too small\n", __func__);
+	needed_len = xdr->buf->len + 4 /* notifications */;
+	xdr_truncate_encode(xdr, starting_len);
+	p = xdr_reserve_space(xdr, 4);
+	if (!p) {
+		nfserr = nfserr_resource;
+	} else {
+		*p++ = cpu_to_be32(needed_len);
+		nfserr = nfserr_toosmall;
+	}
+	goto out;
+}
+
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutget *lgp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[lgp->lg_layout_type];
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(1);	/* we always set return-on-close */
+	*p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+	p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+				    sizeof(stateid_opaque_t));
+
+	*p++ = cpu_to_be32(1);	/* we always return a single layout */
+	p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+	p = xdr_encode_hyper(p, lgp->lg_seg.length);
+	*p++ = cpu_to_be32(lgp->lg_seg.iomode);
+	*p++ = cpu_to_be32(lgp->lg_layout_type);
+
+	nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+	kfree(lgp->lg_content);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+			  struct nfsd4_layoutcommit *lcp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lcp->lc_size_chg);
+	if (lcp->lc_size_chg) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_hyper(p, lcp->lc_newsize);
+	}
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lrp->lrs_present);
+	if (lrp->lrs_present)
+		nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+	return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  struct nfsd4_seek *seek)
@@ -3900,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
 	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+#endif
 	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
 #include "cache.h"
 #include "state.h"
 #include "netns.h"
+#include "pnfs.h"
 
 /*
  *	We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
 	retval = nfsd4_init_slabs();
 	if (retval)
 		goto out_unregister_pernet;
-	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
+	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	if (retval)
+		goto out_exit_pnfs;
 	nfsd_stat_init();	/* Statistics */
 	retval = nfsd_reply_cache_init();
 	if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
 out_free_stat:
 	nfsd_stat_shutdown();
 	nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+	nfsd4_exit_pnfs();
 out_free_slabs:
 	nfsd4_free_slabs();
 out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
 	nfsd_stat_shutdown();
 	nfsd_lockd_shutdown();
 	nfsd4_free_slabs();
+	nfsd4_exit_pnfs();
 	nfsd_fault_inject_cleanup();
 	unregister_filesystem(&nfsd_fs_type);
 	unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void		nfsd_lockd_shutdown(void);
 
 #define NFSD4_SUPPORTED_ATTRS_WORD2 0
 
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1	FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE	| FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1	0
+#define PNFSD_SUPPORTED_ATTRS_WORD2	0
+#endif /* CONFIG_NFSD_PNFS */
+
 #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
 	NFSD4_SUPPORTED_ATTRS_WORD0
 
 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
-	NFSD4_SUPPORTED_ATTRS_WORD1
+	(NFSD4_SUPPORTED_ATTRS_WORD1	| PNFSD_SUPPORTED_ATTRS_WORD1)
 
 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+	(NFSD4_SUPPORTED_ATTRS_WORD2	| PNFSD_SUPPORTED_ATTRS_WORD2 | \
+	 FATTR4_WORD2_SUPPATTR_EXCLCREAT)
 
+/* 4.2 */
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #define NFSD4_2_SECURITY_ATTRS		FATTR4_WORD2_SECURITY_LABEL
 #else
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..a9616a4e13cd
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,80 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+
+#include "state.h"
+#include "xdr4.h"
+
+struct xdr_stream;
+
+struct nfsd4_deviceid_map {
+	struct list_head	hash;
+	u64			idx;
+	int			fsid_type;
+	u32			fsid[];
+};
+
+struct nfsd4_layout_ops {
+	u32		notify_types;
+
+	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+			struct nfsd4_getdeviceinfo *gdevp);
+	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+			struct nfsd4_getdeviceinfo *gdevp);
+
+	__be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+			struct nfsd4_layoutget *lgp);
+	__be32 (*encode_layoutget)(struct xdr_stream *,
+			struct nfsd4_layoutget *lgp);
+
+	__be32 (*proc_layoutcommit)(struct inode *inode,
+			struct nfsd4_layoutcommit *lcp);
+};
+
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+		struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+	return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 38ebb1268b59..5f66b7fd0297 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define NFS4_REVOKED_DELEG_STID 16
 #define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
 	unsigned char sc_type;
 	stateid_t sc_stateid;
 	struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
 	struct list_head	cl_delegations;
 	struct list_head	cl_revoked;	/* unacknowledged, revoked 4.1 state */
 	struct list_head        cl_lru;         /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	cl_lo_states;	/* outstanding layout states */
+#endif
 	struct xdr_netobj	cl_name; 	/* id generated by client */
 	nfs4_verifier		cl_verifier; 	/* generated by client */
 	time_t                  cl_time;        /* time of last lease renewal */
@@ -496,6 +500,9 @@ struct nfs4_file {
 	int			fi_delegees;
 	struct knfsd_fh		fi_fhandle;
 	bool			fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	fi_lo_states;
+#endif
 };
 
 /*
@@ -528,6 +535,20 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 	return container_of(s, struct nfs4_ol_stateid, st_stid);
 }
 
+struct nfs4_layout_stateid {
+	struct nfs4_stid		ls_stid;
+	struct list_head		ls_perclnt;
+	struct list_head		ls_perfile;
+	spinlock_t			ls_lock;
+	struct list_head		ls_layouts;
+	u32				ls_layout_type;
+};
+
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
+
 /* flags for preprocess_seqid_op() */
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
 	u32 rca_one_fs;
 };
 
+struct nfsd4_deviceid {
+	u64			fsid_idx;
+	u32			generation;
+	u32			pad;
+};
+
+struct nfsd4_layout_seg {
+	u32			iomode;
+	u64			offset;
+	u64			length;
+};
+
+struct nfsd4_getdeviceinfo {
+	struct nfsd4_deviceid	gd_devid;	/* request */
+	u32			gd_layout_type;	/* request */
+	u32			gd_maxcount;	/* request */
+	u32			gd_notify_types;/* request - response */
+	void			*gd_device;	/* response */
+};
+
+struct nfsd4_layoutget {
+	u64			lg_minlength;	/* request */
+	u32			lg_signal;	/* request */
+	u32			lg_layout_type;	/* request */
+	u32			lg_maxcount;	/* request */
+	stateid_t		lg_sid;		/* request/response */
+	struct nfsd4_layout_seg	lg_seg;		/* request/response */
+	void			*lg_content;	/* response */
+};
+
+struct nfsd4_layoutcommit {
+	stateid_t		lc_sid;		/* request */
+	struct nfsd4_layout_seg	lc_seg;		/* request */
+	u32			lc_reclaim;	/* request */
+	u32			lc_newoffset;	/* request */
+	u64			lc_last_wr;	/* request */
+	struct timespec		lc_mtime;	/* request */
+	u32			lc_layout_type;	/* request */
+	u32			lc_up_len;	/* layout length */
+	void			*lc_up_layout;	/* decoded by callback */
+	u32			lc_size_chg;	/* boolean for response */
+	u64			lc_newsize;	/* response */
+};
+
+struct nfsd4_layoutreturn {
+	u32			lr_return_type;	/* request */
+	u32			lr_layout_type;	/* request */
+	struct nfsd4_layout_seg	lr_seg;		/* request */
+	u32			lr_reclaim;	/* request */
+	u32			lrf_body_len;	/* request */
+	void			*lrf_body;	/* request */
+	stateid_t		lr_sid;		/* request/response */
+	u32			lrs_present;	/* response */
+};
+
 struct nfsd4_fallocate {
 	/* request */
 	stateid_t	falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
 		struct nfsd4_reclaim_complete	reclaim_complete;
 		struct nfsd4_test_stateid	test_stateid;
 		struct nfsd4_free_stateid	free_stateid;
+		struct nfsd4_getdeviceinfo	getdeviceinfo;
+		struct nfsd4_layoutget		layoutget;
+		struct nfsd4_layoutcommit	layoutcommit;
+		struct nfsd4_layoutreturn	layoutreturn;
 
 		/* NFSv4.2 */
 		struct nfsd4_fallocate		allocate;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 8a3589c2542c..bc10d687f2ce 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -411,6 +411,7 @@ enum lock_type4 {
 #define FATTR4_WORD1_TIME_MODIFY_SET    (1UL << 22)
 #define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
 #define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
+#define FATTR4_WORD2_LAYOUT_TYPES       (1UL << 0)
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
diff --git a/include/uapi/linux/nfsd/debug.h b/include/uapi/linux/nfsd/debug.h
index 1fdc95bb2375..0bf130a1c58d 100644
--- a/include/uapi/linux/nfsd/debug.h
+++ b/include/uapi/linux/nfsd/debug.h
@@ -32,6 +32,7 @@
 #define NFSDDBG_REPCACHE	0x0080
 #define NFSDDBG_XDR		0x0100
 #define NFSDDBG_LOCKD		0x0200
+#define NFSDDBG_PNFS		0x0400
 #define NFSDDBG_ALL		0x7FFF
 #define NFSDDBG_NOCHANGE	0xFFFF
 
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 584b6ef3a5e8..4742f2cb42f2 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -47,8 +47,10 @@
  * exported filesystem.
  */
 #define	NFSEXP_V4ROOT		0x10000
+#define NFSEXP_NOPNFS		0x20000
+
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS		0x1FE7F
+#define NFSEXP_ALLFLAGS		0x3FE7F
 
 /* The flags that may vary depending on security flavor: */
 #define NFSEXP_SECINFO_FLAGS	(NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
-- 
cgit v1.2.3


From 67dc0d4758e5ced978bb30e8af11bc42cabfa77c Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 29 Jan 2015 14:11:25 +1000
Subject: vt_buffer: drop console buffer copying optimisations

These two copy to/from VGA memory, however on the Silicon
Motion SMI750 VGA card on a 64-bit system cause console corruption.

This is due to the hw being buggy and not handling a 64-bit transaction
correctly.

We could try and create a 32-bit version of these routines,
but I'm not sure the optimisation is worth much today.

Fixes https://bugzilla.redhat.com/show_bug.cgi?id=1132826

Tested-by: Huawei engineering.
Signed-off-by: Dave Airlie <airlied@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/vt_buffer.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index 057db7d2f448..f38c10ba3ff5 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -21,10 +21,6 @@
 #ifndef VT_BUF_HAVE_RW
 #define scr_writew(val, addr) (*(addr) = (val))
 #define scr_readw(addr) (*(addr))
-#define scr_memcpyw(d, s, c) memcpy(d, s, c)
-#define scr_memmovew(d, s, c) memmove(d, s, c)
-#define VT_BUF_HAVE_MEMCPYW
-#define VT_BUF_HAVE_MEMMOVEW
 #endif
 
 #ifndef VT_BUF_HAVE_MEMSETW
-- 
cgit v1.2.3


From 01395d798452435f19de3bfe5d04325db4e49677 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Thu, 22 Jan 2015 11:50:24 -0500
Subject: PNP: Allow console to override ACPI device sleep

If the serial console is an ACPI PNP device, the PNP bus always powers
down the device at system suspend, even though the no_console_suspend
command line parameter is specified (eg., when debugging suspend/resume).

Add PNP_CONSOLE capability, which when set, prevents calling both the
->disable() and ->suspend() PNP protocol methods if console suspend
is disabled.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pnp/driver.c |  2 +-
 include/linux/pnp.h  | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index f748cc8cbb03..4e57d3370368 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -182,7 +182,7 @@ static int __pnp_bus_suspend(struct device *dev, pm_message_t state)
 			return error;
 	}
 
-	if (pnp_dev->protocol->suspend)
+	if (pnp_can_suspend(pnp_dev))
 		pnp_dev->protocol->suspend(pnp_dev, state);
 	return 0;
 }
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 195aafc6cd07..6512e9cbc6d5 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -12,6 +12,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/mod_devicetable.h>
+#include <linux/console.h>
 
 #define PNP_NAME_LEN		50
 
@@ -309,15 +310,22 @@ struct pnp_fixup {
 #define PNP_DISABLE		0x0004
 #define PNP_CONFIGURABLE	0x0008
 #define PNP_REMOVABLE		0x0010
+#define PNP_CONSOLE		0x0020
 
 #define pnp_can_read(dev)	(((dev)->protocol->get) && \
 				 ((dev)->capabilities & PNP_READ))
 #define pnp_can_write(dev)	(((dev)->protocol->set) && \
 				 ((dev)->capabilities & PNP_WRITE))
-#define pnp_can_disable(dev)	(((dev)->protocol->disable) && \
-				 ((dev)->capabilities & PNP_DISABLE))
+#define pnp_can_disable(dev)	(((dev)->protocol->disable) &&		  \
+				 ((dev)->capabilities & PNP_DISABLE) &&	  \
+				 (!((dev)->capabilities & PNP_CONSOLE) || \
+				  console_suspend_enabled))
 #define pnp_can_configure(dev)	((!(dev)->active) && \
 				 ((dev)->capabilities & PNP_CONFIGURABLE))
+#define pnp_can_suspend(dev)	(((dev)->protocol->suspend) &&		  \
+				 (!((dev)->capabilities & PNP_CONSOLE) || \
+				  console_suspend_enabled))
+
 
 #ifdef CONFIG_ISAPNP
 extern struct pnp_protocol isapnp_protocol;
-- 
cgit v1.2.3


From 3abf87cd3e70009ed70a7f28c2914888a7e27332 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 17 Jan 2015 15:42:04 -0500
Subject: tty: Make lock subclasses available for other tty locks

Besides nested legacy_mutex locking which is required on pty pair
teardown, other nested pty operations require lock subclassing.

Move lock subclass definition to tty interface header, include/linux/tty.h,
and document its use.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_mutex.c | 12 +-----------
 include/linux/tty.h     | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_mutex.c b/drivers/tty/tty_mutex.c
index a872389dc0bc..0efcf713b756 100644
--- a/drivers/tty/tty_mutex.c
+++ b/drivers/tty/tty_mutex.c
@@ -4,18 +4,8 @@
 #include <linux/semaphore.h>
 #include <linux/sched.h>
 
-/*
- * Nested tty locks are necessary for releasing pty pairs.
- * The stable lock order is master pty first, then slave pty.
- */
-
 /* Legacy tty mutex glue */
 
-enum {
-	TTY_MUTEX_NORMAL,
-	TTY_MUTEX_SLAVE,
-};
-
 /*
  * Getting the big tty mutex.
  */
@@ -58,5 +48,5 @@ void __lockfunc tty_unlock_slave(struct tty_struct *tty)
 
 void tty_set_lock_subclass(struct tty_struct *tty)
 {
-	lockdep_set_subclass(&tty->legacy_mutex, TTY_MUTEX_SLAVE);
+	lockdep_set_subclass(&tty->legacy_mutex, TTY_LOCK_SLAVE);
 }
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 7d66ae508e5c..849659908f23 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -14,6 +14,23 @@
 #include <linux/llist.h>
 
 
+/*
+ * Lock subclasses for tty locks
+ *
+ * TTY_LOCK_NORMAL is for normal ttys and master ptys.
+ * TTY_LOCK_SLAVE is for slave ptys only.
+ *
+ * Lock subclasses are necessary for handling nested locking with pty pairs.
+ * tty locks which use nested locking:
+ *
+ * legacy_mutex - Nested tty locks are necessary for releasing pty pairs.
+ *		  The stable lock order is master pty first, then slave pty.
+ */
+
+enum {
+	TTY_LOCK_NORMAL = 0,
+	TTY_LOCK_SLAVE,
+};
 
 /*
  * (Note: the *_driver.minor_start values 1, 64, 128, 192 are
-- 
cgit v1.2.3


From 1d1d14da12e79a6c05fbe1a975401f0f56c93316 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 17 Jan 2015 15:42:05 -0500
Subject: pty: Fix buffer flush deadlock

The pty driver does not clear its write buffer when commanded.
This is to avoid an apparent deadlock between parallel flushes from
both pty ends; specifically when handling either BRK or INTR input.
However, parallel flushes from this source is not possible since
the pty master can never be set to BRKINT or ISIG. Parallel flushes
from other sources are possible but these do not threaten deadlocks.

Annotate the tty buffer mutex for lockdep to represent the nested
tty_buffer locking which occurs when the pty slave is processing input
(its buffer mutex held) and receives INTR or BRK and acquires the
linked tty buffer mutex via tty_buffer_flush().

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/pty.c        | 10 +++++++++-
 drivers/tty/tty_buffer.c |  6 ++++++
 include/linux/tty.h      |  4 ++++
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index f882ac81b93a..0e273158edac 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -212,10 +212,16 @@ static int pty_signal(struct tty_struct *tty, int sig)
 static void pty_flush_buffer(struct tty_struct *tty)
 {
 	struct tty_struct *to = tty->link;
+	struct tty_ldisc *ld;
 
 	if (!to)
 		return;
-	/* tty_buffer_flush(to); FIXME */
+
+	ld = tty_ldisc_ref(to);
+	tty_buffer_flush(to, ld);
+	if (ld)
+		tty_ldisc_deref(ld);
+
 	if (to->packet) {
 		spin_lock_irq(&tty->ctrl_lock);
 		tty->ctrl_status |= TIOCPKT_FLUSHWRITE;
@@ -425,6 +431,8 @@ static int pty_common_install(struct tty_driver *driver, struct tty_struct *tty,
 	tty->port = ports[1];
 	o_tty->port->itty = o_tty;
 
+	tty_buffer_set_lock_subclass(o_tty->port);
+
 	tty_driver_kref_get(driver);
 	tty->count++;
 	o_tty->count++;
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index 3605103fc1ac..75661641f5fe 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -557,3 +557,9 @@ int tty_buffer_set_limit(struct tty_port *port, int limit)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(tty_buffer_set_limit);
+
+/* slave ptys can claim nested buffer lock when handling BRK and INTR */
+void tty_buffer_set_lock_subclass(struct tty_port *port)
+{
+	lockdep_set_subclass(&port->buf.lock, TTY_LOCK_SLAVE);
+}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 849659908f23..55964b9490dd 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -25,6 +25,9 @@
  *
  * legacy_mutex - Nested tty locks are necessary for releasing pty pairs.
  *		  The stable lock order is master pty first, then slave pty.
+ * tty_buffer lock - slave ptys can claim nested buffer lock when handling
+ *		     signal chars. The stable lock order is slave pty, then
+ *		     master.
  */
 
 enum {
@@ -460,6 +463,7 @@ extern void tty_flush_to_ldisc(struct tty_struct *tty);
 extern void tty_buffer_free_all(struct tty_port *port);
 extern void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld);
 extern void tty_buffer_init(struct tty_port *port);
+extern void tty_buffer_set_lock_subclass(struct tty_port *port);
 extern speed_t tty_termios_baud_rate(struct ktermios *termios);
 extern speed_t tty_termios_input_baud_rate(struct ktermios *termios);
 extern void tty_termios_encode_baud_rate(struct ktermios *termios,
-- 
cgit v1.2.3


From d2b6f44779d3be22d32a5697bd30b59367fd2b33 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 17 Jan 2015 15:42:06 -0500
Subject: n_tty: Fix signal handling flushes

BRKINT and ISIG requires input and output flush when a signal char
is received. However, the order of operations is significant since
parallel i/o may be ongoing.

Merge the signal handling for BRKINT with ISIG handling.

Process the signal first. This ensures any ongoing i/o is aborted;
without this, a waiting writer may continue writing after the flush
occurs and after the signal char has been echoed.

Write lock the termios_rwsem, which excludes parallel writers from
pushing new i/o until after the output buffers are flushed; claiming
the write lock is necessary anyway to exclude parallel readers while
the read buffer is flushed.

Subclass the termios_rwsem for ptys since the slave pty performing
the flush may appear to reorder the termios_rwsem->tty buffer lock
lock order; adding annotation clarifies that
  slave tty_buffer lock-> slave termios_rwsem -> master tty_buffer lock
is a valid lock order.

Flush the echo buffer. In this context, the echo buffer is 'output'.
Otherwise, the output will appear discontinuous because the output buffer
was cleared which contains older output than the echo buffer.

Open-code the read buffer flush since the input worker does not need
kicking (this is the input worker).

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c | 45 ++++++++++++++++++++++++++++++---------------
 drivers/tty/pty.c   |  1 +
 include/linux/tty.h |  3 +++
 3 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 5793aa0d3bfb..cf6e0f2e1331 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1090,16 +1090,45 @@ static void eraser(unsigned char c, struct tty_struct *tty)
  *	Called when a signal is being sent due to terminal input.
  *	Called from the driver receive_buf path so serialized.
  *
+ *	Performs input and output flush if !NOFLSH. In this context, the echo
+ *	buffer is 'output'. The signal is processed first to alert any current
+ *	readers or writers to discontinue and exit their i/o loops.
+ *
  *	Locking: ctrl_lock
  */
 
 static void isig(int sig, struct tty_struct *tty)
 {
+	struct n_tty_data *ldata = tty->disc_data;
 	struct pid *tty_pgrp = tty_get_pgrp(tty);
 	if (tty_pgrp) {
 		kill_pgrp(tty_pgrp, sig, 1);
 		put_pid(tty_pgrp);
 	}
+
+	if (!L_NOFLSH(tty)) {
+		up_read(&tty->termios_rwsem);
+		down_write(&tty->termios_rwsem);
+
+		/* clear echo buffer */
+		mutex_lock(&ldata->output_lock);
+		ldata->echo_head = ldata->echo_tail = 0;
+		ldata->echo_mark = ldata->echo_commit = 0;
+		mutex_unlock(&ldata->output_lock);
+
+		/* clear output buffer */
+		tty_driver_flush_buffer(tty);
+
+		/* clear input buffer */
+		reset_buffer_flags(tty->disc_data);
+
+		/* notify pty master of flush */
+		if (tty->link)
+			n_tty_packet_mode_flush(tty);
+
+		up_write(&tty->termios_rwsem);
+		down_read(&tty->termios_rwsem);
+	}
 }
 
 /**
@@ -1123,13 +1152,6 @@ static void n_tty_receive_break(struct tty_struct *tty)
 		return;
 	if (I_BRKINT(tty)) {
 		isig(SIGINT, tty);
-		if (!L_NOFLSH(tty)) {
-			/* flushing needs exclusive termios_rwsem */
-			up_read(&tty->termios_rwsem);
-			n_tty_flush_buffer(tty);
-			tty_driver_flush_buffer(tty);
-			down_read(&tty->termios_rwsem);
-		}
 		return;
 	}
 	if (I_PARMRK(tty)) {
@@ -1203,13 +1225,7 @@ static void n_tty_receive_parity_error(struct tty_struct *tty, unsigned char c)
 static void
 n_tty_receive_signal_char(struct tty_struct *tty, int signal, unsigned char c)
 {
-	if (!L_NOFLSH(tty)) {
-		/* flushing needs exclusive termios_rwsem */
-		up_read(&tty->termios_rwsem);
-		n_tty_flush_buffer(tty);
-		tty_driver_flush_buffer(tty);
-		down_read(&tty->termios_rwsem);
-	}
+	isig(signal, tty);
 	if (I_IXON(tty))
 		start_tty(tty);
 	if (L_ECHO(tty)) {
@@ -1217,7 +1233,6 @@ n_tty_receive_signal_char(struct tty_struct *tty, int signal, unsigned char c)
 		commit_echoes(tty);
 	} else
 		process_echoes(tty);
-	isig(signal, tty);
 	return;
 }
 
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 0e273158edac..e72ee629cead 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -395,6 +395,7 @@ static int pty_common_install(struct tty_driver *driver, struct tty_struct *tty,
 		goto err_put_module;
 
 	tty_set_lock_subclass(o_tty);
+	lockdep_set_subclass(&o_tty->termios_rwsem, TTY_LOCK_SLAVE);
 
 	if (legacy) {
 		/* We always use new tty termios data so we can do this
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 55964b9490dd..fe5623c9af71 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -25,6 +25,9 @@
  *
  * legacy_mutex - Nested tty locks are necessary for releasing pty pairs.
  *		  The stable lock order is master pty first, then slave pty.
+ * termios_rwsem - The stable lock order is tty_buffer lock->termios_rwsem.
+ *		   Subclassing this lock enables the slave pty to hold its
+ *		   termios_rwsem when claiming the master tty_buffer lock.
  * tty_buffer lock - slave ptys can claim nested buffer lock when handling
  *		     signal chars. The stable lock order is slave pty, then
  *		     master.
-- 
cgit v1.2.3


From 4516d50aabedbe5ae334155193e4d35c02390d9a Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Thu, 22 Jan 2015 12:24:30 -0500
Subject: serial: 8250: Use canary to restart console after suspend

When using no_console_suspend, the serial console may be powered off
anyway during system sleep. Upon resume, the port may be in its default
power-on state, but is expected to continue console i/o before the device
has received its pm callback. The resultant garbage i/o can cause all
kinds of havoc on the remote end.

Use the scratch register as a canary to discover if the console
has been powered-off. Write a non-zero value to the scratch register
at port suspend and reprogram the port before any console i/o if the
scratch register != canary before port resume.

This workaround is disabled for omap_8250 (which uses different divisor
programming).

Credit to Doug Anderson <dianders@chromium.org> for the idea of using
the scratch register canary to discover port power-down.

Cc: Doug Anderson <dianders@chromium.org>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c | 35 ++++++++++++++++++++++++++++++++++-
 include/linux/serial_8250.h         |  3 +++
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index c8ecfaf49eb4..1449c56506b7 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -3267,6 +3267,27 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 	else
 		serial_port_out(port, UART_IER, 0);
 
+	/* check scratch reg to see if port powered off during system sleep */
+	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
+		struct ktermios termios;
+		unsigned int baud, quot, frac = 0;
+
+		termios.c_cflag = port->cons->cflag;
+		if (port->state->port.tty && termios.c_cflag == 0)
+			termios.c_cflag = port->state->port.tty->termios.c_cflag;
+
+		baud = uart_get_baud_rate(port, &termios, NULL,
+					  port->uartclk / 16 / 0xffff,
+					  port->uartclk / 16);
+		quot = serial8250_get_divisor(up, baud, &frac);
+
+		serial8250_set_divisor(port, baud, quot, frac);
+		serial_port_out(port, UART_LCR, up->lcr);
+		serial_port_out(port, UART_MCR, UART_MCR_DTR | UART_MCR_RTS);
+
+		up->canary = 0;
+	}
+
 	uart_console_write(port, s, count, serial8250_console_putchar);
 
 	/*
@@ -3417,7 +3438,17 @@ int __init early_serial_setup(struct uart_port *port)
  */
 void serial8250_suspend_port(int line)
 {
-	uart_suspend_port(&serial8250_reg, &serial8250_ports[line].port);
+	struct uart_8250_port *up = &serial8250_ports[line];
+	struct uart_port *port = &up->port;
+
+	if (!console_suspend_enabled && uart_console(port) &&
+	    port->type != PORT_8250) {
+		unsigned char canary = 0xa5;
+		serial_out(up, UART_SCR, canary);
+		up->canary = canary;
+	}
+
+	uart_suspend_port(&serial8250_reg, port);
 }
 
 /**
@@ -3431,6 +3462,8 @@ void serial8250_resume_port(int line)
 	struct uart_8250_port *up = &serial8250_ports[line];
 	struct uart_port *port = &up->port;
 
+	up->canary = 0;
+
 	if (up->capabilities & UART_NATSEMI) {
 		/* Ensure it's still in high speed mode */
 		serial_port_out(port, UART_LCR, 0xE0);
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 245b959f1ff6..a8efa235b7c1 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -85,6 +85,9 @@ struct uart_8250_port {
 	unsigned char		mcr_force;	/* mask of forced bits */
 	unsigned char		cur_iotype;	/* Running I/O type */
 	unsigned int		rpm_tx_active;
+	unsigned char		canary;		/* non-zero during system sleep
+						 *   if no_console_suspend
+						 */
 
 	/*
 	 * Some bits in registers are cleared on a read, so they must
-- 
cgit v1.2.3


From 391f93f2ec9f857c83bdd21a14dcf7e699f38579 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 25 Jan 2015 14:44:51 -0500
Subject: serial: core: Rework hw-assisted flow control support

hw-assisted flow control support was added to the serial core
in v3.8 with commits,
dba05832cbe4f ("SERIAL: core: add hardware assisted h/w flow control support")
2cbacafd7af0f ("SERIAL: core: add hardware assisted s/w flow control support")
9aba8d5b01119 ("SERIAL: core: add throttle/unthrottle callbacks for hardware
                assisted flow control")
Since then, additional requirements for serial core support have arisen.
Specifically,
1. Separate tx and rx flow control settings for UARTs which only support
   tx flow control (ie., autoCTS).
2. Disable sw-assisted CTS flow control in autoCTS mode
3. Support for RTS flow control by serial core and userspace in autoRTS mode

Distinguish mode from capability; introduce UPSTAT_AUTORTS, UPSTAT_AUTOCTS
and UPSTAT_AUTOXOFF which, when set by the uart driver, enable serial core
support for hw-assisted rx, hw-assisted tx and hw-assisted in-band/IXOFF
rx flow control, respectively. [Note: hw-assisted in-band/IXON tx flow
control does not require serial core support/intervention and can be
enabled by the uart driver when required.]

These modes must be set/reset in the driver's set_termios() method, based
on termios settings, and thus can be safely queried in any context in which
one of the port lock, port mutex or termios rwsem are held. Set these modes
in the 2 in-tree drivers, omap-serial and 8250_omap, which currently
use UPF_HARD_FLOW/UPF_SOFT_FLOW support.

Retain UPF_HARD_FLOW and UPF_SOFT_FLOW as capabilities; re-define
UPF_HARD_FLOW as both UPF_AUTO_RTS and UPF_AUTO_CTS to allow for distinct
and separate rx and tx flow control capabilities.

Disable sw-assisted CTS flow control when UPSTAT_AUTOCTS is enabled.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_omap.c |  7 +++-
 drivers/tty/serial/omap-serial.c    |  7 +++-
 drivers/tty/serial/serial_core.c    | 76 ++++++++++++++-----------------------
 include/linux/serial_core.h         | 21 ++++++++--
 4 files changed, 59 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 9191d05cfaf0..6e4ff7148ead 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -421,8 +421,11 @@ static void omap_8250_set_termios(struct uart_port *port,
 
 	priv->efr = 0;
 	up->mcr &= ~(UART_MCR_RTS | UART_MCR_XONANY);
+	up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF);
+
 	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW) {
 		/* Enable AUTORTS and AUTOCTS */
+		up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS;
 		priv->efr |= UART_EFR_CTS | UART_EFR_RTS;
 	} else	if (up->port.flags & UPF_SOFT_FLOW) {
 		/*
@@ -438,8 +441,10 @@ static void omap_8250_set_termios(struct uart_port *port,
 		 * Enable XON/XOFF flow control on output.
 		 * Transmit XON1, XOFF1
 		 */
-		if (termios->c_iflag & IXOFF)
+		if (termios->c_iflag & IXOFF) {
+			up->port.status |= UPSTAT_AUTOXOFF;
 			priv->efr |= OMAP_UART_SW_TX;
+		}
 
 		/*
 		 * IXANY Flag:
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index b1cf9a3b673a..6129fe515932 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1053,8 +1053,11 @@ serial_omap_set_termios(struct uart_port *port, struct ktermios *termios,
 
 	serial_out(up, UART_TI752_TCR, OMAP_UART_TCR_TRIG);
 
+	up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF);
+
 	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW) {
 		/* Enable AUTORTS and AUTOCTS */
+		up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS;
 		up->efr |= UART_EFR_CTS | UART_EFR_RTS;
 
 		/* Ensure MCR RTS is asserted */
@@ -1081,8 +1084,10 @@ serial_omap_set_termios(struct uart_port *port, struct ktermios *termios,
 		 * Enable XON/XOFF flow control on output.
 		 * Transmit XON1, XOFF1
 		 */
-		if (termios->c_iflag & IXOFF)
+		if (termios->c_iflag & IXOFF) {
+			up->port.status |= UPSTAT_AUTOXOFF;
 			up->efr |= OMAP_UART_SW_TX;
+		}
 
 		/*
 		 * IXANY Flag:
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 2c67a077042a..6a1055ae3437 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -179,14 +179,6 @@ static int uart_port_startup(struct tty_struct *tty, struct uart_state *state,
 			if (tty->termios.c_cflag & CBAUD)
 				uart_set_mctrl(uport, TIOCM_RTS | TIOCM_DTR);
 		}
-
-		spin_lock_irq(&uport->lock);
-		if (uart_cts_enabled(uport) &&
-		    !(uport->ops->get_mctrl(uport) & TIOCM_CTS))
-			uport->hw_stopped = 1;
-		else
-			uport->hw_stopped = 0;
-		spin_unlock_irq(&uport->lock);
 	}
 
 	/*
@@ -442,6 +434,7 @@ static void uart_change_speed(struct tty_struct *tty, struct uart_state *state,
 {
 	struct uart_port *uport = state->uart_port;
 	struct ktermios *termios;
+	int hw_stopped;
 
 	/*
 	 * If we have no tty, termios, or the port does not exist,
@@ -466,6 +459,18 @@ static void uart_change_speed(struct tty_struct *tty, struct uart_state *state,
 		uport->status &= ~UPSTAT_DCD_ENABLE;
 	else
 		uport->status |= UPSTAT_DCD_ENABLE;
+
+	/* reset sw-assisted CTS flow control based on (possibly) new mode */
+	hw_stopped = uport->hw_stopped;
+	uport->hw_stopped = uart_softcts_mode(uport) &&
+				!(uport->ops->get_mctrl(uport) & TIOCM_CTS);
+	if (uport->hw_stopped) {
+		if (!hw_stopped)
+			uport->ops->stop_tx(uport);
+	} else {
+		if (hw_stopped)
+			__uart_start(tty);
+	}
 	spin_unlock_irq(&uport->lock);
 }
 
@@ -619,22 +624,22 @@ static void uart_throttle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port = state->uart_port;
-	upf_t mask = 0;
+	upstat_t mask = 0;
 
 	if (I_IXOFF(tty))
-		mask |= UPF_SOFT_FLOW;
+		mask |= UPSTAT_AUTOXOFF;
 	if (tty->termios.c_cflag & CRTSCTS)
-		mask |= UPF_HARD_FLOW;
+		mask |= UPSTAT_AUTORTS;
 
-	if (port->flags & mask) {
+	if (port->status & mask) {
 		port->ops->throttle(port);
-		mask &= ~port->flags;
+		mask &= ~port->status;
 	}
 
-	if (mask & UPF_SOFT_FLOW)
+	if (mask & UPSTAT_AUTOXOFF)
 		uart_send_xchar(tty, STOP_CHAR(tty));
 
-	if (mask & UPF_HARD_FLOW)
+	if (mask & UPSTAT_AUTORTS)
 		uart_clear_mctrl(port, TIOCM_RTS);
 }
 
@@ -642,22 +647,22 @@ static void uart_unthrottle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port = state->uart_port;
-	upf_t mask = 0;
+	upstat_t mask = 0;
 
 	if (I_IXOFF(tty))
-		mask |= UPF_SOFT_FLOW;
+		mask |= UPSTAT_AUTOXOFF;
 	if (tty->termios.c_cflag & CRTSCTS)
-		mask |= UPF_HARD_FLOW;
+		mask |= UPSTAT_AUTORTS;
 
-	if (port->flags & mask) {
+	if (port->status & mask) {
 		port->ops->unthrottle(port);
-		mask &= ~port->flags;
+		mask &= ~port->status;
 	}
 
-	if (mask & UPF_SOFT_FLOW)
+	if (mask & UPSTAT_AUTOXOFF)
 		uart_send_xchar(tty, START_CHAR(tty));
 
-	if (mask & UPF_HARD_FLOW)
+	if (mask & UPSTAT_AUTORTS)
 		uart_set_mctrl(port, TIOCM_RTS);
 }
 
@@ -1351,30 +1356,6 @@ static void uart_set_termios(struct tty_struct *tty,
 			mask |= TIOCM_RTS;
 		uart_set_mctrl(uport, mask);
 	}
-
-	/*
-	 * If the port is doing h/w assisted flow control, do nothing.
-	 * We assume that port->hw_stopped has never been set.
-	 */
-	if (uport->flags & UPF_HARD_FLOW)
-		return;
-
-	/* Handle turning off CRTSCTS */
-	if ((old_termios->c_cflag & CRTSCTS) && !(cflag & CRTSCTS)) {
-		spin_lock_irq(&uport->lock);
-		uport->hw_stopped = 0;
-		__uart_start(tty);
-		spin_unlock_irq(&uport->lock);
-	}
-	/* Handle turning on CRTSCTS */
-	else if (!(old_termios->c_cflag & CRTSCTS) && (cflag & CRTSCTS)) {
-		spin_lock_irq(&uport->lock);
-		if (!(uport->ops->get_mctrl(uport) & TIOCM_CTS)) {
-			uport->hw_stopped = 1;
-			uport->ops->stop_tx(uport);
-		}
-		spin_unlock_irq(&uport->lock);
-	}
 }
 
 /*
@@ -2855,7 +2836,7 @@ void uart_handle_cts_change(struct uart_port *uport, unsigned int status)
 
 	uport->icount.cts++;
 
-	if (uart_cts_enabled(uport)) {
+	if (uart_softcts_mode(uport)) {
 		if (uport->hw_stopped) {
 			if (status) {
 				uport->hw_stopped = 0;
@@ -2868,6 +2849,7 @@ void uart_handle_cts_change(struct uart_port *uport, unsigned int status)
 				uport->ops->stop_tx(uport);
 			}
 		}
+
 	}
 }
 EXPORT_SYMBOL_GPL(uart_handle_cts_change);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index a0c7033d5f91..baf3e1d08416 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -191,8 +191,10 @@ struct uart_port {
 #define UPF_NO_TXEN_TEST	((__force upf_t) (1 << 15))
 #define UPF_MAGIC_MULTIPLIER	((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ )
 
-/* Port has hardware-assisted h/w flow control (iow, auto-RTS *not* auto-CTS) */
-#define UPF_HARD_FLOW		((__force upf_t) (1 << 21))
+/* Port has hardware-assisted h/w flow control */
+#define UPF_AUTO_CTS		((__force upf_t) (1 << 20))
+#define UPF_AUTO_RTS		((__force upf_t) (1 << 21))
+#define UPF_HARD_FLOW		((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS))
 /* Port has hardware-assisted s/w flow control */
 #define UPF_SOFT_FLOW		((__force upf_t) (1 << 22))
 #define UPF_CONS_FLOW		((__force upf_t) (1 << 23))
@@ -214,11 +216,17 @@ struct uart_port {
 #error Change mask not equivalent to userspace-visible bit defines
 #endif
 
-	/* status must be updated while holding port lock */
+	/*
+	 * Must hold termios_rwsem, port mutex and port lock to change;
+	 * can hold any one lock to read.
+	 */
 	upstat_t		status;
 
 #define UPSTAT_CTS_ENABLE	((__force upstat_t) (1 << 0))
 #define UPSTAT_DCD_ENABLE	((__force upstat_t) (1 << 1))
+#define UPSTAT_AUTORTS		((__force upstat_t) (1 << 2))
+#define UPSTAT_AUTOCTS		((__force upstat_t) (1 << 3))
+#define UPSTAT_AUTOXOFF		((__force upstat_t) (1 << 4))
 
 	int			hw_stopped;		/* sw-assisted CTS flow state */
 	unsigned int		mctrl;			/* current modem ctrl settings */
@@ -392,6 +400,13 @@ static inline bool uart_cts_enabled(struct uart_port *uport)
 	return !!(uport->status & UPSTAT_CTS_ENABLE);
 }
 
+static inline bool uart_softcts_mode(struct uart_port *uport)
+{
+	upstat_t mask = UPSTAT_CTS_ENABLE | UPSTAT_AUTOCTS;
+
+	return ((uport->status & mask) == UPSTAT_CTS_ENABLE);
+}
+
 /*
  * The following are helper functions for the low level drivers.
  */
-- 
cgit v1.2.3


From 632f32e2107d37598e3f6816dcf00c7cab4081ca Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 25 Jan 2015 14:44:54 -0500
Subject: tty: Remove external interface for tty_set_termios()

tty_set_termios() is an internal helper intended for file scope use.

UART drivers which are capable of driving the RTS pin must
properly handle the tiocmset() method, regardless of termios settings.
A failure to do so is a UART driver bug and should be fixed there.
Do not use this interface to workaround UART driver bugs.

Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: <linux-bluetooth@vger.kernel.org>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/bluetooth/hci_ath.c | 15 ++-------------
 drivers/tty/tty_ioctl.c     |  3 +--
 include/linux/tty.h         |  1 -
 3 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bluetooth/hci_ath.c b/drivers/bluetooth/hci_ath.c
index 2ff6dfd2d3f0..9c4dcf4c62ea 100644
--- a/drivers/bluetooth/hci_ath.c
+++ b/drivers/bluetooth/hci_ath.c
@@ -51,33 +51,22 @@ struct ath_struct {
 
 static int ath_wakeup_ar3k(struct tty_struct *tty)
 {
-	struct ktermios ktermios;
 	int status = tty->driver->ops->tiocmget(tty);
 
 	if (status & TIOCM_CTS)
 		return status;
 
-	/* Disable Automatic RTSCTS */
-	ktermios = tty->termios;
-	ktermios.c_cflag &= ~CRTSCTS;
-	tty_set_termios(tty, &ktermios);
-
 	/* Clear RTS first */
-	status = tty->driver->ops->tiocmget(tty);
+	tty->driver->ops->tiocmget(tty);
 	tty->driver->ops->tiocmset(tty, 0x00, TIOCM_RTS);
 	mdelay(20);
 
 	/* Set RTS, wake up board */
-	status = tty->driver->ops->tiocmget(tty);
+	tty->driver->ops->tiocmget(tty);
 	tty->driver->ops->tiocmset(tty, TIOCM_RTS, 0x00);
 	mdelay(20);
 
 	status = tty->driver->ops->tiocmget(tty);
-
-	/* Enable Automatic RTSCTS */
-	ktermios.c_cflag |= CRTSCTS;
-	status = tty_set_termios(tty, &ktermios);
-
 	return status;
 }
 
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 1787fa4d9448..a5cf253b2544 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -530,7 +530,7 @@ EXPORT_SYMBOL(tty_termios_hw_change);
  *	Locking: termios_rwsem
  */
 
-int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios)
+static int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios)
 {
 	struct ktermios old_termios;
 	struct tty_ldisc *ld;
@@ -563,7 +563,6 @@ int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios)
 	up_write(&tty->termios_rwsem);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(tty_set_termios);
 
 /**
  *	set_termios		-	set termios values for a tty
diff --git a/include/linux/tty.h b/include/linux/tty.h
index fe5623c9af71..358a337af598 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -491,7 +491,6 @@ static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 
 extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old);
 extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b);
-extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
 
 extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
 extern void tty_ldisc_deref(struct tty_ldisc *);
-- 
cgit v1.2.3


From 035a61c314eb3dab5bcc5683afaf4d412689858a Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Fri, 23 Jan 2015 12:03:30 +0100
Subject: clk: Make clk API return per-user struct clk instances

Moves clock state to struct clk_core, but takes care to change as little API as
possible.

struct clk_hw still has a pointer to a struct clk, which is the
implementation's per-user clk instance, for backwards compatibility.

The struct clk that clk_get_parent() returns isn't owned by the caller, but by
the clock implementation, so the former shouldn't call clk_put() on it.

Because some boards in mach-omap2 still register clocks statically, their clock
registration had to be updated to take into account that the clock information
is stored in struct clk_core now.

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Tested-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
[mturquette@linaro.org: adapted clk_has_parent to struct clk_core
                        applied OMAP3+ DPLL fix from Tero & Tony]
---
 arch/arm/mach-omap2/cclock3xxx_data.c   | 111 ++++--
 arch/arm/mach-omap2/clock.h             |  11 +-
 arch/arm/mach-omap2/clock_common_data.c |   5 +-
 arch/arm/mach-omap2/dpll3xxx.c          |  11 +-
 drivers/clk/clk.c                       | 631 ++++++++++++++++++++------------
 drivers/clk/clk.h                       |   5 +
 drivers/clk/clkdev.c                    |  84 ++++-
 include/linux/clk-private.h             |  35 +-
 include/linux/clk-provider.h            |  12 +-
 9 files changed, 602 insertions(+), 303 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/cclock3xxx_data.c b/arch/arm/mach-omap2/cclock3xxx_data.c
index 644ff3231bb8..adb4e6437204 100644
--- a/arch/arm/mach-omap2/cclock3xxx_data.c
+++ b/arch/arm/mach-omap2/cclock3xxx_data.c
@@ -82,7 +82,7 @@ DEFINE_CLK_MUX(osc_sys_ck, osc_sys_ck_parent_names, NULL, 0x0,
 	       OMAP3430_PRM_CLKSEL, OMAP3430_SYS_CLKIN_SEL_SHIFT,
 	       OMAP3430_SYS_CLKIN_SEL_WIDTH, 0x0, NULL);
 
-DEFINE_CLK_DIVIDER(sys_ck, "osc_sys_ck", &osc_sys_ck, 0x0,
+DEFINE_CLK_DIVIDER(sys_ck, "osc_sys_ck", &osc_sys_ck_core, 0x0,
 		   OMAP3430_PRM_CLKSRC_CTRL, OMAP_SYSCLKDIV_SHIFT,
 		   OMAP_SYSCLKDIV_WIDTH, CLK_DIVIDER_ONE_BASED, NULL);
 
@@ -132,7 +132,7 @@ static struct clk_hw_omap dpll3_ck_hw = {
 
 DEFINE_STRUCT_CLK(dpll3_ck, dpll3_ck_parent_names, dpll3_ck_ops);
 
-DEFINE_CLK_DIVIDER(dpll3_m2_ck, "dpll3_ck", &dpll3_ck, 0x0,
+DEFINE_CLK_DIVIDER(dpll3_m2_ck, "dpll3_ck", &dpll3_ck_core, 0x0,
 		   OMAP_CM_REGADDR(PLL_MOD, CM_CLKSEL1),
 		   OMAP3430_CORE_DPLL_CLKOUT_DIV_SHIFT,
 		   OMAP3430_CORE_DPLL_CLKOUT_DIV_WIDTH,
@@ -149,12 +149,12 @@ static const struct clk_ops core_ck_ops = {};
 DEFINE_STRUCT_CLK_HW_OMAP(core_ck, NULL);
 DEFINE_STRUCT_CLK(core_ck, core_ck_parent_names, core_ck_ops);
 
-DEFINE_CLK_DIVIDER(l3_ick, "core_ck", &core_ck, 0x0,
+DEFINE_CLK_DIVIDER(l3_ick, "core_ck", &core_ck_core, 0x0,
 		   OMAP_CM_REGADDR(CORE_MOD, CM_CLKSEL),
 		   OMAP3430_CLKSEL_L3_SHIFT, OMAP3430_CLKSEL_L3_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
 
-DEFINE_CLK_DIVIDER(l4_ick, "l3_ick", &l3_ick, 0x0,
+DEFINE_CLK_DIVIDER(l4_ick, "l3_ick", &l3_ick_core, 0x0,
 		   OMAP_CM_REGADDR(CORE_MOD, CM_CLKSEL),
 		   OMAP3430_CLKSEL_L4_SHIFT, OMAP3430_CLKSEL_L4_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -275,9 +275,9 @@ static struct clk_hw_omap dpll1_ck_hw = {
 
 DEFINE_STRUCT_CLK(dpll1_ck, dpll3_ck_parent_names, dpll1_ck_ops);
 
-DEFINE_CLK_FIXED_FACTOR(dpll1_x2_ck, "dpll1_ck", &dpll1_ck, 0x0, 2, 1);
+DEFINE_CLK_FIXED_FACTOR(dpll1_x2_ck, "dpll1_ck", &dpll1_ck_core, 0x0, 2, 1);
 
-DEFINE_CLK_DIVIDER(dpll1_x2m2_ck, "dpll1_x2_ck", &dpll1_x2_ck, 0x0,
+DEFINE_CLK_DIVIDER(dpll1_x2m2_ck, "dpll1_x2_ck", &dpll1_x2_ck_core, 0x0,
 		   OMAP_CM_REGADDR(MPU_MOD, OMAP3430_CM_CLKSEL2_PLL),
 		   OMAP3430_MPU_DPLL_CLKOUT_DIV_SHIFT,
 		   OMAP3430_MPU_DPLL_CLKOUT_DIV_WIDTH,
@@ -292,7 +292,7 @@ static const char *mpu_ck_parent_names[] = {
 DEFINE_STRUCT_CLK_HW_OMAP(mpu_ck, "mpu_clkdm");
 DEFINE_STRUCT_CLK(mpu_ck, mpu_ck_parent_names, core_l4_ick_ops);
 
-DEFINE_CLK_DIVIDER(arm_fck, "mpu_ck", &mpu_ck, 0x0,
+DEFINE_CLK_DIVIDER(arm_fck, "mpu_ck", &mpu_ck_core, 0x0,
 		   OMAP_CM_REGADDR(MPU_MOD, OMAP3430_CM_IDLEST_PLL),
 		   OMAP3430_ST_MPU_CLK_SHIFT, OMAP3430_ST_MPU_CLK_WIDTH,
 		   0x0, NULL);
@@ -424,7 +424,7 @@ static const struct clk_div_table dpll4_mx_ck_div_table[] = {
 	{ .div = 0 },
 };
 
-DEFINE_CLK_DIVIDER(dpll4_m5_ck, "dpll4_ck", &dpll4_ck, 0x0,
+DEFINE_CLK_DIVIDER(dpll4_m5_ck, "dpll4_ck", &dpll4_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_CAM_MOD, CM_CLKSEL),
 		   OMAP3430_CLKSEL_CAM_SHIFT, OMAP3630_CLKSEL_CAM_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -466,7 +466,7 @@ static struct clk_hw_omap dpll4_m5x2_ck_hw = {
 DEFINE_STRUCT_CLK_FLAGS(dpll4_m5x2_ck, dpll4_m5x2_ck_parent_names,
 			dpll4_m5x2_ck_ops, CLK_SET_RATE_PARENT);
 
-static struct clk dpll4_m5x2_ck_3630 = {
+static struct clk_core dpll4_m5x2_ck_3630_core = {
 	.name		= "dpll4_m5x2_ck",
 	.hw		= &dpll4_m5x2_ck_hw.hw,
 	.parent_names	= dpll4_m5x2_ck_parent_names,
@@ -475,6 +475,10 @@ static struct clk dpll4_m5x2_ck_3630 = {
 	.flags		= CLK_SET_RATE_PARENT,
 };
 
+static struct clk dpll4_m5x2_ck_3630 = {
+	.core = &dpll4_m5x2_ck_3630_core,
+};
+
 static struct clk cam_mclk;
 
 static const char *cam_mclk_parent_names[] = {
@@ -490,7 +494,7 @@ static struct clk_hw_omap cam_mclk_hw = {
 	.clkdm_name	= "cam_clkdm",
 };
 
-static struct clk cam_mclk = {
+static struct clk_core cam_mclk_core = {
 	.name		= "cam_mclk",
 	.hw		= &cam_mclk_hw.hw,
 	.parent_names	= cam_mclk_parent_names,
@@ -499,6 +503,10 @@ static struct clk cam_mclk = {
 	.flags		= CLK_SET_RATE_PARENT,
 };
 
+static struct clk cam_mclk = {
+	.core = &cam_mclk_core,
+};
+
 static const struct clksel_rate clkout2_src_core_rates[] = {
 	{ .div = 1, .val = 0, .flags = RATE_IN_3XXX },
 	{ .div = 0 }
@@ -514,7 +522,7 @@ static const struct clksel_rate clkout2_src_96m_rates[] = {
 	{ .div = 0 }
 };
 
-DEFINE_CLK_DIVIDER(dpll4_m2_ck, "dpll4_ck", &dpll4_ck, 0x0,
+DEFINE_CLK_DIVIDER(dpll4_m2_ck, "dpll4_ck", &dpll4_ck_core, 0x0,
 		   OMAP_CM_REGADDR(PLL_MOD, OMAP3430_CM_CLKSEL3),
 		   OMAP3430_DIV_96M_SHIFT, OMAP3630_DIV_96M_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -538,7 +546,7 @@ static struct clk_hw_omap dpll4_m2x2_ck_hw = {
 
 DEFINE_STRUCT_CLK(dpll4_m2x2_ck, dpll4_m2x2_ck_parent_names, dpll4_m5x2_ck_ops);
 
-static struct clk dpll4_m2x2_ck_3630 = {
+static struct clk_core dpll4_m2x2_ck_3630_core = {
 	.name		= "dpll4_m2x2_ck",
 	.hw		= &dpll4_m2x2_ck_hw.hw,
 	.parent_names	= dpll4_m2x2_ck_parent_names,
@@ -546,6 +554,10 @@ static struct clk dpll4_m2x2_ck_3630 = {
 	.ops		= &dpll4_m5x2_ck_3630_ops,
 };
 
+static struct clk dpll4_m2x2_ck_3630 = {
+	.core = &dpll4_m2x2_ck_3630_core,
+};
+
 static struct clk omap_96m_alwon_fck;
 
 static const char *omap_96m_alwon_fck_parent_names[] = {
@@ -570,7 +582,7 @@ static const struct clksel_rate clkout2_src_54m_rates[] = {
 	{ .div = 0 }
 };
 
-DEFINE_CLK_DIVIDER_TABLE(dpll4_m3_ck, "dpll4_ck", &dpll4_ck, 0x0,
+DEFINE_CLK_DIVIDER_TABLE(dpll4_m3_ck, "dpll4_ck", &dpll4_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_DSS_MOD, CM_CLKSEL),
 		   OMAP3430_CLKSEL_TV_SHIFT, OMAP3630_CLKSEL_TV_WIDTH,
 		   0, dpll4_mx_ck_div_table, NULL);
@@ -594,7 +606,7 @@ static struct clk_hw_omap dpll4_m3x2_ck_hw = {
 
 DEFINE_STRUCT_CLK(dpll4_m3x2_ck, dpll4_m3x2_ck_parent_names, dpll4_m5x2_ck_ops);
 
-static struct clk dpll4_m3x2_ck_3630 = {
+static struct clk_core dpll4_m3x2_ck_3630_core = {
 	.name		= "dpll4_m3x2_ck",
 	.hw		= &dpll4_m3x2_ck_hw.hw,
 	.parent_names	= dpll4_m3x2_ck_parent_names,
@@ -602,6 +614,10 @@ static struct clk dpll4_m3x2_ck_3630 = {
 	.ops		= &dpll4_m5x2_ck_3630_ops,
 };
 
+static struct clk dpll4_m3x2_ck_3630 = {
+	.core = &dpll4_m3x2_ck_3630_core,
+};
+
 static const char *omap_54m_fck_parent_names[] = {
 	"dpll4_m3x2_ck", "sys_altclk",
 };
@@ -677,7 +693,8 @@ static struct clk_hw_omap omap_48m_fck_hw = {
 
 DEFINE_STRUCT_CLK(omap_48m_fck, omap_48m_fck_parent_names, omap_48m_fck_ops);
 
-DEFINE_CLK_FIXED_FACTOR(omap_12m_fck, "omap_48m_fck", &omap_48m_fck, 0x0, 1, 4);
+DEFINE_CLK_FIXED_FACTOR(omap_12m_fck, "omap_48m_fck", &omap_48m_fck_core, 0x0,
+			1, 4);
 
 static struct clk core_12m_fck;
 
@@ -723,7 +740,8 @@ static const char *core_l3_ick_parent_names[] = {
 DEFINE_STRUCT_CLK_HW_OMAP(core_l3_ick, "core_l3_clkdm");
 DEFINE_STRUCT_CLK(core_l3_ick, core_l3_ick_parent_names, core_l4_ick_ops);
 
-DEFINE_CLK_FIXED_FACTOR(dpll3_m2x2_ck, "dpll3_m2_ck", &dpll3_m2_ck, 0x0, 2, 1);
+DEFINE_CLK_FIXED_FACTOR(dpll3_m2x2_ck, "dpll3_m2_ck", &dpll3_m2_ck_core, 0x0,
+			2, 1);
 
 static struct clk corex2_fck;
 
@@ -809,7 +827,7 @@ static struct clk_hw_omap des2_ick_hw = {
 
 DEFINE_STRUCT_CLK(des2_ick, aes2_ick_parent_names, aes2_ick_ops);
 
-DEFINE_CLK_DIVIDER(dpll1_fck, "core_ck", &core_ck, 0x0,
+DEFINE_CLK_DIVIDER(dpll1_fck, "core_ck", &core_ck_core, 0x0,
 		   OMAP_CM_REGADDR(MPU_MOD, OMAP3430_CM_CLKSEL1_PLL),
 		   OMAP3430_MPU_CLK_SRC_SHIFT, OMAP3430_MPU_CLK_SRC_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -852,18 +870,18 @@ static struct clk_hw_omap dpll2_ck_hw = {
 
 DEFINE_STRUCT_CLK(dpll2_ck, dpll3_ck_parent_names, dpll1_ck_ops);
 
-DEFINE_CLK_DIVIDER(dpll2_fck, "core_ck", &core_ck, 0x0,
+DEFINE_CLK_DIVIDER(dpll2_fck, "core_ck", &core_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_IVA2_MOD, OMAP3430_CM_CLKSEL1_PLL),
 		   OMAP3430_IVA2_CLK_SRC_SHIFT, OMAP3430_IVA2_CLK_SRC_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
 
-DEFINE_CLK_DIVIDER(dpll2_m2_ck, "dpll2_ck", &dpll2_ck, 0x0,
+DEFINE_CLK_DIVIDER(dpll2_m2_ck, "dpll2_ck", &dpll2_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_IVA2_MOD, OMAP3430_CM_CLKSEL2_PLL),
 		   OMAP3430_IVA2_DPLL_CLKOUT_DIV_SHIFT,
 		   OMAP3430_IVA2_DPLL_CLKOUT_DIV_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
 
-DEFINE_CLK_DIVIDER(dpll3_m3_ck, "dpll3_ck", &dpll3_ck, 0x0,
+DEFINE_CLK_DIVIDER(dpll3_m3_ck, "dpll3_ck", &dpll3_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_EMU_MOD, CM_CLKSEL1),
 		   OMAP3430_DIV_DPLL3_SHIFT, OMAP3430_DIV_DPLL3_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -887,7 +905,7 @@ static struct clk_hw_omap dpll3_m3x2_ck_hw = {
 
 DEFINE_STRUCT_CLK(dpll3_m3x2_ck, dpll3_m3x2_ck_parent_names, dpll4_m5x2_ck_ops);
 
-static struct clk dpll3_m3x2_ck_3630 = {
+static struct clk_core dpll3_m3x2_ck_3630_core = {
 	.name		= "dpll3_m3x2_ck",
 	.hw		= &dpll3_m3x2_ck_hw.hw,
 	.parent_names	= dpll3_m3x2_ck_parent_names,
@@ -895,9 +913,13 @@ static struct clk dpll3_m3x2_ck_3630 = {
 	.ops		= &dpll4_m5x2_ck_3630_ops,
 };
 
-DEFINE_CLK_FIXED_FACTOR(dpll3_x2_ck, "dpll3_ck", &dpll3_ck, 0x0, 2, 1);
+static struct clk dpll3_m3x2_ck_3630 = {
+	.core = &dpll3_m3x2_ck_3630_core,
+};
+
+DEFINE_CLK_FIXED_FACTOR(dpll3_x2_ck, "dpll3_ck", &dpll3_ck_core, 0x0, 2, 1);
 
-DEFINE_CLK_DIVIDER_TABLE(dpll4_m4_ck, "dpll4_ck", &dpll4_ck, 0x0,
+DEFINE_CLK_DIVIDER_TABLE(dpll4_m4_ck, "dpll4_ck", &dpll4_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_DSS_MOD, CM_CLKSEL),
 		   OMAP3430_CLKSEL_DSS1_SHIFT, OMAP3630_CLKSEL_DSS1_WIDTH,
 		   0, dpll4_mx_ck_div_table, NULL);
@@ -922,7 +944,7 @@ static struct clk_hw_omap dpll4_m4x2_ck_hw = {
 DEFINE_STRUCT_CLK_FLAGS(dpll4_m4x2_ck, dpll4_m4x2_ck_parent_names,
 		dpll4_m5x2_ck_ops, CLK_SET_RATE_PARENT);
 
-static struct clk dpll4_m4x2_ck_3630 = {
+static struct clk_core dpll4_m4x2_ck_3630_core = {
 	.name		= "dpll4_m4x2_ck",
 	.hw		= &dpll4_m4x2_ck_hw.hw,
 	.parent_names	= dpll4_m4x2_ck_parent_names,
@@ -931,7 +953,11 @@ static struct clk dpll4_m4x2_ck_3630 = {
 	.flags		= CLK_SET_RATE_PARENT,
 };
 
-DEFINE_CLK_DIVIDER(dpll4_m6_ck, "dpll4_ck", &dpll4_ck, 0x0,
+static struct clk dpll4_m4x2_ck_3630 = {
+	.core = &dpll4_m4x2_ck_3630_core,
+};
+
+DEFINE_CLK_DIVIDER(dpll4_m6_ck, "dpll4_ck", &dpll4_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_EMU_MOD, CM_CLKSEL1),
 		   OMAP3430_DIV_DPLL4_SHIFT, OMAP3630_DIV_DPLL4_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -955,7 +981,7 @@ static struct clk_hw_omap dpll4_m6x2_ck_hw = {
 
 DEFINE_STRUCT_CLK(dpll4_m6x2_ck, dpll4_m6x2_ck_parent_names, dpll4_m5x2_ck_ops);
 
-static struct clk dpll4_m6x2_ck_3630 = {
+static struct clk_core dpll4_m6x2_ck_3630_core = {
 	.name		= "dpll4_m6x2_ck",
 	.hw		= &dpll4_m6x2_ck_hw.hw,
 	.parent_names	= dpll4_m6x2_ck_parent_names,
@@ -963,7 +989,11 @@ static struct clk dpll4_m6x2_ck_3630 = {
 	.ops		= &dpll4_m5x2_ck_3630_ops,
 };
 
-DEFINE_CLK_FIXED_FACTOR(dpll4_x2_ck, "dpll4_ck", &dpll4_ck, 0x0, 2, 1);
+static struct clk dpll4_m6x2_ck_3630 = {
+	.core = &dpll4_m6x2_ck_3630_core,
+};
+
+DEFINE_CLK_FIXED_FACTOR(dpll4_x2_ck, "dpll4_ck", &dpll4_ck_core, 0x0, 2, 1);
 
 static struct dpll_data dpll5_dd = {
 	.mult_div1_reg	= OMAP_CM_REGADDR(PLL_MOD, OMAP3430ES2_CM_CLKSEL4),
@@ -1000,7 +1030,7 @@ static struct clk_hw_omap dpll5_ck_hw = {
 
 DEFINE_STRUCT_CLK(dpll5_ck, dpll3_ck_parent_names, dpll1_ck_ops);
 
-DEFINE_CLK_DIVIDER(dpll5_m2_ck, "dpll5_ck", &dpll5_ck, 0x0,
+DEFINE_CLK_DIVIDER(dpll5_m2_ck, "dpll5_ck", &dpll5_ck_core, 0x0,
 		   OMAP_CM_REGADDR(PLL_MOD, OMAP3430ES2_CM_CLKSEL5),
 		   OMAP3430ES2_DIV_120M_SHIFT, OMAP3430ES2_DIV_120M_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -1247,7 +1277,7 @@ static struct clk_hw_omap emu_src_ck_hw = {
 
 DEFINE_STRUCT_CLK(emu_src_ck, emu_src_ck_parent_names, emu_src_ck_ops);
 
-DEFINE_CLK_DIVIDER(atclk_fck, "emu_src_ck", &emu_src_ck, 0x0,
+DEFINE_CLK_DIVIDER(atclk_fck, "emu_src_ck", &emu_src_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_EMU_MOD, CM_CLKSEL1),
 		   OMAP3430_CLKSEL_ATCLK_SHIFT, OMAP3430_CLKSEL_ATCLK_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -1298,7 +1328,7 @@ static struct clk_hw_omap gfx_l3_ck_hw = {
 
 DEFINE_STRUCT_CLK(gfx_l3_ck, core_l3_ick_parent_names, aes1_ick_ops);
 
-DEFINE_CLK_DIVIDER(gfx_l3_fck, "l3_ick", &l3_ick, 0x0,
+DEFINE_CLK_DIVIDER(gfx_l3_fck, "l3_ick", &l3_ick_core, 0x0,
 		   OMAP_CM_REGADDR(GFX_MOD, CM_CLKSEL),
 		   OMAP_CLKSEL_GFX_SHIFT, OMAP_CLKSEL_GFX_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -2498,7 +2528,7 @@ static struct clk_hw_omap omap_96m_alwon_fck_3630_hw = {
 	.clksel_mask	= OMAP3630_CLKSEL_96M_MASK,
 };
 
-static struct clk omap_96m_alwon_fck_3630 = {
+static struct clk_core omap_96m_alwon_fck_3630_core = {
 	.name	= "omap_96m_alwon_fck",
 	.hw	= &omap_96m_alwon_fck_3630_hw.hw,
 	.parent_names	= omap_96m_alwon_fck_3630_parent_names,
@@ -2506,6 +2536,10 @@ static struct clk omap_96m_alwon_fck_3630 = {
 	.ops	= &omap_96m_alwon_fck_3630_ops,
 };
 
+static struct clk omap_96m_alwon_fck_3630 = {
+	.core = &omap_96m_alwon_fck_3630_core,
+};
+
 static struct clk omapctrl_ick;
 
 static struct clk_hw_omap omapctrl_ick_hw = {
@@ -2521,12 +2555,12 @@ static struct clk_hw_omap omapctrl_ick_hw = {
 
 DEFINE_STRUCT_CLK(omapctrl_ick, aes2_ick_parent_names, aes2_ick_ops);
 
-DEFINE_CLK_DIVIDER(pclk_fck, "emu_src_ck", &emu_src_ck, 0x0,
+DEFINE_CLK_DIVIDER(pclk_fck, "emu_src_ck", &emu_src_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_EMU_MOD, CM_CLKSEL1),
 		   OMAP3430_CLKSEL_PCLK_SHIFT, OMAP3430_CLKSEL_PCLK_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
 
-DEFINE_CLK_DIVIDER(pclkx2_fck, "emu_src_ck", &emu_src_ck, 0x0,
+DEFINE_CLK_DIVIDER(pclkx2_fck, "emu_src_ck", &emu_src_ck_core, 0x0,
 		   OMAP_CM_REGADDR(OMAP3430_EMU_MOD, CM_CLKSEL1),
 		   OMAP3430_CLKSEL_PCLKX2_SHIFT, OMAP3430_CLKSEL_PCLKX2_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -2558,7 +2592,7 @@ static struct clk_hw_omap pka_ick_hw = {
 
 DEFINE_STRUCT_CLK(pka_ick, pka_ick_parent_names, aes1_ick_ops);
 
-DEFINE_CLK_DIVIDER(rm_ick, "l4_ick", &l4_ick, 0x0,
+DEFINE_CLK_DIVIDER(rm_ick, "l4_ick", &l4_ick_core, 0x0,
 		   OMAP_CM_REGADDR(WKUP_MOD, CM_CLKSEL),
 		   OMAP3430_CLKSEL_RM_SHIFT, OMAP3430_CLKSEL_RM_WIDTH,
 		   CLK_DIVIDER_ONE_BASED, NULL);
@@ -2819,10 +2853,10 @@ DEFINE_CLK_OMAP_MUX_GATE(ssi_ssr_fck_3430es2, "core_l4_clkdm",
 			 ssi_ssr_fck_3430es1_ops);
 
 DEFINE_CLK_FIXED_FACTOR(ssi_sst_fck_3430es1, "ssi_ssr_fck_3430es1",
-			&ssi_ssr_fck_3430es1, 0x0, 1, 2);
+			&ssi_ssr_fck_3430es1_core, 0x0, 1, 2);
 
 DEFINE_CLK_FIXED_FACTOR(ssi_sst_fck_3430es2, "ssi_ssr_fck_3430es2",
-			&ssi_ssr_fck_3430es2, 0x0, 1, 2);
+			&ssi_ssr_fck_3430es2_core, 0x0, 1, 2);
 
 static struct clk sys_clkout1;
 
@@ -2840,7 +2874,7 @@ static struct clk_hw_omap sys_clkout1_hw = {
 
 DEFINE_STRUCT_CLK(sys_clkout1, sys_clkout1_parent_names, aes1_ick_ops);
 
-DEFINE_CLK_DIVIDER(sys_clkout2, "clkout2_src_ck", &clkout2_src_ck, 0x0,
+DEFINE_CLK_DIVIDER(sys_clkout2, "clkout2_src_ck", &clkout2_src_ck_core, 0x0,
 		   OMAP3430_CM_CLKOUT_CTRL, OMAP3430_CLKOUT2_DIV_SHIFT,
 		   OMAP3430_CLKOUT2_DIV_WIDTH, CLK_DIVIDER_POWER_OF_TWO, NULL);
 
@@ -2849,7 +2883,8 @@ DEFINE_CLK_MUX(traceclk_src_fck, emu_src_ck_parent_names, NULL, 0x0,
 	       OMAP3430_TRACE_MUX_CTRL_SHIFT, OMAP3430_TRACE_MUX_CTRL_WIDTH,
 	       0x0, NULL);
 
-DEFINE_CLK_DIVIDER(traceclk_fck, "traceclk_src_fck", &traceclk_src_fck, 0x0,
+DEFINE_CLK_DIVIDER(traceclk_fck, "traceclk_src_fck", &traceclk_src_fck_core,
+		   0x0,
 		   OMAP_CM_REGADDR(OMAP3430_EMU_MOD, CM_CLKSEL1),
 		   OMAP3430_CLKSEL_TRACECLK_SHIFT,
 		   OMAP3430_CLKSEL_TRACECLK_WIDTH, CLK_DIVIDER_ONE_BASED, NULL);
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index a4282e79143e..c5b3a7f3e41d 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -40,23 +40,29 @@ struct omap_clk {
 struct clockdomain;
 
 #define DEFINE_STRUCT_CLK(_name, _parent_array_name, _clkops_name)	\
-	static struct clk _name = {				\
+	static struct clk_core _name##_core = {			\
 		.name = #_name,					\
 		.hw = &_name##_hw.hw,				\
 		.parent_names = _parent_array_name,		\
 		.num_parents = ARRAY_SIZE(_parent_array_name),	\
 		.ops = &_clkops_name,				\
+	};							\
+	static struct clk _name = {				\
+		.core = &_name##_core,				\
 	};
 
 #define DEFINE_STRUCT_CLK_FLAGS(_name, _parent_array_name,	\
 				_clkops_name, _flags)		\
-	static struct clk _name = {				\
+	static struct clk_core _name##_core = {			\
 		.name = #_name,					\
 		.hw = &_name##_hw.hw,				\
 		.parent_names = _parent_array_name,		\
 		.num_parents = ARRAY_SIZE(_parent_array_name),	\
 		.ops = &_clkops_name,				\
 		.flags = _flags,				\
+	};							\
+	static struct clk _name = {				\
+		.core = &_name##_core,				\
 	};
 
 #define DEFINE_STRUCT_CLK_HW_OMAP(_name, _clkdm_name)		\
@@ -248,6 +254,7 @@ extern const struct clksel_rate gpt_32k_rates[];
 extern const struct clksel_rate gpt_sys_rates[];
 extern const struct clksel_rate gfx_l3_rates[];
 extern const struct clksel_rate dsp_ick_rates[];
+extern struct clk_core dummy_ck_core;
 extern struct clk dummy_ck;
 
 extern const struct clk_hw_omap_ops clkhwops_iclk_wait;
diff --git a/arch/arm/mach-omap2/clock_common_data.c b/arch/arm/mach-omap2/clock_common_data.c
index ef4d21bfb964..febd0a279224 100644
--- a/arch/arm/mach-omap2/clock_common_data.c
+++ b/arch/arm/mach-omap2/clock_common_data.c
@@ -119,8 +119,11 @@ const struct clksel_rate div31_1to31_rates[] = {
 
 static struct clk_ops dummy_ck_ops = {};
 
-struct clk dummy_ck = {
+struct clk_core dummy_ck_core = {
 	.name = "dummy_clk",
 	.ops = &dummy_ck_ops,
 	.flags = CLK_IS_BASIC,
 };
+struct clk dummy_ck = {
+	.core = &dummy_ck_core,
+};
diff --git a/arch/arm/mach-omap2/dpll3xxx.c b/arch/arm/mach-omap2/dpll3xxx.c
index c2da2a0fe5ad..49752d77f5bc 100644
--- a/arch/arm/mach-omap2/dpll3xxx.c
+++ b/arch/arm/mach-omap2/dpll3xxx.c
@@ -410,7 +410,7 @@ int omap3_noncore_dpll_enable(struct clk_hw *hw)
 	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
 	int r;
 	struct dpll_data *dd;
-	struct clk *parent;
+	struct clk_hw *parent;
 
 	dd = clk->dpll_data;
 	if (!dd)
@@ -427,13 +427,13 @@ int omap3_noncore_dpll_enable(struct clk_hw *hw)
 		}
 	}
 
-	parent = __clk_get_parent(hw->clk);
+	parent = __clk_get_hw(__clk_get_parent(hw->clk));
 
 	if (__clk_get_rate(hw->clk) == __clk_get_rate(dd->clk_bypass)) {
-		WARN_ON(parent != dd->clk_bypass);
+		WARN_ON(parent != __clk_get_hw(dd->clk_bypass));
 		r = _omap3_noncore_dpll_bypass(clk);
 	} else {
-		WARN_ON(parent != dd->clk_ref);
+		WARN_ON(parent != __clk_get_hw(dd->clk_ref));
 		r = _omap3_noncore_dpll_lock(clk);
 	}
 
@@ -549,7 +549,8 @@ int omap3_noncore_dpll_set_rate(struct clk_hw *hw, unsigned long rate,
 	if (!dd)
 		return -EINVAL;
 
-	if (__clk_get_parent(hw->clk) != dd->clk_ref)
+	if (__clk_get_hw(__clk_get_parent(hw->clk)) !=
+	    __clk_get_hw(dd->clk_ref))
 		return -EINVAL;
 
 	if (dd->last_rounded_rate == 0)
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index b701e7c195e4..d60c193b81aa 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -37,6 +37,15 @@ static HLIST_HEAD(clk_root_list);
 static HLIST_HEAD(clk_orphan_list);
 static LIST_HEAD(clk_notifier_list);
 
+static long clk_core_get_accuracy(struct clk_core *clk);
+static unsigned long clk_core_get_rate(struct clk_core *clk);
+static int clk_core_get_phase(struct clk_core *clk);
+static bool clk_core_is_prepared(struct clk_core *clk);
+static bool clk_core_is_enabled(struct clk_core *clk);
+static unsigned long clk_core_round_rate_nolock(struct clk_core *clk,
+						unsigned long rate);
+static struct clk_core *clk_core_lookup(const char *name);
+
 /***           locking             ***/
 static void clk_prepare_lock(void)
 {
@@ -114,7 +123,8 @@ static struct hlist_head *orphan_list[] = {
 	NULL,
 };
 
-static void clk_summary_show_one(struct seq_file *s, struct clk *c, int level)
+static void clk_summary_show_one(struct seq_file *s, struct clk_core *c,
+				 int level)
 {
 	if (!c)
 		return;
@@ -122,14 +132,14 @@ static void clk_summary_show_one(struct seq_file *s, struct clk *c, int level)
 	seq_printf(s, "%*s%-*s %11d %12d %11lu %10lu %-3d\n",
 		   level * 3 + 1, "",
 		   30 - level * 3, c->name,
-		   c->enable_count, c->prepare_count, clk_get_rate(c),
-		   clk_get_accuracy(c), clk_get_phase(c));
+		   c->enable_count, c->prepare_count, clk_core_get_rate(c),
+		   clk_core_get_accuracy(c), clk_core_get_phase(c));
 }
 
-static void clk_summary_show_subtree(struct seq_file *s, struct clk *c,
+static void clk_summary_show_subtree(struct seq_file *s, struct clk_core *c,
 				     int level)
 {
-	struct clk *child;
+	struct clk_core *child;
 
 	if (!c)
 		return;
@@ -142,7 +152,7 @@ static void clk_summary_show_subtree(struct seq_file *s, struct clk *c,
 
 static int clk_summary_show(struct seq_file *s, void *data)
 {
-	struct clk *c;
+	struct clk_core *c;
 	struct hlist_head **lists = (struct hlist_head **)s->private;
 
 	seq_puts(s, "   clock                         enable_cnt  prepare_cnt        rate   accuracy   phase\n");
@@ -172,7 +182,7 @@ static const struct file_operations clk_summary_fops = {
 	.release	= single_release,
 };
 
-static void clk_dump_one(struct seq_file *s, struct clk *c, int level)
+static void clk_dump_one(struct seq_file *s, struct clk_core *c, int level)
 {
 	if (!c)
 		return;
@@ -180,14 +190,14 @@ static void clk_dump_one(struct seq_file *s, struct clk *c, int level)
 	seq_printf(s, "\"%s\": { ", c->name);
 	seq_printf(s, "\"enable_count\": %d,", c->enable_count);
 	seq_printf(s, "\"prepare_count\": %d,", c->prepare_count);
-	seq_printf(s, "\"rate\": %lu", clk_get_rate(c));
-	seq_printf(s, "\"accuracy\": %lu", clk_get_accuracy(c));
-	seq_printf(s, "\"phase\": %d", clk_get_phase(c));
+	seq_printf(s, "\"rate\": %lu", clk_core_get_rate(c));
+	seq_printf(s, "\"accuracy\": %lu", clk_core_get_accuracy(c));
+	seq_printf(s, "\"phase\": %d", clk_core_get_phase(c));
 }
 
-static void clk_dump_subtree(struct seq_file *s, struct clk *c, int level)
+static void clk_dump_subtree(struct seq_file *s, struct clk_core *c, int level)
 {
-	struct clk *child;
+	struct clk_core *child;
 
 	if (!c)
 		return;
@@ -204,7 +214,7 @@ static void clk_dump_subtree(struct seq_file *s, struct clk *c, int level)
 
 static int clk_dump(struct seq_file *s, void *data)
 {
-	struct clk *c;
+	struct clk_core *c;
 	bool first_node = true;
 	struct hlist_head **lists = (struct hlist_head **)s->private;
 
@@ -240,7 +250,7 @@ static const struct file_operations clk_dump_fops = {
 	.release	= single_release,
 };
 
-static int clk_debug_create_one(struct clk *clk, struct dentry *pdentry)
+static int clk_debug_create_one(struct clk_core *clk, struct dentry *pdentry)
 {
 	struct dentry *d;
 	int ret = -ENOMEM;
@@ -315,7 +325,7 @@ out:
  * initialized.  Otherwise it bails out early since the debugfs clk tree
  * will be created lazily by clk_debug_init as part of a late_initcall.
  */
-static int clk_debug_register(struct clk *clk)
+static int clk_debug_register(struct clk_core *clk)
 {
 	int ret = 0;
 
@@ -340,7 +350,7 @@ unlock:
  * debugfs clk tree if clk->dentry points to debugfs created by
  * clk_debug_register in __clk_init.
  */
-static void clk_debug_unregister(struct clk *clk)
+static void clk_debug_unregister(struct clk_core *clk)
 {
 	mutex_lock(&clk_debug_lock);
 	hlist_del_init(&clk->debug_node);
@@ -354,8 +364,9 @@ struct dentry *clk_debugfs_add_file(struct clk_hw *hw, char *name, umode_t mode,
 {
 	struct dentry *d = NULL;
 
-	if (hw->clk->dentry)
-		d = debugfs_create_file(name, mode, hw->clk->dentry, data, fops);
+	if (hw->core->dentry)
+		d = debugfs_create_file(name, mode, hw->core->dentry, data,
+					fops);
 
 	return d;
 }
@@ -375,7 +386,7 @@ EXPORT_SYMBOL_GPL(clk_debugfs_add_file);
  */
 static int __init clk_debug_init(void)
 {
-	struct clk *clk;
+	struct clk_core *clk;
 	struct dentry *d;
 
 	rootdir = debugfs_create_dir("clk", NULL);
@@ -414,19 +425,20 @@ static int __init clk_debug_init(void)
 }
 late_initcall(clk_debug_init);
 #else
-static inline int clk_debug_register(struct clk *clk) { return 0; }
-static inline void clk_debug_reparent(struct clk *clk, struct clk *new_parent)
+static inline int clk_debug_register(struct clk_core *clk) { return 0; }
+static inline void clk_debug_reparent(struct clk_core *clk,
+				      struct clk_core *new_parent)
 {
 }
-static inline void clk_debug_unregister(struct clk *clk)
+static inline void clk_debug_unregister(struct clk_core *clk)
 {
 }
 #endif
 
 /* caller must hold prepare_lock */
-static void clk_unprepare_unused_subtree(struct clk *clk)
+static void clk_unprepare_unused_subtree(struct clk_core *clk)
 {
-	struct clk *child;
+	struct clk_core *child;
 
 	hlist_for_each_entry(child, &clk->children, child_node)
 		clk_unprepare_unused_subtree(child);
@@ -437,7 +449,7 @@ static void clk_unprepare_unused_subtree(struct clk *clk)
 	if (clk->flags & CLK_IGNORE_UNUSED)
 		return;
 
-	if (__clk_is_prepared(clk)) {
+	if (clk_core_is_prepared(clk)) {
 		if (clk->ops->unprepare_unused)
 			clk->ops->unprepare_unused(clk->hw);
 		else if (clk->ops->unprepare)
@@ -446,9 +458,9 @@ static void clk_unprepare_unused_subtree(struct clk *clk)
 }
 
 /* caller must hold prepare_lock */
-static void clk_disable_unused_subtree(struct clk *clk)
+static void clk_disable_unused_subtree(struct clk_core *clk)
 {
-	struct clk *child;
+	struct clk_core *child;
 	unsigned long flags;
 
 	hlist_for_each_entry(child, &clk->children, child_node)
@@ -467,7 +479,7 @@ static void clk_disable_unused_subtree(struct clk *clk)
 	 * sequence.  call .disable_unused if available, otherwise fall
 	 * back to .disable
 	 */
-	if (__clk_is_enabled(clk)) {
+	if (clk_core_is_enabled(clk)) {
 		if (clk->ops->disable_unused)
 			clk->ops->disable_unused(clk->hw);
 		else if (clk->ops->disable)
@@ -488,7 +500,7 @@ __setup("clk_ignore_unused", clk_ignore_unused_setup);
 
 static int clk_disable_unused(void)
 {
-	struct clk *clk;
+	struct clk_core *clk;
 
 	if (clk_ignore_unused) {
 		pr_warn("clk: Not disabling unused clocks\n");
@@ -519,48 +531,65 @@ late_initcall_sync(clk_disable_unused);
 
 const char *__clk_get_name(struct clk *clk)
 {
-	return !clk ? NULL : clk->name;
+	return !clk ? NULL : clk->core->name;
 }
 EXPORT_SYMBOL_GPL(__clk_get_name);
 
 struct clk_hw *__clk_get_hw(struct clk *clk)
 {
-	return !clk ? NULL : clk->hw;
+	return !clk ? NULL : clk->core->hw;
 }
 EXPORT_SYMBOL_GPL(__clk_get_hw);
 
 u8 __clk_get_num_parents(struct clk *clk)
 {
-	return !clk ? 0 : clk->num_parents;
+	return !clk ? 0 : clk->core->num_parents;
 }
 EXPORT_SYMBOL_GPL(__clk_get_num_parents);
 
 struct clk *__clk_get_parent(struct clk *clk)
 {
-	return !clk ? NULL : clk->parent;
+	if (!clk)
+		return NULL;
+
+	/* TODO: Create a per-user clk and change callers to call clk_put */
+	return !clk->core->parent ? NULL : clk->core->parent->hw->clk;
 }
 EXPORT_SYMBOL_GPL(__clk_get_parent);
 
-struct clk *clk_get_parent_by_index(struct clk *clk, u8 index)
+static struct clk_core *clk_core_get_parent_by_index(struct clk_core *clk,
+							 u8 index)
 {
 	if (!clk || index >= clk->num_parents)
 		return NULL;
 	else if (!clk->parents)
-		return __clk_lookup(clk->parent_names[index]);
+		return clk_core_lookup(clk->parent_names[index]);
 	else if (!clk->parents[index])
 		return clk->parents[index] =
-			__clk_lookup(clk->parent_names[index]);
+			clk_core_lookup(clk->parent_names[index]);
 	else
 		return clk->parents[index];
 }
+
+struct clk *clk_get_parent_by_index(struct clk *clk, u8 index)
+{
+	struct clk_core *parent;
+
+	if (!clk)
+		return NULL;
+
+	parent = clk_core_get_parent_by_index(clk->core, index);
+
+	return !parent ? NULL : parent->hw->clk;
+}
 EXPORT_SYMBOL_GPL(clk_get_parent_by_index);
 
 unsigned int __clk_get_enable_count(struct clk *clk)
 {
-	return !clk ? 0 : clk->enable_count;
+	return !clk ? 0 : clk->core->enable_count;
 }
 
-unsigned long __clk_get_rate(struct clk *clk)
+static unsigned long clk_core_get_rate_nolock(struct clk_core *clk)
 {
 	unsigned long ret;
 
@@ -580,9 +609,17 @@ unsigned long __clk_get_rate(struct clk *clk)
 out:
 	return ret;
 }
+
+unsigned long __clk_get_rate(struct clk *clk)
+{
+	if (!clk)
+		return 0;
+
+	return clk_core_get_rate_nolock(clk->core);
+}
 EXPORT_SYMBOL_GPL(__clk_get_rate);
 
-static unsigned long __clk_get_accuracy(struct clk *clk)
+static unsigned long __clk_get_accuracy(struct clk_core *clk)
 {
 	if (!clk)
 		return 0;
@@ -592,11 +629,11 @@ static unsigned long __clk_get_accuracy(struct clk *clk)
 
 unsigned long __clk_get_flags(struct clk *clk)
 {
-	return !clk ? 0 : clk->flags;
+	return !clk ? 0 : clk->core->flags;
 }
 EXPORT_SYMBOL_GPL(__clk_get_flags);
 
-bool __clk_is_prepared(struct clk *clk)
+static bool clk_core_is_prepared(struct clk_core *clk)
 {
 	int ret;
 
@@ -617,7 +654,15 @@ out:
 	return !!ret;
 }
 
-bool __clk_is_enabled(struct clk *clk)
+bool __clk_is_prepared(struct clk *clk)
+{
+	if (!clk)
+		return false;
+
+	return clk_core_is_prepared(clk->core);
+}
+
+static bool clk_core_is_enabled(struct clk_core *clk)
 {
 	int ret;
 
@@ -637,12 +682,21 @@ bool __clk_is_enabled(struct clk *clk)
 out:
 	return !!ret;
 }
+
+bool __clk_is_enabled(struct clk *clk)
+{
+	if (!clk)
+		return false;
+
+	return clk_core_is_enabled(clk->core);
+}
 EXPORT_SYMBOL_GPL(__clk_is_enabled);
 
-static struct clk *__clk_lookup_subtree(const char *name, struct clk *clk)
+static struct clk_core *__clk_lookup_subtree(const char *name,
+					     struct clk_core *clk)
 {
-	struct clk *child;
-	struct clk *ret;
+	struct clk_core *child;
+	struct clk_core *ret;
 
 	if (!strcmp(clk->name, name))
 		return clk;
@@ -656,10 +710,10 @@ static struct clk *__clk_lookup_subtree(const char *name, struct clk *clk)
 	return NULL;
 }
 
-struct clk *__clk_lookup(const char *name)
+static struct clk_core *clk_core_lookup(const char *name)
 {
-	struct clk *root_clk;
-	struct clk *ret;
+	struct clk_core *root_clk;
+	struct clk_core *ret;
 
 	if (!name)
 		return NULL;
@@ -696,32 +750,32 @@ clk_mux_determine_rate_flags(struct clk_hw *hw, unsigned long rate,
 			     struct clk_hw **best_parent_p,
 			     unsigned long flags)
 {
-	struct clk *clk = hw->clk, *parent, *best_parent = NULL;
+	struct clk_core *core = hw->core, *parent, *best_parent = NULL;
 	int i, num_parents;
 	unsigned long parent_rate, best = 0;
 
 	/* if NO_REPARENT flag set, pass through to current parent */
-	if (clk->flags & CLK_SET_RATE_NO_REPARENT) {
-		parent = clk->parent;
-		if (clk->flags & CLK_SET_RATE_PARENT)
-			best = __clk_round_rate(parent, rate);
+	if (core->flags & CLK_SET_RATE_NO_REPARENT) {
+		parent = core->parent;
+		if (core->flags & CLK_SET_RATE_PARENT)
+			best = clk_core_round_rate_nolock(parent, rate);
 		else if (parent)
-			best = __clk_get_rate(parent);
+			best = clk_core_get_rate_nolock(parent);
 		else
-			best = __clk_get_rate(clk);
+			best = clk_core_get_rate_nolock(core);
 		goto out;
 	}
 
 	/* find the parent that can provide the fastest rate <= rate */
-	num_parents = clk->num_parents;
+	num_parents = core->num_parents;
 	for (i = 0; i < num_parents; i++) {
-		parent = clk_get_parent_by_index(clk, i);
+		parent = clk_core_get_parent_by_index(core, i);
 		if (!parent)
 			continue;
-		if (clk->flags & CLK_SET_RATE_PARENT)
-			parent_rate = __clk_round_rate(parent, rate);
+		if (core->flags & CLK_SET_RATE_PARENT)
+			parent_rate = clk_core_round_rate_nolock(parent, rate);
 		else
-			parent_rate = __clk_get_rate(parent);
+			parent_rate = clk_core_get_rate_nolock(parent);
 		if (mux_is_better_rate(rate, parent_rate, best, flags)) {
 			best_parent = parent;
 			best = parent_rate;
@@ -736,6 +790,13 @@ out:
 	return best;
 }
 
+struct clk *__clk_lookup(const char *name)
+{
+	struct clk_core *core = clk_core_lookup(name);
+
+	return !core ? NULL : core->hw->clk;
+}
+
 /*
  * Helper for finding best parent to provide a given frequency. This can be used
  * directly as a determine_rate callback (e.g. for a mux), or from a more
@@ -762,7 +823,7 @@ EXPORT_SYMBOL_GPL(__clk_mux_determine_rate_closest);
 
 /***        clk api        ***/
 
-void __clk_unprepare(struct clk *clk)
+static void clk_core_unprepare(struct clk_core *clk)
 {
 	if (!clk)
 		return;
@@ -778,7 +839,7 @@ void __clk_unprepare(struct clk *clk)
 	if (clk->ops->unprepare)
 		clk->ops->unprepare(clk->hw);
 
-	__clk_unprepare(clk->parent);
+	clk_core_unprepare(clk->parent);
 }
 
 /**
@@ -798,12 +859,12 @@ void clk_unprepare(struct clk *clk)
 		return;
 
 	clk_prepare_lock();
-	__clk_unprepare(clk);
+	clk_core_unprepare(clk->core);
 	clk_prepare_unlock();
 }
 EXPORT_SYMBOL_GPL(clk_unprepare);
 
-int __clk_prepare(struct clk *clk)
+static int clk_core_prepare(struct clk_core *clk)
 {
 	int ret = 0;
 
@@ -811,14 +872,14 @@ int __clk_prepare(struct clk *clk)
 		return 0;
 
 	if (clk->prepare_count == 0) {
-		ret = __clk_prepare(clk->parent);
+		ret = clk_core_prepare(clk->parent);
 		if (ret)
 			return ret;
 
 		if (clk->ops->prepare) {
 			ret = clk->ops->prepare(clk->hw);
 			if (ret) {
-				__clk_unprepare(clk->parent);
+				clk_core_unprepare(clk->parent);
 				return ret;
 			}
 		}
@@ -845,15 +906,18 @@ int clk_prepare(struct clk *clk)
 {
 	int ret;
 
+	if (!clk)
+		return 0;
+
 	clk_prepare_lock();
-	ret = __clk_prepare(clk);
+	ret = clk_core_prepare(clk->core);
 	clk_prepare_unlock();
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(clk_prepare);
 
-static void __clk_disable(struct clk *clk)
+static void clk_core_disable(struct clk_core *clk)
 {
 	if (!clk)
 		return;
@@ -867,7 +931,15 @@ static void __clk_disable(struct clk *clk)
 	if (clk->ops->disable)
 		clk->ops->disable(clk->hw);
 
-	__clk_disable(clk->parent);
+	clk_core_disable(clk->parent);
+}
+
+static void __clk_disable(struct clk *clk)
+{
+	if (!clk)
+		return;
+
+	clk_core_disable(clk->core);
 }
 
 /**
@@ -895,7 +967,7 @@ void clk_disable(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_disable);
 
-static int __clk_enable(struct clk *clk)
+static int clk_core_enable(struct clk_core *clk)
 {
 	int ret = 0;
 
@@ -906,7 +978,7 @@ static int __clk_enable(struct clk *clk)
 		return -ESHUTDOWN;
 
 	if (clk->enable_count == 0) {
-		ret = __clk_enable(clk->parent);
+		ret = clk_core_enable(clk->parent);
 
 		if (ret)
 			return ret;
@@ -914,7 +986,7 @@ static int __clk_enable(struct clk *clk)
 		if (clk->ops->enable) {
 			ret = clk->ops->enable(clk->hw);
 			if (ret) {
-				__clk_disable(clk->parent);
+				clk_core_disable(clk->parent);
 				return ret;
 			}
 		}
@@ -924,6 +996,14 @@ static int __clk_enable(struct clk *clk)
 	return 0;
 }
 
+static int __clk_enable(struct clk *clk)
+{
+	if (!clk)
+		return 0;
+
+	return clk_core_enable(clk->core);
+}
+
 /**
  * clk_enable - ungate a clock
  * @clk: the clk being ungated
@@ -950,17 +1030,11 @@ int clk_enable(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_enable);
 
-/**
- * __clk_round_rate - round the given rate for a clk
- * @clk: round the rate of this clock
- * @rate: the rate which is to be rounded
- *
- * Caller must hold prepare_lock.  Useful for clk_ops such as .set_rate
- */
-unsigned long __clk_round_rate(struct clk *clk, unsigned long rate)
+static unsigned long clk_core_round_rate_nolock(struct clk_core *clk,
+						unsigned long rate)
 {
 	unsigned long parent_rate = 0;
-	struct clk *parent;
+	struct clk_core *parent;
 	struct clk_hw *parent_hw;
 
 	if (!clk)
@@ -977,10 +1051,25 @@ unsigned long __clk_round_rate(struct clk *clk, unsigned long rate)
 	} else if (clk->ops->round_rate)
 		return clk->ops->round_rate(clk->hw, rate, &parent_rate);
 	else if (clk->flags & CLK_SET_RATE_PARENT)
-		return __clk_round_rate(clk->parent, rate);
+		return clk_core_round_rate_nolock(clk->parent, rate);
 	else
 		return clk->rate;
 }
+
+/**
+ * __clk_round_rate - round the given rate for a clk
+ * @clk: round the rate of this clock
+ * @rate: the rate which is to be rounded
+ *
+ * Caller must hold prepare_lock.  Useful for clk_ops such as .set_rate
+ */
+unsigned long __clk_round_rate(struct clk *clk, unsigned long rate)
+{
+	if (!clk)
+		return 0;
+
+	return clk_core_round_rate_nolock(clk->core, rate);
+}
 EXPORT_SYMBOL_GPL(__clk_round_rate);
 
 /**
@@ -996,8 +1085,11 @@ long clk_round_rate(struct clk *clk, unsigned long rate)
 {
 	unsigned long ret;
 
+	if (!clk)
+		return 0;
+
 	clk_prepare_lock();
-	ret = __clk_round_rate(clk, rate);
+	ret = clk_core_round_rate_nolock(clk->core, rate);
 	clk_prepare_unlock();
 
 	return ret;
@@ -1018,22 +1110,21 @@ EXPORT_SYMBOL_GPL(clk_round_rate);
  * called if all went well, or NOTIFY_STOP or NOTIFY_BAD immediately if
  * a driver returns that.
  */
-static int __clk_notify(struct clk *clk, unsigned long msg,
+static int __clk_notify(struct clk_core *clk, unsigned long msg,
 		unsigned long old_rate, unsigned long new_rate)
 {
 	struct clk_notifier *cn;
 	struct clk_notifier_data cnd;
 	int ret = NOTIFY_DONE;
 
-	cnd.clk = clk;
 	cnd.old_rate = old_rate;
 	cnd.new_rate = new_rate;
 
 	list_for_each_entry(cn, &clk_notifier_list, node) {
-		if (cn->clk == clk) {
+		if (cn->clk->core == clk) {
+			cnd.clk = cn->clk;
 			ret = srcu_notifier_call_chain(&cn->notifier_head, msg,
 					&cnd);
-			break;
 		}
 	}
 
@@ -1051,10 +1142,10 @@ static int __clk_notify(struct clk *clk, unsigned long msg,
  *
  * Caller must hold prepare_lock.
  */
-static void __clk_recalc_accuracies(struct clk *clk)
+static void __clk_recalc_accuracies(struct clk_core *clk)
 {
 	unsigned long parent_accuracy = 0;
-	struct clk *child;
+	struct clk_core *child;
 
 	if (clk->parent)
 		parent_accuracy = clk->parent->accuracy;
@@ -1069,6 +1160,20 @@ static void __clk_recalc_accuracies(struct clk *clk)
 		__clk_recalc_accuracies(child);
 }
 
+static long clk_core_get_accuracy(struct clk_core *clk)
+{
+	unsigned long accuracy;
+
+	clk_prepare_lock();
+	if (clk && (clk->flags & CLK_GET_ACCURACY_NOCACHE))
+		__clk_recalc_accuracies(clk);
+
+	accuracy = __clk_get_accuracy(clk);
+	clk_prepare_unlock();
+
+	return accuracy;
+}
+
 /**
  * clk_get_accuracy - return the accuracy of clk
  * @clk: the clk whose accuracy is being returned
@@ -1080,20 +1185,15 @@ static void __clk_recalc_accuracies(struct clk *clk)
  */
 long clk_get_accuracy(struct clk *clk)
 {
-	unsigned long accuracy;
-
-	clk_prepare_lock();
-	if (clk && (clk->flags & CLK_GET_ACCURACY_NOCACHE))
-		__clk_recalc_accuracies(clk);
-
-	accuracy = __clk_get_accuracy(clk);
-	clk_prepare_unlock();
+	if (!clk)
+		return 0;
 
-	return accuracy;
+	return clk_core_get_accuracy(clk->core);
 }
 EXPORT_SYMBOL_GPL(clk_get_accuracy);
 
-static unsigned long clk_recalc(struct clk *clk, unsigned long parent_rate)
+static unsigned long clk_recalc(struct clk_core *clk,
+				unsigned long parent_rate)
 {
 	if (clk->ops->recalc_rate)
 		return clk->ops->recalc_rate(clk->hw, parent_rate);
@@ -1114,11 +1214,11 @@ static unsigned long clk_recalc(struct clk *clk, unsigned long parent_rate)
  *
  * Caller must hold prepare_lock.
  */
-static void __clk_recalc_rates(struct clk *clk, unsigned long msg)
+static void __clk_recalc_rates(struct clk_core *clk, unsigned long msg)
 {
 	unsigned long old_rate;
 	unsigned long parent_rate = 0;
-	struct clk *child;
+	struct clk_core *child;
 
 	old_rate = clk->rate;
 
@@ -1138,15 +1238,7 @@ static void __clk_recalc_rates(struct clk *clk, unsigned long msg)
 		__clk_recalc_rates(child, msg);
 }
 
-/**
- * clk_get_rate - return the rate of clk
- * @clk: the clk whose rate is being returned
- *
- * Simply returns the cached rate of the clk, unless CLK_GET_RATE_NOCACHE flag
- * is set, which means a recalc_rate will be issued.
- * If clk is NULL then returns 0.
- */
-unsigned long clk_get_rate(struct clk *clk)
+static unsigned long clk_core_get_rate(struct clk_core *clk)
 {
 	unsigned long rate;
 
@@ -1155,14 +1247,32 @@ unsigned long clk_get_rate(struct clk *clk)
 	if (clk && (clk->flags & CLK_GET_RATE_NOCACHE))
 		__clk_recalc_rates(clk, 0);
 
-	rate = __clk_get_rate(clk);
+	rate = clk_core_get_rate_nolock(clk);
 	clk_prepare_unlock();
 
 	return rate;
 }
+EXPORT_SYMBOL_GPL(clk_core_get_rate);
+
+/**
+ * clk_get_rate - return the rate of clk
+ * @clk: the clk whose rate is being returned
+ *
+ * Simply returns the cached rate of the clk, unless CLK_GET_RATE_NOCACHE flag
+ * is set, which means a recalc_rate will be issued.
+ * If clk is NULL then returns 0.
+ */
+unsigned long clk_get_rate(struct clk *clk)
+{
+	if (!clk)
+		return 0;
+
+	return clk_core_get_rate(clk->core);
+}
 EXPORT_SYMBOL_GPL(clk_get_rate);
 
-static int clk_fetch_parent_index(struct clk *clk, struct clk *parent)
+static int clk_fetch_parent_index(struct clk_core *clk,
+				  struct clk_core *parent)
 {
 	int i;
 
@@ -1176,7 +1286,7 @@ static int clk_fetch_parent_index(struct clk *clk, struct clk *parent)
 	/*
 	 * find index of new parent clock using cached parent ptrs,
 	 * or if not yet cached, use string name comparison and cache
-	 * them now to avoid future calls to __clk_lookup.
+	 * them now to avoid future calls to clk_core_lookup.
 	 */
 	for (i = 0; i < clk->num_parents; i++) {
 		if (clk->parents[i] == parent)
@@ -1186,7 +1296,7 @@ static int clk_fetch_parent_index(struct clk *clk, struct clk *parent)
 			continue;
 
 		if (!strcmp(clk->parent_names[i], parent->name)) {
-			clk->parents[i] = __clk_lookup(parent->name);
+			clk->parents[i] = clk_core_lookup(parent->name);
 			return i;
 		}
 	}
@@ -1194,7 +1304,7 @@ static int clk_fetch_parent_index(struct clk *clk, struct clk *parent)
 	return -EINVAL;
 }
 
-static void clk_reparent(struct clk *clk, struct clk *new_parent)
+static void clk_reparent(struct clk_core *clk, struct clk_core *new_parent)
 {
 	hlist_del(&clk->child_node);
 
@@ -1211,10 +1321,11 @@ static void clk_reparent(struct clk *clk, struct clk *new_parent)
 	clk->parent = new_parent;
 }
 
-static struct clk *__clk_set_parent_before(struct clk *clk, struct clk *parent)
+static struct clk_core *__clk_set_parent_before(struct clk_core *clk,
+					   struct clk_core *parent)
 {
 	unsigned long flags;
-	struct clk *old_parent = clk->parent;
+	struct clk_core *old_parent = clk->parent;
 
 	/*
 	 * Migrate prepare state between parents and prevent race with
@@ -1234,9 +1345,9 @@ static struct clk *__clk_set_parent_before(struct clk *clk, struct clk *parent)
 	 * See also: Comment for clk_set_parent() below.
 	 */
 	if (clk->prepare_count) {
-		__clk_prepare(parent);
-		clk_enable(parent);
-		clk_enable(clk);
+		clk_core_prepare(parent);
+		clk_core_enable(parent);
+		clk_core_enable(clk);
 	}
 
 	/* update the clk tree topology */
@@ -1247,25 +1358,27 @@ static struct clk *__clk_set_parent_before(struct clk *clk, struct clk *parent)
 	return old_parent;
 }
 
-static void __clk_set_parent_after(struct clk *clk, struct clk *parent,
-		struct clk *old_parent)
+static void __clk_set_parent_after(struct clk_core *core,
+				   struct clk_core *parent,
+				   struct clk_core *old_parent)
 {
 	/*
 	 * Finish the migration of prepare state and undo the changes done
 	 * for preventing a race with clk_enable().
 	 */
-	if (clk->prepare_count) {
-		clk_disable(clk);
-		clk_disable(old_parent);
-		__clk_unprepare(old_parent);
+	if (core->prepare_count) {
+		clk_core_disable(core);
+		clk_core_disable(old_parent);
+		clk_core_unprepare(old_parent);
 	}
 }
 
-static int __clk_set_parent(struct clk *clk, struct clk *parent, u8 p_index)
+static int __clk_set_parent(struct clk_core *clk, struct clk_core *parent,
+			    u8 p_index)
 {
 	unsigned long flags;
 	int ret = 0;
-	struct clk *old_parent;
+	struct clk_core *old_parent;
 
 	old_parent = __clk_set_parent_before(clk, parent);
 
@@ -1279,9 +1392,9 @@ static int __clk_set_parent(struct clk *clk, struct clk *parent, u8 p_index)
 		clk_enable_unlock(flags);
 
 		if (clk->prepare_count) {
-			clk_disable(clk);
-			clk_disable(parent);
-			__clk_unprepare(parent);
+			clk_core_disable(clk);
+			clk_core_disable(parent);
+			clk_core_unprepare(parent);
 		}
 		return ret;
 	}
@@ -1307,9 +1420,10 @@ static int __clk_set_parent(struct clk *clk, struct clk *parent, u8 p_index)
  *
  * Caller must hold prepare_lock.
  */
-static int __clk_speculate_rates(struct clk *clk, unsigned long parent_rate)
+static int __clk_speculate_rates(struct clk_core *clk,
+				 unsigned long parent_rate)
 {
-	struct clk *child;
+	struct clk_core *child;
 	unsigned long new_rate;
 	int ret = NOTIFY_DONE;
 
@@ -1335,10 +1449,10 @@ out:
 	return ret;
 }
 
-static void clk_calc_subtree(struct clk *clk, unsigned long new_rate,
-			     struct clk *new_parent, u8 p_index)
+static void clk_calc_subtree(struct clk_core *clk, unsigned long new_rate,
+			     struct clk_core *new_parent, u8 p_index)
 {
-	struct clk *child;
+	struct clk_core *child;
 
 	clk->new_rate = new_rate;
 	clk->new_parent = new_parent;
@@ -1358,10 +1472,11 @@ static void clk_calc_subtree(struct clk *clk, unsigned long new_rate,
  * calculate the new rates returning the topmost clock that has to be
  * changed.
  */
-static struct clk *clk_calc_new_rates(struct clk *clk, unsigned long rate)
+static struct clk_core *clk_calc_new_rates(struct clk_core *clk,
+					   unsigned long rate)
 {
-	struct clk *top = clk;
-	struct clk *old_parent, *parent;
+	struct clk_core *top = clk;
+	struct clk_core *old_parent, *parent;
 	struct clk_hw *parent_hw;
 	unsigned long best_parent_rate = 0;
 	unsigned long new_rate;
@@ -1382,7 +1497,7 @@ static struct clk *clk_calc_new_rates(struct clk *clk, unsigned long rate)
 		new_rate = clk->ops->determine_rate(clk->hw, rate,
 						    &best_parent_rate,
 						    &parent_hw);
-		parent = parent_hw ? parent_hw->clk : NULL;
+		parent = parent_hw ? parent_hw->core : NULL;
 	} else if (clk->ops->round_rate) {
 		new_rate = clk->ops->round_rate(clk->hw, rate,
 						&best_parent_rate);
@@ -1430,9 +1545,10 @@ out:
  * so that in case of an error we can walk down the whole tree again and
  * abort the change.
  */
-static struct clk *clk_propagate_rate_change(struct clk *clk, unsigned long event)
+static struct clk_core *clk_propagate_rate_change(struct clk_core *clk,
+						  unsigned long event)
 {
-	struct clk *child, *tmp_clk, *fail_clk = NULL;
+	struct clk_core *child, *tmp_clk, *fail_clk = NULL;
 	int ret = NOTIFY_DONE;
 
 	if (clk->rate == clk->new_rate)
@@ -1467,14 +1583,14 @@ static struct clk *clk_propagate_rate_change(struct clk *clk, unsigned long even
  * walk down a subtree and set the new rates notifying the rate
  * change on the way
  */
-static void clk_change_rate(struct clk *clk)
+static void clk_change_rate(struct clk_core *clk)
 {
-	struct clk *child;
+	struct clk_core *child;
 	struct hlist_node *tmp;
 	unsigned long old_rate;
 	unsigned long best_parent_rate = 0;
 	bool skip_set_rate = false;
-	struct clk *old_parent;
+	struct clk_core *old_parent;
 
 	old_rate = clk->rate;
 
@@ -1545,7 +1661,7 @@ static void clk_change_rate(struct clk *clk)
  */
 int clk_set_rate(struct clk *clk, unsigned long rate)
 {
-	struct clk *top, *fail_clk;
+	struct clk_core *top, *fail_clk;
 	int ret = 0;
 
 	if (!clk)
@@ -1558,13 +1674,14 @@ int clk_set_rate(struct clk *clk, unsigned long rate)
 	if (rate == clk_get_rate(clk))
 		goto out;
 
-	if ((clk->flags & CLK_SET_RATE_GATE) && clk->prepare_count) {
+	if ((clk->core->flags & CLK_SET_RATE_GATE) &&
+	    clk->core->prepare_count) {
 		ret = -EBUSY;
 		goto out;
 	}
 
 	/* calculate new rates and get the topmost changed clock */
-	top = clk_calc_new_rates(clk, rate);
+	top = clk_calc_new_rates(clk->core, rate);
 	if (!top) {
 		ret = -EINVAL;
 		goto out;
@@ -1615,11 +1732,11 @@ EXPORT_SYMBOL_GPL(clk_get_parent);
  *
  * For single-parent clocks without .get_parent, first check to see if the
  * .parents array exists, and if so use it to avoid an expensive tree
- * traversal.  If .parents does not exist then walk the tree with __clk_lookup.
+ * traversal.  If .parents does not exist then walk the tree.
  */
-static struct clk *__clk_init_parent(struct clk *clk)
+static struct clk_core *__clk_init_parent(struct clk_core *clk)
 {
-	struct clk *ret = NULL;
+	struct clk_core *ret = NULL;
 	u8 index;
 
 	/* handle the trivial cases */
@@ -1629,7 +1746,7 @@ static struct clk *__clk_init_parent(struct clk *clk)
 
 	if (clk->num_parents == 1) {
 		if (IS_ERR_OR_NULL(clk->parent))
-			clk->parent = __clk_lookup(clk->parent_names[0]);
+			clk->parent = clk_core_lookup(clk->parent_names[0]);
 		ret = clk->parent;
 		goto out;
 	}
@@ -1643,8 +1760,8 @@ static struct clk *__clk_init_parent(struct clk *clk)
 
 	/*
 	 * Do our best to cache parent clocks in clk->parents.  This prevents
-	 * unnecessary and expensive calls to __clk_lookup.  We don't set
-	 * clk->parent here; that is done by the calling function
+	 * unnecessary and expensive lookups.  We don't set clk->parent here;
+	 * that is done by the calling function.
 	 */
 
 	index = clk->ops->get_parent(clk->hw);
@@ -1654,13 +1771,14 @@ static struct clk *__clk_init_parent(struct clk *clk)
 			kcalloc(clk->num_parents, sizeof(struct clk *),
 					GFP_KERNEL);
 
-	ret = clk_get_parent_by_index(clk, index);
+	ret = clk_core_get_parent_by_index(clk, index);
 
 out:
 	return ret;
 }
 
-void __clk_reparent(struct clk *clk, struct clk *new_parent)
+static void clk_core_reparent(struct clk_core *clk,
+				  struct clk_core *new_parent)
 {
 	clk_reparent(clk, new_parent);
 	__clk_recalc_accuracies(clk);
@@ -1679,42 +1797,29 @@ void __clk_reparent(struct clk *clk, struct clk *new_parent)
  */
 bool clk_has_parent(struct clk *clk, struct clk *parent)
 {
+	struct clk_core *core, *parent_core;
 	unsigned int i;
 
 	/* NULL clocks should be nops, so return success if either is NULL. */
 	if (!clk || !parent)
 		return true;
 
+	core = clk->core;
+	parent_core = parent->core;
+
 	/* Optimize for the case where the parent is already the parent. */
-	if (clk->parent == parent)
+	if (core->parent == parent_core)
 		return true;
 
-	for (i = 0; i < clk->num_parents; i++)
-		if (strcmp(clk->parent_names[i], parent->name) == 0)
+	for (i = 0; i < core->num_parents; i++)
+		if (strcmp(core->parent_names[i], parent_core->name) == 0)
 			return true;
 
 	return false;
 }
 EXPORT_SYMBOL_GPL(clk_has_parent);
 
-/**
- * clk_set_parent - switch the parent of a mux clk
- * @clk: the mux clk whose input we are switching
- * @parent: the new input to clk
- *
- * Re-parent clk to use parent as its new input source.  If clk is in
- * prepared state, the clk will get enabled for the duration of this call. If
- * that's not acceptable for a specific clk (Eg: the consumer can't handle
- * that, the reparenting is glitchy in hardware, etc), use the
- * CLK_SET_PARENT_GATE flag to allow reparenting only when clk is unprepared.
- *
- * After successfully changing clk's parent clk_set_parent will update the
- * clk topology, sysfs topology and propagate rate recalculation via
- * __clk_recalc_rates.
- *
- * Returns 0 on success, -EERROR otherwise.
- */
-int clk_set_parent(struct clk *clk, struct clk *parent)
+static int clk_core_set_parent(struct clk_core *clk, struct clk_core *parent)
 {
 	int ret = 0;
 	int p_index = 0;
@@ -1774,6 +1879,31 @@ out:
 
 	return ret;
 }
+
+/**
+ * clk_set_parent - switch the parent of a mux clk
+ * @clk: the mux clk whose input we are switching
+ * @parent: the new input to clk
+ *
+ * Re-parent clk to use parent as its new input source.  If clk is in
+ * prepared state, the clk will get enabled for the duration of this call. If
+ * that's not acceptable for a specific clk (Eg: the consumer can't handle
+ * that, the reparenting is glitchy in hardware, etc), use the
+ * CLK_SET_PARENT_GATE flag to allow reparenting only when clk is unprepared.
+ *
+ * After successfully changing clk's parent clk_set_parent will update the
+ * clk topology, sysfs topology and propagate rate recalculation via
+ * __clk_recalc_rates.
+ *
+ * Returns 0 on success, -EERROR otherwise.
+ */
+int clk_set_parent(struct clk *clk, struct clk *parent)
+{
+	if (!clk)
+		return 0;
+
+	return clk_core_set_parent(clk->core, parent ? parent->core : NULL);
+}
 EXPORT_SYMBOL_GPL(clk_set_parent);
 
 /**
@@ -1810,13 +1940,13 @@ int clk_set_phase(struct clk *clk, int degrees)
 
 	clk_prepare_lock();
 
-	if (!clk->ops->set_phase)
+	if (!clk->core->ops->set_phase)
 		goto out_unlock;
 
-	ret = clk->ops->set_phase(clk->hw, degrees);
+	ret = clk->core->ops->set_phase(clk->core->hw, degrees);
 
 	if (!ret)
-		clk->phase = degrees;
+		clk->core->phase = degrees;
 
 out_unlock:
 	clk_prepare_unlock();
@@ -1826,14 +1956,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(clk_set_phase);
 
-/**
- * clk_get_phase - return the phase shift of a clock signal
- * @clk: clock signal source
- *
- * Returns the phase shift of a clock node in degrees, otherwise returns
- * -EERROR.
- */
-int clk_get_phase(struct clk *clk)
+static int clk_core_get_phase(struct clk_core *clk)
 {
 	int ret = 0;
 
@@ -1849,27 +1972,45 @@ out:
 }
 EXPORT_SYMBOL_GPL(clk_get_phase);
 
+/**
+ * clk_get_phase - return the phase shift of a clock signal
+ * @clk: clock signal source
+ *
+ * Returns the phase shift of a clock node in degrees, otherwise returns
+ * -EERROR.
+ */
+int clk_get_phase(struct clk *clk)
+{
+	if (!clk)
+		return 0;
+
+	return clk_core_get_phase(clk->core);
+}
+
 /**
  * __clk_init - initialize the data structures in a struct clk
  * @dev:	device initializing this clk, placeholder for now
  * @clk:	clk being initialized
  *
- * Initializes the lists in struct clk, queries the hardware for the
+ * Initializes the lists in struct clk_core, queries the hardware for the
  * parent and rate and sets them both.
  */
-int __clk_init(struct device *dev, struct clk *clk)
+int __clk_init(struct device *dev, struct clk *clk_user)
 {
 	int i, ret = 0;
-	struct clk *orphan;
+	struct clk_core *orphan;
 	struct hlist_node *tmp2;
+	struct clk_core *clk;
 
-	if (!clk)
+	if (!clk_user)
 		return -EINVAL;
 
+	clk = clk_user->core;
+
 	clk_prepare_lock();
 
 	/* check to see if a clock with this name is already registered */
-	if (__clk_lookup(clk->name)) {
+	if (clk_core_lookup(clk->name)) {
 		pr_debug("%s: clk %s already initialized\n",
 				__func__, clk->name);
 		ret = -EEXIST;
@@ -1921,7 +2062,7 @@ int __clk_init(struct device *dev, struct clk *clk)
 		clk->parents = kcalloc(clk->num_parents, sizeof(struct clk *),
 					GFP_KERNEL);
 		/*
-		 * __clk_lookup returns NULL for parents that have not been
+		 * clk_core_lookup returns NULL for parents that have not been
 		 * clk_init'd; thus any access to clk->parents[] must check
 		 * for a NULL pointer.  We can always perform lazy lookups for
 		 * missing parents later on.
@@ -1929,7 +2070,7 @@ int __clk_init(struct device *dev, struct clk *clk)
 		if (clk->parents)
 			for (i = 0; i < clk->num_parents; i++)
 				clk->parents[i] =
-					__clk_lookup(clk->parent_names[i]);
+					clk_core_lookup(clk->parent_names[i]);
 	}
 
 	clk->parent = __clk_init_parent(clk);
@@ -1985,7 +2126,7 @@ int __clk_init(struct device *dev, struct clk *clk)
 	 */
 	if (clk->ops->recalc_rate)
 		clk->rate = clk->ops->recalc_rate(clk->hw,
-				__clk_get_rate(clk->parent));
+				clk_core_get_rate_nolock(clk->parent));
 	else if (clk->parent)
 		clk->rate = clk->parent->rate;
 	else
@@ -1999,13 +2140,13 @@ int __clk_init(struct device *dev, struct clk *clk)
 		if (orphan->num_parents && orphan->ops->get_parent) {
 			i = orphan->ops->get_parent(orphan->hw);
 			if (!strcmp(clk->name, orphan->parent_names[i]))
-				__clk_reparent(orphan, clk);
+				clk_core_reparent(orphan, clk);
 			continue;
 		}
 
 		for (i = 0; i < orphan->num_parents; i++)
 			if (!strcmp(clk->name, orphan->parent_names[i])) {
-				__clk_reparent(orphan, clk);
+				clk_core_reparent(orphan, clk);
 				break;
 			}
 	 }
@@ -2031,6 +2172,26 @@ out:
 	return ret;
 }
 
+struct clk *__clk_create_clk(struct clk_hw *hw, const char *dev_id,
+			     const char *con_id)
+{
+	struct clk *clk;
+
+	/* This is to allow this function to be chained to others */
+	if (!hw || IS_ERR(hw))
+		return (struct clk *) hw;
+
+	clk = kzalloc(sizeof(*clk), GFP_KERNEL);
+	if (!clk)
+		return ERR_PTR(-ENOMEM);
+
+	clk->core = hw->core;
+	clk->dev_id = dev_id;
+	clk->con_id = con_id;
+
+	return clk;
+}
+
 /**
  * clk_register - allocate a new clock, register it and return an opaque cookie
  * @dev: device that is registering this clock
@@ -2045,7 +2206,7 @@ out:
 struct clk *clk_register(struct device *dev, struct clk_hw *hw)
 {
 	int i, ret;
-	struct clk *clk;
+	struct clk_core *clk;
 
 	clk = kzalloc(sizeof(*clk), GFP_KERNEL);
 	if (!clk) {
@@ -2066,7 +2227,7 @@ struct clk *clk_register(struct device *dev, struct clk_hw *hw)
 	clk->hw = hw;
 	clk->flags = hw->init->flags;
 	clk->num_parents = hw->init->num_parents;
-	hw->clk = clk;
+	hw->core = clk;
 
 	/* allocate local copy in case parent_names is __initdata */
 	clk->parent_names = kcalloc(clk->num_parents, sizeof(char *),
@@ -2090,10 +2251,19 @@ struct clk *clk_register(struct device *dev, struct clk_hw *hw)
 		}
 	}
 
-	ret = __clk_init(dev, clk);
+	hw->clk = __clk_create_clk(hw, NULL, NULL);
+	if (IS_ERR(hw->clk)) {
+		pr_err("%s: could not allocate per-user clk\n", __func__);
+		ret = PTR_ERR(hw->clk);
+		goto fail_parent_names_copy;
+	}
+
+	ret = __clk_init(dev, hw->clk);
 	if (!ret)
-		return clk;
+		return hw->clk;
 
+	kfree(hw->clk);
+	hw->clk = NULL;
 fail_parent_names_copy:
 	while (--i >= 0)
 		kfree(clk->parent_names[i]);
@@ -2113,7 +2283,7 @@ EXPORT_SYMBOL_GPL(clk_register);
  */
 static void __clk_release(struct kref *ref)
 {
-	struct clk *clk = container_of(ref, struct clk, ref);
+	struct clk_core *clk = container_of(ref, struct clk_core, ref);
 	int i = clk->num_parents;
 
 	kfree(clk->parents);
@@ -2171,12 +2341,13 @@ void clk_unregister(struct clk *clk)
 	if (!clk || WARN_ON_ONCE(IS_ERR(clk)))
 		return;
 
-	clk_debug_unregister(clk);
+	clk_debug_unregister(clk->core);
 
 	clk_prepare_lock();
 
-	if (clk->ops == &clk_nodrv_ops) {
-		pr_err("%s: unregistered clock: %s\n", __func__, clk->name);
+	if (clk->core->ops == &clk_nodrv_ops) {
+		pr_err("%s: unregistered clock: %s\n", __func__,
+		       clk->core->name);
 		return;
 	}
 	/*
@@ -2184,24 +2355,25 @@ void clk_unregister(struct clk *clk)
 	 * a reference to this clock.
 	 */
 	flags = clk_enable_lock();
-	clk->ops = &clk_nodrv_ops;
+	clk->core->ops = &clk_nodrv_ops;
 	clk_enable_unlock(flags);
 
-	if (!hlist_empty(&clk->children)) {
-		struct clk *child;
+	if (!hlist_empty(&clk->core->children)) {
+		struct clk_core *child;
 		struct hlist_node *t;
 
 		/* Reparent all children to the orphan list. */
-		hlist_for_each_entry_safe(child, t, &clk->children, child_node)
-			clk_set_parent(child, NULL);
+		hlist_for_each_entry_safe(child, t, &clk->core->children,
+					  child_node)
+			clk_core_set_parent(child, NULL);
 	}
 
-	hlist_del_init(&clk->child_node);
+	hlist_del_init(&clk->core->child_node);
 
-	if (clk->prepare_count)
+	if (clk->core->prepare_count)
 		pr_warn("%s: unregistering prepared clock: %s\n",
-					__func__, clk->name);
-	kref_put(&clk->ref, __clk_release);
+					__func__, clk->core->name);
+	kref_put(&clk->core->ref, __clk_release);
 
 	clk_prepare_unlock();
 }
@@ -2269,30 +2441,39 @@ EXPORT_SYMBOL_GPL(devm_clk_unregister);
  */
 int __clk_get(struct clk *clk)
 {
-	if (clk) {
-		if (!try_module_get(clk->owner))
+	struct clk_core *core = !clk ? NULL : clk->core;
+
+	if (core) {
+		if (!try_module_get(core->owner))
 			return 0;
 
-		kref_get(&clk->ref);
+		kref_get(&core->ref);
 	}
 	return 1;
 }
 
-void __clk_put(struct clk *clk)
+static void clk_core_put(struct clk_core *core)
 {
 	struct module *owner;
 
-	if (!clk || WARN_ON_ONCE(IS_ERR(clk)))
-		return;
+	owner = core->owner;
 
 	clk_prepare_lock();
-	owner = clk->owner;
-	kref_put(&clk->ref, __clk_release);
+	kref_put(&core->ref, __clk_release);
 	clk_prepare_unlock();
 
 	module_put(owner);
 }
 
+void __clk_put(struct clk *clk)
+{
+	if (!clk || WARN_ON_ONCE(IS_ERR(clk)))
+		return;
+
+	clk_core_put(clk->core);
+	kfree(clk);
+}
+
 /***        clk rate change notifiers        ***/
 
 /**
@@ -2345,7 +2526,7 @@ int clk_notifier_register(struct clk *clk, struct notifier_block *nb)
 
 	ret = srcu_notifier_chain_register(&cn->notifier_head, nb);
 
-	clk->notifier_count++;
+	clk->core->notifier_count++;
 
 out:
 	clk_prepare_unlock();
@@ -2382,7 +2563,7 @@ int clk_notifier_unregister(struct clk *clk, struct notifier_block *nb)
 	if (cn->clk == clk) {
 		ret = srcu_notifier_chain_unregister(&cn->notifier_head, nb);
 
-		clk->notifier_count--;
+		clk->core->notifier_count--;
 
 		/* XXX the notifier code should handle this better */
 		if (!cn->notifier_head.head) {
diff --git a/drivers/clk/clk.h b/drivers/clk/clk.h
index c798138f023f..23c44e51df69 100644
--- a/drivers/clk/clk.h
+++ b/drivers/clk/clk.h
@@ -9,9 +9,14 @@
  * published by the Free Software Foundation.
  */
 
+struct clk_hw;
+
 #if defined(CONFIG_OF) && defined(CONFIG_COMMON_CLK)
 struct clk *of_clk_get_by_clkspec(struct of_phandle_args *clkspec);
 struct clk *__of_clk_get_from_provider(struct of_phandle_args *clkspec);
 void of_clk_lock(void);
 void of_clk_unlock(void);
 #endif
+
+struct clk *__clk_create_clk(struct clk_hw *hw, const char *dev_id,
+			     const char *con_id);
diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index da4bda8b7fc7..901d242f38ea 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/of.h>
 
 #include "clk.h"
@@ -53,7 +54,7 @@ struct clk *of_clk_get_by_clkspec(struct of_phandle_args *clkspec)
 	return clk;
 }
 
-struct clk *of_clk_get(struct device_node *np, int index)
+static struct clk *__of_clk_get(struct device_node *np, int index)
 {
 	struct of_phandle_args clkspec;
 	struct clk *clk;
@@ -69,20 +70,24 @@ struct clk *of_clk_get(struct device_node *np, int index)
 
 	clk = of_clk_get_by_clkspec(&clkspec);
 	of_node_put(clkspec.np);
+
+	return clk;
+}
+
+struct clk *of_clk_get(struct device_node *np, int index)
+{
+	struct clk *clk = __of_clk_get(np, index);
+
+	if (!IS_ERR(clk))
+		clk = __clk_create_clk(__clk_get_hw(clk), np->full_name, NULL);
+
 	return clk;
 }
 EXPORT_SYMBOL(of_clk_get);
 
-/**
- * of_clk_get_by_name() - Parse and lookup a clock referenced by a device node
- * @np: pointer to clock consumer node
- * @name: name of consumer's clock input, or NULL for the first clock reference
- *
- * This function parses the clocks and clock-names properties,
- * and uses them to look up the struct clk from the registered list of clock
- * providers.
- */
-struct clk *of_clk_get_by_name(struct device_node *np, const char *name)
+static struct clk *__of_clk_get_by_name(struct device_node *np,
+					const char *dev_id,
+					const char *name)
 {
 	struct clk *clk = ERR_PTR(-ENOENT);
 
@@ -97,9 +102,11 @@ struct clk *of_clk_get_by_name(struct device_node *np, const char *name)
 		 */
 		if (name)
 			index = of_property_match_string(np, "clock-names", name);
-		clk = of_clk_get(np, index);
-		if (!IS_ERR(clk))
+		clk = __of_clk_get(np, index);
+		if (!IS_ERR(clk)) {
+			clk = __clk_create_clk(__clk_get_hw(clk), dev_id, name);
 			break;
+		}
 		else if (name && index >= 0) {
 			if (PTR_ERR(clk) != -EPROBE_DEFER)
 				pr_err("ERROR: could not get clock %s:%s(%i)\n",
@@ -119,7 +126,33 @@ struct clk *of_clk_get_by_name(struct device_node *np, const char *name)
 
 	return clk;
 }
+
+/**
+ * of_clk_get_by_name() - Parse and lookup a clock referenced by a device node
+ * @np: pointer to clock consumer node
+ * @name: name of consumer's clock input, or NULL for the first clock reference
+ *
+ * This function parses the clocks and clock-names properties,
+ * and uses them to look up the struct clk from the registered list of clock
+ * providers.
+ */
+struct clk *of_clk_get_by_name(struct device_node *np, const char *name)
+{
+	if (!np)
+		return ERR_PTR(-ENOENT);
+
+	return __of_clk_get_by_name(np, np->full_name, name);
+}
 EXPORT_SYMBOL(of_clk_get_by_name);
+
+#else /* defined(CONFIG_OF) && defined(CONFIG_COMMON_CLK) */
+
+static struct clk *__of_clk_get_by_name(struct device_node *np,
+					const char *dev_id,
+					const char *name)
+{
+	return ERR_PTR(-ENOENT);
+}
 #endif
 
 /*
@@ -168,14 +201,29 @@ static struct clk_lookup *clk_find(const char *dev_id, const char *con_id)
 struct clk *clk_get_sys(const char *dev_id, const char *con_id)
 {
 	struct clk_lookup *cl;
+	struct clk *clk = NULL;
 
 	mutex_lock(&clocks_mutex);
+
 	cl = clk_find(dev_id, con_id);
-	if (cl && !__clk_get(cl->clk))
+	if (!cl)
+		goto out;
+
+	if (!__clk_get(cl->clk)) {
 		cl = NULL;
+		goto out;
+	}
+
+#if defined(CONFIG_COMMON_CLK)
+	clk = __clk_create_clk(__clk_get_hw(cl->clk), dev_id, con_id);
+#else
+	clk = cl->clk;
+#endif
+
+out:
 	mutex_unlock(&clocks_mutex);
 
-	return cl ? cl->clk : ERR_PTR(-ENOENT);
+	return cl ? clk : ERR_PTR(-ENOENT);
 }
 EXPORT_SYMBOL(clk_get_sys);
 
@@ -185,10 +233,8 @@ struct clk *clk_get(struct device *dev, const char *con_id)
 	struct clk *clk;
 
 	if (dev) {
-		clk = of_clk_get_by_name(dev->of_node, con_id);
-		if (!IS_ERR(clk))
-			return clk;
-		if (PTR_ERR(clk) == -EPROBE_DEFER)
+		clk = __of_clk_get_by_name(dev->of_node, dev_id, con_id);
+		if (!IS_ERR(clk) || PTR_ERR(clk) == -EPROBE_DEFER)
 			return clk;
 	}
 
diff --git a/include/linux/clk-private.h b/include/linux/clk-private.h
index c5f40d07686c..ae55d99a2313 100644
--- a/include/linux/clk-private.h
+++ b/include/linux/clk-private.h
@@ -28,20 +28,20 @@
 
 struct module;
 
-struct clk {
+struct clk_core {
 	const char		*name;
 	const struct clk_ops	*ops;
 	struct clk_hw		*hw;
 	struct module		*owner;
-	struct clk		*parent;
+	struct clk_core		*parent;
 	const char		**parent_names;
-	struct clk		**parents;
+	struct clk_core		**parents;
 	u8			num_parents;
 	u8			new_parent_index;
 	unsigned long		rate;
 	unsigned long		new_rate;
-	struct clk		*new_parent;
-	struct clk		*new_child;
+	struct clk_core		*new_parent;
+	struct clk_core		*new_child;
 	unsigned long		flags;
 	unsigned int		enable_count;
 	unsigned int		prepare_count;
@@ -57,6 +57,12 @@ struct clk {
 	struct kref		ref;
 };
 
+struct clk {
+	struct clk_core	*core;
+	const char *dev_id;
+	const char *con_id;
+};
+
 /*
  * DOC: Basic clock implementations common to many platforms
  *
@@ -69,6 +75,9 @@ struct clk {
 #define DEFINE_CLK(_name, _ops, _flags, _parent_names,		\
 		_parents)					\
 	static struct clk _name = {				\
+		.core = &_name##_core				\
+	};							\
+	static struct clk_core _name##_core = {			\
 		.name = #_name,					\
 		.ops = &_ops,					\
 		.hw = &_name##_hw.hw,				\
@@ -81,9 +90,11 @@ struct clk {
 #define DEFINE_CLK_FIXED_RATE(_name, _flags, _rate,		\
 				_fixed_rate_flags)		\
 	static struct clk _name;				\
+	static struct clk_core _name##_core;			\
 	static const char *_name##_parent_names[] = {};		\
 	static struct clk_fixed_rate _name##_hw = {		\
 		.hw = {						\
+			.core = &_name##_core,			\
 			.clk = &_name,				\
 		},						\
 		.fixed_rate = _rate,				\
@@ -96,14 +107,16 @@ struct clk {
 				_flags, _reg, _bit_idx,		\
 				_gate_flags, _lock)		\
 	static struct clk _name;				\
+	static struct clk_core _name##_core;			\
 	static const char *_name##_parent_names[] = {		\
 		_parent_name,					\
 	};							\
-	static struct clk *_name##_parents[] = {		\
+	static struct clk_core *_name##_parents[] = {		\
 		_parent_ptr,					\
 	};							\
 	static struct clk_gate _name##_hw = {			\
 		.hw = {						\
+			.core = &_name##_core,			\
 			.clk = &_name,				\
 		},						\
 		.reg = _reg,					\
@@ -118,14 +131,16 @@ struct clk {
 				_flags, _reg, _shift, _width,	\
 				_divider_flags, _table, _lock)	\
 	static struct clk _name;				\
+	static struct clk_core _name##_core;			\
 	static const char *_name##_parent_names[] = {		\
 		_parent_name,					\
 	};							\
-	static struct clk *_name##_parents[] = {		\
+	static struct clk_core *_name##_parents[] = {		\
 		_parent_ptr,					\
 	};							\
 	static struct clk_divider _name##_hw = {		\
 		.hw = {						\
+			.core = &_name##_core,			\
 			.clk = &_name,				\
 		},						\
 		.reg = _reg,					\
@@ -157,8 +172,10 @@ struct clk {
 				_reg, _shift, _width,		\
 				_mux_flags, _lock)		\
 	static struct clk _name;				\
+	static struct clk_core _name##_core;			\
 	static struct clk_mux _name##_hw = {			\
 		.hw = {						\
+			.core = &_name##_core,			\
 			.clk = &_name,				\
 		},						\
 		.reg = _reg,					\
@@ -174,14 +191,16 @@ struct clk {
 				_parent_ptr, _flags,		\
 				_mult, _div)			\
 	static struct clk _name;				\
+	static struct clk_core _name##_core;			\
 	static const char *_name##_parent_names[] = {		\
 		_parent_name,					\
 	};							\
-	static struct clk *_name##_parents[] = {		\
+	static struct clk_core *_name##_parents[] = {		\
 		_parent_ptr,					\
 	};							\
 	static struct clk_fixed_factor _name##_hw = {		\
 		.hw = {						\
+			.core = &_name##_core,			\
 			.clk = &_name,				\
 		},						\
 		.mult = _mult,					\
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 0ed5bf2209ad..12f13b0673af 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -33,6 +33,7 @@
 #define CLK_GET_ACCURACY_NOCACHE BIT(8) /* do not use the cached clk accuracy */
 
 struct clk_hw;
+struct clk_core;
 struct dentry;
 
 /**
@@ -216,13 +217,17 @@ struct clk_init_data {
  * clk_foo and then referenced by the struct clk instance that uses struct
  * clk_foo's clk_ops
  *
- * @clk: pointer to the struct clk instance that points back to this struct
- * clk_hw instance
+ * @core: pointer to the struct clk_core instance that points back to this
+ * struct clk_hw instance
+ *
+ * @clk: pointer to the per-user struct clk instance that can be used to call
+ * into the clk API
  *
  * @init: pointer to struct clk_init_data that contains the init data shared
  * with the common clock framework.
  */
 struct clk_hw {
+	struct clk_core *core;
 	struct clk *clk;
 	const struct clk_init_data *init;
 };
@@ -577,9 +582,6 @@ long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate,
 /*
  * FIXME clock api without lock protection
  */
-int __clk_prepare(struct clk *clk);
-void __clk_unprepare(struct clk *clk);
-void __clk_reparent(struct clk *clk, struct clk *new_parent);
 unsigned long __clk_round_rate(struct clk *clk, unsigned long rate);
 
 struct of_device_id;
-- 
cgit v1.2.3


From b09d6d99102504a929cfaba4cd0e07658d7f01d1 Mon Sep 17 00:00:00 2001
From: Michael Turquette <mturquette@linaro.org>
Date: Thu, 29 Jan 2015 14:22:50 -0800
Subject: clk: remove clk-private.h

Private clock framework data structures should be private, surprisingly.

Now that all platforms and drivers have been updated to remove static
initializations of struct clk and struct clk_core objects and all
references to clk-private.h have been removed we can move the
definitions of these structures into drivers/clk/clk.c and delete the
header.

Additionally the ugly DEFINE_CLK macros have been removed. Those were
used for static definitions of struct clk objects. That practice is no
longer allowed.

Finally __clk_init is staticized as it is no longer declared in any
header.

Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 drivers/clk/clk.c           |  41 +++++++-
 include/linux/clk-private.h | 237 --------------------------------------------
 2 files changed, 39 insertions(+), 239 deletions(-)
 delete mode 100644 include/linux/clk-private.h

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index d60c193b81aa..cdc1fa58e4f1 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -9,7 +9,7 @@
  * Standard functionality for the common clock API.  See Documentation/clk.txt
  */
 
-#include <linux/clk-private.h>
+#include <linux/clk-provider.h>
 #include <linux/clk/clk-conf.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -46,6 +46,43 @@ static unsigned long clk_core_round_rate_nolock(struct clk_core *clk,
 						unsigned long rate);
 static struct clk_core *clk_core_lookup(const char *name);
 
+/***    private data structures    ***/
+
+struct clk_core {
+	const char		*name;
+	const struct clk_ops	*ops;
+	struct clk_hw		*hw;
+	struct module		*owner;
+	struct clk_core		*parent;
+	const char		**parent_names;
+	struct clk_core		**parents;
+	u8			num_parents;
+	u8			new_parent_index;
+	unsigned long		rate;
+	unsigned long		new_rate;
+	struct clk_core		*new_parent;
+	struct clk_core		*new_child;
+	unsigned long		flags;
+	unsigned int		enable_count;
+	unsigned int		prepare_count;
+	unsigned long		accuracy;
+	int			phase;
+	struct hlist_head	children;
+	struct hlist_node	child_node;
+	struct hlist_node	debug_node;
+	unsigned int		notifier_count;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry		*dentry;
+#endif
+	struct kref		ref;
+};
+
+struct clk {
+	struct clk_core	*core;
+	const char *dev_id;
+	const char *con_id;
+};
+
 /***           locking             ***/
 static void clk_prepare_lock(void)
 {
@@ -1995,7 +2032,7 @@ int clk_get_phase(struct clk *clk)
  * Initializes the lists in struct clk_core, queries the hardware for the
  * parent and rate and sets them both.
  */
-int __clk_init(struct device *dev, struct clk *clk_user)
+static int __clk_init(struct device *dev, struct clk *clk_user)
 {
 	int i, ret = 0;
 	struct clk_core *orphan;
diff --git a/include/linux/clk-private.h b/include/linux/clk-private.h
deleted file mode 100644
index ae55d99a2313..000000000000
--- a/include/linux/clk-private.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- *  linux/include/linux/clk-private.h
- *
- *  Copyright (c) 2010-2011 Jeremy Kerr <jeremy.kerr@canonical.com>
- *  Copyright (C) 2011-2012 Linaro Ltd <mturquette@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __LINUX_CLK_PRIVATE_H
-#define __LINUX_CLK_PRIVATE_H
-
-#include <linux/clk-provider.h>
-#include <linux/kref.h>
-#include <linux/list.h>
-
-/*
- * WARNING: Do not include clk-private.h from any file that implements struct
- * clk_ops.  Doing so is a layering violation!
- *
- * This header exists only to allow for statically initialized clock data.  Any
- * static clock data must be defined in a separate file from the logic that
- * implements the clock operations for that same data.
- */
-
-#ifdef CONFIG_COMMON_CLK
-
-struct module;
-
-struct clk_core {
-	const char		*name;
-	const struct clk_ops	*ops;
-	struct clk_hw		*hw;
-	struct module		*owner;
-	struct clk_core		*parent;
-	const char		**parent_names;
-	struct clk_core		**parents;
-	u8			num_parents;
-	u8			new_parent_index;
-	unsigned long		rate;
-	unsigned long		new_rate;
-	struct clk_core		*new_parent;
-	struct clk_core		*new_child;
-	unsigned long		flags;
-	unsigned int		enable_count;
-	unsigned int		prepare_count;
-	unsigned long		accuracy;
-	int			phase;
-	struct hlist_head	children;
-	struct hlist_node	child_node;
-	struct hlist_node	debug_node;
-	unsigned int		notifier_count;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry		*dentry;
-#endif
-	struct kref		ref;
-};
-
-struct clk {
-	struct clk_core	*core;
-	const char *dev_id;
-	const char *con_id;
-};
-
-/*
- * DOC: Basic clock implementations common to many platforms
- *
- * Each basic clock hardware type is comprised of a structure describing the
- * clock hardware, implementations of the relevant callbacks in struct clk_ops,
- * unique flags for that hardware type, a registration function and an
- * alternative macro for static initialization
- */
-
-#define DEFINE_CLK(_name, _ops, _flags, _parent_names,		\
-		_parents)					\
-	static struct clk _name = {				\
-		.core = &_name##_core				\
-	};							\
-	static struct clk_core _name##_core = {			\
-		.name = #_name,					\
-		.ops = &_ops,					\
-		.hw = &_name##_hw.hw,				\
-		.parent_names = _parent_names,			\
-		.num_parents = ARRAY_SIZE(_parent_names),	\
-		.parents = _parents,				\
-		.flags = _flags | CLK_IS_BASIC,			\
-	}
-
-#define DEFINE_CLK_FIXED_RATE(_name, _flags, _rate,		\
-				_fixed_rate_flags)		\
-	static struct clk _name;				\
-	static struct clk_core _name##_core;			\
-	static const char *_name##_parent_names[] = {};		\
-	static struct clk_fixed_rate _name##_hw = {		\
-		.hw = {						\
-			.core = &_name##_core,			\
-			.clk = &_name,				\
-		},						\
-		.fixed_rate = _rate,				\
-		.flags = _fixed_rate_flags,			\
-	};							\
-	DEFINE_CLK(_name, clk_fixed_rate_ops, _flags,		\
-			_name##_parent_names, NULL);
-
-#define DEFINE_CLK_GATE(_name, _parent_name, _parent_ptr,	\
-				_flags, _reg, _bit_idx,		\
-				_gate_flags, _lock)		\
-	static struct clk _name;				\
-	static struct clk_core _name##_core;			\
-	static const char *_name##_parent_names[] = {		\
-		_parent_name,					\
-	};							\
-	static struct clk_core *_name##_parents[] = {		\
-		_parent_ptr,					\
-	};							\
-	static struct clk_gate _name##_hw = {			\
-		.hw = {						\
-			.core = &_name##_core,			\
-			.clk = &_name,				\
-		},						\
-		.reg = _reg,					\
-		.bit_idx = _bit_idx,				\
-		.flags = _gate_flags,				\
-		.lock = _lock,					\
-	};							\
-	DEFINE_CLK(_name, clk_gate_ops, _flags,			\
-			_name##_parent_names, _name##_parents);
-
-#define _DEFINE_CLK_DIVIDER(_name, _parent_name, _parent_ptr,	\
-				_flags, _reg, _shift, _width,	\
-				_divider_flags, _table, _lock)	\
-	static struct clk _name;				\
-	static struct clk_core _name##_core;			\
-	static const char *_name##_parent_names[] = {		\
-		_parent_name,					\
-	};							\
-	static struct clk_core *_name##_parents[] = {		\
-		_parent_ptr,					\
-	};							\
-	static struct clk_divider _name##_hw = {		\
-		.hw = {						\
-			.core = &_name##_core,			\
-			.clk = &_name,				\
-		},						\
-		.reg = _reg,					\
-		.shift = _shift,				\
-		.width = _width,				\
-		.flags = _divider_flags,			\
-		.table = _table,				\
-		.lock = _lock,					\
-	};							\
-	DEFINE_CLK(_name, clk_divider_ops, _flags,		\
-			_name##_parent_names, _name##_parents);
-
-#define DEFINE_CLK_DIVIDER(_name, _parent_name, _parent_ptr,	\
-				_flags, _reg, _shift, _width,	\
-				_divider_flags, _lock)		\
-	_DEFINE_CLK_DIVIDER(_name, _parent_name, _parent_ptr,	\
-				_flags, _reg, _shift, _width,	\
-				_divider_flags, NULL, _lock)
-
-#define DEFINE_CLK_DIVIDER_TABLE(_name, _parent_name,		\
-				_parent_ptr, _flags, _reg,	\
-				_shift, _width, _divider_flags,	\
-				_table, _lock)			\
-	_DEFINE_CLK_DIVIDER(_name, _parent_name, _parent_ptr,	\
-				_flags, _reg, _shift, _width,	\
-				_divider_flags, _table, _lock)	\
-
-#define DEFINE_CLK_MUX(_name, _parent_names, _parents, _flags,	\
-				_reg, _shift, _width,		\
-				_mux_flags, _lock)		\
-	static struct clk _name;				\
-	static struct clk_core _name##_core;			\
-	static struct clk_mux _name##_hw = {			\
-		.hw = {						\
-			.core = &_name##_core,			\
-			.clk = &_name,				\
-		},						\
-		.reg = _reg,					\
-		.shift = _shift,				\
-		.mask = BIT(_width) - 1,			\
-		.flags = _mux_flags,				\
-		.lock = _lock,					\
-	};							\
-	DEFINE_CLK(_name, clk_mux_ops, _flags, _parent_names,	\
-			_parents);
-
-#define DEFINE_CLK_FIXED_FACTOR(_name, _parent_name,		\
-				_parent_ptr, _flags,		\
-				_mult, _div)			\
-	static struct clk _name;				\
-	static struct clk_core _name##_core;			\
-	static const char *_name##_parent_names[] = {		\
-		_parent_name,					\
-	};							\
-	static struct clk_core *_name##_parents[] = {		\
-		_parent_ptr,					\
-	};							\
-	static struct clk_fixed_factor _name##_hw = {		\
-		.hw = {						\
-			.core = &_name##_core,			\
-			.clk = &_name,				\
-		},						\
-		.mult = _mult,					\
-		.div = _div,					\
-	};							\
-	DEFINE_CLK(_name, clk_fixed_factor_ops, _flags,		\
-			_name##_parent_names, _name##_parents);
-
-/**
- * __clk_init - initialize the data structures in a struct clk
- * @dev:	device initializing this clk, placeholder for now
- * @clk:	clk being initialized
- *
- * Initializes the lists in struct clk, queries the hardware for the
- * parent and rate and sets them both.
- *
- * Any struct clk passed into __clk_init must have the following members
- * populated:
- * 	.name
- * 	.ops
- * 	.hw
- * 	.parent_names
- * 	.num_parents
- * 	.flags
- *
- * It is not necessary to call clk_register if __clk_init is used directly with
- * statically initialized clock data.
- *
- * Returns 0 on success, otherwise an error code.
- */
-int __clk_init(struct device *dev, struct clk *clk);
-
-#endif /* CONFIG_COMMON_CLK */
-#endif /* CLK_PRIVATE_H */
-- 
cgit v1.2.3


From 1c8e600440c7f5036bd9a94526d01e9c7cb68dca Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Fri, 23 Jan 2015 12:03:31 +0100
Subject: clk: Add rate constraints to clocks

Adds a way for clock consumers to set maximum and minimum rates. This
can be used for thermal drivers to set minimum rates, or by misc.
drivers to set maximum rates to assure a minimum performance level.

Changes the signature of the determine_rate callback by adding the
parameters min_rate and max_rate.

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
[sboyd@codeaurora.org: set req_rate in __clk_init]
Signed-off-by: Michael Turquette <mturquette@linaro.org>
[mturquette@linaro.org: min/max rate for sun6i_ahb1_clk_determine_rate
                        migrated clk-private.h changes to clk.c]
---
 Documentation/clk.txt               |   2 +
 arch/arm/mach-omap2/dpll3xxx.c      |   2 +
 arch/arm/mach-omap2/dpll44xx.c      |   2 +
 arch/mips/alchemy/common/clock.c    |   8 ++
 drivers/clk/at91/clk-programmable.c |   2 +
 drivers/clk/bcm/clk-kona.c          |   2 +
 drivers/clk/clk-composite.c         |   9 +-
 drivers/clk/clk.c                   | 272 +++++++++++++++++++++++++++++-------
 drivers/clk/hisilicon/clk-hi3620.c  |   2 +
 drivers/clk/mmp/clk-mix.c           |   2 +
 drivers/clk/qcom/clk-pll.c          |   1 +
 drivers/clk/qcom/clk-rcg.c          |  10 +-
 drivers/clk/qcom/clk-rcg2.c         |   6 +
 drivers/clk/sunxi/clk-factors.c     |   2 +
 drivers/clk/sunxi/clk-sun6i-ar100.c |   2 +
 drivers/clk/sunxi/clk-sunxi.c       |   2 +
 include/linux/clk-provider.h        |  17 ++-
 include/linux/clk.h                 |  28 ++++
 include/linux/clk/ti.h              |   4 +
 19 files changed, 314 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/clk.txt b/Documentation/clk.txt
index 4ff84623d5e1..0e4f90aa1c13 100644
--- a/Documentation/clk.txt
+++ b/Documentation/clk.txt
@@ -73,6 +73,8 @@ the operations defined in clk.h:
 						unsigned long *parent_rate);
 		long		(*determine_rate)(struct clk_hw *hw,
 						unsigned long rate,
+						unsigned long min_rate,
+						unsigned long max_rate,
 						unsigned long *best_parent_rate,
 						struct clk_hw **best_parent_clk);
 		int		(*set_parent)(struct clk_hw *hw, u8 index);
diff --git a/arch/arm/mach-omap2/dpll3xxx.c b/arch/arm/mach-omap2/dpll3xxx.c
index 49752d77f5bc..44e57ec225d4 100644
--- a/arch/arm/mach-omap2/dpll3xxx.c
+++ b/arch/arm/mach-omap2/dpll3xxx.c
@@ -473,6 +473,8 @@ void omap3_noncore_dpll_disable(struct clk_hw *hw)
  * in failure.
  */
 long omap3_noncore_dpll_determine_rate(struct clk_hw *hw, unsigned long rate,
+				       unsigned long min_rate,
+				       unsigned long max_rate,
 				       unsigned long *best_parent_rate,
 				       struct clk_hw **best_parent_clk)
 {
diff --git a/arch/arm/mach-omap2/dpll44xx.c b/arch/arm/mach-omap2/dpll44xx.c
index 0e58e5a85d53..acacb900a58b 100644
--- a/arch/arm/mach-omap2/dpll44xx.c
+++ b/arch/arm/mach-omap2/dpll44xx.c
@@ -222,6 +222,8 @@ out:
  * in failure.
  */
 long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_clk)
 {
diff --git a/arch/mips/alchemy/common/clock.c b/arch/mips/alchemy/common/clock.c
index 48a9dfc55b51..4e65404b3ba3 100644
--- a/arch/mips/alchemy/common/clock.c
+++ b/arch/mips/alchemy/common/clock.c
@@ -373,6 +373,8 @@ static long alchemy_calc_div(unsigned long rate, unsigned long prate,
 }
 
 static long alchemy_clk_fgcs_detr(struct clk_hw *hw, unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_clk,
 					int scale, int maxdiv)
@@ -546,6 +548,8 @@ static unsigned long alchemy_clk_fgv1_recalc(struct clk_hw *hw,
 }
 
 static long alchemy_clk_fgv1_detr(struct clk_hw *hw, unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_clk)
 {
@@ -678,6 +682,8 @@ static unsigned long alchemy_clk_fgv2_recalc(struct clk_hw *hw,
 }
 
 static long alchemy_clk_fgv2_detr(struct clk_hw *hw, unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_clk)
 {
@@ -897,6 +903,8 @@ static int alchemy_clk_csrc_setr(struct clk_hw *hw, unsigned long rate,
 }
 
 static long alchemy_clk_csrc_detr(struct clk_hw *hw, unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_clk)
 {
diff --git a/drivers/clk/at91/clk-programmable.c b/drivers/clk/at91/clk-programmable.c
index bbdb1b985c91..86c8a073dcc3 100644
--- a/drivers/clk/at91/clk-programmable.c
+++ b/drivers/clk/at91/clk-programmable.c
@@ -56,6 +56,8 @@ static unsigned long clk_programmable_recalc_rate(struct clk_hw *hw,
 
 static long clk_programmable_determine_rate(struct clk_hw *hw,
 					    unsigned long rate,
+					    unsigned long min_rate,
+					    unsigned long max_rate,
 					    unsigned long *best_parent_rate,
 					    struct clk_hw **best_parent_hw)
 {
diff --git a/drivers/clk/bcm/clk-kona.c b/drivers/clk/bcm/clk-kona.c
index 1c06f6f3a8c5..05abae89262e 100644
--- a/drivers/clk/bcm/clk-kona.c
+++ b/drivers/clk/bcm/clk-kona.c
@@ -1032,6 +1032,8 @@ static long kona_peri_clk_round_rate(struct clk_hw *hw, unsigned long rate,
 }
 
 static long kona_peri_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
+		unsigned long min_rate,
+		unsigned long max_rate,
 		unsigned long *best_parent_rate, struct clk_hw **best_parent)
 {
 	struct kona_clk *bcm_clk = to_kona_clk(hw);
diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 4386697236a7..dee81b83c4b3 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -56,6 +56,8 @@ static unsigned long clk_composite_recalc_rate(struct clk_hw *hw,
 }
 
 static long clk_composite_determine_rate(struct clk_hw *hw, unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_p)
 {
@@ -73,7 +75,9 @@ static long clk_composite_determine_rate(struct clk_hw *hw, unsigned long rate,
 
 	if (rate_hw && rate_ops && rate_ops->determine_rate) {
 		rate_hw->clk = hw->clk;
-		return rate_ops->determine_rate(rate_hw, rate, best_parent_rate,
+		return rate_ops->determine_rate(rate_hw, rate, min_rate,
+						max_rate,
+						best_parent_rate,
 						best_parent_p);
 	} else if (rate_hw && rate_ops && rate_ops->round_rate &&
 		   mux_hw && mux_ops && mux_ops->set_parent) {
@@ -117,7 +121,8 @@ static long clk_composite_determine_rate(struct clk_hw *hw, unsigned long rate,
 		return best_rate;
 	} else if (mux_hw && mux_ops && mux_ops->determine_rate) {
 		mux_hw->clk = hw->clk;
-		return mux_ops->determine_rate(mux_hw, rate, best_parent_rate,
+		return mux_ops->determine_rate(mux_hw, rate, min_rate,
+					       max_rate, best_parent_rate,
 					       best_parent_p);
 	} else {
 		pr_err("clk: clk_composite_determine_rate function called, but no mux or rate callback set!\n");
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index cdc1fa58e4f1..113456030d66 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -42,8 +42,6 @@ static unsigned long clk_core_get_rate(struct clk_core *clk);
 static int clk_core_get_phase(struct clk_core *clk);
 static bool clk_core_is_prepared(struct clk_core *clk);
 static bool clk_core_is_enabled(struct clk_core *clk);
-static unsigned long clk_core_round_rate_nolock(struct clk_core *clk,
-						unsigned long rate);
 static struct clk_core *clk_core_lookup(const char *name);
 
 /***    private data structures    ***/
@@ -59,6 +57,7 @@ struct clk_core {
 	u8			num_parents;
 	u8			new_parent_index;
 	unsigned long		rate;
+	unsigned long		req_rate;
 	unsigned long		new_rate;
 	struct clk_core		*new_parent;
 	struct clk_core		*new_child;
@@ -70,6 +69,7 @@ struct clk_core {
 	struct hlist_head	children;
 	struct hlist_node	child_node;
 	struct hlist_node	debug_node;
+	struct hlist_head	clks;
 	unsigned int		notifier_count;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry		*dentry;
@@ -81,6 +81,9 @@ struct clk {
 	struct clk_core	*core;
 	const char *dev_id;
 	const char *con_id;
+	unsigned long min_rate;
+	unsigned long max_rate;
+	struct hlist_node child_node;
 };
 
 /***           locking             ***/
@@ -783,6 +786,8 @@ static bool mux_is_better_rate(unsigned long rate, unsigned long now,
 
 static long
 clk_mux_determine_rate_flags(struct clk_hw *hw, unsigned long rate,
+			     unsigned long min_rate,
+			     unsigned long max_rate,
 			     unsigned long *best_parent_rate,
 			     struct clk_hw **best_parent_p,
 			     unsigned long flags)
@@ -795,7 +800,8 @@ clk_mux_determine_rate_flags(struct clk_hw *hw, unsigned long rate,
 	if (core->flags & CLK_SET_RATE_NO_REPARENT) {
 		parent = core->parent;
 		if (core->flags & CLK_SET_RATE_PARENT)
-			best = clk_core_round_rate_nolock(parent, rate);
+			best = __clk_determine_rate(parent->hw, rate,
+						    min_rate, max_rate);
 		else if (parent)
 			best = clk_core_get_rate_nolock(parent);
 		else
@@ -810,7 +816,9 @@ clk_mux_determine_rate_flags(struct clk_hw *hw, unsigned long rate,
 		if (!parent)
 			continue;
 		if (core->flags & CLK_SET_RATE_PARENT)
-			parent_rate = clk_core_round_rate_nolock(parent, rate);
+			parent_rate = __clk_determine_rate(parent->hw, rate,
+							   min_rate,
+							   max_rate);
 		else
 			parent_rate = clk_core_get_rate_nolock(parent);
 		if (mux_is_better_rate(rate, parent_rate, best, flags)) {
@@ -834,25 +842,47 @@ struct clk *__clk_lookup(const char *name)
 	return !core ? NULL : core->hw->clk;
 }
 
+static void clk_core_get_boundaries(struct clk_core *clk,
+				    unsigned long *min_rate,
+				    unsigned long *max_rate)
+{
+	struct clk *clk_user;
+
+	*min_rate = 0;
+	*max_rate = ULONG_MAX;
+
+	hlist_for_each_entry(clk_user, &clk->clks, child_node)
+		*min_rate = max(*min_rate, clk_user->min_rate);
+
+	hlist_for_each_entry(clk_user, &clk->clks, child_node)
+		*max_rate = min(*max_rate, clk_user->max_rate);
+}
+
 /*
  * Helper for finding best parent to provide a given frequency. This can be used
  * directly as a determine_rate callback (e.g. for a mux), or from a more
  * complex clock that may combine a mux with other operations.
  */
 long __clk_mux_determine_rate(struct clk_hw *hw, unsigned long rate,
+			      unsigned long min_rate,
+			      unsigned long max_rate,
 			      unsigned long *best_parent_rate,
 			      struct clk_hw **best_parent_p)
 {
-	return clk_mux_determine_rate_flags(hw, rate, best_parent_rate,
+	return clk_mux_determine_rate_flags(hw, rate, min_rate, max_rate,
+					    best_parent_rate,
 					    best_parent_p, 0);
 }
 EXPORT_SYMBOL_GPL(__clk_mux_determine_rate);
 
 long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate,
+			      unsigned long min_rate,
+			      unsigned long max_rate,
 			      unsigned long *best_parent_rate,
 			      struct clk_hw **best_parent_p)
 {
-	return clk_mux_determine_rate_flags(hw, rate, best_parent_rate,
+	return clk_mux_determine_rate_flags(hw, rate, min_rate, max_rate,
+					    best_parent_rate,
 					    best_parent_p,
 					    CLK_MUX_ROUND_CLOSEST);
 }
@@ -1068,7 +1098,9 @@ int clk_enable(struct clk *clk)
 EXPORT_SYMBOL_GPL(clk_enable);
 
 static unsigned long clk_core_round_rate_nolock(struct clk_core *clk,
-						unsigned long rate)
+						unsigned long rate,
+						unsigned long min_rate,
+						unsigned long max_rate)
 {
 	unsigned long parent_rate = 0;
 	struct clk_core *parent;
@@ -1083,16 +1115,40 @@ static unsigned long clk_core_round_rate_nolock(struct clk_core *clk,
 
 	if (clk->ops->determine_rate) {
 		parent_hw = parent ? parent->hw : NULL;
-		return clk->ops->determine_rate(clk->hw, rate, &parent_rate,
-						&parent_hw);
+		return clk->ops->determine_rate(clk->hw, rate,
+						min_rate, max_rate,
+						&parent_rate, &parent_hw);
 	} else if (clk->ops->round_rate)
 		return clk->ops->round_rate(clk->hw, rate, &parent_rate);
 	else if (clk->flags & CLK_SET_RATE_PARENT)
-		return clk_core_round_rate_nolock(clk->parent, rate);
+		return clk_core_round_rate_nolock(clk->parent, rate, min_rate,
+						  max_rate);
 	else
 		return clk->rate;
 }
 
+/**
+ * __clk_determine_rate - get the closest rate actually supported by a clock
+ * @hw: determine the rate of this clock
+ * @rate: target rate
+ * @min_rate: returned rate must be greater than this rate
+ * @max_rate: returned rate must be less than this rate
+ *
+ * Caller must hold prepare_lock.  Useful for clk_ops such as .set_rate and
+ * .determine_rate.
+ */
+unsigned long __clk_determine_rate(struct clk_hw *hw,
+				   unsigned long rate,
+				   unsigned long min_rate,
+				   unsigned long max_rate)
+{
+	if (!hw)
+		return 0;
+
+	return clk_core_round_rate_nolock(hw->core, rate, min_rate, max_rate);
+}
+EXPORT_SYMBOL_GPL(__clk_determine_rate);
+
 /**
  * __clk_round_rate - round the given rate for a clk
  * @clk: round the rate of this clock
@@ -1102,10 +1158,15 @@ static unsigned long clk_core_round_rate_nolock(struct clk_core *clk,
  */
 unsigned long __clk_round_rate(struct clk *clk, unsigned long rate)
 {
+	unsigned long min_rate;
+	unsigned long max_rate;
+
 	if (!clk)
 		return 0;
 
-	return clk_core_round_rate_nolock(clk->core, rate);
+	clk_core_get_boundaries(clk->core, &min_rate, &max_rate);
+
+	return clk_core_round_rate_nolock(clk->core, rate, min_rate, max_rate);
 }
 EXPORT_SYMBOL_GPL(__clk_round_rate);
 
@@ -1126,7 +1187,7 @@ long clk_round_rate(struct clk *clk, unsigned long rate)
 		return 0;
 
 	clk_prepare_lock();
-	ret = clk_core_round_rate_nolock(clk->core, rate);
+	ret = __clk_round_rate(clk, rate);
 	clk_prepare_unlock();
 
 	return ret;
@@ -1517,6 +1578,8 @@ static struct clk_core *clk_calc_new_rates(struct clk_core *clk,
 	struct clk_hw *parent_hw;
 	unsigned long best_parent_rate = 0;
 	unsigned long new_rate;
+	unsigned long min_rate;
+	unsigned long max_rate;
 	int p_index = 0;
 
 	/* sanity */
@@ -1528,16 +1591,22 @@ static struct clk_core *clk_calc_new_rates(struct clk_core *clk,
 	if (parent)
 		best_parent_rate = parent->rate;
 
+	clk_core_get_boundaries(clk, &min_rate, &max_rate);
+
 	/* find the closest rate and parent clk/rate */
 	if (clk->ops->determine_rate) {
 		parent_hw = parent ? parent->hw : NULL;
 		new_rate = clk->ops->determine_rate(clk->hw, rate,
+						    min_rate,
+						    max_rate,
 						    &best_parent_rate,
 						    &parent_hw);
 		parent = parent_hw ? parent_hw->core : NULL;
 	} else if (clk->ops->round_rate) {
 		new_rate = clk->ops->round_rate(clk->hw, rate,
 						&best_parent_rate);
+		if (new_rate < min_rate || new_rate > max_rate)
+			return NULL;
 	} else if (!parent || !(clk->flags & CLK_SET_RATE_PARENT)) {
 		/* pass-through clock without adjustable parent */
 		clk->new_rate = clk->rate;
@@ -1675,6 +1744,45 @@ static void clk_change_rate(struct clk_core *clk)
 		clk_change_rate(clk->new_child);
 }
 
+static int clk_core_set_rate_nolock(struct clk_core *clk,
+				    unsigned long req_rate)
+{
+	struct clk_core *top, *fail_clk;
+	unsigned long rate = req_rate;
+	int ret = 0;
+
+	if (!clk)
+		return 0;
+
+	/* bail early if nothing to do */
+	if (rate == clk_core_get_rate_nolock(clk))
+		return 0;
+
+	if ((clk->flags & CLK_SET_RATE_GATE) && clk->prepare_count)
+		return -EBUSY;
+
+	/* calculate new rates and get the topmost changed clock */
+	top = clk_calc_new_rates(clk, rate);
+	if (!top)
+		return -EINVAL;
+
+	/* notify that we are about to change rates */
+	fail_clk = clk_propagate_rate_change(top, PRE_RATE_CHANGE);
+	if (fail_clk) {
+		pr_debug("%s: failed to set %s rate\n", __func__,
+				fail_clk->name);
+		clk_propagate_rate_change(top, ABORT_RATE_CHANGE);
+		return -EBUSY;
+	}
+
+	/* change the rates */
+	clk_change_rate(top);
+
+	clk->req_rate = req_rate;
+
+	return ret;
+}
+
 /**
  * clk_set_rate - specify a new rate for clk
  * @clk: the clk whose rate is being changed
@@ -1698,8 +1806,7 @@ static void clk_change_rate(struct clk_core *clk)
  */
 int clk_set_rate(struct clk *clk, unsigned long rate)
 {
-	struct clk_core *top, *fail_clk;
-	int ret = 0;
+	int ret;
 
 	if (!clk)
 		return 0;
@@ -1707,42 +1814,81 @@ int clk_set_rate(struct clk *clk, unsigned long rate)
 	/* prevent racing with updates to the clock topology */
 	clk_prepare_lock();
 
-	/* bail early if nothing to do */
-	if (rate == clk_get_rate(clk))
-		goto out;
+	ret = clk_core_set_rate_nolock(clk->core, rate);
 
-	if ((clk->core->flags & CLK_SET_RATE_GATE) &&
-	    clk->core->prepare_count) {
-		ret = -EBUSY;
-		goto out;
-	}
+	clk_prepare_unlock();
 
-	/* calculate new rates and get the topmost changed clock */
-	top = clk_calc_new_rates(clk->core, rate);
-	if (!top) {
-		ret = -EINVAL;
-		goto out;
-	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(clk_set_rate);
 
-	/* notify that we are about to change rates */
-	fail_clk = clk_propagate_rate_change(top, PRE_RATE_CHANGE);
-	if (fail_clk) {
-		pr_debug("%s: failed to set %s rate\n", __func__,
-				fail_clk->name);
-		clk_propagate_rate_change(top, ABORT_RATE_CHANGE);
-		ret = -EBUSY;
-		goto out;
+/**
+ * clk_set_rate_range - set a rate range for a clock source
+ * @clk: clock source
+ * @min: desired minimum clock rate in Hz, inclusive
+ * @max: desired maximum clock rate in Hz, inclusive
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max)
+{
+	int ret = 0;
+
+	if (!clk)
+		return 0;
+
+	if (min > max) {
+		pr_err("%s: clk %s dev %s con %s: invalid range [%lu, %lu]\n",
+		       __func__, clk->core->name, clk->dev_id, clk->con_id,
+		       min, max);
+		return -EINVAL;
 	}
 
-	/* change the rates */
-	clk_change_rate(top);
+	clk_prepare_lock();
+
+	if (min != clk->min_rate || max != clk->max_rate) {
+		clk->min_rate = min;
+		clk->max_rate = max;
+		ret = clk_core_set_rate_nolock(clk->core, clk->core->req_rate);
+	}
 
-out:
 	clk_prepare_unlock();
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(clk_set_rate);
+EXPORT_SYMBOL_GPL(clk_set_rate_range);
+
+/**
+ * clk_set_min_rate - set a minimum clock rate for a clock source
+ * @clk: clock source
+ * @rate: desired minimum clock rate in Hz, inclusive
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_set_min_rate(struct clk *clk, unsigned long rate)
+{
+	if (!clk)
+		return 0;
+
+	return clk_set_rate_range(clk, rate, clk->max_rate);
+}
+EXPORT_SYMBOL_GPL(clk_set_min_rate);
+
+/**
+ * clk_set_max_rate - set a maximum clock rate for a clock source
+ * @clk: clock source
+ * @rate: desired maximum clock rate in Hz, inclusive
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_set_max_rate(struct clk *clk, unsigned long rate)
+{
+	if (!clk)
+		return 0;
+
+	return clk_set_rate_range(clk, clk->min_rate, rate);
+}
+EXPORT_SYMBOL_GPL(clk_set_max_rate);
 
 /**
  * clk_get_parent - return the parent of a clk
@@ -2038,6 +2184,7 @@ static int __clk_init(struct device *dev, struct clk *clk_user)
 	struct clk_core *orphan;
 	struct hlist_node *tmp2;
 	struct clk_core *clk;
+	unsigned long rate;
 
 	if (!clk_user)
 		return -EINVAL;
@@ -2162,12 +2309,13 @@ static int __clk_init(struct device *dev, struct clk *clk_user)
 	 * then rate is set to zero.
 	 */
 	if (clk->ops->recalc_rate)
-		clk->rate = clk->ops->recalc_rate(clk->hw,
+		rate = clk->ops->recalc_rate(clk->hw,
 				clk_core_get_rate_nolock(clk->parent));
 	else if (clk->parent)
-		clk->rate = clk->parent->rate;
+		rate = clk->parent->rate;
 	else
-		clk->rate = 0;
+		rate = 0;
+	clk->rate = clk->req_rate = rate;
 
 	/*
 	 * walk the list of orphan clocks and reparent any that are children of
@@ -2225,10 +2373,24 @@ struct clk *__clk_create_clk(struct clk_hw *hw, const char *dev_id,
 	clk->core = hw->core;
 	clk->dev_id = dev_id;
 	clk->con_id = con_id;
+	clk->max_rate = ULONG_MAX;
+
+	clk_prepare_lock();
+	hlist_add_head(&clk->child_node, &hw->core->clks);
+	clk_prepare_unlock();
 
 	return clk;
 }
 
+static void __clk_free_clk(struct clk *clk)
+{
+	clk_prepare_lock();
+	hlist_del(&clk->child_node);
+	clk_prepare_unlock();
+
+	kfree(clk);
+}
+
 /**
  * clk_register - allocate a new clock, register it and return an opaque cookie
  * @dev: device that is registering this clock
@@ -2288,6 +2450,8 @@ struct clk *clk_register(struct device *dev, struct clk_hw *hw)
 		}
 	}
 
+	INIT_HLIST_HEAD(&clk->clks);
+
 	hw->clk = __clk_create_clk(hw, NULL, NULL);
 	if (IS_ERR(hw->clk)) {
 		pr_err("%s: could not allocate per-user clk\n", __func__);
@@ -2299,8 +2463,9 @@ struct clk *clk_register(struct device *dev, struct clk_hw *hw)
 	if (!ret)
 		return hw->clk;
 
-	kfree(hw->clk);
+	__clk_free_clk(hw->clk);
 	hw->clk = NULL;
+
 fail_parent_names_copy:
 	while (--i >= 0)
 		kfree(clk->parent_names[i]);
@@ -2489,25 +2654,24 @@ int __clk_get(struct clk *clk)
 	return 1;
 }
 
-static void clk_core_put(struct clk_core *core)
+void __clk_put(struct clk *clk)
 {
 	struct module *owner;
 
-	owner = core->owner;
+	if (!clk || WARN_ON_ONCE(IS_ERR(clk)))
+		return;
 
 	clk_prepare_lock();
-	kref_put(&core->ref, __clk_release);
+
+	hlist_del(&clk->child_node);
+	clk_core_set_rate_nolock(clk->core, clk->core->req_rate);
+	owner = clk->core->owner;
+	kref_put(&clk->core->ref, __clk_release);
+
 	clk_prepare_unlock();
 
 	module_put(owner);
-}
-
-void __clk_put(struct clk *clk)
-{
-	if (!clk || WARN_ON_ONCE(IS_ERR(clk)))
-		return;
 
-	clk_core_put(clk->core);
 	kfree(clk);
 }
 
diff --git a/drivers/clk/hisilicon/clk-hi3620.c b/drivers/clk/hisilicon/clk-hi3620.c
index 007144f81f50..2e4f6d432beb 100644
--- a/drivers/clk/hisilicon/clk-hi3620.c
+++ b/drivers/clk/hisilicon/clk-hi3620.c
@@ -295,6 +295,8 @@ static unsigned long mmc_clk_recalc_rate(struct clk_hw *hw,
 }
 
 static long mmc_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
+			      unsigned long min_rate,
+			      unsigned long max_rate,
 			      unsigned long *best_parent_rate,
 			      struct clk_hw **best_parent_p)
 {
diff --git a/drivers/clk/mmp/clk-mix.c b/drivers/clk/mmp/clk-mix.c
index 48fa53c7ce5e..de6a873175d2 100644
--- a/drivers/clk/mmp/clk-mix.c
+++ b/drivers/clk/mmp/clk-mix.c
@@ -202,6 +202,8 @@ error:
 }
 
 static long mmp_clk_mix_determine_rate(struct clk_hw *hw, unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_clk)
 {
diff --git a/drivers/clk/qcom/clk-pll.c b/drivers/clk/qcom/clk-pll.c
index 60873a7f45d9..b4325f65a1bf 100644
--- a/drivers/clk/qcom/clk-pll.c
+++ b/drivers/clk/qcom/clk-pll.c
@@ -141,6 +141,7 @@ struct pll_freq_tbl *find_freq(const struct pll_freq_tbl *f, unsigned long rate)
 
 static long
 clk_pll_determine_rate(struct clk_hw *hw, unsigned long rate,
+		       unsigned long min_rate, unsigned long max_rate,
 		       unsigned long *p_rate, struct clk_hw **p)
 {
 	struct clk_pll *pll = to_clk_pll(hw);
diff --git a/drivers/clk/qcom/clk-rcg.c b/drivers/clk/qcom/clk-rcg.c
index 0b93972c8807..0039bd7d3965 100644
--- a/drivers/clk/qcom/clk-rcg.c
+++ b/drivers/clk/qcom/clk-rcg.c
@@ -368,6 +368,7 @@ clk_dyn_rcg_recalc_rate(struct clk_hw *hw, unsigned long parent_rate)
 
 static long _freq_tbl_determine_rate(struct clk_hw *hw,
 		const struct freq_tbl *f, unsigned long rate,
+		unsigned long min_rate, unsigned long max_rate,
 		unsigned long *p_rate, struct clk_hw **p_hw)
 {
 	unsigned long clk_flags;
@@ -397,22 +398,27 @@ static long _freq_tbl_determine_rate(struct clk_hw *hw,
 }
 
 static long clk_rcg_determine_rate(struct clk_hw *hw, unsigned long rate,
+		unsigned long min_rate, unsigned long max_rate,
 		unsigned long *p_rate, struct clk_hw **p)
 {
 	struct clk_rcg *rcg = to_clk_rcg(hw);
 
-	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, rate, p_rate, p);
+	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, rate, min_rate,
+			max_rate, p_rate, p);
 }
 
 static long clk_dyn_rcg_determine_rate(struct clk_hw *hw, unsigned long rate,
+		unsigned long min_rate, unsigned long max_rate,
 		unsigned long *p_rate, struct clk_hw **p)
 {
 	struct clk_dyn_rcg *rcg = to_clk_dyn_rcg(hw);
 
-	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, rate, p_rate, p);
+	return _freq_tbl_determine_rate(hw, rcg->freq_tbl, rate, min_rate,
+			max_rate, p_rate, p);
 }
 
 static long clk_rcg_bypass_determine_rate(struct clk_hw *hw, unsigned long rate,
+		unsigned long min_rate, unsigned long max_rate,
 		unsigned long *p_rate, struct clk_hw **p_hw)
 {
 	struct clk_rcg *rcg = to_clk_rcg(hw);
diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c
index 08b8b3729f53..742acfa18d63 100644
--- a/drivers/clk/qcom/clk-rcg2.c
+++ b/drivers/clk/qcom/clk-rcg2.c
@@ -208,6 +208,7 @@ static long _freq_tbl_determine_rate(struct clk_hw *hw,
 }
 
 static long clk_rcg2_determine_rate(struct clk_hw *hw, unsigned long rate,
+		unsigned long min_rate, unsigned long max_rate,
 		unsigned long *p_rate, struct clk_hw **p)
 {
 	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
@@ -361,6 +362,8 @@ static int clk_edp_pixel_set_rate_and_parent(struct clk_hw *hw,
 }
 
 static long clk_edp_pixel_determine_rate(struct clk_hw *hw, unsigned long rate,
+				 unsigned long min_rate,
+				 unsigned long max_rate,
 				 unsigned long *p_rate, struct clk_hw **p)
 {
 	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
@@ -412,6 +415,7 @@ const struct clk_ops clk_edp_pixel_ops = {
 EXPORT_SYMBOL_GPL(clk_edp_pixel_ops);
 
 static long clk_byte_determine_rate(struct clk_hw *hw, unsigned long rate,
+			 unsigned long min_rate, unsigned long max_rate,
 			 unsigned long *p_rate, struct clk_hw **p_hw)
 {
 	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
@@ -476,6 +480,8 @@ static const struct frac_entry frac_table_pixel[] = {
 };
 
 static long clk_pixel_determine_rate(struct clk_hw *hw, unsigned long rate,
+				 unsigned long min_rate,
+				 unsigned long max_rate,
 				 unsigned long *p_rate, struct clk_hw **p)
 {
 	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
diff --git a/drivers/clk/sunxi/clk-factors.c b/drivers/clk/sunxi/clk-factors.c
index a9ebbd207d58..8c20190a3e9f 100644
--- a/drivers/clk/sunxi/clk-factors.c
+++ b/drivers/clk/sunxi/clk-factors.c
@@ -80,6 +80,8 @@ static long clk_factors_round_rate(struct clk_hw *hw, unsigned long rate,
 }
 
 static long clk_factors_determine_rate(struct clk_hw *hw, unsigned long rate,
+				       unsigned long min_rate,
+				       unsigned long max_rate,
 				       unsigned long *best_parent_rate,
 				       struct clk_hw **best_parent_p)
 {
diff --git a/drivers/clk/sunxi/clk-sun6i-ar100.c b/drivers/clk/sunxi/clk-sun6i-ar100.c
index 3d282fb8f85c..63cf149195ae 100644
--- a/drivers/clk/sunxi/clk-sun6i-ar100.c
+++ b/drivers/clk/sunxi/clk-sun6i-ar100.c
@@ -45,6 +45,8 @@ static unsigned long ar100_recalc_rate(struct clk_hw *hw,
 }
 
 static long ar100_determine_rate(struct clk_hw *hw, unsigned long rate,
+				 unsigned long min_rate,
+				 unsigned long max_rate,
 				 unsigned long *best_parent_rate,
 				 struct clk_hw **best_parent_clk)
 {
diff --git a/drivers/clk/sunxi/clk-sunxi.c b/drivers/clk/sunxi/clk-sunxi.c
index 9b79f8907cc5..69937eaba795 100644
--- a/drivers/clk/sunxi/clk-sunxi.c
+++ b/drivers/clk/sunxi/clk-sunxi.c
@@ -119,6 +119,8 @@ static long sun6i_ahb1_clk_round(unsigned long rate, u8 *divp, u8 *pre_divp,
 }
 
 static long sun6i_ahb1_clk_determine_rate(struct clk_hw *hw, unsigned long rate,
+					  unsigned long min_rate,
+					  unsigned long max_rate,
 					  unsigned long *best_parent_rate,
 					  struct clk_hw **best_parent_clk)
 {
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 12f13b0673af..17dd6e9439d1 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -175,9 +175,12 @@ struct clk_ops {
 					unsigned long parent_rate);
 	long		(*round_rate)(struct clk_hw *hw, unsigned long rate,
 					unsigned long *parent_rate);
-	long		(*determine_rate)(struct clk_hw *hw, unsigned long rate,
-					unsigned long *best_parent_rate,
-					struct clk_hw **best_parent_hw);
+	long		(*determine_rate)(struct clk_hw *hw,
+					  unsigned long rate,
+					  unsigned long min_rate,
+					  unsigned long max_rate,
+					  unsigned long *best_parent_rate,
+					  struct clk_hw **best_parent_hw);
 	int		(*set_parent)(struct clk_hw *hw, u8 index);
 	u8		(*get_parent)(struct clk_hw *hw);
 	int		(*set_rate)(struct clk_hw *hw, unsigned long rate,
@@ -573,9 +576,17 @@ bool __clk_is_prepared(struct clk *clk);
 bool __clk_is_enabled(struct clk *clk);
 struct clk *__clk_lookup(const char *name);
 long __clk_mux_determine_rate(struct clk_hw *hw, unsigned long rate,
+			      unsigned long min_rate,
+			      unsigned long max_rate,
 			      unsigned long *best_parent_rate,
 			      struct clk_hw **best_parent_p);
+unsigned long __clk_determine_rate(struct clk_hw *core,
+				   unsigned long rate,
+				   unsigned long min_rate,
+				   unsigned long max_rate);
 long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate,
+			      unsigned long min_rate,
+			      unsigned long max_rate,
 			      unsigned long *best_parent_rate,
 			      struct clk_hw **best_parent_p);
 
diff --git a/include/linux/clk.h b/include/linux/clk.h
index ba7e9eda4347..8381bbfbc308 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -313,6 +313,34 @@ int clk_set_rate(struct clk *clk, unsigned long rate);
  */
 bool clk_has_parent(struct clk *clk, struct clk *parent);
 
+/**
+ * clk_set_rate_range - set a rate range for a clock source
+ * @clk: clock source
+ * @min: desired minimum clock rate in Hz, inclusive
+ * @max: desired maximum clock rate in Hz, inclusive
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max);
+
+/**
+ * clk_set_min_rate - set a minimum clock rate for a clock source
+ * @clk: clock source
+ * @rate: desired minimum clock rate in Hz, inclusive
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_set_min_rate(struct clk *clk, unsigned long rate);
+
+/**
+ * clk_set_max_rate - set a maximum clock rate for a clock source
+ * @clk: clock source
+ * @rate: desired maximum clock rate in Hz, inclusive
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_set_max_rate(struct clk *clk, unsigned long rate);
+
 /**
  * clk_set_parent - set the parent clock source for this clock
  * @clk: clock source
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 310122dcd9b5..0eac65054283 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -271,6 +271,8 @@ int omap3_noncore_dpll_set_rate_and_parent(struct clk_hw *hw,
 					   u8 index);
 long omap3_noncore_dpll_determine_rate(struct clk_hw *hw,
 				       unsigned long rate,
+				       unsigned long min_rate,
+				       unsigned long max_rate,
 				       unsigned long *best_parent_rate,
 				       struct clk_hw **best_parent_clk);
 unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw,
@@ -280,6 +282,8 @@ long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw,
 				    unsigned long *parent_rate);
 long omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw,
 					unsigned long rate,
+					unsigned long min_rate,
+					unsigned long max_rate,
 					unsigned long *best_parent_rate,
 					struct clk_hw **best_parent_clk);
 u8 omap2_init_dpll_parent(struct clk_hw *hw);
-- 
cgit v1.2.3


From 366e41d9774d7010cb63112b6db2fce6dc7809c0 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Sat, 31 Jan 2015 10:40:13 -0500
Subject: ipv6: pull cork initialization into its own function.

Pull IPv6 cork initialization into its own function that
can be re-used.  IPv6 specific cork data did not have an
explicit data structure.  This patch creats eone so that
just ipv6 cork data can be as arguemts.  Also, since
IPv6 tries to save the flow label into inet_cork_full
tructure, pass the full cork.

Adjust ip6_cork_release() to take cork data structures.

Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h  |  12 ++--
 net/ipv6/ip6_output.c | 158 ++++++++++++++++++++++++++++----------------------
 2 files changed, 96 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 2805062c013f..4d5169f5d7d1 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -125,6 +125,12 @@ struct ipv6_mc_socklist;
 struct ipv6_ac_socklist;
 struct ipv6_fl_socklist;
 
+struct inet6_cork {
+	struct ipv6_txoptions *opt;
+	u8 hop_limit;
+	u8 tclass;
+};
+
 /**
  * struct ipv6_pinfo - ipv6 private area
  *
@@ -217,11 +223,7 @@ struct ipv6_pinfo {
 	struct ipv6_txoptions	*opt;
 	struct sk_buff		*pktoptions;
 	struct sk_buff		*rxpmtu;
-	struct {
-		struct ipv6_txoptions *opt;
-		u8 hop_limit;
-		u8 tclass;
-	} cork;
+	struct inet6_cork	cork;
 };
 
 /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12ae48c..f9f08c43c6e1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1135,6 +1135,74 @@ static void ip6_append_data_mtu(unsigned int *mtu,
 	}
 }
 
+static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
+			  struct inet6_cork *v6_cork,
+			  int hlimit, int tclass, struct ipv6_txoptions *opt,
+			  struct rt6_info *rt, struct flowi6 *fl6)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	unsigned int mtu;
+
+	/*
+	 * setup for corking
+	 */
+	if (opt) {
+		if (WARN_ON(v6_cork->opt))
+			return -EINVAL;
+
+		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
+		if (unlikely(v6_cork->opt == NULL))
+			return -ENOBUFS;
+
+		v6_cork->opt->tot_len = opt->tot_len;
+		v6_cork->opt->opt_flen = opt->opt_flen;
+		v6_cork->opt->opt_nflen = opt->opt_nflen;
+
+		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
+						    sk->sk_allocation);
+		if (opt->dst0opt && !v6_cork->opt->dst0opt)
+			return -ENOBUFS;
+
+		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
+						    sk->sk_allocation);
+		if (opt->dst1opt && !v6_cork->opt->dst1opt)
+			return -ENOBUFS;
+
+		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
+						   sk->sk_allocation);
+		if (opt->hopopt && !v6_cork->opt->hopopt)
+			return -ENOBUFS;
+
+		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
+						    sk->sk_allocation);
+		if (opt->srcrt && !v6_cork->opt->srcrt)
+			return -ENOBUFS;
+
+		/* need source address above miyazawa*/
+	}
+	dst_hold(&rt->dst);
+	cork->base.dst = &rt->dst;
+	cork->fl.u.ip6 = *fl6;
+	v6_cork->hop_limit = hlimit;
+	v6_cork->tclass = tclass;
+	if (rt->dst.flags & DST_XFRM_TUNNEL)
+		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
+	else
+		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+	if (np->frag_size < mtu) {
+		if (np->frag_size)
+			mtu = np->frag_size;
+	}
+	cork->base.fragsize = mtu;
+	if (dst_allfrag(rt->dst.path))
+		cork->base.flags |= IPCORK_ALLFRAG;
+	cork->base.length = 0;
+
+	return 0;
+}
+
 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 	int offset, int len, int odd, struct sk_buff *skb),
 	void *from, int length, int transhdrlen,
@@ -1162,59 +1230,10 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		/*
 		 * setup for corking
 		 */
-		if (opt) {
-			if (WARN_ON(np->cork.opt))
-				return -EINVAL;
-
-			np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
-			if (unlikely(np->cork.opt == NULL))
-				return -ENOBUFS;
-
-			np->cork.opt->tot_len = opt->tot_len;
-			np->cork.opt->opt_flen = opt->opt_flen;
-			np->cork.opt->opt_nflen = opt->opt_nflen;
-
-			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
-							    sk->sk_allocation);
-			if (opt->dst0opt && !np->cork.opt->dst0opt)
-				return -ENOBUFS;
-
-			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
-							    sk->sk_allocation);
-			if (opt->dst1opt && !np->cork.opt->dst1opt)
-				return -ENOBUFS;
-
-			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
-							   sk->sk_allocation);
-			if (opt->hopopt && !np->cork.opt->hopopt)
-				return -ENOBUFS;
-
-			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
-							    sk->sk_allocation);
-			if (opt->srcrt && !np->cork.opt->srcrt)
-				return -ENOBUFS;
-
-			/* need source address above miyazawa*/
-		}
-		dst_hold(&rt->dst);
-		cork->dst = &rt->dst;
-		inet->cork.fl.u.ip6 = *fl6;
-		np->cork.hop_limit = hlimit;
-		np->cork.tclass = tclass;
-		if (rt->dst.flags & DST_XFRM_TUNNEL)
-			mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
-			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
-		else
-			mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
-			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
-		if (np->frag_size < mtu) {
-			if (np->frag_size)
-				mtu = np->frag_size;
-		}
-		cork->fragsize = mtu;
-		if (dst_allfrag(rt->dst.path))
-			cork->flags |= IPCORK_ALLFRAG;
-		cork->length = 0;
+		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
+				     tclass, opt, rt, fl6);
+		if (err)
+			return err;
 		exthdrlen = (opt ? opt->opt_flen : 0);
 		length += exthdrlen;
 		transhdrlen += exthdrlen;
@@ -1226,8 +1245,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		transhdrlen = 0;
 		exthdrlen = 0;
 		dst_exthdrlen = 0;
-		mtu = cork->fragsize;
 	}
+	mtu = cork->fragsize;
 	orig_mtu = mtu;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1503,23 +1522,24 @@ error:
 }
 EXPORT_SYMBOL_GPL(ip6_append_data);
 
-static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
+static void ip6_cork_release(struct inet_cork_full *cork,
+			     struct inet6_cork *v6_cork)
 {
-	if (np->cork.opt) {
-		kfree(np->cork.opt->dst0opt);
-		kfree(np->cork.opt->dst1opt);
-		kfree(np->cork.opt->hopopt);
-		kfree(np->cork.opt->srcrt);
-		kfree(np->cork.opt);
-		np->cork.opt = NULL;
+	if (v6_cork->opt) {
+		kfree(v6_cork->opt->dst0opt);
+		kfree(v6_cork->opt->dst1opt);
+		kfree(v6_cork->opt->hopopt);
+		kfree(v6_cork->opt->srcrt);
+		kfree(v6_cork->opt);
+		v6_cork->opt = NULL;
 	}
 
-	if (inet->cork.base.dst) {
-		dst_release(inet->cork.base.dst);
-		inet->cork.base.dst = NULL;
-		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
+	if (cork->base.dst) {
+		dst_release(cork->base.dst);
+		cork->base.dst = NULL;
+		cork->base.flags &= ~IPCORK_ALLFRAG;
 	}
-	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
+	memset(&cork->fl, 0, sizeof(cork->fl));
 }
 
 int ip6_push_pending_frames(struct sock *sk)
@@ -1599,7 +1619,7 @@ int ip6_push_pending_frames(struct sock *sk)
 	}
 
 out:
-	ip6_cork_release(inet, np);
+	ip6_cork_release(&inet->cork, &np->cork);
 	return err;
 error:
 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
@@ -1618,6 +1638,6 @@ void ip6_flush_pending_frames(struct sock *sk)
 		kfree_skb(skb);
 	}
 
-	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
+	ip6_cork_release(&inet_sk(sk)->cork, &inet6_sk(sk)->cork);
 }
 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
-- 
cgit v1.2.3


From 5a2e87b16875f9b83b7e9494cf1fce8e17dc764a Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Mon, 2 Feb 2015 15:18:42 +0200
Subject: net/mlx4_core: Fix kernel Oops (mem corruption) when working with
 more than 80 VFs

Commit de966c592802 (net/mlx4_core: Support more than 64 VFs) was meant to
allow up to 126 VFs.  However, due to leaving MLX4_MFUNC_MAX too low, using
more than 80 VFs resulted in memory corruptions (and Oopses) when more than
80 VFs were requested. In addition, the number of slaves was left too high.

This commit fixes these issues.

Fixes: de966c592802 ("net/mlx4_core: Support more than 64 VFs")
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 ++-
 include/linux/mlx4/device.h               | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index bdd4eea2247c..210691c89b6c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -235,7 +235,8 @@ do {									\
 extern int mlx4_log_num_mgm_entry_size;
 extern int log_mtts_per_seg;
 
-#define MLX4_MAX_NUM_SLAVES	(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF)
+#define MLX4_MAX_NUM_SLAVES	(min(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF, \
+				     MLX4_MFUNC_MAX))
 #define ALL_SLAVES 0xff
 
 struct mlx4_bitmap {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 25c791e295fd..5f3a9aa7225d 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -97,7 +97,7 @@ enum {
 	MLX4_MAX_NUM_PF		= 16,
 	MLX4_MAX_NUM_VF		= 126,
 	MLX4_MAX_NUM_VF_P_PORT  = 64,
-	MLX4_MFUNC_MAX		= 80,
+	MLX4_MFUNC_MAX		= 128,
 	MLX4_MAX_EQ_NUM		= 1024,
 	MLX4_MFUNC_EQ_NUM	= 4,
 	MLX4_MFUNC_MAX_EQES     = 8,
-- 
cgit v1.2.3


From 3e87523897e18a3e17fc8955ed795188be737ff1 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 2 Feb 2015 09:39:02 -0500
Subject: sched/wait: Remove might_sleep() from wait_event_cmd()

The patch e22b886a8a43 ("sched/wait: Add might_sleep() checks")
introduced a bug in the raid5 subsystem.

The function raid5_quiesce() (and resize_stripes()) uses the 'cmd'
part to release and acquire a spinlock (so we call the sleep
primitives in atomic context), and therefore we cannot do the
might_sleep() check.

Remove it.

Fixes: e22b886a8a43 ("sched/wait: Add might_sleep() checks")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1502020935580.13510@file01.intranet.prod.int.rdu2.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2232ed16635a..37423e0e1379 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -363,7 +363,6 @@ do {									\
  */
 #define wait_event_cmd(wq, condition, cmd1, cmd2)			\
 do {									\
-	might_sleep();							\
 	if (condition)							\
 		break;							\
 	__wait_event_cmd(wq, condition, cmd1, cmd2);			\
-- 
cgit v1.2.3


From f0b66a2cf68ed3613fe72fe01ed309f998e2bbb3 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Tue, 3 Feb 2015 09:29:52 -0600
Subject: PCI: Add pci_device_to_OF_node() stub for !CONFIG_OF

Add a stub for pci_device_to_OF_node() so drivers don't need to
use #ifdef CONFIG_OF around calls to it.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 360a966a97a5..fbb5795a8d86 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1847,6 +1847,8 @@ static inline void pci_set_of_node(struct pci_dev *dev) { }
 static inline void pci_release_of_node(struct pci_dev *dev) { }
 static inline void pci_set_bus_of_node(struct pci_bus *bus) { }
 static inline void pci_release_bus_of_node(struct pci_bus *bus) { }
+static inline struct device_node *
+pci_device_to_OF_node(const struct pci_dev *pdev) { return NULL; }
 #endif  /* CONFIG_OF */
 
 #ifdef CONFIG_EEH
-- 
cgit v1.2.3


From 6793a30a0646d2cc269e66782ca30c6025c92e1f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 3 Feb 2015 17:59:32 +0100
Subject: clk: omap: compile legacy omap3 clocks conditionally

The 'ARM: OMAP3: legacy clock data move under clk driver' patch series
causes build errors when CONFIG_OMAP3 is not set:

drivers/clk/ti/dpll.c: In function 'ti_clk_register_dpll':
drivers/clk/ti/dpll.c:199:31: error: 'omap3_dpll_ck_ops' undeclared (first use in this function)
  const struct clk_ops *ops = &omap3_dpll_ck_ops;
                               ^
drivers/clk/ti/dpll.c:199:31: note: each undeclared identifier is reported only once for each function it appears in
drivers/clk/ti/dpll.c:259:10: error: 'omap3_dpll_per_ck_ops' undeclared (first use in this function)
   ops = &omap3_dpll_per_ck_ops;
          ^

drivers/built-in.o: In function `ti_clk_register_gate':
drivers/clk/ti/gate.c:179: undefined reference to `clkhwops_omap3430es2_dss_usbhost_wait'
drivers/clk/ti/gate.c:179: undefined reference to `clkhwops_am35xx_ipss_module_wait'
-in.o: In function `ti_clk_register_interface':
drivers/clk/ti/interface.c:100: undefined reference to `clkhwops_omap3430es2_iclk_hsotgusb_wait'
drivers/clk/ti/interface.c:100: undefined reference to `clkhwops_omap3430es2_iclk_dss_usbhost_wait'
drivers/clk/ti/interface.c:100: undefined reference to `clkhwops_omap3430es2_iclk_ssi_wait'
drivers/clk/ti/interface.c:100: undefined reference to `clkhwops_am35xx_ipss_wait'
drivers/built-in.o: In function `ti_clk_register_composite':
:(.text+0x3da768): undefined reference to `ti_clk_build_component_gate'

In order to fix that problem, this patch makes the omap3 legacy code
compiled only when both CONFIG_OMAP3 and CONFIG_ATAGS are set.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 drivers/clk/ti/Makefile    | 6 ++++--
 drivers/clk/ti/clk.c       | 2 ++
 drivers/clk/ti/composite.c | 2 ++
 drivers/clk/ti/dpll.c      | 2 ++
 drivers/clk/ti/gate.c      | 2 ++
 drivers/clk/ti/interface.c | 2 ++
 include/linux/clk/ti.h     | 8 ++++++++
 7 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/ti/Makefile b/drivers/clk/ti/Makefile
index 14e6686a5eea..105ffd0f5e79 100644
--- a/drivers/clk/ti/Makefile
+++ b/drivers/clk/ti/Makefile
@@ -1,4 +1,3 @@
-ifneq ($(CONFIG_OF),)
 obj-y					+= clk.o autoidle.o clockdomain.o
 clk-common				= dpll.o composite.o divider.o gate.o \
 					  fixed-factor.o mux.o apll.o
@@ -6,10 +5,13 @@ obj-$(CONFIG_SOC_AM33XX)		+= $(clk-common) clk-33xx.o
 obj-$(CONFIG_SOC_TI81XX)		+= $(clk-common) fapll.o clk-816x.o
 obj-$(CONFIG_ARCH_OMAP2)		+= $(clk-common) interface.o clk-2xxx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clk-common) interface.o \
-					   clk-3xxx.o clk-3xxx-legacy.o
+					   clk-3xxx.o
 obj-$(CONFIG_ARCH_OMAP4)		+= $(clk-common) clk-44xx.o
 obj-$(CONFIG_SOC_OMAP5)			+= $(clk-common) clk-54xx.o
 obj-$(CONFIG_SOC_DRA7XX)		+= $(clk-common) clk-7xx.o \
 					   clk-dra7-atl.o
 obj-$(CONFIG_SOC_AM43XX)		+= $(clk-common) clk-43xx.o
+
+ifdef CONFIG_ATAGS
+obj-$(CONFIG_ARCH_OMAP3)                += clk-3xxx-legacy.o
 endif
diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c
index 546dae405402..e22b95646e09 100644
--- a/drivers/clk/ti/clk.c
+++ b/drivers/clk/ti/clk.c
@@ -186,6 +186,7 @@ void ti_dt_clk_init_retry_clks(void)
 	}
 }
 
+#if defined(CONFIG_ARCH_OMAP3) && defined(CONFIG_ATAGS)
 void __init ti_clk_patch_legacy_clks(struct ti_clk **patch)
 {
 	while (*patch) {
@@ -308,3 +309,4 @@ int __init ti_clk_register_legacy_clks(struct ti_clk_alias *clks)
 
 	return 0;
 }
+#endif
diff --git a/drivers/clk/ti/composite.c b/drivers/clk/ti/composite.c
index 3a9665fce041..3654f61912eb 100644
--- a/drivers/clk/ti/composite.c
+++ b/drivers/clk/ti/composite.c
@@ -118,6 +118,7 @@ static inline struct clk_hw *_get_hw(struct clk_hw_omap_comp *clk, int idx)
 
 #define to_clk_hw_comp(_hw) container_of(_hw, struct clk_hw_omap_comp, hw)
 
+#if defined(CONFIG_ARCH_OMAP3) && defined(CONFIG_ATAGS)
 struct clk *ti_clk_register_composite(struct ti_clk *setup)
 {
 	struct ti_clk_composite *comp;
@@ -153,6 +154,7 @@ struct clk *ti_clk_register_composite(struct ti_clk *setup)
 
 	return clk;
 }
+#endif
 
 static void __init _register_composite(struct clk_hw *hw,
 				       struct device_node *node)
diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c
index 47ebff772b13..81dc4698dc41 100644
--- a/drivers/clk/ti/dpll.c
+++ b/drivers/clk/ti/dpll.c
@@ -176,6 +176,7 @@ cleanup:
 	kfree(clk_hw);
 }
 
+#if defined(CONFIG_ARCH_OMAP3) && defined(CONFIG_ATAGS)
 void __iomem *_get_reg(u8 module, u16 offset)
 {
 	u32 reg;
@@ -271,6 +272,7 @@ cleanup:
 	kfree(clk_hw);
 	return clk;
 }
+#endif
 
 #if defined(CONFIG_ARCH_OMAP4) || defined(CONFIG_SOC_OMAP5) || \
 	defined(CONFIG_SOC_DRA7XX) || defined(CONFIG_SOC_AM33XX) || \
diff --git a/drivers/clk/ti/gate.c b/drivers/clk/ti/gate.c
index d4f6cb20e16e..d493307b73f4 100644
--- a/drivers/clk/ti/gate.c
+++ b/drivers/clk/ti/gate.c
@@ -130,6 +130,7 @@ static struct clk *_register_gate(struct device *dev, const char *name,
 	return clk;
 }
 
+#if defined(CONFIG_ARCH_OMAP3) && defined(CONFIG_ATAGS)
 struct clk *ti_clk_register_gate(struct ti_clk *setup)
 {
 	const struct clk_ops *ops = &omap_gate_clk_ops;
@@ -208,6 +209,7 @@ struct clk_hw *ti_clk_build_component_gate(struct ti_clk_gate *setup)
 
 	return &gate->hw;
 }
+#endif
 
 static void __init _of_ti_gate_clk_setup(struct device_node *node,
 					 const struct clk_ops *ops,
diff --git a/drivers/clk/ti/interface.c b/drivers/clk/ti/interface.c
index d71cd9b5de46..265d91f071c5 100644
--- a/drivers/clk/ti/interface.c
+++ b/drivers/clk/ti/interface.c
@@ -68,6 +68,7 @@ static struct clk *_register_interface(struct device *dev, const char *name,
 	return clk;
 }
 
+#if defined(CONFIG_ARCH_OMAP3) && defined(CONFIG_ATAGS)
 struct clk *ti_clk_register_interface(struct ti_clk *setup)
 {
 	const struct clk_hw_omap_ops *ops = &clkhwops_iclk_wait;
@@ -98,6 +99,7 @@ struct clk *ti_clk_register_interface(struct ti_clk *setup)
 	return _register_interface(NULL, setup->name, gate->parent,
 				   (void __iomem *)reg, gate->bit_shift, ops);
 }
+#endif
 
 static void __init _of_ti_interface_clk_setup(struct device_node *node,
 					      const struct clk_hw_omap_ops *ops)
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 0eac65054283..67844003493d 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -360,9 +360,17 @@ extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait;
 
+#ifdef CONFIG_ATAGS
 int omap3430_clk_legacy_init(void);
 int omap3430es1_clk_legacy_init(void);
 int omap36xx_clk_legacy_init(void);
 int am35xx_clk_legacy_init(void);
+#else
+static inline int omap3430_clk_legacy_init(void) { return -ENXIO; }
+static inline int omap3430es1_clk_legacy_init(void) { return -ENXIO; }
+static inline int omap36xx_clk_legacy_init(void) { return -ENXIO; }
+static inline int am35xx_clk_legacy_init(void) { return -ENXIO; }
+#endif
+
 
 #endif
-- 
cgit v1.2.3


From 1a04c6e1a26a43305fe124a0978a3e4be861af89 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Fri, 30 May 2014 18:15:57 +0800
Subject: nfsv3: introduce nfs3_set_ds_client

The flexfiles layout wants to create DS connection over NFSv3.
Add nfs3_set_ds_client to allow that to happen.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Tom Haynes <Thomas.Haynes@primarydata.com>
---
 fs/nfs/internal.h         |  4 ++++
 fs/nfs/nfs3_fs.h          |  2 ++
 fs/nfs/nfs3client.c       | 34 ++++++++++++++++++++++++++++++++++
 fs/nfs/nfs3super.c        |  2 +-
 include/linux/nfs_fs_sb.h |  9 +++++----
 5 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7d7c36ff09fa..7332ba1f693b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -193,6 +193,10 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 					     rpc_authflavor_t au_flavor);
 extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
 						struct inode *);
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+			const struct sockaddr *ds_addr, int ds_addrlen,
+			int ds_proto, unsigned int ds_timeo,
+			unsigned int ds_retrans, rpc_authflavor_t au_flavor);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 333ae4068506..e134d6548ab7 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -30,5 +30,7 @@ struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subver
 struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
 				     struct nfs_fattr *, rpc_authflavor_t);
 
+/* nfs3super.c */
+extern struct nfs_subversion nfs_v3;
 
 #endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 8c1b437c5403..52e2344bf9a1 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -64,3 +64,37 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
 		nfs_init_server_aclclient(server);
 	return server;
 }
+
+/*
+ * Set up a pNFS Data Server client over NFSv3.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+		const struct sockaddr *ds_addr, int ds_addrlen,
+		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+		rpc_authflavor_t au_flavor)
+{
+	struct nfs_client_initdata cl_init = {
+		.addr = ds_addr,
+		.addrlen = ds_addrlen,
+		.nfs_mod = &nfs_v3,
+		.proto = ds_proto,
+		.net = mds_clp->cl_net,
+	};
+	struct rpc_timeout ds_timeout;
+	struct nfs_client *clp;
+
+	/* Use the MDS nfs_client cl_ipaddr. */
+	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+			     au_flavor);
+
+	return clp;
+}
+EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index 6af29c2da352..5c4394e4656b 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -7,7 +7,7 @@
 #include "nfs3_fs.h"
 #include "nfs.h"
 
-static struct nfs_subversion nfs_v3 = {
+struct nfs_subversion nfs_v3 = {
 	.owner = THIS_MODULE,
 	.nfs_fs   = &nfs_fs_type,
 	.rpc_vers = &nfs_version3,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index ddea982355f3..5e1273d4de14 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -77,10 +77,6 @@ struct nfs_client {
 	/* Client owner identifier */
 	const char *		cl_owner_id;
 
-	/* Our own IP address, as a null-terminated string.
-	 * This is used to generate the mv0 callback address.
-	 */
-	char			cl_ipaddr[48];
 	u32			cl_cb_ident;	/* v4.0 callback identifier */
 	const struct nfs4_minor_version_ops *cl_mvops;
 	unsigned long		cl_mig_gen;
@@ -108,6 +104,11 @@ struct nfs_client {
 #define NFS_SP4_MACH_CRED_COMMIT   6	/* COMMIT */
 #endif /* CONFIG_NFS_V4 */
 
+	/* Our own IP address, as a null-terminated string.
+	 * This is used to generate the mv0 callback address.
+	 */
+	char			cl_ipaddr[48];
+
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
 #endif
-- 
cgit v1.2.3


From abde71f4d3c027a30f8d725e1e22001313b4481a Mon Sep 17 00:00:00 2001
From: Tom Haynes <loghyr@primarydata.com>
Date: Mon, 9 Jun 2014 13:12:20 -0700
Subject: pnfs: Add nfs_rpc_ops in calls to nfs_initiate_pgio

Signed-off-by: Tom Haynes <loghyr@primarydata.com>
---
 fs/nfs/filelayout/filelayout.c | 4 ++--
 fs/nfs/internal.h              | 1 +
 fs/nfs/pagelist.c              | 6 ++++--
 fs/nfs/read.c                  | 3 ++-
 fs/nfs/write.c                 | 6 +++---
 include/linux/nfs_page.h       | 1 +
 6 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index bc36ed350a68..25c4896887ca 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -501,7 +501,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
 	hdr->mds_offset = offset;
 
 	/* Perform an asynchronous read to ds */
-	nfs_initiate_pgio(ds_clnt, hdr,
+	nfs_initiate_pgio(ds_clnt, hdr, NFS_PROTO(hdr->inode),
 			    &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
 	return PNFS_ATTEMPTED;
 }
@@ -542,7 +542,7 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 	hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
 
 	/* Perform an asynchronous write */
-	nfs_initiate_pgio(ds_clnt, hdr,
+	nfs_initiate_pgio(ds_clnt, hdr, NFS_PROTO(hdr->inode),
 				    &filelayout_write_call_ops, sync,
 				    RPC_TASK_SOFTCONN);
 	return PNFS_ATTEMPTED;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5543850268d2..1d15ffa94937 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -251,6 +251,7 @@ void nfs_pgio_header_free(struct nfs_pgio_header *);
 void nfs_pgio_data_destroy(struct nfs_pgio_header *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
 int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
+		      const struct nfs_rpc_ops *,
 		      const struct rpc_call_ops *, int, int);
 void nfs_free_request(struct nfs_page *req);
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2b5e769beb16..35a2626a6922 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -597,6 +597,7 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
 }
 
 int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+		      const struct nfs_rpc_ops *rpc_ops,
 		      const struct rpc_call_ops *call_ops, int how, int flags)
 {
 	struct rpc_task *task;
@@ -616,7 +617,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
 	};
 	int ret = 0;
 
-	hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
+	hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
 
 	dprintk("NFS: %5u initiated pgio call "
 		"(req %s/%llu, %u bytes @ offset %llu)\n",
@@ -792,7 +793,8 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 	ret = nfs_generic_pgio(desc, hdr);
 	if (ret == 0)
 		ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
-					hdr, desc->pg_rpc_callops,
+					hdr, NFS_PROTO(hdr->inode),
+					desc->pg_rpc_callops,
 					desc->pg_ioflags, 0);
 	return ret;
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c91a4799c562..092ab499f2b6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -168,13 +168,14 @@ out:
 
 static void nfs_initiate_read(struct nfs_pgio_header *hdr,
 			      struct rpc_message *msg,
+			      const struct nfs_rpc_ops *rpc_ops,
 			      struct rpc_task_setup *task_setup_data, int how)
 {
 	struct inode *inode = hdr->inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
 
 	task_setup_data->flags |= swap_flags;
-	NFS_PROTO(inode)->read_setup(hdr, msg);
+	rpc_ops->read_setup(hdr, msg);
 }
 
 static void
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af3af685a9e3..54d4857e0e2b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1240,15 +1240,15 @@ static int flush_task_priority(int how)
 
 static void nfs_initiate_write(struct nfs_pgio_header *hdr,
 			       struct rpc_message *msg,
+			       const struct nfs_rpc_ops *rpc_ops,
 			       struct rpc_task_setup *task_setup_data, int how)
 {
-	struct inode *inode = hdr->inode;
 	int priority = flush_task_priority(how);
 
 	task_setup_data->priority = priority;
-	NFS_PROTO(inode)->write_setup(hdr, msg);
+	rpc_ops->write_setup(hdr, msg);
 
-	nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
+	nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
 				 &task_setup_data->rpc_client, msg, hdr);
 }
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 6c3e06ee2fb7..4c3aa809ab95 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -69,6 +69,7 @@ struct nfs_rw_ops {
 			struct inode *);
 	void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
 	void (*rw_initiate)(struct nfs_pgio_header *, struct rpc_message *,
+			    const struct nfs_rpc_ops *,
 			    struct rpc_task_setup *, int);
 };
 
-- 
cgit v1.2.3


From 840210fc4872bcbc17ab4f435f28021dce9d0aff Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Tue, 24 Jun 2014 10:59:52 -0400
Subject: sunrpc: add rpc_count_iostats_idx

Add a call to tally stats for a task under a different statsidx than
what's contained in the task structure.

This is needed to properly account for pnfs reads/writes when the
DS nfs version != the MDS version.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Tom Haynes <Thomas.Haynes@primarydata.com>
---
 include/linux/sunrpc/metrics.h |  2 ++
 net/sunrpc/stats.c             | 26 +++++++++++++++++++-------
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
index eecb5a71e6c0..89f2ca178873 100644
--- a/include/linux/sunrpc/metrics.h
+++ b/include/linux/sunrpc/metrics.h
@@ -79,6 +79,8 @@ struct rpc_clnt;
 struct rpc_iostats *	rpc_alloc_iostats(struct rpc_clnt *);
 void			rpc_count_iostats(const struct rpc_task *,
 					  struct rpc_iostats *);
+void			rpc_count_iostats_metrics(const struct rpc_task *,
+					  struct rpc_iostats *);
 void			rpc_print_iostats(struct seq_file *, struct rpc_clnt *);
 void			rpc_free_iostats(struct rpc_iostats *);
 
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 9711a155bc50..2ecb994314c1 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -140,22 +140,20 @@ void rpc_free_iostats(struct rpc_iostats *stats)
 EXPORT_SYMBOL_GPL(rpc_free_iostats);
 
 /**
- * rpc_count_iostats - tally up per-task stats
+ * rpc_count_iostats_metrics - tally up per-task stats
  * @task: completed rpc_task
- * @stats: array of stat structures
+ * @op_metrics: stat structure for OP that will accumulate stats from @task
  */
-void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats)
+void rpc_count_iostats_metrics(const struct rpc_task *task,
+			       struct rpc_iostats *op_metrics)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
-	struct rpc_iostats *op_metrics;
 	ktime_t delta, now;
 
-	if (!stats || !req)
+	if (!op_metrics || !req)
 		return;
 
 	now = ktime_get();
-	op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx];
-
 	spin_lock(&op_metrics->om_lock);
 
 	op_metrics->om_ops++;
@@ -175,6 +173,20 @@ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats)
 
 	spin_unlock(&op_metrics->om_lock);
 }
+EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics);
+
+/**
+ * rpc_count_iostats - tally up per-task stats
+ * @task: completed rpc_task
+ * @stats: array of stat structures
+ *
+ * Uses the statidx from @task
+ */
+void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats)
+{
+	rpc_count_iostats_metrics(task,
+				  &stats[task->tk_msg.rpc_proc->p_statidx]);
+}
 EXPORT_SYMBOL_GPL(rpc_count_iostats);
 
 static void _print_name(struct seq_file *seq, unsigned int op,
-- 
cgit v1.2.3


From aabff4ddcac0d36dd26546f5b905c27682e7bf89 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Wed, 27 Aug 2014 10:47:14 +0800
Subject: nfs: save server READ/WRITE/COMMIT status

Flexfiles layout would want to use them to report DS IO status.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Tom Haynes <Thomas.Haynes@primarydata.com>
---
 fs/nfs/nfs2xdr.c        | 10 +++++++---
 fs/nfs/nfs3xdr.c        |  3 +++
 fs/nfs/nfs4xdr.c        |  3 +++
 include/linux/nfs_xdr.h |  2 ++
 4 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5f61b83f4a1c..b4e03ed8599d 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -481,7 +481,8 @@ out_overflow:
  *		void;
  *	};
  */
-static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
+			   __u32 *op_status)
 {
 	enum nfs_stat status;
 	int error;
@@ -489,6 +490,8 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
 	error = decode_stat(xdr, &status);
 	if (unlikely(error))
 		goto out;
+	if (op_status)
+		*op_status = status;
 	if (status != NFS_OK)
 		goto out_default;
 	error = decode_fattr(xdr, result);
@@ -808,7 +811,7 @@ out_default:
 static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 struct nfs_fattr *result)
 {
-	return decode_attrstat(xdr, result);
+	return decode_attrstat(xdr, result, NULL);
 }
 
 static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -865,6 +868,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
 	error = decode_stat(xdr, &status);
 	if (unlikely(error))
 		goto out;
+	result->op_status = status;
 	if (status != NFS_OK)
 		goto out_default;
 	error = decode_fattr(xdr, result->fattr);
@@ -882,7 +886,7 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
 {
 	/* All NFSv2 writes are "file sync" writes */
 	result->verf->committed = NFS_FILE_SYNC;
-	return decode_attrstat(xdr, result->fattr);
+	return decode_attrstat(xdr, result->fattr, &result->op_status);
 }
 
 /**
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 8f4cbe7f4aa8..2a932fdc57cb 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1636,6 +1636,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 	error = decode_post_op_attr(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
+	result->op_status = status;
 	if (status != NFS3_OK)
 		goto out_status;
 	error = decode_read3resok(xdr, result);
@@ -1708,6 +1709,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 	error = decode_wcc_data(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
+	result->op_status = status;
 	if (status != NFS3_OK)
 		goto out_status;
 	error = decode_write3resok(xdr, result);
@@ -2323,6 +2325,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
 	error = decode_wcc_data(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
+	result->op_status = status;
 	if (status != NFS3_OK)
 		goto out_status;
 	error = decode_writeverf3(xdr, &result->verf->verifier);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cb4376b78ed9..7d8d7a47f771 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6567,6 +6567,7 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	int status;
 
 	status = decode_compound_hdr(xdr, &hdr);
+	res->op_status = hdr.status;
 	if (status)
 		goto out;
 	status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6592,6 +6593,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	int status;
 
 	status = decode_compound_hdr(xdr, &hdr);
+	res->op_status = hdr.status;
 	if (status)
 		goto out;
 	status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6621,6 +6623,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	int status;
 
 	status = decode_compound_hdr(xdr, &hdr);
+	res->op_status = hdr.status;
 	if (status)
 		goto out;
 	status = decode_sequence(xdr, &res->seq_res, rqstp);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 467c84efb596..962f461c065d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -513,6 +513,7 @@ struct nfs_pgio_res {
 	struct nfs4_sequence_res	seq_res;
 	struct nfs_fattr *	fattr;
 	__u32			count;
+	__u32			op_status;
 	int			eof;		/* used by read */
 	struct nfs_writeverf *	verf;		/* used by write */
 	const struct nfs_server *server;	/* used by write */
@@ -532,6 +533,7 @@ struct nfs_commitargs {
 
 struct nfs_commitres {
 	struct nfs4_sequence_res	seq_res;
+	__u32			op_status;
 	struct nfs_fattr	*fattr;
 	struct nfs_writeverf	*verf;
 	const struct nfs_server *server;
-- 
cgit v1.2.3


From 4579d6b897ee1b2557517fd536fb17eeb13481ad Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Sat, 6 Sep 2014 00:53:21 +0800
Subject: nfs41: pass iomode through layoutreturn args

So that it is possible to return a specific iomode layouts.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Tom Haynes <Thomas.Haynes@primarydata.com>
---
 fs/nfs/nfs4xdr.c        | 2 +-
 fs/nfs/pnfs.c           | 1 +
 include/linux/nfs_xdr.h | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7d8d7a47f771..3c3ff633dd17 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2012,7 +2012,7 @@ encode_layoutreturn(struct xdr_stream *xdr,
 	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(0);		/* reclaim. always 0 for now */
 	*p++ = cpu_to_be32(args->layout_type);
-	*p++ = cpu_to_be32(IOMODE_ANY);
+	*p++ = cpu_to_be32(args->iomode);
 	*p = cpu_to_be32(RETURN_FILE);
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, 0);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7e1bac189d1c..1b544c1a746c 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -914,6 +914,7 @@ _pnfs_return_layout(struct inode *ino)
 	lrp->args.stateid = stateid;
 	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
 	lrp->args.inode = ino;
+	lrp->args.iomode = IOMODE_ANY;
 	lrp->args.layout = lo;
 	lrp->clp = NFS_SERVER(ino)->nfs_client;
 	lrp->cred = lo->plh_lc_cred;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 962f461c065d..4fd7793d45d1 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -293,6 +293,7 @@ struct nfs4_layoutreturn_args {
 	struct nfs4_sequence_args seq_args;
 	struct pnfs_layout_hdr *layout;
 	struct inode *inode;
+	enum pnfs_iomode iomode;
 	nfs4_stateid stateid;
 	__u32   layout_type;
 };
-- 
cgit v1.2.3


From 2176bf4269a37a7742230ed6c91668241bfe1b2b Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Wed, 10 Sep 2014 15:44:18 -0400
Subject: nfs: introduce pg_cleanup op for pgio descriptors

Add a new operation to nfs_pageio_ops that is called on nfs_pageio_complete.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
---
 fs/nfs/pagelist.c        | 5 ++++-
 include/linux/nfs_page.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c4d175829880..1c031878c752 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1050,7 +1050,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 EXPORT_SYMBOL_GPL(nfs_pageio_resend);
 
 /**
- * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
+ * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
  * @desc: pointer to io descriptor
  */
 void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
@@ -1062,6 +1062,9 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
 		if (!nfs_do_recoalesce(desc))
 			break;
 	}
+
+	if (desc->pg_ops->pg_cleanup)
+		desc->pg_ops->pg_cleanup(desc);
 }
 
 /**
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 4c3aa809ab95..479c566c4ddc 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -58,6 +58,7 @@ struct nfs_pageio_ops {
 	size_t	(*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *,
 			   struct nfs_page *);
 	int	(*pg_doio)(struct nfs_pageio_descriptor *);
+	void	(*pg_cleanup)(struct nfs_pageio_descriptor *);
 };
 
 struct nfs_rw_ops {
-- 
cgit v1.2.3


From 6cccbb6f52dceec5f4faed8846ac05ae830640e6 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Tue, 16 Sep 2014 17:35:51 -0400
Subject: nfs: rename pgio header ds_idx to ds_commit_idx

'ds_commit_idx' is a better name - it is used to select the right
commit bucket for pnfs.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
---
 fs/nfs/direct.c                | 14 ++++++--------
 fs/nfs/filelayout/filelayout.c |  4 ++--
 include/linux/nfs_xdr.h        |  2 +-
 3 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e84f764b9dcd..d7c2d430b04d 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -112,22 +112,22 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
  * nfs_direct_select_verf - select the right verifier
  * @dreq - direct request possibly spanning multiple servers
  * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
- * @ds_idx - index of data server in data server list, only valid if ds_clp set
+ * @commit_idx - commit bucket index for the DS
  *
  * returns the correct verifier to use given the role of the server
  */
 static struct nfs_writeverf *
 nfs_direct_select_verf(struct nfs_direct_req *dreq,
 		       struct nfs_client *ds_clp,
-		       int ds_idx)
+		       int commit_idx)
 {
 	struct nfs_writeverf *verfp = &dreq->verf;
 
 #ifdef CONFIG_NFS_V4_1
 	if (ds_clp) {
 		/* pNFS is in use, use the DS verf */
-		if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets)
-			verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+		if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
+			verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
 		else
 			WARN_ON_ONCE(1);
 	}
@@ -148,8 +148,7 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 {
 	struct nfs_writeverf *verfp;
 
-	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
-				      hdr->ds_idx);
+	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
 	WARN_ON_ONCE(verfp->committed >= 0);
 	memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
 	WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +168,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 {
 	struct nfs_writeverf *verfp;
 
-	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
-					 hdr->ds_idx);
+	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
 	if (verfp->committed < 0) {
 		nfs_direct_set_hdr_verf(dreq, hdr);
 		return 0;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 2af32fc39d60..520cbc53e035 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -492,7 +492,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
 	/* No multipath support. Use first DS */
 	atomic_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
-	hdr->ds_idx = idx;
+	hdr->ds_commit_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
 		hdr->args.fh = fh;
@@ -536,7 +536,7 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 	hdr->pgio_done_cb = filelayout_write_done_cb;
 	atomic_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
-	hdr->ds_idx = idx;
+	hdr->ds_commit_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
 		hdr->args.fh = fh;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 4fd7793d45d1..5bc99f04a550 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1328,7 +1328,7 @@ struct nfs_pgio_header {
 	__u64			mds_offset;	/* Filelayout dense stripe */
 	struct nfs_page_array	page_array;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
-	int			ds_idx;		/* ds index if ds_clp is set */
+	int			ds_commit_idx;	/* ds index if ds_clp is set */
 };
 
 struct nfs_mds_commit_info {
-- 
cgit v1.2.3


From a7d42ddb3099727f58366fa006f850a219cce6c8 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Fri, 19 Sep 2014 10:55:07 -0400
Subject: nfs: add mirroring support to pgio layer

This patch adds mirrored write support to the pgio layer. The default
is to use one mirror, but pgio callers may define callbacks to change
this to any value up to the (arbitrarily selected) limit of 16.

The basic idea is to break out members of nfs_pageio_descriptor that cannot
be shared between mirrored DSes and put them in a new structure.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
---
 fs/nfs/direct.c              |  17 ++-
 fs/nfs/internal.h            |   1 +
 fs/nfs/objlayout/objio_osd.c |   3 +-
 fs/nfs/pagelist.c            | 270 +++++++++++++++++++++++++++++++++++--------
 fs/nfs/pnfs.c                |  26 +++--
 fs/nfs/read.c                |  30 ++++-
 fs/nfs/write.c               |  10 +-
 include/linux/nfs_page.h     |  20 +++-
 include/linux/nfs_xdr.h      |   1 +
 9 files changed, 311 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1ee41d74c31c..0178d4fe8ab7 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -360,8 +360,14 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 	spin_lock(&dreq->lock);
 	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
 		dreq->error = hdr->error;
-	else
-		dreq->count += hdr->good_bytes;
+	else {
+		/*
+		 * FIXME: right now this only accounts for bytes written
+		 *        to the first mirror
+		 */
+		if (hdr->pgio_mirror_idx == 0)
+			dreq->count += hdr->good_bytes;
+	}
 	spin_unlock(&dreq->lock);
 
 	while (!list_empty(&hdr->pages)) {
@@ -724,7 +730,12 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 		dreq->error = hdr->error;
 	}
 	if (dreq->error == 0) {
-		dreq->count += hdr->good_bytes;
+		/*
+		 * FIXME: right now this only accounts for bytes written
+		 *        to the first mirror
+		 */
+		if (hdr->pgio_mirror_idx == 0)
+			dreq->count += hdr->good_bytes;
 		if (nfs_write_need_commit(hdr)) {
 			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
 				request_commit = true;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 05f9a87cdab4..ef1c703e487b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -469,6 +469,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
 		    struct nfs_direct_req *dreq);
 int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
 bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
 
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d00778077df1..9a5f2ee6001f 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -537,11 +537,12 @@ int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
 static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
 			  struct nfs_page *prev, struct nfs_page *req)
 {
+	struct nfs_pgio_mirror *mirror = &pgio->pg_mirrors[pgio->pg_mirror_idx];
 	unsigned int size;
 
 	size = pnfs_generic_pg_test(pgio, prev, req);
 
-	if (!size || pgio->pg_count + req->wb_bytes >
+	if (!size || mirror->pg_count + req->wb_bytes >
 	    (unsigned long)pgio->pg_layout_private)
 		return 0;
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 1c031878c752..eec12b75c232 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -46,17 +46,22 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 		       struct nfs_pgio_header *hdr,
 		       void (*release)(struct nfs_pgio_header *hdr))
 {
-	hdr->req = nfs_list_entry(desc->pg_list.next);
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
+
+	hdr->req = nfs_list_entry(mirror->pg_list.next);
 	hdr->inode = desc->pg_inode;
 	hdr->cred = hdr->req->wb_context->cred;
 	hdr->io_start = req_offset(hdr->req);
-	hdr->good_bytes = desc->pg_count;
+	hdr->good_bytes = mirror->pg_count;
 	hdr->dreq = desc->pg_dreq;
 	hdr->layout_private = desc->pg_layout_private;
 	hdr->release = release;
 	hdr->completion_ops = desc->pg_completion_ops;
 	if (hdr->completion_ops->init_hdr)
 		hdr->completion_ops->init_hdr(hdr);
+
+	hdr->pgio_mirror_idx = desc->pg_mirror_idx;
 }
 EXPORT_SYMBOL_GPL(nfs_pgheader_init);
 
@@ -480,7 +485,10 @@ nfs_wait_on_request(struct nfs_page *req)
 size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 			   struct nfs_page *prev, struct nfs_page *req)
 {
-	if (desc->pg_count > desc->pg_bsize) {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
+
+	if (mirror->pg_count > mirror->pg_bsize) {
 		/* should never happen */
 		WARN_ON_ONCE(1);
 		return 0;
@@ -490,11 +498,11 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 	 * Limit the request size so that we can still allocate a page array
 	 * for it without upsetting the slab allocator.
 	 */
-	if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+	if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
 			sizeof(struct page) > PAGE_SIZE)
 		return 0;
 
-	return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
+	return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 
@@ -651,10 +659,18 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
 static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
 			  struct nfs_pgio_header *hdr)
 {
+	struct nfs_pgio_mirror *mirror;
+	u32 midx;
+
 	set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	nfs_pgio_data_destroy(hdr);
 	hdr->completion_ops->completion(hdr);
-	desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+	/* TODO: Make sure it's right to clean up all mirrors here
+	 *       and not just hdr->pgio_mirror_idx */
+	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+		mirror = &desc->pg_mirrors[midx];
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+	}
 	return -ENOMEM;
 }
 
@@ -671,6 +687,17 @@ static void nfs_pgio_release(void *calldata)
 	hdr->completion_ops->completion(hdr);
 }
 
+static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
+				   unsigned int bsize)
+{
+	INIT_LIST_HEAD(&mirror->pg_list);
+	mirror->pg_bytes_written = 0;
+	mirror->pg_count = 0;
+	mirror->pg_bsize = bsize;
+	mirror->pg_base = 0;
+	mirror->pg_recoalesce = 0;
+}
+
 /**
  * nfs_pageio_init - initialise a page io descriptor
  * @desc: pointer to descriptor
@@ -687,13 +714,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     size_t bsize,
 		     int io_flags)
 {
-	INIT_LIST_HEAD(&desc->pg_list);
-	desc->pg_bytes_written = 0;
-	desc->pg_count = 0;
-	desc->pg_bsize = bsize;
-	desc->pg_base = 0;
+	struct nfs_pgio_mirror *new;
+	int i;
+
 	desc->pg_moreio = 0;
-	desc->pg_recoalesce = 0;
 	desc->pg_inode = inode;
 	desc->pg_ops = pg_ops;
 	desc->pg_completion_ops = compl_ops;
@@ -703,6 +727,26 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_lseg = NULL;
 	desc->pg_dreq = NULL;
 	desc->pg_layout_private = NULL;
+	desc->pg_bsize = bsize;
+
+	desc->pg_mirror_count = 1;
+	desc->pg_mirror_idx = 0;
+
+	if (pg_ops->pg_get_mirror_count) {
+		/* until we have a request, we don't have an lseg and no
+		 * idea how many mirrors there will be */
+		new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
+			      sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
+		desc->pg_mirrors_dynamic = new;
+		desc->pg_mirrors = new;
+
+		for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
+			nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
+	} else {
+		desc->pg_mirrors_dynamic = NULL;
+		desc->pg_mirrors = desc->pg_mirrors_static;
+		nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
+	}
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init);
 
@@ -738,14 +782,16 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
 int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 		     struct nfs_pgio_header *hdr)
 {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
 	struct nfs_page		*req;
 	struct page		**pages,
 				*last_page;
-	struct list_head *head = &desc->pg_list;
+	struct list_head *head = &mirror->pg_list;
 	struct nfs_commit_info cinfo;
 	unsigned int pagecount, pageused;
 
-	pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
+	pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
 	if (!nfs_pgarray_set(&hdr->page_array, pagecount))
 		return nfs_pgio_error(desc, hdr);
 
@@ -773,7 +819,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 
 	/* Set up the argument struct */
-	nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
+	nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo);
 	desc->pg_rpc_callops = &nfs_pgio_common_ops;
 	return 0;
 }
@@ -781,12 +827,17 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
 
 static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
+	struct nfs_pgio_mirror *mirror;
 	struct nfs_pgio_header *hdr;
 	int ret;
 
+	mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
 	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
 	if (!hdr) {
-		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+		/* TODO: make sure this is right with mirroring - or
+		 *       should it back out all mirrors? */
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
 		return -ENOMEM;
 	}
 	nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
@@ -801,6 +852,49 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 	return ret;
 }
 
+/*
+ * nfs_pageio_setup_mirroring - determine if mirroring is to be used
+ *				by calling the pg_get_mirror_count op
+ */
+static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
+				       struct nfs_page *req)
+{
+	int mirror_count = 1;
+
+	if (!pgio->pg_ops->pg_get_mirror_count)
+		return 0;
+
+	mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+
+	if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
+		return -EINVAL;
+
+	pgio->pg_mirror_count = mirror_count;
+
+	return 0;
+}
+
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+	pgio->pg_mirror_count = 1;
+	pgio->pg_mirror_idx = 0;
+}
+
+static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+	pgio->pg_mirror_count = 1;
+	pgio->pg_mirror_idx = 0;
+	pgio->pg_mirrors = pgio->pg_mirrors_static;
+	kfree(pgio->pg_mirrors_dynamic);
+	pgio->pg_mirrors_dynamic = NULL;
+}
+
 static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
 		const struct nfs_open_context *ctx2)
 {
@@ -867,19 +961,22 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 				     struct nfs_page *req)
 {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
 	struct nfs_page *prev = NULL;
-	if (desc->pg_count != 0) {
-		prev = nfs_list_entry(desc->pg_list.prev);
+
+	if (mirror->pg_count != 0) {
+		prev = nfs_list_entry(mirror->pg_list.prev);
 	} else {
 		if (desc->pg_ops->pg_init)
 			desc->pg_ops->pg_init(desc, req);
-		desc->pg_base = req->wb_pgbase;
+		mirror->pg_base = req->wb_pgbase;
 	}
 	if (!nfs_can_coalesce_requests(prev, req, desc))
 		return 0;
 	nfs_list_remove_request(req);
-	nfs_list_add_request(req, &desc->pg_list);
-	desc->pg_count += req->wb_bytes;
+	nfs_list_add_request(req, &mirror->pg_list);
+	mirror->pg_count += req->wb_bytes;
 	return 1;
 }
 
@@ -888,16 +985,19 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
  */
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
-	if (!list_empty(&desc->pg_list)) {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
+
+	if (!list_empty(&mirror->pg_list)) {
 		int error = desc->pg_ops->pg_doio(desc);
 		if (error < 0)
 			desc->pg_error = error;
 		else
-			desc->pg_bytes_written += desc->pg_count;
+			mirror->pg_bytes_written += mirror->pg_count;
 	}
-	if (list_empty(&desc->pg_list)) {
-		desc->pg_count = 0;
-		desc->pg_base = 0;
+	if (list_empty(&mirror->pg_list)) {
+		mirror->pg_count = 0;
+		mirror->pg_base = 0;
 	}
 }
 
@@ -915,10 +1015,14 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			   struct nfs_page *req)
 {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
 	struct nfs_page *subreq;
 	unsigned int bytes_left = 0;
 	unsigned int offset, pgbase;
 
+	WARN_ON_ONCE(desc->pg_mirror_idx >= desc->pg_mirror_count);
+
 	nfs_page_group_lock(req, false);
 
 	subreq = req;
@@ -938,7 +1042,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			nfs_pageio_doio(desc);
 			if (desc->pg_error < 0)
 				return 0;
-			if (desc->pg_recoalesce)
+			if (mirror->pg_recoalesce)
 				return 0;
 			/* retry add_request for this subreq */
 			nfs_page_group_lock(req, false);
@@ -976,14 +1080,16 @@ err_ptr:
 
 static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
 {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
 	LIST_HEAD(head);
 
 	do {
-		list_splice_init(&desc->pg_list, &head);
-		desc->pg_bytes_written -= desc->pg_count;
-		desc->pg_count = 0;
-		desc->pg_base = 0;
-		desc->pg_recoalesce = 0;
+		list_splice_init(&mirror->pg_list, &head);
+		mirror->pg_bytes_written -= mirror->pg_count;
+		mirror->pg_count = 0;
+		mirror->pg_base = 0;
+		mirror->pg_recoalesce = 0;
+
 		desc->pg_moreio = 0;
 
 		while (!list_empty(&head)) {
@@ -997,11 +1103,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
 				return 0;
 			break;
 		}
-	} while (desc->pg_recoalesce);
+	} while (mirror->pg_recoalesce);
 	return 1;
 }
 
-int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
 		struct nfs_page *req)
 {
 	int ret;
@@ -1014,9 +1120,78 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			break;
 		ret = nfs_do_recoalesce(desc);
 	} while (ret);
+
 	return ret;
 }
 
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+			   struct nfs_page *req)
+{
+	u32 midx;
+	unsigned int pgbase, offset, bytes;
+	struct nfs_page *dupreq, *lastreq;
+
+	pgbase = req->wb_pgbase;
+	offset = req->wb_offset;
+	bytes = req->wb_bytes;
+
+	nfs_pageio_setup_mirroring(desc, req);
+
+	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+		if (midx) {
+			nfs_page_group_lock(req, false);
+
+			/* find the last request */
+			for (lastreq = req->wb_head;
+			     lastreq->wb_this_page != req->wb_head;
+			     lastreq = lastreq->wb_this_page)
+				;
+
+			dupreq = nfs_create_request(req->wb_context,
+					req->wb_page, lastreq, pgbase, bytes);
+
+			if (IS_ERR(dupreq)) {
+				nfs_page_group_unlock(req);
+				return 0;
+			}
+
+			nfs_lock_request(dupreq);
+			nfs_page_group_unlock(req);
+			dupreq->wb_offset = offset;
+			dupreq->wb_index = req->wb_index;
+		} else
+			dupreq = req;
+
+		desc->pg_mirror_idx = midx;
+		if (!nfs_pageio_add_request_mirror(desc, dupreq))
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
+ *				nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
+				       u32 mirror_idx)
+{
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
+	u32 restore_idx = desc->pg_mirror_idx;
+
+	desc->pg_mirror_idx = mirror_idx;
+	for (;;) {
+		nfs_pageio_doio(desc);
+		if (!mirror->pg_recoalesce)
+			break;
+		if (!nfs_do_recoalesce(desc))
+			break;
+	}
+	desc->pg_mirror_idx = restore_idx;
+}
+
 /*
  * nfs_pageio_resend - Transfer requests to new descriptor and resend
  * @hdr - the pgio header to move request from
@@ -1055,16 +1230,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_resend);
  */
 void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
 {
-	for (;;) {
-		nfs_pageio_doio(desc);
-		if (!desc->pg_recoalesce)
-			break;
-		if (!nfs_do_recoalesce(desc))
-			break;
-	}
+	u32 midx;
+
+	for (midx = 0; midx < desc->pg_mirror_count; midx++)
+		nfs_pageio_complete_mirror(desc, midx);
 
 	if (desc->pg_ops->pg_cleanup)
 		desc->pg_ops->pg_cleanup(desc);
+	nfs_pageio_cleanup_mirroring(desc);
 }
 
 /**
@@ -1080,10 +1253,17 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
  */
 void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
 {
-	if (!list_empty(&desc->pg_list)) {
-		struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);
-		if (index != prev->wb_index + 1)
-			nfs_pageio_complete(desc);
+	struct nfs_pgio_mirror *mirror;
+	struct nfs_page *prev;
+	u32 midx;
+
+	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+		mirror = &desc->pg_mirrors[midx];
+		if (!list_empty(&mirror->pg_list)) {
+			prev = nfs_list_entry(mirror->pg_list.prev);
+			if (index != prev->wb_index + 1)
+				nfs_pageio_complete_mirror(desc, midx);
+		}
 	}
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2da2e771fefe..5f7c422ebb5d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1646,8 +1646,8 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
  * of bytes (maximum @req->wb_bytes) that can be coalesced.
  */
 size_t
-pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
-		     struct nfs_page *req)
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+		     struct nfs_page *prev, struct nfs_page *req)
 {
 	unsigned int size;
 	u64 seg_end, req_start, seg_left;
@@ -1729,10 +1729,12 @@ static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
 		struct nfs_pgio_header *hdr)
 {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
-		list_splice_tail_init(&hdr->pages, &desc->pg_list);
+		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
 		nfs_pageio_reset_write_mds(desc);
-		desc->pg_recoalesce = 1;
+		mirror->pg_recoalesce = 1;
 	}
 	nfs_pgio_data_destroy(hdr);
 }
@@ -1781,12 +1783,14 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
 	struct nfs_pgio_header *hdr;
 	int ret;
 
 	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
 	if (!hdr) {
-		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
 		return -ENOMEM;
 	}
 	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
@@ -1795,6 +1799,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 	ret = nfs_generic_pgio(desc, hdr);
 	if (!ret)
 		pnfs_do_write(desc, hdr, desc->pg_ioflags);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
@@ -1839,10 +1844,13 @@ static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
 		struct nfs_pgio_header *hdr)
 {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
+
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
-		list_splice_tail_init(&hdr->pages, &desc->pg_list);
+		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
 		nfs_pageio_reset_read_mds(desc);
-		desc->pg_recoalesce = 1;
+		mirror->pg_recoalesce = 1;
 	}
 	nfs_pgio_data_destroy(hdr);
 }
@@ -1893,12 +1901,14 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
+	struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[desc->pg_mirror_idx];
+
 	struct nfs_pgio_header *hdr;
 	int ret;
 
 	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
 	if (!hdr) {
-		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
 		return -ENOMEM;
 	}
 	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 092ab499f2b6..568ecf0a880f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -70,8 +70,15 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
 void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 {
+	struct nfs_pgio_mirror *mirror;
+
 	pgio->pg_ops = &nfs_pgio_rw_ops;
-	pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
+
+	/* read path should never have more than one mirror */
+	WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+
+	mirror = &pgio->pg_mirrors[0];
+	mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
@@ -81,6 +88,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	struct nfs_page	*new;
 	unsigned int len;
 	struct nfs_pageio_descriptor pgio;
+	struct nfs_pgio_mirror *pgm;
 
 	len = nfs_page_length(page);
 	if (len == 0)
@@ -97,7 +105,13 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 			     &nfs_async_read_completion_ops);
 	nfs_pageio_add_request(&pgio, new);
 	nfs_pageio_complete(&pgio);
-	NFS_I(inode)->read_io += pgio.pg_bytes_written;
+
+	/* It doesn't make sense to do mirrored reads! */
+	WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+
+	pgm = &pgio.pg_mirrors[0];
+	NFS_I(inode)->read_io += pgm->pg_bytes_written;
+
 	return 0;
 }
 
@@ -352,6 +366,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
 	struct nfs_pageio_descriptor pgio;
+	struct nfs_pgio_mirror *pgm;
 	struct nfs_readdesc desc = {
 		.pgio = &pgio,
 	};
@@ -387,10 +402,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 			     &nfs_async_read_completion_ops);
 
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-
 	nfs_pageio_complete(&pgio);
-	NFS_I(inode)->read_io += pgio.pg_bytes_written;
-	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	/* It doesn't make sense to do mirrored reads! */
+	WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+
+	pgm = &pgio.pg_mirrors[0];
+	NFS_I(inode)->read_io += pgm->pg_bytes_written;
+	npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
+		 PAGE_CACHE_SHIFT;
 	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 read_complete:
 	put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2bee165fddcf..ceacfeeb28c2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -906,7 +906,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 		if (nfs_write_need_commit(hdr)) {
 			memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
 			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
-				0);
+				hdr->pgio_mirror_idx);
 			goto next;
 		}
 remove_req:
@@ -1304,8 +1304,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 
 void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 {
+	struct nfs_pgio_mirror *mirror;
+
 	pgio->pg_ops = &nfs_pgio_rw_ops;
-	pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
+
+	nfs_pageio_stop_mirroring(pgio);
+
+	mirror = &pgio->pg_mirrors[0];
+	mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 479c566c4ddc..3eb072dbce83 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -58,6 +58,8 @@ struct nfs_pageio_ops {
 	size_t	(*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *,
 			   struct nfs_page *);
 	int	(*pg_doio)(struct nfs_pageio_descriptor *);
+	unsigned int	(*pg_get_mirror_count)(struct nfs_pageio_descriptor *,
+				       struct nfs_page *);
 	void	(*pg_cleanup)(struct nfs_pageio_descriptor *);
 };
 
@@ -74,15 +76,17 @@ struct nfs_rw_ops {
 			    struct rpc_task_setup *, int);
 };
 
-struct nfs_pageio_descriptor {
+struct nfs_pgio_mirror {
 	struct list_head	pg_list;
 	unsigned long		pg_bytes_written;
 	size_t			pg_count;
 	size_t			pg_bsize;
 	unsigned int		pg_base;
-	unsigned char		pg_moreio : 1,
-				pg_recoalesce : 1;
+	unsigned char		pg_recoalesce : 1;
+};
 
+struct nfs_pageio_descriptor {
+	unsigned char		pg_moreio : 1;
 	struct inode		*pg_inode;
 	const struct nfs_pageio_ops *pg_ops;
 	const struct nfs_rw_ops *pg_rw_ops;
@@ -93,8 +97,18 @@ struct nfs_pageio_descriptor {
 	struct pnfs_layout_segment *pg_lseg;
 	struct nfs_direct_req	*pg_dreq;
 	void			*pg_layout_private;
+	unsigned int		pg_bsize;	/* default bsize for mirrors */
+
+	u32			pg_mirror_count;
+	struct nfs_pgio_mirror	*pg_mirrors;
+	struct nfs_pgio_mirror	pg_mirrors_static[1];
+	struct nfs_pgio_mirror	*pg_mirrors_dynamic;
+	u32			pg_mirror_idx;	/* current mirror */
 };
 
+/* arbitrarily selected limit to number of mirrors */
+#define NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX 16
+
 #define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
 
 extern	struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5bc99f04a550..6400a1e01aa4 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1329,6 +1329,7 @@ struct nfs_pgio_header {
 	struct nfs_page_array	page_array;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 	int			ds_commit_idx;	/* ds index if ds_clp is set */
+	int			pgio_mirror_idx;/* mirror index in pgio layer */
 };
 
 struct nfs_mds_commit_info {
-- 
cgit v1.2.3


From 15eb67c15342d212b0c8a540b6d6bd2dfad52a63 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Mon, 17 Nov 2014 09:30:36 +0800
Subject: nfs41: add range to layoutreturn args

So that callers can specify which range to return.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Tom Haynes <loghyr@primarydata.com>
---
 fs/nfs/nfs4xdr.c        | 6 +++---
 fs/nfs/pnfs.c           | 4 +++-
 include/linux/nfs_xdr.h | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3c3ff633dd17..56d4c91a48f3 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2012,11 +2012,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
 	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(0);		/* reclaim. always 0 for now */
 	*p++ = cpu_to_be32(args->layout_type);
-	*p++ = cpu_to_be32(args->iomode);
+	*p++ = cpu_to_be32(args->range.iomode);
 	*p = cpu_to_be32(RETURN_FILE);
 	p = reserve_space(xdr, 16);
-	p = xdr_encode_hyper(p, 0);
-	p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+	p = xdr_encode_hyper(p, args->range.offset);
+	p = xdr_encode_hyper(p, args->range.length);
 	spin_lock(&args->inode->i_lock);
 	encode_nfs4_stateid(xdr, &args->stateid);
 	spin_unlock(&args->inode->i_lock);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 685af4fb39ca..9549b89e494b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -916,7 +916,9 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
 	lrp->args.stateid = stateid;
 	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
 	lrp->args.inode = ino;
-	lrp->args.iomode = iomode;
+	lrp->args.range.iomode = iomode;
+	lrp->args.range.offset = 0;
+	lrp->args.range.length = NFS4_MAX_UINT64;
 	lrp->args.layout = lo;
 	lrp->clp = NFS_SERVER(ino)->nfs_client;
 	lrp->cred = lo->plh_lc_cred;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6400a1e01aa4..363792356d25 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -293,7 +293,7 @@ struct nfs4_layoutreturn_args {
 	struct nfs4_sequence_args seq_args;
 	struct pnfs_layout_hdr *layout;
 	struct inode *inode;
-	enum pnfs_iomode iomode;
+	struct pnfs_layout_range range;
 	nfs4_stateid stateid;
 	__u32   layout_type;
 };
-- 
cgit v1.2.3


From d67ae825a59d639e4d8b82413af84d854617a87e Mon Sep 17 00:00:00 2001
From: Tom Haynes <loghyr@primarydata.com>
Date: Thu, 11 Dec 2014 17:02:04 -0500
Subject: pnfs/flexfiles: Add the FlexFile Layout Driver

The flexfile layout is a new layout that extends the
file layout. It is currently being drafted as a specification at
https://datatracker.ietf.org/doc/draft-ietf-nfsv4-layout-types/

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Tom Haynes <loghyr@primarydata.com>
Signed-off-by: Tao Peng <bergwolf@primarydata.com>
---
 fs/nfs/Kconfig                            |    5 +
 fs/nfs/Makefile                           |    1 +
 fs/nfs/flexfilelayout/Makefile            |    5 +
 fs/nfs/flexfilelayout/flexfilelayout.c    | 1574 +++++++++++++++++++++++++++++
 fs/nfs/flexfilelayout/flexfilelayout.h    |  155 +++
 fs/nfs/flexfilelayout/flexfilelayoutdev.c |  552 ++++++++++
 fs/nfs/idmap.c                            |    3 +-
 fs/nfs/nfs4proc.c                         |    4 +-
 fs/nfs/pnfs.c                             |   32 +-
 fs/nfs/pnfs.h                             |    1 +
 include/linux/nfs4.h                      |    1 +
 include/linux/nfs_idmap.h                 |    2 +
 include/linux/sunrpc/metrics.h            |    2 +
 13 files changed, 2325 insertions(+), 12 deletions(-)
 create mode 100644 fs/nfs/flexfilelayout/Makefile
 create mode 100644 fs/nfs/flexfilelayout/flexfilelayout.c
 create mode 100644 fs/nfs/flexfilelayout/flexfilelayout.h
 create mode 100644 fs/nfs/flexfilelayout/flexfilelayoutdev.c

(limited to 'include/linux')

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 3dece03f2fc8..c7abc10279af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -128,6 +128,11 @@ config PNFS_OBJLAYOUT
 	depends on NFS_V4_1 && SCSI_OSD_ULD
 	default NFS_V4
 
+config PNFS_FLEXFILE_LAYOUT
+	tristate
+	depends on NFS_V4_1 && NFS_V3
+	default m
+
 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
 	string "NFSv4.1 Implementation ID Domain"
 	depends on NFS_V4_1
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 23abffa8a4ce..1e987acf20c9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -33,3 +33,4 @@ nfsv4-$(CONFIG_NFS_V4_2)	+= nfs42proc.o
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
 obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
 obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 000000000000..1d2c9f6bbcd4
--- /dev/null
+++ b/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Flexfile Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
+nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 000000000000..f29fb7d7e8f8
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,1574 @@
+/*
+ * Module for pnfs flexfile layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_idmap.h>
+
+#include "flexfilelayout.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "../nfs4trace.h"
+#include "../iostat.h"
+#include "../nfs.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+#define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
+
+static struct pnfs_layout_hdr *
+ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+	struct nfs4_flexfile_layout *ffl;
+
+	ffl = kzalloc(sizeof(*ffl), gfp_flags);
+	if (ffl) {
+		INIT_LIST_HEAD(&ffl->error_list);
+		return &ffl->generic_hdr;
+	} else
+		return NULL;
+}
+
+static void
+ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct nfs4_ff_layout_ds_err *err, *n;
+
+	list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
+				 list) {
+		list_del(&err->list);
+		kfree(err);
+	}
+	kfree(FF_LAYOUT_FROM_HDR(lo));
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+	if (unlikely(p == NULL))
+		return -ENOBUFS;
+	memcpy(stateid, p, NFS4_STATEID_SIZE);
+	dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
+		p[0], p[1], p[2], p[3]);
+	return 0;
+}
+
+static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
+	if (unlikely(!p))
+		return -ENOBUFS;
+	memcpy(devid, p, NFS4_DEVICEID4_SIZE);
+	nfs4_print_deviceid(devid);
+	return 0;
+}
+
+static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		return -ENOBUFS;
+	fh->size = be32_to_cpup(p++);
+	if (fh->size > sizeof(struct nfs_fh)) {
+		printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
+		       fh->size);
+		return -EOVERFLOW;
+	}
+	/* fh.data */
+	p = xdr_inline_decode(xdr, fh->size);
+	if (unlikely(!p))
+		return -ENOBUFS;
+	memcpy(&fh->data, p, fh->size);
+	dprintk("%s: fh len %d\n", __func__, fh->size);
+
+	return 0;
+}
+
+/*
+ * Currently only stringified uids and gids are accepted.
+ * I.e., kerberos is not supported to the DSes, so no pricipals.
+ *
+ * That means that one common function will suffice, but when
+ * principals are added, this should be split to accomodate
+ * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
+ */
+static int
+decode_name(struct xdr_stream *xdr, u32 *id)
+{
+	__be32 *p;
+	int len;
+
+	/* opaque_length(4)*/
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		return -ENOBUFS;
+	len = be32_to_cpup(p++);
+	if (len < 0)
+		return -EINVAL;
+
+	dprintk("%s: len %u\n", __func__, len);
+
+	/* opaque body */
+	p = xdr_inline_decode(xdr, len);
+	if (unlikely(!p))
+		return -ENOBUFS;
+
+	if (!nfs_map_string_to_numeric((char *)p, len, id))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
+{
+	int i;
+
+	if (fls->mirror_array) {
+		for (i = 0; i < fls->mirror_array_cnt; i++) {
+			/* normally mirror_ds is freed in
+			 * .free_deviceid_node but we still do it here
+			 * for .alloc_lseg error path */
+			if (fls->mirror_array[i]) {
+				kfree(fls->mirror_array[i]->fh_versions);
+				nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+				kfree(fls->mirror_array[i]);
+			}
+		}
+		kfree(fls->mirror_array);
+		fls->mirror_array = NULL;
+	}
+}
+
+static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
+{
+	int ret = 0;
+
+	dprintk("--> %s\n", __func__);
+
+	/* FIXME: remove this check when layout segment support is added */
+	if (lgr->range.offset != 0 ||
+	    lgr->range.length != NFS4_MAX_UINT64) {
+		dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+			__func__);
+		ret = -EINVAL;
+	}
+
+	dprintk("--> %s returns %d\n", __func__, ret);
+	return ret;
+}
+
+static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
+{
+	if (fls) {
+		ff_layout_free_mirror_array(fls);
+		kfree(fls);
+	}
+}
+
+static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
+{
+	struct nfs4_ff_layout_mirror *tmp;
+	int i, j;
+
+	for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
+		for (j = i + 1; j < fls->mirror_array_cnt; j++)
+			if (fls->mirror_array[i]->efficiency <
+			    fls->mirror_array[j]->efficiency) {
+				tmp = fls->mirror_array[i];
+				fls->mirror_array[i] = fls->mirror_array[j];
+				fls->mirror_array[j] = tmp;
+			}
+	}
+}
+
+static struct pnfs_layout_segment *
+ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
+		     struct nfs4_layoutget_res *lgr,
+		     gfp_t gfp_flags)
+{
+	struct pnfs_layout_segment *ret;
+	struct nfs4_ff_layout_segment *fls = NULL;
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	u64 stripe_unit;
+	u32 mirror_array_cnt;
+	__be32 *p;
+	int i, rc;
+
+	dprintk("--> %s\n", __func__);
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		return ERR_PTR(-ENOMEM);
+
+	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
+			      lgr->layoutp->len);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	/* stripe unit and mirror_array_cnt */
+	rc = -EIO;
+	p = xdr_inline_decode(&stream, 8 + 4);
+	if (!p)
+		goto out_err_free;
+
+	p = xdr_decode_hyper(p, &stripe_unit);
+	mirror_array_cnt = be32_to_cpup(p++);
+	dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
+		stripe_unit, mirror_array_cnt);
+
+	if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
+	    mirror_array_cnt == 0)
+		goto out_err_free;
+
+	rc = -ENOMEM;
+	fls = kzalloc(sizeof(*fls), gfp_flags);
+	if (!fls)
+		goto out_err_free;
+
+	fls->mirror_array_cnt = mirror_array_cnt;
+	fls->stripe_unit = stripe_unit;
+	fls->mirror_array = kcalloc(fls->mirror_array_cnt,
+				    sizeof(fls->mirror_array[0]), gfp_flags);
+	if (fls->mirror_array == NULL)
+		goto out_err_free;
+
+	for (i = 0; i < fls->mirror_array_cnt; i++) {
+		struct nfs4_deviceid devid;
+		struct nfs4_deviceid_node *idnode;
+		u32 ds_count;
+		u32 fh_count;
+		int j;
+
+		rc = -EIO;
+		p = xdr_inline_decode(&stream, 4);
+		if (!p)
+			goto out_err_free;
+		ds_count = be32_to_cpup(p);
+
+		/* FIXME: allow for striping? */
+		if (ds_count != 1)
+			goto out_err_free;
+
+		fls->mirror_array[i] =
+			kzalloc(sizeof(struct nfs4_ff_layout_mirror),
+				gfp_flags);
+		if (fls->mirror_array[i] == NULL) {
+			rc = -ENOMEM;
+			goto out_err_free;
+		}
+
+		spin_lock_init(&fls->mirror_array[i]->lock);
+		fls->mirror_array[i]->ds_count = ds_count;
+
+		/* deviceid */
+		rc = decode_deviceid(&stream, &devid);
+		if (rc)
+			goto out_err_free;
+
+		idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
+						&devid, lh->plh_lc_cred,
+						gfp_flags);
+		/*
+		 * upon success, mirror_ds is allocated by previous
+		 * getdeviceinfo, or newly by .alloc_deviceid_node
+		 * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
+		 */
+		if (idnode)
+			fls->mirror_array[i]->mirror_ds =
+				FF_LAYOUT_MIRROR_DS(idnode);
+		else
+			goto out_err_free;
+
+		/* efficiency */
+		rc = -EIO;
+		p = xdr_inline_decode(&stream, 4);
+		if (!p)
+			goto out_err_free;
+		fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+
+		/* stateid */
+		rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+		if (rc)
+			goto out_err_free;
+
+		/* fh */
+		p = xdr_inline_decode(&stream, 4);
+		if (!p)
+			goto out_err_free;
+		fh_count = be32_to_cpup(p);
+
+		fls->mirror_array[i]->fh_versions =
+			kzalloc(fh_count * sizeof(struct nfs_fh),
+				gfp_flags);
+		if (fls->mirror_array[i]->fh_versions == NULL) {
+			rc = -ENOMEM;
+			goto out_err_free;
+		}
+
+		for (j = 0; j < fh_count; j++) {
+			rc = decode_nfs_fh(&stream,
+					   &fls->mirror_array[i]->fh_versions[j]);
+			if (rc)
+				goto out_err_free;
+		}
+
+		fls->mirror_array[i]->fh_versions_cnt = fh_count;
+
+		/* user */
+		rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+		if (rc)
+			goto out_err_free;
+
+		/* group */
+		rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+		if (rc)
+			goto out_err_free;
+
+		dprintk("%s: uid %d gid %d\n", __func__,
+			fls->mirror_array[i]->uid,
+			fls->mirror_array[i]->gid);
+	}
+
+	ff_layout_sort_mirrors(fls);
+	rc = ff_layout_check_layout(lgr);
+	if (rc)
+		goto out_err_free;
+
+	ret = &fls->generic_hdr;
+	dprintk("<-- %s (success)\n", __func__);
+out_free_page:
+	__free_page(scratch);
+	return ret;
+out_err_free:
+	_ff_layout_free_lseg(fls);
+	ret = ERR_PTR(rc);
+	dprintk("<-- %s (%d)\n", __func__, rc);
+	goto out_free_page;
+}
+
+static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
+{
+	struct pnfs_layout_segment *lseg;
+
+	list_for_each_entry(lseg, &layout->plh_segs, pls_list)
+		if (lseg->pls_range.iomode == IOMODE_RW)
+			return true;
+
+	return false;
+}
+
+static void
+ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+	int i;
+
+	dprintk("--> %s\n", __func__);
+
+	for (i = 0; i < fls->mirror_array_cnt; i++) {
+		if (fls->mirror_array[i]) {
+			nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+			fls->mirror_array[i]->mirror_ds = NULL;
+			if (fls->mirror_array[i]->cred) {
+				put_rpccred(fls->mirror_array[i]->cred);
+				fls->mirror_array[i]->cred = NULL;
+			}
+		}
+	}
+
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		struct nfs4_flexfile_layout *ffl;
+		struct inode *inode;
+
+		ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
+		inode = ffl->generic_hdr.plh_inode;
+		spin_lock(&inode->i_lock);
+		if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
+			ffl->commit_info.nbuckets = 0;
+			kfree(ffl->commit_info.buckets);
+			ffl->commit_info.buckets = NULL;
+		}
+		spin_unlock(&inode->i_lock);
+	}
+	_ff_layout_free_lseg(fls);
+}
+
+/* Return 1 until we have multiple lsegs support */
+static int
+ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
+{
+	return 1;
+}
+
+static int
+ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+			    struct nfs_commit_info *cinfo,
+			    gfp_t gfp_flags)
+{
+	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+	struct pnfs_commit_bucket *buckets;
+	int size;
+
+	if (cinfo->ds->nbuckets != 0) {
+		/* This assumes there is only one RW lseg per file.
+		 * To support multiple lseg per file, we need to
+		 * change struct pnfs_commit_bucket to allow dynamic
+		 * increasing nbuckets.
+		 */
+		return 0;
+	}
+
+	size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
+
+	buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
+			  gfp_flags);
+	if (!buckets)
+		return -ENOMEM;
+	else {
+		int i;
+
+		spin_lock(cinfo->lock);
+		if (cinfo->ds->nbuckets != 0)
+			kfree(buckets);
+		else {
+			cinfo->ds->buckets = buckets;
+			cinfo->ds->nbuckets = size;
+			for (i = 0; i < size; i++) {
+				INIT_LIST_HEAD(&buckets[i].written);
+				INIT_LIST_HEAD(&buckets[i].committing);
+				/* mark direct verifier as unset */
+				buckets[i].direct_verf.committed =
+					NFS_INVALID_STABLE_HOW;
+			}
+		}
+		spin_unlock(cinfo->lock);
+		return 0;
+	}
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+				  int *best_idx)
+{
+	struct nfs4_ff_layout_segment *fls;
+	struct nfs4_pnfs_ds *ds;
+	int idx;
+
+	fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
+	/* mirrors are sorted by efficiency */
+	for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
+		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+		if (ds) {
+			*best_idx = idx;
+			return ds;
+		}
+	}
+
+	return NULL;
+}
+
+static void
+ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+			struct nfs_page *req)
+{
+	struct nfs_pgio_mirror *pgm;
+	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs4_pnfs_ds *ds;
+	int ds_idx;
+
+	/* Use full layout for now */
+	if (!pgio->pg_lseg)
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   req->wb_context,
+						   0,
+						   NFS4_MAX_UINT64,
+						   IOMODE_READ,
+						   GFP_KERNEL);
+	/* If no lseg, fall back to read through mds */
+	if (pgio->pg_lseg == NULL)
+		goto out_mds;
+
+	ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+	if (!ds)
+		goto out_mds;
+	mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+
+	pgio->pg_mirror_idx = ds_idx;
+
+	/* read always uses only one mirror - idx 0 for pgio layer */
+	pgm = &pgio->pg_mirrors[0];
+	pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+
+	return;
+out_mds:
+	pnfs_put_lseg(pgio->pg_lseg);
+	pgio->pg_lseg = NULL;
+	nfs_pageio_reset_read_mds(pgio);
+}
+
+static void
+ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+			struct nfs_page *req)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs_pgio_mirror *pgm;
+	struct nfs_commit_info cinfo;
+	struct nfs4_pnfs_ds *ds;
+	int i;
+	int status;
+
+	if (!pgio->pg_lseg)
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   req->wb_context,
+						   0,
+						   NFS4_MAX_UINT64,
+						   IOMODE_RW,
+						   GFP_NOFS);
+	/* If no lseg, fall back to write through mds */
+	if (pgio->pg_lseg == NULL)
+		goto out_mds;
+
+	nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
+	status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
+	if (status < 0)
+		goto out_mds;
+
+	/* Use a direct mapping of ds_idx to pgio mirror_idx */
+	if (WARN_ON_ONCE(pgio->pg_mirror_count !=
+	    FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
+		goto out_mds;
+
+	for (i = 0; i < pgio->pg_mirror_count; i++) {
+		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+		if (!ds)
+			goto out_mds;
+		pgm = &pgio->pg_mirrors[i];
+		mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+		pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+	}
+
+	return;
+
+out_mds:
+	pnfs_put_lseg(pgio->pg_lseg);
+	pgio->pg_lseg = NULL;
+	nfs_pageio_reset_write_mds(pgio);
+}
+
+static unsigned int
+ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
+				    struct nfs_page *req)
+{
+	if (!pgio->pg_lseg)
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   req->wb_context,
+						   0,
+						   NFS4_MAX_UINT64,
+						   IOMODE_RW,
+						   GFP_NOFS);
+	if (pgio->pg_lseg)
+		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
+
+	/* no lseg means that pnfs is not in use, so no mirroring here */
+	pnfs_put_lseg(pgio->pg_lseg);
+	pgio->pg_lseg = NULL;
+	nfs_pageio_reset_write_mds(pgio);
+	return 1;
+}
+
+static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
+	.pg_init = ff_layout_pg_init_read,
+	.pg_test = pnfs_generic_pg_test,
+	.pg_doio = pnfs_generic_pg_readpages,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
+	.pg_init = ff_layout_pg_init_write,
+	.pg_test = pnfs_generic_pg_test,
+	.pg_doio = pnfs_generic_pg_writepages,
+	.pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
+	.pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
+{
+	struct rpc_task *task = &hdr->task;
+
+	pnfs_layoutcommit_inode(hdr->inode, false);
+
+	if (retry_pnfs) {
+		dprintk("%s Reset task %5u for i/o through pNFS "
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+			hdr->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(hdr->inode),
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
+
+		if (!hdr->dreq) {
+			struct nfs_open_context *ctx;
+
+			ctx = nfs_list_entry(hdr->pages.next)->wb_context;
+			set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
+			hdr->completion_ops->error_cleanup(&hdr->pages);
+		} else {
+			nfs_direct_set_resched_writes(hdr->dreq);
+			/* fake unstable write to let common nfs resend pages */
+			hdr->verf.committed = NFS_UNSTABLE;
+			hdr->good_bytes = 0;
+		}
+		return;
+	}
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+			hdr->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(hdr->inode),
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
+
+		task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+	}
+}
+
+static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+{
+	struct rpc_task *task = &hdr->task;
+
+	pnfs_layoutcommit_inode(hdr->inode, false);
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+			hdr->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(hdr->inode),
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
+
+		task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+	}
+}
+
+static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+					   struct nfs4_state *state,
+					   struct nfs_client *clp,
+					   struct pnfs_layout_segment *lseg,
+					   int idx)
+{
+	struct pnfs_layout_hdr *lo = lseg->pls_layout;
+	struct inode *inode = lo->plh_inode;
+	struct nfs_server *mds_server = NFS_SERVER(inode);
+
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+	struct nfs_client *mds_client = mds_server->nfs_client;
+	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+	if (task->tk_status >= 0)
+		return 0;
+
+	switch (task->tk_status) {
+	/* MDS state errors */
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+		if (state == NULL)
+			break;
+		nfs_remove_bad_delegation(state->inode);
+	case -NFS4ERR_OPENMODE:
+		if (state == NULL)
+			break;
+		if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+			goto out_bad_stateid;
+		goto wait_on_recovery;
+	case -NFS4ERR_EXPIRED:
+		if (state != NULL) {
+			if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+				goto out_bad_stateid;
+		}
+		nfs4_schedule_lease_recovery(mds_client);
+		goto wait_on_recovery;
+	/* DS session errors */
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_BADSLOT:
+	case -NFS4ERR_BAD_HIGH_SLOT:
+	case -NFS4ERR_DEADSESSION:
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+	case -NFS4ERR_SEQ_FALSE_RETRY:
+	case -NFS4ERR_SEQ_MISORDERED:
+		dprintk("%s ERROR %d, Reset session. Exchangeid "
+			"flags 0x%x\n", __func__, task->tk_status,
+			clp->cl_exchange_flags);
+		nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+		break;
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+		rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+		break;
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		break;
+	/* Invalidate Layout errors */
+	case -NFS4ERR_PNFS_NO_LAYOUT:
+	case -ESTALE:           /* mapped NFS4ERR_STALE */
+	case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
+	case -EISDIR:           /* mapped NFS4ERR_ISDIR */
+	case -NFS4ERR_FHEXPIRED:
+	case -NFS4ERR_WRONG_TYPE:
+		dprintk("%s Invalid layout error %d\n", __func__,
+			task->tk_status);
+		/*
+		 * Destroy layout so new i/o will get a new layout.
+		 * Layout will not be destroyed until all current lseg
+		 * references are put. Mark layout as invalid to resend failed
+		 * i/o and all i/o waiting on the slot table to the MDS until
+		 * layout is destroyed and a new valid layout is obtained.
+		 */
+		pnfs_destroy_layout(NFS_I(inode));
+		rpc_wake_up(&tbl->slot_tbl_waitq);
+		goto reset;
+	/* RPC connection errors */
+	case -ECONNREFUSED:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -EIO:
+	case -ETIMEDOUT:
+	case -EPIPE:
+		dprintk("%s DS connection error %d\n", __func__,
+			task->tk_status);
+		nfs4_mark_deviceid_unavailable(devid);
+		rpc_wake_up(&tbl->slot_tbl_waitq);
+		/* fall through */
+	default:
+		if (ff_layout_has_available_ds(lseg))
+			return -NFS4ERR_RESET_TO_PNFS;
+reset:
+		dprintk("%s Retry through MDS. Error %d\n", __func__,
+			task->tk_status);
+		return -NFS4ERR_RESET_TO_MDS;
+	}
+out:
+	task->tk_status = 0;
+	return -EAGAIN;
+out_bad_stateid:
+	task->tk_status = -EIO;
+	return 0;
+wait_on_recovery:
+	rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
+	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
+		rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
+	goto out;
+}
+
+/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+					   struct pnfs_layout_segment *lseg,
+					   int idx)
+{
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+	if (task->tk_status >= 0)
+		return 0;
+
+	if (task->tk_status != -EJUKEBOX) {
+		dprintk("%s DS connection error %d\n", __func__,
+			task->tk_status);
+		nfs4_mark_deviceid_unavailable(devid);
+		if (ff_layout_has_available_ds(lseg))
+			return -NFS4ERR_RESET_TO_PNFS;
+		else
+			return -NFS4ERR_RESET_TO_MDS;
+	}
+
+	if (task->tk_status == -EJUKEBOX)
+		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+	task->tk_status = 0;
+	rpc_restart_call(task);
+	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+	return -EAGAIN;
+}
+
+static int ff_layout_async_handle_error(struct rpc_task *task,
+					struct nfs4_state *state,
+					struct nfs_client *clp,
+					struct pnfs_layout_segment *lseg,
+					int idx)
+{
+	int vers = clp->cl_nfs_mod->rpc_vers->number;
+
+	switch (vers) {
+	case 3:
+		return ff_layout_async_handle_error_v3(task, lseg, idx);
+	case 4:
+		return ff_layout_async_handle_error_v4(task, state, clp,
+						       lseg, idx);
+	default:
+		/* should never happen */
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+}
+
+static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+					int idx, u64 offset, u64 length,
+					u32 status, int opnum)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	int err;
+
+	mirror = FF_LAYOUT_COMP(lseg, idx);
+	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+				       mirror, offset, length, status, opnum,
+				       GFP_NOIO);
+	dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int ff_layout_read_done_cb(struct rpc_task *task,
+				struct nfs_pgio_header *hdr)
+{
+	struct inode *inode;
+	int err;
+
+	trace_nfs4_pnfs_read(hdr, task->tk_status);
+	if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+		hdr->res.op_status = NFS4ERR_NXIO;
+	if (task->tk_status < 0 && hdr->res.op_status)
+		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+					    hdr->args.offset, hdr->args.count,
+					    hdr->res.op_status, OP_READ);
+	err = ff_layout_async_handle_error(task, hdr->args.context->state,
+					   hdr->ds_clp, hdr->lseg,
+					   hdr->pgio_mirror_idx);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_PNFS:
+		set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+			&hdr->lseg->pls_layout->plh_flags);
+		pnfs_read_resend_pnfs(hdr);
+		return task->tk_status;
+	case -NFS4ERR_RESET_TO_MDS:
+		inode = hdr->lseg->pls_layout->plh_inode;
+		pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+		ff_layout_reset_read(hdr);
+		return task->tk_status;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ *
+ * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
+ * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
+ * we always send layoutcommit after DS writes.
+ */
+static void
+ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+{
+	pnfs_set_layoutcommit(hdr);
+	dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+}
+
+static bool
+ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+{
+	/* No mirroring for now */
+	struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+	return ff_layout_test_devid_unavailable(node);
+}
+
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+					 struct nfs_pgio_header *hdr)
+{
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return -EIO;
+	}
+	if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+		if (ff_layout_has_available_ds(hdr->lseg))
+			pnfs_read_resend_pnfs(hdr);
+		else
+			ff_layout_reset_read(hdr);
+		rpc_exit(task, 0);
+		return -EAGAIN;
+	}
+	hdr->pgio_done_cb = ff_layout_read_done_cb;
+
+	return 0;
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (ff_layout_read_prepare_common(task, hdr))
+		return;
+
+	rpc_call_start(task);
+}
+
+static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
+				    struct nfs4_sequence_args *args,
+				    struct nfs4_sequence_res *res,
+				    struct rpc_task *task)
+{
+	if (ds_clp->cl_session)
+		return nfs41_setup_sequence(ds_clp->cl_session,
+					   args,
+					   res,
+					   task);
+	return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
+				   args,
+				   res,
+				   task);
+}
+
+static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (ff_layout_read_prepare_common(task, hdr))
+		return;
+
+	if (ff_layout_setup_sequence(hdr->ds_clp,
+				     &hdr->args.seq_args,
+				     &hdr->res.seq_res,
+				     task))
+		return;
+
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+			hdr->args.lock_context, FMODE_READ) == -EIO)
+		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void ff_layout_read_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+	    task->tk_status == 0) {
+		nfs4_sequence_done(task, &hdr->res.seq_res);
+		return;
+	}
+
+	/* Note this may cause RPC to be resent */
+	hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	rpc_count_iostats_metrics(task,
+	    &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
+}
+
+static int ff_layout_write_done_cb(struct rpc_task *task,
+				struct nfs_pgio_header *hdr)
+{
+	struct inode *inode;
+	int err;
+
+	trace_nfs4_pnfs_write(hdr, task->tk_status);
+	if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+		hdr->res.op_status = NFS4ERR_NXIO;
+	if (task->tk_status < 0 && hdr->res.op_status)
+		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+					    hdr->args.offset, hdr->args.count,
+					    hdr->res.op_status, OP_WRITE);
+	err = ff_layout_async_handle_error(task, hdr->args.context->state,
+					   hdr->ds_clp, hdr->lseg,
+					   hdr->pgio_mirror_idx);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_PNFS:
+	case -NFS4ERR_RESET_TO_MDS:
+		inode = hdr->lseg->pls_layout->plh_inode;
+		pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+		if (err == -NFS4ERR_RESET_TO_PNFS) {
+			pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+			ff_layout_reset_write(hdr, true);
+		} else {
+			pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+			ff_layout_reset_write(hdr, false);
+		}
+		return task->tk_status;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	if (hdr->res.verf->committed == NFS_FILE_SYNC ||
+	    hdr->res.verf->committed == NFS_DATA_SYNC)
+		ff_layout_set_layoutcommit(hdr);
+
+	return 0;
+}
+
+static int ff_layout_commit_done_cb(struct rpc_task *task,
+				     struct nfs_commit_data *data)
+{
+	struct inode *inode;
+	int err;
+
+	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
+	if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
+		data->res.op_status = NFS4ERR_NXIO;
+	if (task->tk_status < 0 && data->res.op_status)
+		ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+					    data->args.offset, data->args.count,
+					    data->res.op_status, OP_COMMIT);
+	err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+					   data->lseg, data->ds_commit_index);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_PNFS:
+	case -NFS4ERR_RESET_TO_MDS:
+		inode = data->lseg->pls_layout->plh_inode;
+		pnfs_error_mark_layout_for_return(inode, data->lseg);
+		if (err == -NFS4ERR_RESET_TO_PNFS)
+			pnfs_set_retry_layoutget(data->lseg->pls_layout);
+		else
+			pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+		pnfs_generic_prepare_to_resend_writes(data);
+		return -EAGAIN;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
+		return -EAGAIN;
+	}
+
+	if (data->verf.committed == NFS_UNSTABLE)
+		pnfs_commit_set_layoutcommit(data);
+
+	return 0;
+}
+
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+					  struct nfs_pgio_header *hdr)
+{
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return -EIO;
+	}
+
+	if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+		bool retry_pnfs;
+
+		retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
+		dprintk("%s task %u reset io to %s\n", __func__,
+			task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
+		ff_layout_reset_write(hdr, retry_pnfs);
+		rpc_exit(task, 0);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (ff_layout_write_prepare_common(task, hdr))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (ff_layout_write_prepare_common(task, hdr))
+		return;
+
+	if (ff_layout_setup_sequence(hdr->ds_clp,
+				     &hdr->args.seq_args,
+				     &hdr->res.seq_res,
+				     task))
+		return;
+
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+			hdr->args.lock_context, FMODE_WRITE) == -EIO)
+		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void ff_layout_write_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+	    task->tk_status == 0) {
+		nfs4_sequence_done(task, &hdr->res.seq_res);
+		return;
+	}
+
+	/* Note this may cause RPC to be resent */
+	hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	rpc_count_iostats_metrics(task,
+	    &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
+}
+
+static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
+{
+	rpc_call_start(task);
+}
+
+static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *wdata = data;
+
+	ff_layout_setup_sequence(wdata->ds_clp,
+				 &wdata->args.seq_args,
+				 &wdata->res.seq_res,
+				 task);
+}
+
+static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *cdata = data;
+
+	rpc_count_iostats_metrics(task,
+	    &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
+}
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
+	.rpc_call_prepare = ff_layout_read_prepare_v3,
+	.rpc_call_done = ff_layout_read_call_done,
+	.rpc_count_stats = ff_layout_read_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
+	.rpc_call_prepare = ff_layout_read_prepare_v4,
+	.rpc_call_done = ff_layout_read_call_done,
+	.rpc_count_stats = ff_layout_read_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
+	.rpc_call_prepare = ff_layout_write_prepare_v3,
+	.rpc_call_done = ff_layout_write_call_done,
+	.rpc_count_stats = ff_layout_write_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
+	.rpc_call_prepare = ff_layout_write_prepare_v4,
+	.rpc_call_done = ff_layout_write_call_done,
+	.rpc_count_stats = ff_layout_write_count_stats,
+	.rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
+	.rpc_call_prepare = ff_layout_commit_prepare_v3,
+	.rpc_call_done = pnfs_generic_write_commit_done,
+	.rpc_count_stats = ff_layout_commit_count_stats,
+	.rpc_release = pnfs_generic_commit_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
+	.rpc_call_prepare = ff_layout_commit_prepare_v4,
+	.rpc_call_done = pnfs_generic_write_commit_done,
+	.rpc_count_stats = ff_layout_commit_count_stats,
+	.rpc_release = pnfs_generic_commit_release,
+};
+
+static enum pnfs_try_status
+ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+	struct pnfs_layout_segment *lseg = hdr->lseg;
+	struct nfs4_pnfs_ds *ds;
+	struct rpc_clnt *ds_clnt;
+	struct rpc_cred *ds_cred;
+	loff_t offset = hdr->args.offset;
+	u32 idx = hdr->pgio_mirror_idx;
+	int vers;
+	struct nfs_fh *fh;
+
+	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+		__func__, hdr->inode->i_ino,
+		hdr->args.pgbase, (size_t)hdr->args.count, offset);
+
+	ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+	if (!ds)
+		goto out_failed;
+
+	ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+						   hdr->inode);
+	if (IS_ERR(ds_clnt))
+		goto out_failed;
+
+	ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+	if (IS_ERR(ds_cred))
+		goto out_failed;
+
+	vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+	dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
+		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+
+	atomic_inc(&ds->ds_clp->cl_count);
+	hdr->ds_clp = ds->ds_clp;
+	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+	if (fh)
+		hdr->args.fh = fh;
+
+	/*
+	 * Note that if we ever decide to split across DSes,
+	 * then we may need to handle dense-like offsets.
+	 */
+	hdr->args.offset = offset;
+	hdr->mds_offset = offset;
+
+	/* Perform an asynchronous read to ds */
+	nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+			  vers == 3 ? &ff_layout_read_call_ops_v3 :
+				      &ff_layout_read_call_ops_v4,
+			  0, RPC_TASK_SOFTCONN);
+
+	return PNFS_ATTEMPTED;
+
+out_failed:
+	if (ff_layout_has_available_ds(lseg))
+		return PNFS_TRY_AGAIN;
+	return PNFS_NOT_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+	struct pnfs_layout_segment *lseg = hdr->lseg;
+	struct nfs4_pnfs_ds *ds;
+	struct rpc_clnt *ds_clnt;
+	struct rpc_cred *ds_cred;
+	loff_t offset = hdr->args.offset;
+	int vers;
+	struct nfs_fh *fh;
+	int idx = hdr->pgio_mirror_idx;
+
+	ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+	if (!ds)
+		return PNFS_NOT_ATTEMPTED;
+
+	ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+						   hdr->inode);
+	if (IS_ERR(ds_clnt))
+		return PNFS_NOT_ATTEMPTED;
+
+	ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+	if (IS_ERR(ds_cred))
+		return PNFS_NOT_ATTEMPTED;
+
+	vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
+		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
+		vers);
+
+	hdr->pgio_done_cb = ff_layout_write_done_cb;
+	atomic_inc(&ds->ds_clp->cl_count);
+	hdr->ds_clp = ds->ds_clp;
+	hdr->ds_commit_idx = idx;
+	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+	if (fh)
+		hdr->args.fh = fh;
+
+	/*
+	 * Note that if we ever decide to split across DSes,
+	 * then we may need to handle dense-like offsets.
+	 */
+	hdr->args.offset = offset;
+
+	/* Perform an asynchronous write */
+	nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+			  vers == 3 ? &ff_layout_write_call_ops_v3 :
+				      &ff_layout_write_call_ops_v4,
+			  sync, RPC_TASK_SOFTCONN);
+	return PNFS_ATTEMPTED;
+}
+
+static void
+ff_layout_mark_request_commit(struct nfs_page *req,
+			      struct pnfs_layout_segment *lseg,
+			      struct nfs_commit_info *cinfo,
+			      u32 ds_commit_idx)
+{
+	struct list_head *list;
+	struct pnfs_commit_bucket *buckets;
+
+	spin_lock(cinfo->lock);
+	buckets = cinfo->ds->buckets;
+	list = &buckets[ds_commit_idx].written;
+	if (list_empty(list)) {
+		/* Non-empty buckets hold a reference on the lseg.  That ref
+		 * is normally transferred to the COMMIT call and released
+		 * there.  It could also be released if the last req is pulled
+		 * off due to a rewrite, in which case it will be done in
+		 * pnfs_common_clear_request_commit
+		 */
+		WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
+		buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
+	}
+	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+	cinfo->ds->nwritten++;
+
+	/* nfs_request_add_commit_list(). We need to add req to list without
+	 * dropping cinfo lock.
+	 */
+	set_bit(PG_CLEAN, &(req)->wb_flags);
+	nfs_list_add_request(req, list);
+	cinfo->mds->ncommit++;
+	spin_unlock(cinfo->lock);
+	if (!cinfo->dreq) {
+		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+			     BDI_RECLAIMABLE);
+		__mark_inode_dirty(req->wb_context->dentry->d_inode,
+				   I_DIRTY_DATASYNC);
+	}
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+	return i;
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+	struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+	/* FIXME: Assume that there is only one NFS version available
+	 * for the DS.
+	 */
+	return &flseg->mirror_array[i]->fh_versions[0];
+}
+
+static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	struct rpc_clnt *ds_clnt;
+	struct rpc_cred *ds_cred;
+	u32 idx;
+	int vers;
+	struct nfs_fh *fh;
+
+	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+	ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+	if (!ds)
+		goto out_err;
+
+	ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+						   data->inode);
+	if (IS_ERR(ds_clnt))
+		goto out_err;
+
+	ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+	if (IS_ERR(ds_cred))
+		goto out_err;
+
+	vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+	dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
+		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
+		vers);
+	data->commit_done_cb = ff_layout_commit_done_cb;
+	data->cred = ds_cred;
+	atomic_inc(&ds->ds_clp->cl_count);
+	data->ds_clp = ds->ds_clp;
+	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+	if (fh)
+		data->args.fh = fh;
+	return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+				   vers == 3 ? &ff_layout_commit_call_ops_v3 :
+					       &ff_layout_commit_call_ops_v4,
+				   how, RPC_TASK_SOFTCONN);
+out_err:
+	pnfs_generic_prepare_to_resend_writes(data);
+	pnfs_generic_commit_release(data);
+	return -EAGAIN;
+}
+
+static int
+ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+			   int how, struct nfs_commit_info *cinfo)
+{
+	return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+					    ff_layout_initiate_commit);
+}
+
+static struct pnfs_ds_commit_info *
+ff_layout_get_ds_info(struct inode *inode)
+{
+	struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+	if (layout == NULL)
+		return NULL;
+
+	return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
+}
+
+static void
+ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+	nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
+						  id_node));
+}
+
+static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
+				  struct xdr_stream *xdr,
+				  const struct nfs4_layoutreturn_args *args)
+{
+	struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
+	__be32 *start;
+	int count = 0, ret = 0;
+
+	start = xdr_reserve_space(xdr, 4);
+	if (unlikely(!start))
+		return -E2BIG;
+
+	/* This assume we always return _ALL_ layouts */
+	spin_lock(&hdr->plh_inode->i_lock);
+	ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
+	spin_unlock(&hdr->plh_inode->i_lock);
+
+	*start = cpu_to_be32(count);
+
+	return ret;
+}
+
+/* report nothing for now */
+static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_layoutreturn_args *args)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (likely(p))
+		*p = cpu_to_be32(0);
+}
+
+static struct nfs4_deviceid_node *
+ff_layout_alloc_deviceid_node(struct nfs_server *server,
+			      struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+	struct nfs4_ff_layout_ds *dsaddr;
+
+	dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
+	if (!dsaddr)
+		return NULL;
+	return &dsaddr->id_node;
+}
+
+static void
+ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
+			      struct xdr_stream *xdr,
+			      const struct nfs4_layoutreturn_args *args)
+{
+	struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+	__be32 *start;
+
+	dprintk("%s: Begin\n", __func__);
+	start = xdr_reserve_space(xdr, 4);
+	BUG_ON(!start);
+
+	if (ff_layout_encode_ioerr(flo, xdr, args))
+		goto out;
+
+	ff_layout_encode_iostats(flo, xdr, args);
+out:
+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
+	dprintk("%s: Return\n", __func__);
+}
+
+static struct pnfs_layoutdriver_type flexfilelayout_type = {
+	.id			= LAYOUT_FLEX_FILES,
+	.name			= "LAYOUT_FLEX_FILES",
+	.owner			= THIS_MODULE,
+	.alloc_layout_hdr	= ff_layout_alloc_layout_hdr,
+	.free_layout_hdr	= ff_layout_free_layout_hdr,
+	.alloc_lseg		= ff_layout_alloc_lseg,
+	.free_lseg		= ff_layout_free_lseg,
+	.pg_read_ops		= &ff_layout_pg_read_ops,
+	.pg_write_ops		= &ff_layout_pg_write_ops,
+	.get_ds_info		= ff_layout_get_ds_info,
+	.free_deviceid_node	= ff_layout_free_deveiceid_node,
+	.mark_request_commit	= ff_layout_mark_request_commit,
+	.clear_request_commit	= pnfs_generic_clear_request_commit,
+	.scan_commit_lists	= pnfs_generic_scan_commit_lists,
+	.recover_commit_reqs	= pnfs_generic_recover_commit_reqs,
+	.commit_pagelist	= ff_layout_commit_pagelist,
+	.read_pagelist		= ff_layout_read_pagelist,
+	.write_pagelist		= ff_layout_write_pagelist,
+	.alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
+	.encode_layoutreturn    = ff_layout_encode_layoutreturn,
+};
+
+static int __init nfs4flexfilelayout_init(void)
+{
+	printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
+	       __func__);
+	return pnfs_register_layoutdriver(&flexfilelayout_type);
+}
+
+static void __exit nfs4flexfilelayout_exit(void)
+{
+	printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
+	       __func__);
+	pnfs_unregister_layoutdriver(&flexfilelayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-4");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
+
+module_init(nfs4flexfilelayout_init);
+module_exit(nfs4flexfilelayout_exit);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 000000000000..070f20445b2d
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,155 @@
+/*
+ * NFSv4 flexfile layout driver data structures.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
+#define FS_NFS_NFS4FLEXFILELAYOUT_H
+
+#include "../pnfs.h"
+
+/* XXX: Let's filter out insanely large mirror count for now to avoid oom
+ * due to network error etc. */
+#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+
+struct nfs4_ff_ds_version {
+	u32				version;
+	u32				minor_version;
+	u32				rsize;
+	u32				wsize;
+	bool				tightly_coupled;
+};
+
+/* chained in global deviceid hlist */
+struct nfs4_ff_layout_ds {
+	struct nfs4_deviceid_node	id_node;
+	u32				ds_versions_cnt;
+	struct nfs4_ff_ds_version	*ds_versions;
+	struct nfs4_pnfs_ds		*ds;
+};
+
+struct nfs4_ff_layout_ds_err {
+	struct list_head		list; /* linked in mirror error_list */
+	u64				offset;
+	u64				length;
+	int				status;
+	enum nfs_opnum4			opnum;
+	nfs4_stateid			stateid;
+	struct nfs4_deviceid		deviceid;
+};
+
+struct nfs4_ff_layout_mirror {
+	u32				ds_count;
+	u32				efficiency;
+	struct nfs4_ff_layout_ds	*mirror_ds;
+	u32				fh_versions_cnt;
+	struct nfs_fh			*fh_versions;
+	nfs4_stateid			stateid;
+	struct nfs4_string		user_name;
+	struct nfs4_string		group_name;
+	u32				uid;
+	u32				gid;
+	struct rpc_cred			*cred;
+	spinlock_t			lock;
+};
+
+struct nfs4_ff_layout_segment {
+	struct pnfs_layout_segment	generic_hdr;
+	u64				stripe_unit;
+	u32				mirror_array_cnt;
+	struct nfs4_ff_layout_mirror	**mirror_array;
+};
+
+struct nfs4_flexfile_layout {
+	struct pnfs_layout_hdr generic_hdr;
+	struct pnfs_ds_commit_info commit_info;
+	struct list_head	error_list; /* nfs4_ff_layout_ds_err */
+};
+
+static inline struct nfs4_flexfile_layout *
+FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+	return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
+}
+
+static inline struct nfs4_ff_layout_segment *
+FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+	return container_of(lseg,
+			    struct nfs4_ff_layout_segment,
+			    generic_hdr);
+}
+
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+	if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
+	    FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
+	    FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
+		return NULL;
+	return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
+}
+
+static inline struct nfs4_ff_layout_ds *
+FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
+{
+	return container_of(node, struct nfs4_ff_layout_ds, id_node);
+}
+
+static inline struct nfs4_ff_layout_mirror *
+FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
+{
+	if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
+		return NULL;
+	return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
+}
+
+static inline u32
+FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
+{
+	return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
+}
+
+static inline bool
+ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
+{
+	return nfs4_test_deviceid_unavailable(node);
+}
+
+static inline int
+nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+	return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
+}
+
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+			    gfp_t gfp_flags);
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+			     struct nfs4_ff_layout_mirror *mirror, u64 offset,
+			     u64 length, int status, enum nfs_opnum4 opnum,
+			     gfp_t gfp_flags);
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+			      struct xdr_stream *xdr, int *count,
+			      const struct pnfs_layout_range *range);
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
+
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+			  bool fail_return);
+
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
+				 u32 ds_idx,
+				 struct nfs_client *ds_clp,
+				 struct inode *inode);
+struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
+				       u32 ds_idx, struct rpc_cred *mdscred);
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 000000000000..3bbb16b3066f
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,552 @@
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/sunrpc/addr.h>
+
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "flexfilelayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+	if (mirror_ds)
+		nfs4_put_deviceid_node(&mirror_ds->id_node);
+}
+
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+	nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
+	nfs4_pnfs_ds_put(mirror_ds->ds);
+	kfree(mirror_ds);
+}
+
+/* Decode opaque device data and construct new_ds using it */
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+			    gfp_t gfp_flags)
+{
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	struct list_head dsaddrs;
+	struct nfs4_pnfs_ds_addr *da;
+	struct nfs4_ff_layout_ds *new_ds = NULL;
+	struct nfs4_ff_ds_version *ds_versions = NULL;
+	u32 mp_count;
+	u32 version_count;
+	__be32 *p;
+	int i, ret = -ENOMEM;
+
+	/* set up xdr stream */
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		goto out_err;
+
+	new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
+	if (!new_ds)
+		goto out_scratch;
+
+	nfs4_init_deviceid_node(&new_ds->id_node,
+				server,
+				&pdev->dev_id);
+	INIT_LIST_HEAD(&dsaddrs);
+
+	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	/* multipath count */
+	p = xdr_inline_decode(&stream, 4);
+	if (unlikely(!p))
+		goto out_err_drain_dsaddrs;
+	mp_count = be32_to_cpup(p);
+	dprintk("%s: multipath ds count %d\n", __func__, mp_count);
+
+	for (i = 0; i < mp_count; i++) {
+		/* multipath ds */
+		da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+					    &stream, gfp_flags);
+		if (da)
+			list_add_tail(&da->da_node, &dsaddrs);
+	}
+	if (list_empty(&dsaddrs)) {
+		dprintk("%s: no suitable DS addresses found\n",
+			__func__);
+		ret = -ENOMEDIUM;
+		goto out_err_drain_dsaddrs;
+	}
+
+	/* version count */
+	p = xdr_inline_decode(&stream, 4);
+	if (unlikely(!p))
+		goto out_err_drain_dsaddrs;
+	version_count = be32_to_cpup(p);
+	dprintk("%s: version count %d\n", __func__, version_count);
+
+	ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
+			      gfp_flags);
+	if (!ds_versions)
+		goto out_scratch;
+
+	for (i = 0; i < version_count; i++) {
+		/* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
+		 * tightly_coupled(4) */
+		p = xdr_inline_decode(&stream, 20);
+		if (unlikely(!p))
+			goto out_err_drain_dsaddrs;
+		ds_versions[i].version = be32_to_cpup(p++);
+		ds_versions[i].minor_version = be32_to_cpup(p++);
+		ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
+		ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
+		ds_versions[i].tightly_coupled = be32_to_cpup(p);
+
+		if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
+			ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
+		if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
+			ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
+
+		if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
+			dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
+				i, ds_versions[i].version,
+				ds_versions[i].minor_version);
+			ret = -EPROTONOSUPPORT;
+			goto out_err_drain_dsaddrs;
+		}
+
+		dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
+			__func__, i, ds_versions[i].version,
+			ds_versions[i].minor_version,
+			ds_versions[i].rsize,
+			ds_versions[i].wsize,
+			ds_versions[i].tightly_coupled);
+	}
+
+	new_ds->ds_versions = ds_versions;
+	new_ds->ds_versions_cnt = version_count;
+
+	new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+	if (!new_ds->ds)
+		goto out_err_drain_dsaddrs;
+
+	/* If DS was already in cache, free ds addrs */
+	while (!list_empty(&dsaddrs)) {
+		da = list_first_entry(&dsaddrs,
+				      struct nfs4_pnfs_ds_addr,
+				      da_node);
+		list_del_init(&da->da_node);
+		kfree(da->da_remotestr);
+		kfree(da);
+	}
+
+	__free_page(scratch);
+	return new_ds;
+
+out_err_drain_dsaddrs:
+	while (!list_empty(&dsaddrs)) {
+		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+				      da_node);
+		list_del_init(&da->da_node);
+		kfree(da->da_remotestr);
+		kfree(da);
+	}
+
+	kfree(ds_versions);
+out_scratch:
+	__free_page(scratch);
+out_err:
+	kfree(new_ds);
+
+	dprintk("%s ERROR: returning %d\n", __func__, ret);
+	return NULL;
+}
+
+static u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
+			    u64 offset, u64 length)
+{
+	u64 end;
+
+	end = max_t(u64, end_offset(err->offset, err->length),
+		    end_offset(offset, length));
+	err->offset = min_t(u64, err->offset, offset);
+	err->length = end - err->offset;
+}
+
+static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err,  u64 offset,
+			       u64 length, int status, enum nfs_opnum4 opnum,
+			       nfs4_stateid *stateid,
+			       struct nfs4_deviceid *deviceid)
+{
+	return err->status == status && err->opnum == opnum &&
+	       nfs4_stateid_match(&err->stateid, stateid) &&
+	       !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
+	       end_offset(err->offset, err->length) >= offset &&
+	       err->offset <= end_offset(offset, length);
+}
+
+static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
+			   struct nfs4_ff_layout_ds_err *new)
+{
+	if (!ds_error_can_merge(old, new->offset, new->length, new->status,
+				new->opnum, &new->stateid, &new->deviceid))
+		return false;
+
+	extend_ds_error(old, new->offset, new->length);
+	return true;
+}
+
+static bool
+ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
+			      struct nfs4_ff_layout_ds_err *dserr)
+{
+	struct nfs4_ff_layout_ds_err *err;
+
+	list_for_each_entry(err, &flo->error_list, list) {
+		if (merge_ds_error(err, dserr)) {
+			return true;
+		}
+	}
+
+	list_add(&dserr->list, &flo->error_list);
+	return false;
+}
+
+static bool
+ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
+			  u64 length, int status, enum nfs_opnum4 opnum,
+			  nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
+{
+	bool found = false;
+	struct nfs4_ff_layout_ds_err *err;
+
+	list_for_each_entry(err, &flo->error_list, list) {
+		if (ds_error_can_merge(err, offset, length, status, opnum,
+				       stateid, deviceid)) {
+			found = true;
+			extend_ds_error(err, offset, length);
+			break;
+		}
+	}
+
+	return found;
+}
+
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+			     struct nfs4_ff_layout_mirror *mirror, u64 offset,
+			     u64 length, int status, enum nfs_opnum4 opnum,
+			     gfp_t gfp_flags)
+{
+	struct nfs4_ff_layout_ds_err *dserr;
+	bool needfree;
+
+	if (status == 0)
+		return 0;
+
+	if (mirror->mirror_ds == NULL)
+		return -EINVAL;
+
+	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+	if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
+				      &mirror->stateid,
+				      &mirror->mirror_ds->id_node.deviceid)) {
+		spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+		return 0;
+	}
+	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+	dserr = kmalloc(sizeof(*dserr), gfp_flags);
+	if (!dserr)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&dserr->list);
+	dserr->offset = offset;
+	dserr->length = length;
+	dserr->status = status;
+	dserr->opnum = opnum;
+	nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
+	memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+	       NFS4_DEVICEID4_SIZE);
+
+	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+	needfree = ff_layout_add_ds_error_locked(flo, dserr);
+	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+	if (needfree)
+		kfree(dserr);
+
+	return 0;
+}
+
+/* currently we only support AUTH_NONE and AUTH_SYS */
+static rpc_authflavor_t
+nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
+{
+	if (mirror->uid == (u32)-1)
+		return RPC_AUTH_NULL;
+	return RPC_AUTH_UNIX;
+}
+
+/* fetch cred for NFSv3 DS */
+static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
+				      struct nfs4_pnfs_ds *ds)
+{
+	if (ds->ds_clp && !mirror->cred &&
+	    mirror->mirror_ds->ds_versions[0].version == 3) {
+		struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
+		struct rpc_cred *cred;
+		struct auth_cred acred = {
+			.uid = make_kuid(&init_user_ns, mirror->uid),
+			.gid = make_kgid(&init_user_ns, mirror->gid),
+		};
+
+		/* AUTH_NULL ignores acred */
+		cred = auth->au_ops->lookup_cred(auth, &acred, 0);
+		if (IS_ERR(cred)) {
+			dprintk("%s: lookup_cred failed with %ld\n",
+				__func__, PTR_ERR(cred));
+			return PTR_ERR(cred);
+		} else {
+			mirror->cred = cred;
+		}
+	}
+	return 0;
+}
+
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
+{
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+	struct nfs_fh *fh = NULL;
+	struct nfs4_deviceid_node *devid;
+
+	if (mirror == NULL || mirror->mirror_ds == NULL ||
+	    mirror->mirror_ds->ds == NULL) {
+		printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+			__func__, mirror_idx);
+		if (mirror && mirror->mirror_ds) {
+			devid = &mirror->mirror_ds->id_node;
+			pnfs_generic_mark_devid_invalid(devid);
+		}
+		goto out;
+	}
+
+	/* FIXME: For now assume there is only 1 version available for the DS */
+	fh = &mirror->fh_versions[0];
+out:
+	return fh;
+}
+
+/* Upon return, either ds is connected, or ds is NULL */
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+			  bool fail_return)
+{
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+	struct nfs4_pnfs_ds *ds = NULL;
+	struct nfs4_deviceid_node *devid;
+	struct inode *ino = lseg->pls_layout->plh_inode;
+	struct nfs_server *s = NFS_SERVER(ino);
+	unsigned int max_payload;
+	rpc_authflavor_t flavor;
+
+	if (mirror == NULL || mirror->mirror_ds == NULL ||
+	    mirror->mirror_ds->ds == NULL) {
+		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+			__func__, ds_idx);
+		if (mirror && mirror->mirror_ds) {
+			devid = &mirror->mirror_ds->id_node;
+			pnfs_generic_mark_devid_invalid(devid);
+		}
+		goto out;
+	}
+
+	devid = &mirror->mirror_ds->id_node;
+	if (ff_layout_test_devid_unavailable(devid))
+		goto out;
+
+	ds = mirror->mirror_ds->ds;
+	/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
+	smp_rmb();
+	if (ds->ds_clp)
+		goto out;
+
+	flavor = nfs4_ff_layout_choose_authflavor(mirror);
+
+	/* FIXME: For now we assume the server sent only one version of NFS
+	 * to use for the DS.
+	 */
+	nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
+			     dataserver_retrans,
+			     mirror->mirror_ds->ds_versions[0].version,
+			     mirror->mirror_ds->ds_versions[0].minor_version,
+			     flavor);
+
+	/* connect success, check rsize/wsize limit */
+	if (ds->ds_clp) {
+		max_payload =
+			nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
+				       NULL);
+		if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
+			mirror->mirror_ds->ds_versions[0].rsize = max_payload;
+		if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
+			mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+	} else {
+		ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+					 mirror, lseg->pls_range.offset,
+					 lseg->pls_range.length, NFS4ERR_NXIO,
+					 OP_ILLEGAL, GFP_NOIO);
+		if (fail_return) {
+			pnfs_error_mark_layout_for_return(ino, lseg);
+			if (ff_layout_has_available_ds(lseg))
+				pnfs_set_retry_layoutget(lseg->pls_layout);
+			else
+				pnfs_clear_retry_layoutget(lseg->pls_layout);
+
+		} else {
+			if (ff_layout_has_available_ds(lseg))
+				set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+					&lseg->pls_layout->plh_flags);
+			else {
+				pnfs_error_mark_layout_for_return(ino, lseg);
+				pnfs_clear_retry_layoutget(lseg->pls_layout);
+			}
+		}
+	}
+
+	if (ff_layout_update_mirror_cred(mirror, ds))
+		ds = NULL;
+out:
+	return ds;
+}
+
+struct rpc_cred *
+ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
+		      struct rpc_cred *mdscred)
+{
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+	struct rpc_cred *cred = ERR_PTR(-EINVAL);
+
+	if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
+		goto out;
+
+	if (mirror && mirror->cred)
+		cred = mirror->cred;
+	else
+		cred = mdscred;
+out:
+	return cred;
+}
+
+/**
+* Find or create a DS rpc client with th MDS server rpc client auth flavor
+* in the nfs_client cl_ds_clients list.
+*/
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
+				 struct nfs_client *ds_clp, struct inode *inode)
+{
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+
+	switch (mirror->mirror_ds->ds_versions[0].version) {
+	case 3:
+		/* For NFSv3 DS, flavor is set when creating DS connections */
+		return ds_clp->cl_rpcclient;
+	case 4:
+		return nfs4_find_or_create_ds_client(ds_clp, inode);
+	default:
+		BUG();
+	}
+}
+
+static bool is_range_intersecting(u64 offset1, u64 length1,
+				  u64 offset2, u64 length2)
+{
+	u64 end1 = end_offset(offset1, length1);
+	u64 end2 = end_offset(offset2, length2);
+
+	return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
+	       (end2 == NFS4_MAX_UINT64 || end2 > offset1);
+}
+
+/* called with inode i_lock held */
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+			      struct xdr_stream *xdr, int *count,
+			      const struct pnfs_layout_range *range)
+{
+	struct nfs4_ff_layout_ds_err *err, *n;
+	__be32 *p;
+
+	list_for_each_entry_safe(err, n, &flo->error_list, list) {
+		if (!is_range_intersecting(err->offset, err->length,
+					   range->offset, range->length))
+			continue;
+		/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
+		 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+		 */
+		p = xdr_reserve_space(xdr,
+				24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+		if (unlikely(!p))
+			return -ENOBUFS;
+		p = xdr_encode_hyper(p, err->offset);
+		p = xdr_encode_hyper(p, err->length);
+		p = xdr_encode_opaque_fixed(p, &err->stateid,
+					    NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, &err->deviceid,
+					    NFS4_DEVICEID4_SIZE);
+		*p++ = cpu_to_be32(err->status);
+		*p++ = cpu_to_be32(err->opnum);
+		*count += 1;
+		list_del(&err->list);
+		kfree(err);
+		dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
+			__func__, err->offset, err->length, err->status,
+			err->opnum, *count);
+	}
+
+	return 0;
+}
+
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs4_deviceid_node *devid;
+	int idx;
+
+	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+		mirror = FF_LAYOUT_COMP(lseg, idx);
+		if (mirror && mirror->mirror_ds) {
+			devid = &mirror->mirror_ds->id_node;
+			if (!ff_layout_test_devid_unavailable(devid))
+				return true;
+		}
+	}
+
+	return false;
+}
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
+			"retries a request before it attempts further "
+			" recovery  action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+			"NFSv4.1  client  waits for a response from a "
+			" data server before it retries an NFS request.");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 2f5db844c172..857e2a99acc8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -152,7 +152,7 @@ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *f
 		nfs_fattr_free_group_name(fattr);
 }
 
-static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
 {
 	unsigned long val;
 	char buf[16];
@@ -166,6 +166,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
 	*res = val;
 	return 1;
 }
+EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
 
 static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
 {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 44c600aac907..ca6dda0f68bb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7796,9 +7796,7 @@ static void nfs4_layoutreturn_release(void *calldata)
 	spin_lock(&lo->plh_inode->i_lock);
 	if (lrp->res.lrs_present)
 		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
-	clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
-	smp_mb__after_atomic();
-	wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+	pnfs_clear_layoutreturn_waitbit(lo);
 	clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
 	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 	lo->plh_block_lgets--;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c4c9fe606ae6..0fb0f1920a1f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -910,7 +910,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 			pnfs_layout_io_set_failed(lo, range->iomode);
 		}
 		return NULL;
-	}
+	} else
+		pnfs_layout_clear_fail_bit(lo,
+				pnfs_iomode_to_fail_bit(range->iomode));
 
 	return lseg;
 }
@@ -930,6 +932,13 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
 	}
 }
 
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+	clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+}
+
 static int
 pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
 		       enum pnfs_iomode iomode, bool sync)
@@ -943,6 +952,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
 		status = -ENOMEM;
 		spin_lock(&ino->i_lock);
 		lo->plh_block_lgets--;
+		pnfs_clear_layoutreturn_waitbit(lo);
 		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
 		spin_unlock(&ino->i_lock);
 		pnfs_put_layout_hdr(lo);
@@ -1418,6 +1428,15 @@ static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
 				   TASK_UNINTERRUPTIBLE);
 }
 
+static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+	unsigned long *bitlock = &lo->plh_flags;
+
+	clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
+	smp_mb__after_atomic();
+	wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
+}
+
 /*
  * Layout segment is retreived from the server if not cached.
  * The appropriate layout segment is referenced and returned to the caller.
@@ -1499,6 +1518,8 @@ lookup_again:
 		spin_unlock(&ino->i_lock);
 		dprintk("%s wait for layoutreturn\n", __func__);
 		if (pnfs_prepare_to_retry_layoutget(lo)) {
+			if (first)
+				pnfs_clear_first_layoutget(lo);
 			pnfs_put_layout_hdr(lo);
 			dprintk("%s retrying\n", __func__);
 			goto lookup_again;
@@ -1533,13 +1554,8 @@ lookup_again:
 	pnfs_clear_retry_layoutget(lo);
 	atomic_dec(&lo->plh_outstanding);
 out_put_layout_hdr:
-	if (first) {
-		unsigned long *bitlock = &lo->plh_flags;
-
-		clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
-		smp_mb__after_atomic();
-		wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
-	}
+	if (first)
+		pnfs_clear_first_layoutget(lo);
 	pnfs_put_layout_hdr(lo);
 out:
 	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 49a466708400..7642021484bf 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -278,6 +278,7 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       u64 count,
 					       enum pnfs_iomode iomode,
 					       gfp_t gfp_flags);
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
 
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 022b761dbf0a..de7c91ca427e 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -516,6 +516,7 @@ enum pnfs_layouttype {
 	LAYOUT_NFSV4_1_FILES  = 1,
 	LAYOUT_OSD2_OBJECTS = 2,
 	LAYOUT_BLOCK_VOLUME = 3,
+	LAYOUT_FLEX_FILES = 4,
 };
 
 /* used for both layout return and recall */
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index 0f4b79da6584..333844e38f66 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -73,5 +73,7 @@ int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t
 int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t);
 int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t);
 
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res);
+
 extern unsigned int nfs_idmap_cache_timeout;
 #endif /* NFS_IDMAP_H */
diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
index 89f2ca178873..7e61a17030a4 100644
--- a/include/linux/sunrpc/metrics.h
+++ b/include/linux/sunrpc/metrics.h
@@ -89,6 +89,8 @@ void			rpc_free_iostats(struct rpc_iostats *);
 static inline struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { return NULL; }
 static inline void rpc_count_iostats(const struct rpc_task *task,
 				     struct rpc_iostats *stats) {}
+static inline void rpc_count_iostats_metrics(const struct rpc_task *,
+					     struct rpc_iostats *) {}
 static inline void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) {}
 static inline void rpc_free_iostats(struct rpc_iostats *stats) {}
 
-- 
cgit v1.2.3


From 344d635b9a41b19837ccf8083a99ea688027019c Mon Sep 17 00:00:00 2001
From: Brad Griffis <bgriffis@ti.com>
Date: Tue, 3 Feb 2015 11:44:12 -0800
Subject: Input: ti_am335x_tsc - remove udelay in interrupt handler

TSC interrupt handler had udelay to avoid reporting of false pen-up
interrupt to user space. This patch implements workaround suggesting in
Advisory 1.0.31 of silicon errata for am335x, thus eliminating udelay and
touchscreen lag. This also improves performance of touchscreen and
eliminates sudden jump of cursor at touch release.

IDLECONFIG and CHARGECONFIG registers are to be configured with same values
in order to eliminate false pen-up events. This workaround may result in
false pen-down to be detected, hence considerable charge step delay needs
to be added. The charge delay is set to 0xB000 (in terms of ADC clock
cycles) by default.

TSC steps are disabled at the end of every sampling cycle and EOS bit is
set. Once the EOS bit is set, the TSC steps need to be re-enabled to begin
next sampling cycle.

Signed-off-by: Brad Griffis <bgriffis@ti.com>
[vigneshr@ti.com: Ported the patch from v3.12 to v3.19rc1]
Signed-off-by: Vignesh R <vigneshr@ti.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/ti_am335x_tsc.c | 67 ++++++++++++++-----------------
 include/linux/mfd/ti_am335x_tscadc.h      |  3 +-
 2 files changed, 33 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ti_am335x_tsc.c b/drivers/input/touchscreen/ti_am335x_tsc.c
index dfbb9fe6a270..0625c102a1d0 100644
--- a/drivers/input/touchscreen/ti_am335x_tsc.c
+++ b/drivers/input/touchscreen/ti_am335x_tsc.c
@@ -173,11 +173,9 @@ static void titsc_step_config(struct titsc *ts_dev)
 		titsc_writel(ts_dev, REG_STEPDELAY(i), STEPCONFIG_OPENDLY);
 	}
 
-	/* Charge step configuration */
-	config = ts_dev->bit_xp | ts_dev->bit_yn |
-			STEPCHARGE_RFP_XPUL | STEPCHARGE_RFM_XNUR |
-			STEPCHARGE_INM_AN1 | STEPCHARGE_INP(ts_dev->inp_yp);
+	/* Make CHARGECONFIG same as IDLECONFIG */
 
+	config = titsc_readl(ts_dev, REG_IDLECONFIG);
 	titsc_writel(ts_dev, REG_CHARGECONFIG, config);
 	titsc_writel(ts_dev, REG_CHARGEDELAY, CHARGEDLY_OPENDLY);
 
@@ -261,12 +259,34 @@ static irqreturn_t titsc_irq(int irq, void *dev)
 {
 	struct titsc *ts_dev = dev;
 	struct input_dev *input_dev = ts_dev->input;
-	unsigned int status, irqclr = 0;
+	unsigned int fsm, status, irqclr = 0;
 	unsigned int x = 0, y = 0;
 	unsigned int z1, z2, z;
-	unsigned int fsm;
 
-	status = titsc_readl(ts_dev, REG_IRQSTATUS);
+	status = titsc_readl(ts_dev, REG_RAWIRQSTATUS);
+	if (status & IRQENB_HW_PEN) {
+		ts_dev->pen_down = true;
+		titsc_writel(ts_dev, REG_IRQWAKEUP, 0x00);
+		titsc_writel(ts_dev, REG_IRQCLR, IRQENB_HW_PEN);
+		irqclr |= IRQENB_HW_PEN;
+	}
+
+	if (status & IRQENB_PENUP) {
+		fsm = titsc_readl(ts_dev, REG_ADCFSM);
+		if (fsm == ADCFSM_STEPID) {
+			ts_dev->pen_down = false;
+			input_report_key(input_dev, BTN_TOUCH, 0);
+			input_report_abs(input_dev, ABS_PRESSURE, 0);
+			input_sync(input_dev);
+		} else {
+			ts_dev->pen_down = true;
+		}
+		irqclr |= IRQENB_PENUP;
+	}
+
+	if (status & IRQENB_EOS)
+		irqclr |= IRQENB_EOS;
+
 	/*
 	 * ADC and touchscreen share the IRQ line.
 	 * FIFO1 interrupts are used by ADC. Handle FIFO0 IRQs here only
@@ -297,37 +317,11 @@ static irqreturn_t titsc_irq(int irq, void *dev)
 		}
 		irqclr |= IRQENB_FIFO0THRES;
 	}
-
-	/*
-	 * Time for sequencer to settle, to read
-	 * correct state of the sequencer.
-	 */
-	udelay(SEQ_SETTLE);
-
-	status = titsc_readl(ts_dev, REG_RAWIRQSTATUS);
-	if (status & IRQENB_PENUP) {
-		/* Pen up event */
-		fsm = titsc_readl(ts_dev, REG_ADCFSM);
-		if (fsm == ADCFSM_STEPID) {
-			ts_dev->pen_down = false;
-			input_report_key(input_dev, BTN_TOUCH, 0);
-			input_report_abs(input_dev, ABS_PRESSURE, 0);
-			input_sync(input_dev);
-		} else {
-			ts_dev->pen_down = true;
-		}
-		irqclr |= IRQENB_PENUP;
-	}
-
-	if (status & IRQENB_HW_PEN) {
-
-		titsc_writel(ts_dev, REG_IRQWAKEUP, 0x00);
-		titsc_writel(ts_dev, REG_IRQCLR, IRQENB_HW_PEN);
-	}
-
 	if (irqclr) {
 		titsc_writel(ts_dev, REG_IRQSTATUS, irqclr);
-		am335x_tsc_se_set_cache(ts_dev->mfd_tscadc, ts_dev->step_mask);
+		if (status & IRQENB_EOS)
+			am335x_tsc_se_set_cache(ts_dev->mfd_tscadc,
+						ts_dev->step_mask);
 		return IRQ_HANDLED;
 	}
 	return IRQ_NONE;
@@ -417,6 +411,7 @@ static int titsc_probe(struct platform_device *pdev)
 	}
 
 	titsc_writel(ts_dev, REG_IRQENABLE, IRQENB_FIFO0THRES);
+	titsc_writel(ts_dev, REG_IRQENABLE, IRQENB_EOS);
 	err = titsc_config_wires(ts_dev);
 	if (err) {
 		dev_err(&pdev->dev, "wrong i/p wire configuration\n");
diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index e2e70053470e..3f4e994ace2b 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -52,6 +52,7 @@
 
 /* IRQ enable */
 #define IRQENB_HW_PEN		BIT(0)
+#define IRQENB_EOS		BIT(1)
 #define IRQENB_FIFO0THRES	BIT(2)
 #define IRQENB_FIFO0OVRRUN	BIT(3)
 #define IRQENB_FIFO0UNDRFLW	BIT(4)
@@ -107,7 +108,7 @@
 /* Charge delay */
 #define CHARGEDLY_OPEN_MASK	(0x3FFFF << 0)
 #define CHARGEDLY_OPEN(val)	((val) << 0)
-#define CHARGEDLY_OPENDLY	CHARGEDLY_OPEN(1)
+#define CHARGEDLY_OPENDLY	CHARGEDLY_OPEN(0x400)
 
 /* Control register */
 #define CNTRLREG_TSCSSENB	BIT(0)
-- 
cgit v1.2.3


From a49170b552423a3e85fc4f0d778c707402ee4863 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 2 Feb 2015 10:42:58 +0800
Subject: ACPI: Return translation offset when parsing ACPI address space
 resources

Change function acpi_dev_resource_address_space() and
acpi_dev_resource_ext_address_space() to return address space
translation offset.

It's based on a patch from Yinghai Lu <yinghai@kernel.org>.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c        | 58 +++++++++++++++++++++++-------------------
 drivers/pnp/pnpacpi/rsparser.c | 29 +++++++++++----------
 include/linux/acpi.h           |  9 +++++--
 3 files changed, 54 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 57891a621b96..c902c8eece81 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -184,13 +184,14 @@ bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res)
 }
 EXPORT_SYMBOL_GPL(acpi_dev_resource_io);
 
-static bool acpi_decode_space(struct resource *res,
+static bool acpi_decode_space(struct resource_win *win,
 			      struct acpi_resource_address *addr,
 			      struct acpi_address64_attribute *attr)
 {
 	u8 iodec = attr->granularity == 0xfff ? ACPI_DECODE_10 : ACPI_DECODE_16;
 	bool wp = addr->info.mem.write_protect;
 	u64 len = attr->address_length;
+	struct resource *res = &win->res;
 
 	/*
 	 * Filter out invalid descriptor according to ACPI Spec 5.0, section
@@ -218,6 +219,8 @@ static bool acpi_decode_space(struct resource *res,
 		return false;
 	}
 
+	win->offset = attr->translation_offset;
+
 	if (addr->producer_consumer == ACPI_PRODUCER)
 		res->flags |= IORESOURCE_WINDOW;
 
@@ -230,27 +233,28 @@ static bool acpi_decode_space(struct resource *res,
 /**
  * acpi_dev_resource_address_space - Extract ACPI address space information.
  * @ares: Input ACPI resource object.
- * @res: Output generic resource object.
+ * @win: Output generic resource object.
  *
  * Check if the given ACPI resource object represents an address space resource
  * and if that's the case, use the information in it to populate the generic
- * resource object pointed to by @res.
+ * resource object pointed to by @win.
  *
  * Return:
- * 1) false with res->flags setting to zero: not the expected resource type
- * 2) false with IORESOURCE_DISABLED in res->flags: valid unassigned resource
+ * 1) false with win->res.flags setting to zero: not the expected resource type
+ * 2) false with IORESOURCE_DISABLED in win->res.flags: valid unassigned
+ *    resource
  * 3) true: valid assigned resource
  */
 bool acpi_dev_resource_address_space(struct acpi_resource *ares,
-				     struct resource *res)
+				     struct resource_win *win)
 {
 	struct acpi_resource_address64 addr;
 
-	res->flags = 0;
+	win->res.flags = 0;
 	if (ACPI_FAILURE(acpi_resource_to_address64(ares, &addr)))
 		return false;
 
-	return acpi_decode_space(res, (struct acpi_resource_address *)&addr,
+	return acpi_decode_space(win, (struct acpi_resource_address *)&addr,
 				 &addr.address);
 }
 EXPORT_SYMBOL_GPL(acpi_dev_resource_address_space);
@@ -258,29 +262,30 @@ EXPORT_SYMBOL_GPL(acpi_dev_resource_address_space);
 /**
  * acpi_dev_resource_ext_address_space - Extract ACPI address space information.
  * @ares: Input ACPI resource object.
- * @res: Output generic resource object.
+ * @win: Output generic resource object.
  *
  * Check if the given ACPI resource object represents an extended address space
  * resource and if that's the case, use the information in it to populate the
- * generic resource object pointed to by @res.
+ * generic resource object pointed to by @win.
  *
  * Return:
- * 1) false with res->flags setting to zero: not the expected resource type
- * 2) false with IORESOURCE_DISABLED in res->flags: valid unassigned resource
+ * 1) false with win->res.flags setting to zero: not the expected resource type
+ * 2) false with IORESOURCE_DISABLED in win->res.flags: valid unassigned
+ *    resource
  * 3) true: valid assigned resource
  */
 bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
-					 struct resource *res)
+					 struct resource_win *win)
 {
 	struct acpi_resource_extended_address64 *ext_addr;
 
-	res->flags = 0;
+	win->res.flags = 0;
 	if (ares->type != ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64)
 		return false;
 
 	ext_addr = &ares->data.ext_address64;
 
-	return acpi_decode_space(res, (struct acpi_resource_address *)ext_addr,
+	return acpi_decode_space(win, (struct acpi_resource_address *)ext_addr,
 				 &ext_addr->address);
 }
 EXPORT_SYMBOL_GPL(acpi_dev_resource_ext_address_space);
@@ -441,7 +446,7 @@ struct res_proc_context {
 	int error;
 };
 
-static acpi_status acpi_dev_new_resource_entry(struct resource *r,
+static acpi_status acpi_dev_new_resource_entry(struct resource_win *win,
 					       struct res_proc_context *c)
 {
 	struct resource_list_entry *rentry;
@@ -451,7 +456,7 @@ static acpi_status acpi_dev_new_resource_entry(struct resource *r,
 		c->error = -ENOMEM;
 		return AE_NO_MEMORY;
 	}
-	rentry->res = *r;
+	rentry->res = win->res;
 	list_add_tail(&rentry->node, c->list);
 	c->count++;
 	return AE_OK;
@@ -461,7 +466,8 @@ static acpi_status acpi_dev_process_resource(struct acpi_resource *ares,
 					     void *context)
 {
 	struct res_proc_context *c = context;
-	struct resource r;
+	struct resource_win win;
+	struct resource *res = &win.res;
 	int i;
 
 	if (c->preproc) {
@@ -476,18 +482,18 @@ static acpi_status acpi_dev_process_resource(struct acpi_resource *ares,
 		}
 	}
 
-	memset(&r, 0, sizeof(r));
+	memset(&win, 0, sizeof(win));
 
-	if (acpi_dev_resource_memory(ares, &r)
-	    || acpi_dev_resource_io(ares, &r)
-	    || acpi_dev_resource_address_space(ares, &r)
-	    || acpi_dev_resource_ext_address_space(ares, &r))
-		return acpi_dev_new_resource_entry(&r, c);
+	if (acpi_dev_resource_memory(ares, res)
+	    || acpi_dev_resource_io(ares, res)
+	    || acpi_dev_resource_address_space(ares, &win)
+	    || acpi_dev_resource_ext_address_space(ares, &win))
+		return acpi_dev_new_resource_entry(&win, c);
 
-	for (i = 0; acpi_dev_resource_interrupt(ares, i, &r); i++) {
+	for (i = 0; acpi_dev_resource_interrupt(ares, i, res); i++) {
 		acpi_status status;
 
-		status = acpi_dev_new_resource_entry(&r, c);
+		status = acpi_dev_new_resource_entry(&win, c);
 		if (ACPI_FAILURE(status))
 			return status;
 	}
diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index 2d9bc789af0f..ff0356fb378f 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -180,20 +180,21 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 	struct pnp_dev *dev = data;
 	struct acpi_resource_dma *dma;
 	struct acpi_resource_vendor_typed *vendor_typed;
-	struct resource r = {0};
+	struct resource_win win = {{0}, 0};
+	struct resource *r = &win.res;
 	int i, flags;
 
-	if (acpi_dev_resource_address_space(res, &r)
-	    || acpi_dev_resource_ext_address_space(res, &r)) {
-		pnp_add_resource(dev, &r);
+	if (acpi_dev_resource_address_space(res, &win)
+	    || acpi_dev_resource_ext_address_space(res, &win)) {
+		pnp_add_resource(dev, &win.res);
 		return AE_OK;
 	}
 
-	r.flags = 0;
-	if (acpi_dev_resource_interrupt(res, 0, &r)) {
-		pnpacpi_add_irqresource(dev, &r);
-		for (i = 1; acpi_dev_resource_interrupt(res, i, &r); i++)
-			pnpacpi_add_irqresource(dev, &r);
+	r->flags = 0;
+	if (acpi_dev_resource_interrupt(res, 0, r)) {
+		pnpacpi_add_irqresource(dev, r);
+		for (i = 1; acpi_dev_resource_interrupt(res, i, r); i++)
+			pnpacpi_add_irqresource(dev, r);
 
 		if (i > 1) {
 			/*
@@ -209,7 +210,7 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 			}
 		}
 		return AE_OK;
-	} else if (r.flags & IORESOURCE_DISABLED) {
+	} else if (r->flags & IORESOURCE_DISABLED) {
 		pnp_add_irq_resource(dev, 0, IORESOURCE_DISABLED);
 		return AE_OK;
 	}
@@ -218,13 +219,13 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 	case ACPI_RESOURCE_TYPE_MEMORY24:
 	case ACPI_RESOURCE_TYPE_MEMORY32:
 	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
-		if (acpi_dev_resource_memory(res, &r))
-			pnp_add_resource(dev, &r);
+		if (acpi_dev_resource_memory(res, r))
+			pnp_add_resource(dev, r);
 		break;
 	case ACPI_RESOURCE_TYPE_IO:
 	case ACPI_RESOURCE_TYPE_FIXED_IO:
-		if (acpi_dev_resource_io(res, &r))
-			pnp_add_resource(dev, &r);
+		if (acpi_dev_resource_io(res, r))
+			pnp_add_resource(dev, r);
 		break;
 	case ACPI_RESOURCE_TYPE_DMA:
 		dma = &res->data.dma;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d459cd17b477..be9eaee8f4ae 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -285,12 +285,17 @@ extern int pnpacpi_disabled;
 
 #define PXM_INVAL	(-1)
 
+struct resource_win {
+	struct resource res;
+	resource_size_t offset;
+};
+
 bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res);
 bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res);
 bool acpi_dev_resource_address_space(struct acpi_resource *ares,
-				     struct resource *res);
+				     struct resource_win *win);
 bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
-					 struct resource *res);
+					 struct resource_win *win);
 unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable);
 bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
 				 struct resource *res);
-- 
cgit v1.2.3


From 93286f4798590e711aa395503401f8632fb74f9a Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 2 Feb 2015 10:43:00 +0800
Subject: ACPI: Add field offset to struct resource_list_entry

Add field offset to struct resource_list_entry to host address space
translation offset so it could be used to represent bridge resources.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c | 1 +
 include/linux/acpi.h    | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 4dc8cfb2e94e..1c3abae6f2fa 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -472,6 +472,7 @@ static acpi_status acpi_dev_new_resource_entry(struct resource_win *win,
 		return AE_NO_MEMORY;
 	}
 	rentry->res = win->res;
+	rentry->offset = win->offset;
 	list_add_tail(&rentry->node, c->list);
 	c->count++;
 	return AE_OK;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index be9eaee8f4ae..21dac3cb62d2 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -303,6 +303,7 @@ bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
 struct resource_list_entry {
 	struct list_head node;
 	struct resource res;
+	resource_size_t offset;
 };
 
 void acpi_dev_free_resource_list(struct list_head *list);
-- 
cgit v1.2.3


From 62d1141ff34e35de496ba06491c8e854b23b3f3e Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 2 Feb 2015 10:43:01 +0800
Subject: ACPI: Introduce helper function acpi_dev_filter_resource_type()

Introduce helper function acpi_dev_filter_resource_type(), which may
be used by acpi_dev_get_resources() to filer out resource based on
resource type.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h    |  8 +++++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 1c3abae6f2fa..3ea0d17eb951 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -569,3 +569,58 @@ int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 	return c.count;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_get_resources);
+
+/**
+ * acpi_dev_filter_resource_type - Filter ACPI resource according to resource
+ *				   types
+ * @ares: Input ACPI resource object.
+ * @types: Valid resource types of IORESOURCE_XXX
+ *
+ * This is a hepler function to support acpi_dev_get_resources(), which filters
+ * ACPI resource objects according to resource types.
+ */
+int acpi_dev_filter_resource_type(struct acpi_resource *ares,
+				  unsigned long types)
+{
+	unsigned long type = 0;
+
+	switch (ares->type) {
+	case ACPI_RESOURCE_TYPE_MEMORY24:
+	case ACPI_RESOURCE_TYPE_MEMORY32:
+	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
+		type = IORESOURCE_MEM;
+		break;
+	case ACPI_RESOURCE_TYPE_IO:
+	case ACPI_RESOURCE_TYPE_FIXED_IO:
+		type = IORESOURCE_IO;
+		break;
+	case ACPI_RESOURCE_TYPE_IRQ:
+	case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
+		type = IORESOURCE_IRQ;
+		break;
+	case ACPI_RESOURCE_TYPE_DMA:
+	case ACPI_RESOURCE_TYPE_FIXED_DMA:
+		type = IORESOURCE_DMA;
+		break;
+	case ACPI_RESOURCE_TYPE_GENERIC_REGISTER:
+		type = IORESOURCE_REG;
+		break;
+	case ACPI_RESOURCE_TYPE_ADDRESS16:
+	case ACPI_RESOURCE_TYPE_ADDRESS32:
+	case ACPI_RESOURCE_TYPE_ADDRESS64:
+	case ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64:
+		if (ares->data.address.resource_type == ACPI_MEMORY_RANGE)
+			type = IORESOURCE_MEM;
+		else if (ares->data.address.resource_type == ACPI_IO_RANGE)
+			type = IORESOURCE_IO;
+		else if (ares->data.address.resource_type ==
+			 ACPI_BUS_NUMBER_RANGE)
+			type = IORESOURCE_BUS;
+		break;
+	default:
+		break;
+	}
+
+	return (type & types) ? 0 : 1;
+}
+EXPORT_SYMBOL_GPL(acpi_dev_filter_resource_type);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 21dac3cb62d2..e818decb631f 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -310,6 +310,14 @@ void acpi_dev_free_resource_list(struct list_head *list);
 int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 			   int (*preproc)(struct acpi_resource *, void *),
 			   void *preproc_data);
+int acpi_dev_filter_resource_type(struct acpi_resource *ares,
+				  unsigned long types);
+
+static inline int acpi_dev_filter_resource_type_cb(struct acpi_resource *ares,
+						   void *arg)
+{
+	return acpi_dev_filter_resource_type(ares, (unsigned long)arg);
+}
 
 int acpi_check_resource_conflict(const struct resource *res);
 
-- 
cgit v1.2.3


From 03a9a42a1a7e5b3e7919ddfacc1d1cc81882a955 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 30 Jan 2015 18:12:28 -0500
Subject: SUNRPC: NULL utsname dereference on NFS umount during namespace
 cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix an Oopsable condition when nsm_mon_unmon is called as part of the
namespace cleanup, which now apparently happens after the utsname
has been freed.

Link: http://lkml.kernel.org/r/20150125220604.090121ae@neptune.home
Reported-by: Bruno Prémont <bonbons@linux-vserver.org>
Cc: stable@vger.kernel.org # 3.18
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/lockd/mon.c              | 13 +++++++++----
 include/linux/sunrpc/clnt.h |  3 ++-
 net/sunrpc/clnt.c           | 12 +++++++-----
 net/sunrpc/rpcb_clnt.c      |  8 ++++++--
 4 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1cc6ec51e6b1..47a32b6d9b90 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -65,7 +65,7 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
 	return (struct sockaddr *)&nsm->sm_addr;
 }
 
-static struct rpc_clnt *nsm_create(struct net *net)
+static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
 {
 	struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
@@ -77,6 +77,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
 		.address		= (struct sockaddr *)&sin,
 		.addrsize		= sizeof(sin),
 		.servername		= "rpc.statd",
+		.nodename		= nodename,
 		.program		= &nsm_program,
 		.version		= NSM_VERSION,
 		.authflavor		= RPC_AUTH_NULL,
@@ -102,7 +103,7 @@ out:
 	return clnt;
 }
 
-static struct rpc_clnt *nsm_client_get(struct net *net)
+static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
 {
 	struct rpc_clnt	*clnt, *new;
 	struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -111,7 +112,7 @@ static struct rpc_clnt *nsm_client_get(struct net *net)
 	if (clnt != NULL)
 		goto out;
 
-	clnt = new = nsm_create(net);
+	clnt = new = nsm_create(net, nodename);
 	if (IS_ERR(clnt))
 		goto out;
 
@@ -190,19 +191,23 @@ int nsm_monitor(const struct nlm_host *host)
 	struct nsm_res	res;
 	int		status;
 	struct rpc_clnt *clnt;
+	const char *nodename = NULL;
 
 	dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
 
 	if (nsm->sm_monitored)
 		return 0;
 
+	if (host->h_rpcclnt)
+		nodename = host->h_rpcclnt->cl_nodename;
+
 	/*
 	 * Choose whether to record the caller_name or IP address of
 	 * this peer in the local rpc.statd's database.
 	 */
 	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
 
-	clnt = nsm_client_get(host->net);
+	clnt = nsm_client_get(host->net, nodename);
 	if (IS_ERR(clnt)) {
 		status = PTR_ERR(clnt);
 		dprintk("lockd: failed to create NSM upcall transport, "
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index d86acc63b25f..598ba80ec30c 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -57,7 +57,7 @@ struct rpc_clnt {
 	const struct rpc_timeout *cl_timeout;	/* Timeout strategy */
 
 	int			cl_nodelen;	/* nodename length */
-	char 			cl_nodename[UNX_MAXNODENAME];
+	char 			cl_nodename[UNX_MAXNODENAME+1];
 	struct rpc_pipe_dir_head cl_pipedir_objects;
 	struct rpc_clnt *	cl_parent;	/* Points to parent of clones */
 	struct rpc_rtt		cl_rtt_default;
@@ -112,6 +112,7 @@ struct rpc_create_args {
 	struct sockaddr		*saddress;
 	const struct rpc_timeout *timeout;
 	const char		*servername;
+	const char		*nodename;
 	const struct rpc_program *program;
 	u32			prognumber;	/* overrides program->number */
 	u32			version;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 05da12a33945..3f5d4d48f0cb 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -286,10 +286,8 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt,
 
 static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename)
 {
-	clnt->cl_nodelen = strlen(nodename);
-	if (clnt->cl_nodelen > UNX_MAXNODENAME)
-		clnt->cl_nodelen = UNX_MAXNODENAME;
-	memcpy(clnt->cl_nodename, nodename, clnt->cl_nodelen);
+	clnt->cl_nodelen = strlcpy(clnt->cl_nodename,
+			nodename, sizeof(clnt->cl_nodename));
 }
 
 static int rpc_client_register(struct rpc_clnt *clnt,
@@ -365,6 +363,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
 	const struct rpc_version *version;
 	struct rpc_clnt *clnt = NULL;
 	const struct rpc_timeout *timeout;
+	const char *nodename = args->nodename;
 	int err;
 
 	/* sanity check the name before trying to print it */
@@ -420,8 +419,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
 
 	atomic_set(&clnt->cl_count, 1);
 
+	if (nodename == NULL)
+		nodename = utsname()->nodename;
 	/* save the nodename */
-	rpc_clnt_set_nodename(clnt, utsname()->nodename);
+	rpc_clnt_set_nodename(clnt, nodename);
 
 	err = rpc_client_register(clnt, args->authflavor, args->client_name);
 	if (err)
@@ -576,6 +577,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
 	if (xprt == NULL)
 		goto out_err;
 	args->servername = xprt->servername;
+	args->nodename = clnt->cl_nodename;
 
 	new = rpc_new_client(args, xprt, clnt);
 	if (IS_ERR(new)) {
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 05202012bcfc..cf5770d8f49a 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -355,7 +355,8 @@ out:
 	return result;
 }
 
-static struct rpc_clnt *rpcb_create(struct net *net, const char *hostname,
+static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename,
+				    const char *hostname,
 				    struct sockaddr *srvaddr, size_t salen,
 				    int proto, u32 version)
 {
@@ -365,6 +366,7 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *hostname,
 		.address	= srvaddr,
 		.addrsize	= salen,
 		.servername	= hostname,
+		.nodename	= nodename,
 		.program	= &rpcb_program,
 		.version	= version,
 		.authflavor	= RPC_AUTH_UNIX,
@@ -740,7 +742,9 @@ void rpcb_getport_async(struct rpc_task *task)
 	dprintk("RPC: %5u %s: trying rpcbind version %u\n",
 		task->tk_pid, __func__, bind_version);
 
-	rpcb_clnt = rpcb_create(xprt->xprt_net, xprt->servername, sap, salen,
+	rpcb_clnt = rpcb_create(xprt->xprt_net,
+				clnt->cl_nodename,
+				xprt->servername, sap, salen,
 				xprt->prot, bind_version);
 	if (IS_ERR(rpcb_clnt)) {
 		status = PTR_ERR(rpcb_clnt);
-- 
cgit v1.2.3


From c1dbe2fbb33ef425a81e1a7cffd17c113c87cdbc Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 27 Jan 2015 21:13:39 +0100
Subject: PM / Domains: Remove reference counting for the
 generic_pm_domain_data

The reference counting was needed when genpd supported PM domain device
callbacks. Since this option has been removed, let's also remove the
reference counting of the struct generic_pm_domain_data.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 10 ++--------
 include/linux/pm_domain.h   |  1 -
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index f9e7df554b2f..351df5bbd9c9 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1456,7 +1456,6 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 		gpd_data = gpd_data_new;
 		dev->power.subsys_data->domain_data = &gpd_data->base;
 	}
-	gpd_data->refcount++;
 	if (td)
 		gpd_data->td = *td;
 
@@ -1504,7 +1503,6 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 {
 	struct generic_pm_domain_data *gpd_data;
 	struct pm_domain_data *pdd;
-	bool remove = false;
 	int ret = 0;
 
 	dev_dbg(dev, "%s()\n", __func__);
@@ -1533,10 +1531,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	pdd = dev->power.subsys_data->domain_data;
 	list_del_init(&pdd->list_node);
 	gpd_data = to_gpd_data(pdd);
-	if (--gpd_data->refcount == 0) {
-		dev->power.subsys_data->domain_data = NULL;
-		remove = true;
-	}
+	dev->power.subsys_data->domain_data = NULL;
 
 	spin_unlock_irq(&dev->power.lock);
 
@@ -1547,8 +1542,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	genpd_release_lock(genpd);
 
 	dev_pm_put_subsys_data(dev);
-	if (remove)
-		genpd_free_dev_data(dev, gpd_data);
+	genpd_free_dev_data(dev, gpd_data);
 
 	return 0;
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index ed607760fc20..e160a0bba28d 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -114,7 +114,6 @@ struct generic_pm_domain_data {
 	struct gpd_timing_data td;
 	struct notifier_block nb;
 	struct mutex lock;
-	unsigned int refcount;
 	int need_restore;
 };
 
-- 
cgit v1.2.3


From c0356db7d1b66840882744cbd9d9c5960b2d88c7 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 27 Jan 2015 21:13:42 +0100
Subject: PM / Domains: Eliminate the mutex for the generic_pm_domain_data

While adding devices to their PM domains, dev_pm_qos_add_notifier() was
invoked while allocating the generic_pm_domain_data for the device.

Since the generic_pm_domain_data's device pointer will be assigned
after allocation, the ->genpd_dev_pm_qos_notifier() callback could be
called prior having a valid pointer to the device. Similar scenario
existed while removing a device from a genpd.

To cope with these scenarios a mutex was used to protect the pointer to
the device.

By re-order the sequence for when dev_pm_qos_add|remove_notifier() are
invoked, we make sure the ->genpd_dev_pm_qos_notifier() callback are
always called with a valid device pointer available.

In this way, we eliminate the need for protecting the pointer and thus
we can remove the mutex from the struct generic_pm_domain_data.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 37 ++++++++++++++-----------------------
 include/linux/pm_domain.h   |  1 -
 2 files changed, 14 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 88198ba919d9..1f026c18bc5c 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -344,14 +344,7 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb,
 	struct device *dev;
 
 	gpd_data = container_of(nb, struct generic_pm_domain_data, nb);
-
-	mutex_lock(&gpd_data->lock);
 	dev = gpd_data->base.dev;
-	if (!dev) {
-		mutex_unlock(&gpd_data->lock);
-		return NOTIFY_DONE;
-	}
-	mutex_unlock(&gpd_data->lock);
 
 	for (;;) {
 		struct generic_pm_domain *genpd;
@@ -1392,16 +1385,12 @@ static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev)
 	if (!gpd_data)
 		return NULL;
 
-	mutex_init(&gpd_data->lock);
-	gpd_data->nb.notifier_call = genpd_dev_pm_qos_notifier;
-	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
 	return gpd_data;
 }
 
 static void genpd_free_dev_data(struct device *dev,
 				struct generic_pm_domain_data *gpd_data)
 {
-	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
 	kfree(gpd_data);
 }
 
@@ -1414,7 +1403,7 @@ static void genpd_free_dev_data(struct device *dev,
 int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 			  struct gpd_timing_data *td)
 {
-	struct generic_pm_domain_data *gpd_data_new, *gpd_data = NULL;
+	struct generic_pm_domain_data *gpd_data;
 	int ret = 0;
 
 	dev_dbg(dev, "%s()\n", __func__);
@@ -1422,8 +1411,8 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(dev))
 		return -EINVAL;
 
-	gpd_data_new = genpd_alloc_dev_data(dev);
-	if (!gpd_data_new)
+	gpd_data = genpd_alloc_dev_data(dev);
+	if (!gpd_data)
 		return -ENOMEM;
 
 	genpd_acquire_lock(genpd);
@@ -1445,7 +1434,6 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 		goto out;
 	}
 
-	gpd_data = gpd_data_new;
 	dev->power.subsys_data->domain_data = &gpd_data->base;
 
 	if (td)
@@ -1461,19 +1449,20 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	genpd->device_count++;
 	genpd->max_off_time_changed = true;
 
-	mutex_lock(&gpd_data->lock);
 	gpd_data->base.dev = dev;
 	list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
 	gpd_data->need_restore = -1;
 	gpd_data->td.constraint_changed = true;
 	gpd_data->td.effective_constraint_ns = -1;
-	mutex_unlock(&gpd_data->lock);
+	gpd_data->nb.notifier_call = genpd_dev_pm_qos_notifier;
 
  out:
 	genpd_release_lock(genpd);
 
-	if (gpd_data != gpd_data_new)
-		genpd_free_dev_data(dev, gpd_data_new);
+	if (ret)
+		genpd_free_dev_data(dev, gpd_data);
+	else
+		dev_pm_qos_add_notifier(dev, &gpd_data->nb);
 
 	return ret;
 }
@@ -1509,6 +1498,11 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	    ||  pd_to_genpd(dev->pm_domain) != genpd)
 		return -EINVAL;
 
+	/* The above validation also means we have existing domain_data. */
+	pdd = dev->power.subsys_data->domain_data;
+	gpd_data = to_gpd_data(pdd);
+	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
+
 	genpd_acquire_lock(genpd);
 
 	if (genpd->prepared_count > 0) {
@@ -1525,16 +1519,12 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	spin_lock_irq(&dev->power.lock);
 
 	dev->pm_domain = NULL;
-	pdd = dev->power.subsys_data->domain_data;
 	list_del_init(&pdd->list_node);
-	gpd_data = to_gpd_data(pdd);
 	dev->power.subsys_data->domain_data = NULL;
 
 	spin_unlock_irq(&dev->power.lock);
 
-	mutex_lock(&gpd_data->lock);
 	pdd->dev = NULL;
-	mutex_unlock(&gpd_data->lock);
 
 	genpd_release_lock(genpd);
 
@@ -1545,6 +1535,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 
  out:
 	genpd_release_lock(genpd);
+	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
 
 	return ret;
 }
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index e160a0bba28d..080e778118ba 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -113,7 +113,6 @@ struct generic_pm_domain_data {
 	struct pm_domain_data base;
 	struct gpd_timing_data td;
 	struct notifier_block nb;
-	struct mutex lock;
 	int need_restore;
 };
 
-- 
cgit v1.2.3


From 1e95e3b2da424db68d0a465273f1901a990c6277 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 29 Jan 2015 18:39:05 +0100
Subject: PM: Convert dev_pm_put_subsys_data() into a void function

Clients using the dev_pm_put_subsys_data() API isn't interested of a
return value. They care only of decreasing a reference to the device's
pm_subsys_data. So, let's convert the API to a void function, which
anyway seems like reasonable thing to do.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/common.c | 14 ++++----------
 include/linux/pm.h          |  2 +-
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index a1ee51d43da1..f32b802b98f4 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -56,13 +56,11 @@ EXPORT_SYMBOL_GPL(dev_pm_get_subsys_data);
  * @dev: Device to handle.
  *
  * If the reference counter of power.subsys_data is zero after dropping the
- * reference, power.subsys_data is removed.  Return 1 if that happens or 0
- * otherwise.
+ * reference, power.subsys_data is removed.
  */
-int dev_pm_put_subsys_data(struct device *dev)
+void dev_pm_put_subsys_data(struct device *dev)
 {
 	struct pm_subsys_data *psd;
-	int ret = 1;
 
 	spin_lock_irq(&dev->power.lock);
 
@@ -70,18 +68,14 @@ int dev_pm_put_subsys_data(struct device *dev)
 	if (!psd)
 		goto out;
 
-	if (--psd->refcount == 0) {
+	if (--psd->refcount == 0)
 		dev->power.subsys_data = NULL;
-	} else {
+	else
 		psd = NULL;
-		ret = 0;
-	}
 
  out:
 	spin_unlock_irq(&dev->power.lock);
 	kfree(psd);
-
-	return ret;
 }
 EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data);
 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 8b5976364619..e2f1be6dd9dd 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -597,7 +597,7 @@ struct dev_pm_info {
 
 extern void update_pm_runtime_accounting(struct device *dev);
 extern int dev_pm_get_subsys_data(struct device *dev);
-extern int dev_pm_put_subsys_data(struct device *dev);
+extern void dev_pm_put_subsys_data(struct device *dev);
 
 /*
  * Power domains provide callbacks that are executed during system suspend,
-- 
cgit v1.2.3


From 63e144c9d6ffa791c1402f4ee4551c1b9f5a336a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 15 Jan 2015 14:42:27 +0300
Subject: ti-st: clean up data types (fix harmless memory corruption)

The big issue here is:

	of_property_read_u32(np, "flow_cntrl", (u32 *)&dt_pdata->flow_cntrl);

"->flow_cntrl" is a char so when we write a 32 bit number to it then it
corrupts past the end of the char.  It's probably hard to notice because
the struct has padding so the code works on little endian systems. But
on a big endian system the code would fail and on a 64 bit, big endian
systems then "nshutdown_gpio" and "baud_rate" would be buggy as well.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/ti-st/st_kim.c  | 12 ++++++------
 include/linux/ti_wilink_st.h | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c
index 8fb116f8a152..18e7a03985d4 100644
--- a/drivers/misc/ti-st/st_kim.c
+++ b/drivers/misc/ti-st/st_kim.c
@@ -638,7 +638,7 @@ static ssize_t show_baud_rate(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct kim_data_s *kim_data = dev_get_drvdata(dev);
-	return sprintf(buf, "%ld\n", kim_data->baud_rate);
+	return sprintf(buf, "%d\n", kim_data->baud_rate);
 }
 
 static ssize_t show_flow_cntrl(struct device *dev,
@@ -760,9 +760,9 @@ static struct ti_st_plat_data *get_platform_data(struct device *dev)
 	if (dt_property)
 		memcpy(&dt_pdata->dev_name, dt_property, len);
 	of_property_read_u32(np, "nshutdown_gpio",
-			     (u32 *)&dt_pdata->nshutdown_gpio);
-	of_property_read_u32(np, "flow_cntrl", (u32 *)&dt_pdata->flow_cntrl);
-	of_property_read_u32(np, "baud_rate", (u32 *)&dt_pdata->baud_rate);
+			     &dt_pdata->nshutdown_gpio);
+	of_property_read_u32(np, "flow_cntrl", &dt_pdata->flow_cntrl);
+	of_property_read_u32(np, "baud_rate", &dt_pdata->baud_rate);
 
 	return dt_pdata;
 }
@@ -812,14 +812,14 @@ static int kim_probe(struct platform_device *pdev)
 	kim_gdata->nshutdown = pdata->nshutdown_gpio;
 	err = gpio_request(kim_gdata->nshutdown, "kim");
 	if (unlikely(err)) {
-		pr_err(" gpio %ld request failed ", kim_gdata->nshutdown);
+		pr_err(" gpio %d request failed ", kim_gdata->nshutdown);
 		return err;
 	}
 
 	/* Configure nShutdown GPIO as output=0 */
 	err = gpio_direction_output(kim_gdata->nshutdown, 0);
 	if (unlikely(err)) {
-		pr_err(" unable to configure gpio %ld", kim_gdata->nshutdown);
+		pr_err(" unable to configure gpio %d", kim_gdata->nshutdown);
 		return err;
 	}
 	/* get reference of pdev for request_firmware
diff --git a/include/linux/ti_wilink_st.h b/include/linux/ti_wilink_st.h
index 9072d9f95cff..c78dcfeaf25f 100644
--- a/include/linux/ti_wilink_st.h
+++ b/include/linux/ti_wilink_st.h
@@ -262,7 +262,7 @@ struct kim_data_s {
 	struct completion kim_rcvd, ldisc_installed;
 	char resp_buffer[30];
 	const struct firmware *fw_entry;
-	long nshutdown;
+	unsigned nshutdown;
 	unsigned long rx_state;
 	unsigned long rx_count;
 	struct sk_buff *rx_skb;
@@ -270,8 +270,8 @@ struct kim_data_s {
 	struct chip_version version;
 	unsigned char ldisc_install;
 	unsigned char dev_name[UART_DEV_NAME_LEN + 1];
-	unsigned char flow_cntrl;
-	unsigned long baud_rate;
+	unsigned flow_cntrl;
+	unsigned baud_rate;
 };
 
 /**
@@ -437,10 +437,10 @@ struct gps_event_hdr {
  *
  */
 struct ti_st_plat_data {
-	long nshutdown_gpio;
+	u32 nshutdown_gpio;
 	unsigned char dev_name[UART_DEV_NAME_LEN]; /* uart name */
-	unsigned char flow_cntrl; /* flow control flag */
-	unsigned long baud_rate;
+	u32 flow_cntrl; /* flow control flag */
+	u32 baud_rate;
 	int (*suspend)(struct platform_device *, pm_message_t);
 	int (*resume)(struct platform_device *);
 	int (*chip_enable) (struct kim_data_s *);
-- 
cgit v1.2.3


From 2cbf7fe2d5d32a4747c1f8ad163e886dccad930c Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 3 Feb 2015 13:18:55 +0000
Subject: i2o: move to staging

The I2O layer deals with a technology that to say the least didn't catch on
in the market.

The only relevant products are some of the AMI MegaRAID - which supported I2O
and its native mode (The native mode is faster and runs on Linux), an
obscure crypto ethernet card that's now so many years out of date nobody
would use it, the old DPT controllers, which speak their own dialect and
have their own driver - and ermm.. thats about it.

We also know the code isn't in good shape as recently a patch was proposed
and queried as buggy, which in turn showed the existing code was broken
already by prior "clean up" and nobody had noticed that either.

It's coding style robot code nothing more. Like some forgotten corridor
cleaned relentlessly by a lost Roomba but where no user has trodden in years.

Move it to staging and then to /dev/null.

The headers remain as they are shared with dpt_i2o.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/Kconfig                  |    2 -
 drivers/message/Makefile         |    1 -
 drivers/message/i2o/Kconfig      |  121 ---
 drivers/message/i2o/Makefile     |   16 -
 drivers/message/i2o/README       |   98 --
 drivers/message/i2o/README.ioctl |  394 --------
 drivers/message/i2o/bus-osm.c    |  176 ----
 drivers/message/i2o/config-osm.c |   90 --
 drivers/message/i2o/core.h       |   69 --
 drivers/message/i2o/debug.c      |  472 ---------
 drivers/message/i2o/device.c     |  594 -----------
 drivers/message/i2o/driver.c     |  382 -------
 drivers/message/i2o/exec-osm.c   |  612 ------------
 drivers/message/i2o/i2o_block.c  | 1228 -----------------------
 drivers/message/i2o/i2o_block.h  |  103 --
 drivers/message/i2o/i2o_config.c | 1163 ----------------------
 drivers/message/i2o/i2o_proc.c   | 2045 --------------------------------------
 drivers/message/i2o/i2o_scsi.c   |  814 ---------------
 drivers/message/i2o/iop.c        | 1247 -----------------------
 drivers/message/i2o/memory.c     |  313 ------
 drivers/message/i2o/pci.c        |  497 ---------
 drivers/staging/Kconfig          |    2 +
 drivers/staging/Makefile         |    1 +
 drivers/staging/i2o/Kconfig      |  120 +++
 drivers/staging/i2o/Makefile     |   16 +
 drivers/staging/i2o/README       |   98 ++
 drivers/staging/i2o/README.ioctl |  394 ++++++++
 drivers/staging/i2o/bus-osm.c    |  176 ++++
 drivers/staging/i2o/config-osm.c |   90 ++
 drivers/staging/i2o/core.h       |   69 ++
 drivers/staging/i2o/debug.c      |  472 +++++++++
 drivers/staging/i2o/device.c     |  594 +++++++++++
 drivers/staging/i2o/driver.c     |  382 +++++++
 drivers/staging/i2o/exec-osm.c   |  612 ++++++++++++
 drivers/staging/i2o/i2o.h        |  988 ++++++++++++++++++
 drivers/staging/i2o/i2o_block.c  | 1228 +++++++++++++++++++++++
 drivers/staging/i2o/i2o_block.h  |  103 ++
 drivers/staging/i2o/i2o_config.c | 1163 ++++++++++++++++++++++
 drivers/staging/i2o/i2o_proc.c   | 2045 ++++++++++++++++++++++++++++++++++++++
 drivers/staging/i2o/i2o_scsi.c   |  814 +++++++++++++++
 drivers/staging/i2o/iop.c        | 1247 +++++++++++++++++++++++
 drivers/staging/i2o/memory.c     |  313 ++++++
 drivers/staging/i2o/pci.c        |  497 +++++++++
 include/linux/i2o.h              |  988 ------------------
 44 files changed, 11424 insertions(+), 11425 deletions(-)
 delete mode 100644 drivers/message/i2o/Kconfig
 delete mode 100644 drivers/message/i2o/Makefile
 delete mode 100644 drivers/message/i2o/README
 delete mode 100644 drivers/message/i2o/README.ioctl
 delete mode 100644 drivers/message/i2o/bus-osm.c
 delete mode 100644 drivers/message/i2o/config-osm.c
 delete mode 100644 drivers/message/i2o/core.h
 delete mode 100644 drivers/message/i2o/debug.c
 delete mode 100644 drivers/message/i2o/device.c
 delete mode 100644 drivers/message/i2o/driver.c
 delete mode 100644 drivers/message/i2o/exec-osm.c
 delete mode 100644 drivers/message/i2o/i2o_block.c
 delete mode 100644 drivers/message/i2o/i2o_block.h
 delete mode 100644 drivers/message/i2o/i2o_config.c
 delete mode 100644 drivers/message/i2o/i2o_proc.c
 delete mode 100644 drivers/message/i2o/i2o_scsi.c
 delete mode 100644 drivers/message/i2o/iop.c
 delete mode 100644 drivers/message/i2o/memory.c
 delete mode 100644 drivers/message/i2o/pci.c
 create mode 100644 drivers/staging/i2o/Kconfig
 create mode 100644 drivers/staging/i2o/Makefile
 create mode 100644 drivers/staging/i2o/README
 create mode 100644 drivers/staging/i2o/README.ioctl
 create mode 100644 drivers/staging/i2o/bus-osm.c
 create mode 100644 drivers/staging/i2o/config-osm.c
 create mode 100644 drivers/staging/i2o/core.h
 create mode 100644 drivers/staging/i2o/debug.c
 create mode 100644 drivers/staging/i2o/device.c
 create mode 100644 drivers/staging/i2o/driver.c
 create mode 100644 drivers/staging/i2o/exec-osm.c
 create mode 100644 drivers/staging/i2o/i2o.h
 create mode 100644 drivers/staging/i2o/i2o_block.c
 create mode 100644 drivers/staging/i2o/i2o_block.h
 create mode 100644 drivers/staging/i2o/i2o_config.c
 create mode 100644 drivers/staging/i2o/i2o_proc.c
 create mode 100644 drivers/staging/i2o/i2o_scsi.c
 create mode 100644 drivers/staging/i2o/iop.c
 create mode 100644 drivers/staging/i2o/memory.c
 create mode 100644 drivers/staging/i2o/pci.c
 delete mode 100644 include/linux/i2o.h

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index c70d6e45dc10..c0cc96bab9e7 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -36,8 +36,6 @@ source "drivers/message/fusion/Kconfig"
 
 source "drivers/firewire/Kconfig"
 
-source "drivers/message/i2o/Kconfig"
-
 source "drivers/macintosh/Kconfig"
 
 source "drivers/net/Kconfig"
diff --git a/drivers/message/Makefile b/drivers/message/Makefile
index 97ef5a01ad11..755676ded67c 100644
--- a/drivers/message/Makefile
+++ b/drivers/message/Makefile
@@ -2,5 +2,4 @@
 # Makefile for MPT based block devices
 #
 
-obj-$(CONFIG_I2O)	+= i2o/
 obj-$(CONFIG_FUSION)	+= fusion/
diff --git a/drivers/message/i2o/Kconfig b/drivers/message/i2o/Kconfig
deleted file mode 100644
index 5afa0e393ecf..000000000000
--- a/drivers/message/i2o/Kconfig
+++ /dev/null
@@ -1,121 +0,0 @@
-
-menuconfig I2O
-	tristate "I2O device support"
-	depends on PCI
-	---help---
-	  The Intelligent Input/Output (I2O) architecture allows hardware
-	  drivers to be split into two parts: an operating system specific
-	  module called the OSM and an hardware specific module called the
-	  HDM. The OSM can talk to a whole range of HDM's, and ideally the
-	  HDM's are not OS dependent. This allows for the same HDM driver to
-	  be used under different operating systems if the relevant OSM is in
-	  place. In order for this to work, you need to have an I2O interface
-	  adapter card in your computer. This card contains a special I/O
-	  processor (IOP), thus allowing high speeds since the CPU does not
-	  have to deal with I/O.
-
-	  If you say Y here, you will get a choice of interface adapter
-	  drivers and OSM's with the following questions.
-
-	  To compile this support as a module, choose M here: the
-	  modules will be called i2o_core.
-
-	  If unsure, say N.
-
-if I2O
-
-config I2O_LCT_NOTIFY_ON_CHANGES
-	bool "Enable LCT notification"
-	default y
-	---help---
-	  Only say N here if you have a I2O controller from SUN. The SUN
-	  firmware doesn't support LCT notification on changes. If this option
-	  is enabled on such a controller the driver will hang up in a endless
-	  loop. On all other controllers say Y.
-
-	  If unsure, say Y.
-
-config I2O_EXT_ADAPTEC
-	bool "Enable Adaptec extensions"
-	default y
-	---help---
-	  Say Y for support of raidutils for Adaptec I2O controllers. You also
-	  have to say Y to "I2O Configuration support", "I2O SCSI OSM" below
-	  and to "SCSI generic support" under "SCSI device configuration".
-
-config I2O_EXT_ADAPTEC_DMA64
-	bool "Enable 64-bit DMA"
-	depends on I2O_EXT_ADAPTEC && ( 64BIT || HIGHMEM64G )
-	default y
-	---help---
-	  Say Y for support of 64-bit DMA transfer mode on Adaptec I2O
-	  controllers.
-	  Note: You need at least firmware version 3709.
-
-config I2O_CONFIG
-	tristate "I2O Configuration support"
-	depends on VIRT_TO_BUS
-	---help---
-	  Say Y for support of the configuration interface for the I2O adapters.
-	  If you have a RAID controller from Adaptec and you want to use the
-	  raidutils to manage your RAID array, you have to say Y here.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_config.
-
-	  Note: If you want to use the new API you have to download the
-	  i2o_config patch from http://i2o.shadowconnect.com/
-
-config I2O_CONFIG_OLD_IOCTL
-	bool "Enable ioctls (OBSOLETE)"
-	depends on I2O_CONFIG
-	default y
-	---help---
-	  Enables old ioctls.
-
-config I2O_BUS
-	tristate "I2O Bus Adapter OSM"
-	---help---
-	  Include support for the I2O Bus Adapter OSM. The Bus Adapter OSM
-	  provides access to the busses on the I2O controller. The main purpose
-	  is to rescan the bus to find new devices.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_bus.
-
-config I2O_BLOCK
-	tristate "I2O Block OSM"
-	depends on BLOCK
-	---help---
-	  Include support for the I2O Block OSM. The Block OSM presents disk
-	  and other structured block devices to the operating system. If you
-	  are using an RAID controller, you could access the array only by
-	  the Block OSM driver. But it is possible to access the single disks
-	  by the SCSI OSM driver, for example to monitor the disks.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_block.
-
-config I2O_SCSI
-	tristate "I2O SCSI OSM"
-	depends on SCSI
-	---help---
-	  Allows direct SCSI access to SCSI devices on a SCSI or FibreChannel
-	  I2O controller. You can use both the SCSI and Block OSM together if
-	  you wish. To access a RAID array, you must use the Block OSM driver.
-	  But you could use the SCSI OSM driver to monitor the single disks.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_scsi.
-
-config I2O_PROC
-	tristate "I2O /proc support"
-	---help---
-	  If you say Y here and to "/proc file system support", you will be
-	  able to read I2O related information from the virtual directory
-	  /proc/i2o.
-
-	  To compile this support as a module, choose M here: the
-	  module will be called i2o_proc.
-
-endif # I2O
diff --git a/drivers/message/i2o/Makefile b/drivers/message/i2o/Makefile
deleted file mode 100644
index b0982dacfd0a..000000000000
--- a/drivers/message/i2o/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-#
-# Makefile for the kernel I2O OSM.
-#
-# Note : at this point, these files are compiled on all systems.
-# In the future, some of these should be built conditionally.
-#
-
-i2o_core-y		+= iop.o driver.o device.o debug.o pci.o exec-osm.o memory.o
-i2o_bus-y		+= bus-osm.o
-i2o_config-y		+= config-osm.o
-obj-$(CONFIG_I2O)	+= i2o_core.o
-obj-$(CONFIG_I2O_CONFIG)+= i2o_config.o
-obj-$(CONFIG_I2O_BUS)	+= i2o_bus.o
-obj-$(CONFIG_I2O_BLOCK)	+= i2o_block.o
-obj-$(CONFIG_I2O_SCSI)	+= i2o_scsi.o
-obj-$(CONFIG_I2O_PROC)	+= i2o_proc.o
diff --git a/drivers/message/i2o/README b/drivers/message/i2o/README
deleted file mode 100644
index f072a8eb3041..000000000000
--- a/drivers/message/i2o/README
+++ /dev/null
@@ -1,98 +0,0 @@
-
-	Linux I2O Support	(c) Copyright 1999 Red Hat Software
-					and others.
-
-	This program is free software; you can redistribute it and/or
-	modify it under the terms of the GNU General Public License
-	as published by the Free Software Foundation; either version
-	2 of the License, or (at your option) any later version.
-
-AUTHORS (so far)
-
-Alan Cox, Building Number Three Ltd.
-	Core code, SCSI and Block OSMs
-
-Steve Ralston, LSI Logic Corp.
-	Debugging SCSI and Block OSM
-
-Deepak Saxena, Intel Corp.
-	Various core/block extensions
-	/proc interface, bug fixes
-	Ioctl interfaces for control
-	Debugging LAN OSM
-
-Philip Rumpf
-	Fixed assorted dumb SMP locking bugs
-
-Juha Sievanen, University of Helsinki Finland
-	LAN OSM code
-	/proc interface to LAN class
-	Bug fixes
-	Core code extensions
-
-Auvo Häkkinen, University of Helsinki Finland
-	LAN OSM code
-	/Proc interface to LAN class
-	Bug fixes
-	Core code extensions
-
-Taneli Vähäkangas, University of Helsinki Finland
-	Fixes to i2o_config
-
-CREDITS
-
-	This work was made possible by 
-
-Red Hat Software
-	Funding for the Building #3 part of the project
-
-Symbios Logic (Now LSI)
-	Host adapters, hints, known to work platforms when I hit
-	compatibility problems
-
-BoxHill Corporation
-	Loan of initial FibreChannel disk array used for development work.
-
-European Commission
-	Funding the work done by the University of Helsinki
-
-SysKonnect
-        Loan of FDDI and Gigabit Ethernet cards
-
-ASUSTeK
-        Loan of I2O motherboard 
-
-STATUS:
-
-o	The core setup works within limits.
-o	The scsi layer seems to almost work. 
-           I'm still chasing down the hang bug.
-o	The block OSM is mostly functional
-o	LAN OSM works with FDDI and Ethernet cards.
-
-TO DO:
-
-General:
-o	Provide hidden address space if asked
-o	Long term message flow control
-o	PCI IOP's without interrupts are not supported yet
-o	Push FAIL handling into the core
-o	DDM control interfaces for module load etc
-o       Add I2O 2.0 support (Deffered to 2.5 kernel)
-
-Block:
-o	Multiple major numbers
-o	Read ahead and cache handling stuff. Talk to Ingo and people
-o	Power management
-o	Finish Media changers
-
-SCSI:
-o	Find the right way to associate drives/luns/busses
-
-Lan:	
-o	Performance tuning
-o	Test Fibre Channel code
-
-Tape:
-o	Anyone seen anything implementing this ?
-           (D.S: Will attempt to do so if spare cycles permit)
diff --git a/drivers/message/i2o/README.ioctl b/drivers/message/i2o/README.ioctl
deleted file mode 100644
index 4a7d2ebdfc97..000000000000
--- a/drivers/message/i2o/README.ioctl
+++ /dev/null
@@ -1,394 +0,0 @@
-
-Linux I2O User Space Interface
-rev 0.3 - 04/20/99
-
-=============================================================================
-Originally written by Deepak Saxena(deepak@plexity.net)
-Currently maintained by Deepak Saxena(deepak@plexity.net)
-=============================================================================
-
-I. Introduction
-
-The Linux I2O subsystem provides a set of ioctl() commands that can be
-utilized by user space applications to communicate with IOPs and devices
-on individual IOPs. This document defines the specific ioctl() commands
-that are available to the user and provides examples of their uses.
-
-This document assumes the reader is familiar with or has access to the 
-I2O specification as no I2O message parameters are outlined.  For information 
-on the specification, see http://www.i2osig.org
-
-This document and the I2O user space interface are currently maintained
-by Deepak Saxena.  Please send all comments, errata, and bug fixes to
-deepak@csociety.purdue.edu
-
-II. IOP Access
-
-Access to the I2O subsystem is provided through the device file named 
-/dev/i2o/ctl.  This file is a character file with major number 10 and minor
-number 166.  It can be created through the following command:
-
-   mknod /dev/i2o/ctl c 10 166
-
-III. Determining the IOP Count
-
-   SYNOPSIS 
-
-   ioctl(fd, I2OGETIOPS,  int *count);
-
-   u8 count[MAX_I2O_CONTROLLERS];
-
-   DESCRIPTION
-
-   This function returns the system's active IOP table.  count should
-   point to a buffer containing MAX_I2O_CONTROLLERS entries.  Upon 
-   returning, each entry will contain a non-zero value if the given
-   IOP unit is active, and NULL if it is inactive or non-existent.
-
-   RETURN VALUE.
-
-   Returns 0 if no errors occur, and -1 otherwise.  If an error occurs,
-   errno is set appropriately:
-
-     EFAULT   Invalid user space pointer was passed
-
-IV. Getting Hardware Resource Table
-
-   SYNOPSIS 
- 
-   ioctl(fd, I2OHRTGET, struct i2o_cmd_hrt *hrt);
-
-      struct i2o_cmd_hrtlct
-      {
-         u32   iop;      /* IOP unit number */
-         void  *resbuf;  /* Buffer for result */
-         u32   *reslen;  /* Buffer length in bytes */
-      };
-
-   DESCRIPTION
-
-   This function returns the Hardware Resource Table of the IOP specified 
-   by hrt->iop in the buffer pointed to by hrt->resbuf. The actual size of 
-   the data is written into *(hrt->reslen).
-
-   RETURNS
-
-   This function returns 0 if no errors occur. If an error occurs, -1 
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(hrt->reslen)
-  
-V. Getting Logical Configuration Table
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OLCTGET, struct i2o_cmd_lct *lct);
-
-      struct i2o_cmd_hrtlct
-      {
-         u32   iop;      /* IOP unit number */
-         void  *resbuf;  /* Buffer for result */
-         u32   *reslen;  /* Buffer length in bytes */
-      };
-
-   DESCRIPTION
-
-   This function returns the Logical Configuration Table of the IOP specified
-   by lct->iop in the buffer pointed to by lct->resbuf. The actual size of 
-   the data is written into *(lct->reslen).
-
-   RETURNS
-
-   This function returns 0 if no errors occur. If an error occurs, -1 
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(lct->reslen)
-
-VI. Setting Parameters
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OPARMSET, struct i2o_parm_setget *ops);
-
-      struct i2o_cmd_psetget
-      {
-         u32   iop;      /* IOP unit number */
-         u32   tid;      /* Target device TID */
-         void  *opbuf;   /* Operation List buffer */
-         u32   oplen;    /* Operation List buffer length in bytes */
-         void  *resbuf;  /* Result List buffer */
-         u32   *reslen;  /* Result List buffer length in bytes */
-      };
-
-   DESCRIPTION
-
-   This function posts a UtilParamsSet message to the device identified
-   by ops->iop and ops->tid.  The operation list for the message is 
-   sent through the ops->opbuf buffer, and the result list is written
-   into the buffer pointed to by ops->resbuf.  The number of bytes 
-   written is placed into *(ops->reslen). 
-
-   RETURNS
-
-   The return value is the size in bytes of the data written into
-   ops->resbuf if no errors occur.  If an error occurs, -1 is returned 
-   and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(ops->reslen)
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-   A return value of 0 does not mean that the value was actually
-   changed properly on the IOP.  The user should check the result
-   list to determine the specific status of the transaction.
-
-VII. Getting Parameters
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OPARMGET, struct i2o_parm_setget *ops);
-
-      struct i2o_parm_setget
-      {
-         u32   iop;      /* IOP unit number */
-         u32   tid;      /* Target device TID */
-         void  *opbuf;   /* Operation List buffer */
-         u32   oplen;    /* Operation List buffer length in bytes */
-         void  *resbuf;  /* Result List buffer */
-         u32   *reslen;  /* Result List buffer length in bytes */
-      };
-
-   DESCRIPTION
-
-   This function posts a UtilParamsGet message to the device identified
-   by ops->iop and ops->tid.  The operation list for the message is 
-   sent through the ops->opbuf buffer, and the result list is written
-   into the buffer pointed to by ops->resbuf.  The actual size of data
-   written is placed into *(ops->reslen).
-
-   RETURNS
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(ops->reslen)
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-   A return value of 0 does not mean that the value was actually
-   properly retrieved.  The user should check the result list 
-   to determine the specific status of the transaction.
-
-VIII. Downloading Software
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OSWDL, struct i2o_sw_xfer *sw);
-
-      struct i2o_sw_xfer
-      {
-         u32   iop;       /* IOP unit number */
-         u8    flags;     /* DownloadFlags field */
-         u8    sw_type;   /* Software type */
-         u32   sw_id;     /* Software ID */
-         void  *buf;      /* Pointer to software buffer */
-         u32   *swlen;    /* Length of software buffer */        
-         u32   *maxfrag;  /* Number of fragments */
-         u32   *curfrag;  /* Current fragment number */
-      };
-
-   DESCRIPTION
-
-   This function downloads a software fragment pointed by sw->buf
-   to the iop identified by sw->iop. The DownloadFlags, SwID, SwType
-   and SwSize fields of the ExecSwDownload message are filled in with
-   the values of sw->flags, sw->sw_id, sw->sw_type and *(sw->swlen).
-
-   The fragments _must_ be sent in order and be 8K in size. The last
-   fragment _may_ be shorter, however. The kernel will compute its
-   size based on information in the sw->swlen field.
-
-   Please note that SW transfers can take a long time.
-
-   RETURNS
-
-   This function returns 0 no errors occur. If an error occurs, -1 
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-IX. Uploading Software
-   
-   SYNOPSIS 
-
-   ioctl(fd, I2OSWUL, struct i2o_sw_xfer *sw);
-
-      struct i2o_sw_xfer
-      {
-         u32   iop;      /* IOP unit number */
-         u8    flags; 	 /* UploadFlags */
-         u8    sw_type;  /* Software type */
-         u32   sw_id;    /* Software ID */
-         void  *buf;     /* Pointer to software buffer */
-         u32   *swlen;   /* Length of software buffer */        
-         u32   *maxfrag; /* Number of fragments */
-         u32   *curfrag; /* Current fragment number */
-      };
-
-   DESCRIPTION
-
-   This function uploads a software fragment from the IOP identified
-   by sw->iop, sw->sw_type, sw->sw_id and optionally sw->swlen fields.
-   The UploadFlags, SwID, SwType and SwSize fields of the ExecSwUpload
-   message are filled in with the values of sw->flags, sw->sw_id,
-   sw->sw_type and *(sw->swlen).
-
-   The fragments _must_ be requested in order and be 8K in size. The
-   user is responsible for allocating memory pointed by sw->buf. The
-   last fragment _may_ be shorter.
-
-   Please note that SW transfers can take a long time.
-
-   RETURNS
-
-   This function returns 0 if no errors occur.  If an error occurs, -1
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-         
-X. Removing Software
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OSWDEL, struct i2o_sw_xfer *sw);
-
-      struct i2o_sw_xfer
-      {
-         u32   iop;      /* IOP unit number */
-         u8    flags; 	 /* RemoveFlags */
-         u8    sw_type;  /* Software type */
-         u32   sw_id;    /* Software ID */
-         void  *buf;     /* Unused */
-         u32   *swlen;   /* Length of the software data */        
-         u32   *maxfrag; /* Unused */
-         u32   *curfrag; /* Unused */
-      };
-
-   DESCRIPTION
-
-   This function removes software from the IOP identified by sw->iop.
-   The RemoveFlags, SwID, SwType and SwSize fields of the ExecSwRemove message 
-   are filled in with the values of sw->flags, sw->sw_id, sw->sw_type and 
-   *(sw->swlen). Give zero in *(sw->len) if the value is unknown. IOP uses 
-   *(sw->swlen) value to verify correct identication of the module to remove. 
-   The actual size of the module is written into *(sw->swlen).
-
-   RETURNS
-
-   This function returns 0 if no errors occur.  If an error occurs, -1
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-X. Validating Configuration
-
-   SYNOPSIS
-
-   ioctl(fd, I2OVALIDATE, int *iop);
-	u32 iop;
-
-   DESCRIPTION
-
-   This function posts an ExecConfigValidate message to the controller
-   identified by iop. This message indicates that the current
-   configuration is accepted. The iop changes the status of suspect drivers 
-   to valid and may delete old drivers from its store.
-
-   RETURNS
-
-   This function returns 0 if no erro occur.  If an error occurs, -1 is
-   returned and errno is set appropriately:
-
-      ETIMEDOUT   Timeout waiting for reply message
-      ENXIO       Invalid IOP number
-
-XI. Configuration Dialog
-   
-   SYNOPSIS 
- 
-   ioctl(fd, I2OHTML, struct i2o_html *htquery);
-      struct i2o_html
-      {
-         u32   iop;      /* IOP unit number */
-         u32   tid;      /* Target device ID */
-         u32   page;     /* HTML page */
-         void  *resbuf;  /* Buffer for reply HTML page */
-         u32   *reslen;  /* Length in bytes of reply buffer */
-         void  *qbuf;    /* Pointer to HTTP query string */
-         u32   qlen;     /* Length in bytes of query string buffer */        
-      };
-
-   DESCRIPTION
-
-   This function posts an UtilConfigDialog message to the device identified
-   by htquery->iop and htquery->tid.  The requested HTML page number is 
-   provided by the htquery->page field, and the resultant data is stored 
-   in the buffer pointed to by htquery->resbuf.  If there is an HTTP query 
-   string that is to be sent to the device, it should be sent in the buffer
-   pointed to by htquery->qbuf.  If there is no query string, this field
-   should be set to NULL. The actual size of the reply received is written
-   into *(htquery->reslen).
-  
-   RETURNS
-
-   This function returns 0 if no error occur. If an error occurs, -1
-   is returned and errno is set appropriately:
-
-      EFAULT      Invalid user space pointer was passed
-      ENXIO       Invalid IOP number
-      ENOBUFS     Buffer not large enough.  If this occurs, the required
-                  buffer length is written into *(ops->reslen)
-      ETIMEDOUT   Timeout waiting for reply message
-      ENOMEM      Kernel memory allocation error
-
-XII. Events
-
-    In the process of determining this.  Current idea is to have use
-    the select() interface to allow user apps to periodically poll
-    the /dev/i2o/ctl device for events.  When select() notifies the user
-    that an event is available, the user would call read() to retrieve
-    a list of all the events that are pending for the specific device.
-
-=============================================================================
-Revision History
-=============================================================================
-
-Rev 0.1 - 04/01/99
-- Initial revision
-
-Rev 0.2 - 04/06/99
-- Changed return values to match UNIX ioctl() standard.  Only return values
-  are 0 and -1.  All errors are reported through errno.
-- Added summary of proposed possible event interfaces
-
-Rev 0.3 - 04/20/99
-- Changed all ioctls() to use pointers to user data instead of actual data
-- Updated error values to match the code
diff --git a/drivers/message/i2o/bus-osm.c b/drivers/message/i2o/bus-osm.c
deleted file mode 100644
index c463dc2efc09..000000000000
--- a/drivers/message/i2o/bus-osm.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- *	Bus Adapter OSM
- *
- *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-
-#define OSM_NAME	"bus-osm"
-#define OSM_VERSION	"1.317"
-#define OSM_DESCRIPTION	"I2O Bus Adapter OSM"
-
-static struct i2o_driver i2o_bus_driver;
-
-/* Bus OSM class handling definition */
-static struct i2o_class_id i2o_bus_class_id[] = {
-	{I2O_CLASS_BUS_ADAPTER},
-	{I2O_CLASS_END}
-};
-
-/**
- *	i2o_bus_scan - Scan the bus for new devices
- *	@dev: I2O device of the bus, which should be scanned
- *
- *	Scans the bus dev for new / removed devices. After the scan a new LCT
- *	will be fetched automatically.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_bus_scan(struct i2o_device *dev)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return -ETIMEDOUT;
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.
-			tid);
-
-	return i2o_msg_post_wait(dev->iop, msg, 60);
-};
-
-/**
- *	i2o_bus_store_scan - Scan the I2O Bus Adapter
- *	@d: device which should be scanned
- *	@attr: device_attribute
- *	@buf: output buffer
- *	@count: buffer size
- *
- *	Returns count.
- */
-static ssize_t i2o_bus_store_scan(struct device *d,
-				  struct device_attribute *attr,
-				  const char *buf, size_t count)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(d);
-	int rc;
-
-	if ((rc = i2o_bus_scan(i2o_dev)))
-		osm_warn("bus scan failed %d\n", rc);
-
-	return count;
-}
-
-/* Bus Adapter OSM device attributes */
-static DEVICE_ATTR(scan, S_IWUSR, NULL, i2o_bus_store_scan);
-
-/**
- *	i2o_bus_probe - verify if dev is a I2O Bus Adapter device and install it
- *	@dev: device to verify if it is a I2O Bus Adapter device
- *
- *	Because we want all Bus Adapters always return 0.
- *	Except when we fail.  Then we are sad.
- *
- *	Returns 0, except when we fail to excel.
- */
-static int i2o_bus_probe(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(get_device(dev));
-	int rc;
-
-	rc = device_create_file(dev, &dev_attr_scan);
-	if (rc)
-		goto err_out;
-
-	osm_info("device added (TID: %03x)\n", i2o_dev->lct_data.tid);
-
-	return 0;
-
-err_out:
-	put_device(dev);
-	return rc;
-};
-
-/**
- *	i2o_bus_remove - remove the I2O Bus Adapter device from the system again
- *	@dev: I2O Bus Adapter device which should be removed
- *
- *	Always returns 0.
- */
-static int i2o_bus_remove(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-
-	device_remove_file(dev, &dev_attr_scan);
-
-	put_device(dev);
-
-	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
-
-	return 0;
-};
-
-/* Bus Adapter OSM driver struct */
-static struct i2o_driver i2o_bus_driver = {
-	.name = OSM_NAME,
-	.classes = i2o_bus_class_id,
-	.driver = {
-		   .probe = i2o_bus_probe,
-		   .remove = i2o_bus_remove,
-		   },
-};
-
-/**
- *	i2o_bus_init - Bus Adapter OSM initialization function
- *
- *	Only register the Bus Adapter OSM in the I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_bus_init(void)
-{
-	int rc;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	/* Register Bus Adapter OSM into I2O core */
-	rc = i2o_driver_register(&i2o_bus_driver);
-	if (rc) {
-		osm_err("Could not register Bus Adapter OSM\n");
-		return rc;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_bus_exit - Bus Adapter OSM exit function
- *
- *	Unregisters Bus Adapter OSM from I2O core.
- */
-static void __exit i2o_bus_exit(void)
-{
-	i2o_driver_unregister(&i2o_bus_driver);
-};
-
-MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_bus_init);
-module_exit(i2o_bus_exit);
diff --git a/drivers/message/i2o/config-osm.c b/drivers/message/i2o/config-osm.c
deleted file mode 100644
index 3bba7aa82e58..000000000000
--- a/drivers/message/i2o/config-osm.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *	Configuration OSM
- *
- *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/fs.h>
-
-#include <asm/uaccess.h>
-
-#define OSM_NAME	"config-osm"
-#define OSM_VERSION	"1.323"
-#define OSM_DESCRIPTION	"I2O Configuration OSM"
-
-/* access mode user rw */
-#define S_IWRSR (S_IRUSR | S_IWUSR)
-
-static struct i2o_driver i2o_config_driver;
-
-/* Config OSM driver struct */
-static struct i2o_driver i2o_config_driver = {
-	.name = OSM_NAME,
-};
-
-#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
-#include "i2o_config.c"
-#endif
-
-/**
- *	i2o_config_init - Configuration OSM initialization function
- *
- *	Registers Configuration OSM in the I2O core and if old ioctl's are
- *	compiled in initialize them.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_config_init(void)
-{
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	if (i2o_driver_register(&i2o_config_driver)) {
-		osm_err("handler register failed.\n");
-		return -EBUSY;
-	}
-#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
-	if (i2o_config_old_init()) {
-		osm_err("old config handler initialization failed\n");
-		i2o_driver_unregister(&i2o_config_driver);
-		return -EBUSY;
-	}
-#endif
-
-	return 0;
-}
-
-/**
- *	i2o_config_exit - Configuration OSM exit function
- *
- *	If old ioctl's are compiled in exit remove them and unregisters
- *	Configuration OSM from I2O core.
- */
-static void i2o_config_exit(void)
-{
-#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
-	i2o_config_old_exit();
-#endif
-
-	i2o_driver_unregister(&i2o_config_driver);
-}
-
-MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_config_init);
-module_exit(i2o_config_exit);
diff --git a/drivers/message/i2o/core.h b/drivers/message/i2o/core.h
deleted file mode 100644
index 91614f11f89a..000000000000
--- a/drivers/message/i2o/core.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *	I2O core internal declarations
- *
- *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-/* Exec-OSM */
-extern struct i2o_driver i2o_exec_driver;
-extern int i2o_exec_lct_get(struct i2o_controller *);
-
-extern int __init i2o_exec_init(void);
-extern void i2o_exec_exit(void);
-
-/* driver */
-extern struct bus_type i2o_bus_type;
-
-extern int i2o_driver_dispatch(struct i2o_controller *, u32);
-
-extern int __init i2o_driver_init(void);
-extern void i2o_driver_exit(void);
-
-/* PCI */
-extern int __init i2o_pci_init(void);
-extern void __exit i2o_pci_exit(void);
-
-/* device */
-extern const struct attribute_group *i2o_device_groups[];
-
-extern void i2o_device_remove(struct i2o_device *);
-extern int i2o_device_parse_lct(struct i2o_controller *);
-
-int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
-		   int oplen, void *reslist, int reslen);
-
-/* IOP */
-extern struct i2o_controller *i2o_iop_alloc(void);
-
-/**
- *	i2o_iop_free - Free the i2o_controller struct
- *	@c: I2O controller to free
- */
-static inline void i2o_iop_free(struct i2o_controller *c)
-{
-	i2o_pool_free(&c->in_msg);
-	kfree(c);
-}
-
-extern int i2o_iop_add(struct i2o_controller *);
-extern void i2o_iop_remove(struct i2o_controller *);
-
-/* control registers relative to c->base */
-#define I2O_IRQ_STATUS	0x30
-#define I2O_IRQ_MASK	0x34
-#define I2O_IN_PORT	0x40
-#define I2O_OUT_PORT	0x44
-
-/* Motorola/Freescale specific register offset */
-#define I2O_MOTOROLA_PORT_OFFSET	0x10400
-
-#define I2O_IRQ_OUTBOUND_POST	0x00000008
diff --git a/drivers/message/i2o/debug.c b/drivers/message/i2o/debug.c
deleted file mode 100644
index ce62d8bfe1c8..000000000000
--- a/drivers/message/i2o/debug.c
+++ /dev/null
@@ -1,472 +0,0 @@
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/i2o.h>
-
-static void i2o_report_util_cmd(u8 cmd);
-static void i2o_report_exec_cmd(u8 cmd);
-static void i2o_report_fail_status(u8 req_status, u32 * msg);
-static void i2o_report_common_status(u8 req_status);
-static void i2o_report_common_dsc(u16 detailed_status);
-
-/*
- * Used for error reporting/debugging purposes.
- * Report Cmd name, Request status, Detailed Status.
- */
-void i2o_report_status(const char *severity, const char *str,
-		       struct i2o_message *m)
-{
-	u32 *msg = (u32 *) m;
-	u8 cmd = (msg[1] >> 24) & 0xFF;
-	u8 req_status = (msg[4] >> 24) & 0xFF;
-	u16 detailed_status = msg[4] & 0xFFFF;
-
-	if (cmd == I2O_CMD_UTIL_EVT_REGISTER)
-		return;		// No status in this reply
-
-	printk("%s%s: ", severity, str);
-
-	if (cmd < 0x1F)		// Utility cmd
-		i2o_report_util_cmd(cmd);
-
-	else if (cmd >= 0xA0 && cmd <= 0xEF)	// Executive cmd
-		i2o_report_exec_cmd(cmd);
-	else
-		printk("Cmd = %0#2x, ", cmd);	// Other cmds
-
-	if (msg[0] & MSG_FAIL) {
-		i2o_report_fail_status(req_status, msg);
-		return;
-	}
-
-	i2o_report_common_status(req_status);
-
-	if (cmd < 0x1F || (cmd >= 0xA0 && cmd <= 0xEF))
-		i2o_report_common_dsc(detailed_status);
-	else
-		printk(" / DetailedStatus = %0#4x.\n",
-		       detailed_status);
-}
-
-/* Used to dump a message to syslog during debugging */
-void i2o_dump_message(struct i2o_message *m)
-{
-#ifdef DEBUG
-	u32 *msg = (u32 *) m;
-	int i;
-	printk(KERN_INFO "Dumping I2O message size %d @ %p\n",
-	       msg[0] >> 16 & 0xffff, msg);
-	for (i = 0; i < ((msg[0] >> 16) & 0xffff); i++)
-		printk(KERN_INFO "  msg[%d] = %0#10x\n", i, msg[i]);
-#endif
-}
-
-/*
- * Used for error reporting/debugging purposes.
- * Following fail status are common to all classes.
- * The preserved message must be handled in the reply handler.
- */
-static void i2o_report_fail_status(u8 req_status, u32 * msg)
-{
-	static char *FAIL_STATUS[] = {
-		"0x80",		/* not used */
-		"SERVICE_SUSPENDED",	/* 0x81 */
-		"SERVICE_TERMINATED",	/* 0x82 */
-		"CONGESTION",
-		"FAILURE",
-		"STATE_ERROR",
-		"TIME_OUT",
-		"ROUTING_FAILURE",
-		"INVALID_VERSION",
-		"INVALID_OFFSET",
-		"INVALID_MSG_FLAGS",
-		"FRAME_TOO_SMALL",
-		"FRAME_TOO_LARGE",
-		"INVALID_TARGET_ID",
-		"INVALID_INITIATOR_ID",
-		"INVALID_INITIATOR_CONTEX",	/* 0x8F */
-		"UNKNOWN_FAILURE"	/* 0xFF */
-	};
-
-	if (req_status == I2O_FSC_TRANSPORT_UNKNOWN_FAILURE)
-		printk("TRANSPORT_UNKNOWN_FAILURE (%0#2x).\n",
-		       req_status);
-	else
-		printk("TRANSPORT_%s.\n",
-		       FAIL_STATUS[req_status & 0x0F]);
-
-	/* Dump some details */
-
-	printk(KERN_ERR "  InitiatorId = %d, TargetId = %d\n",
-	       (msg[1] >> 12) & 0xFFF, msg[1] & 0xFFF);
-	printk(KERN_ERR "  LowestVersion = 0x%02X, HighestVersion = 0x%02X\n",
-	       (msg[4] >> 8) & 0xFF, msg[4] & 0xFF);
-	printk(KERN_ERR "  FailingHostUnit = 0x%04X,  FailingIOP = 0x%03X\n",
-	       msg[5] >> 16, msg[5] & 0xFFF);
-
-	printk(KERN_ERR "  Severity:  0x%02X\n", (msg[4] >> 16) & 0xFF);
-	if (msg[4] & (1 << 16))
-		printk(KERN_DEBUG "(FormatError), "
-		       "this msg can never be delivered/processed.\n");
-	if (msg[4] & (1 << 17))
-		printk(KERN_DEBUG "(PathError), "
-		       "this msg can no longer be delivered/processed.\n");
-	if (msg[4] & (1 << 18))
-		printk(KERN_DEBUG "(PathState), "
-		       "the system state does not allow delivery.\n");
-	if (msg[4] & (1 << 19))
-		printk(KERN_DEBUG
-		       "(Congestion), resources temporarily not available;"
-		       "do not retry immediately.\n");
-}
-
-/*
- * Used for error reporting/debugging purposes.
- * Following reply status are common to all classes.
- */
-static void i2o_report_common_status(u8 req_status)
-{
-	static char *REPLY_STATUS[] = {
-		"SUCCESS",
-		"ABORT_DIRTY",
-		"ABORT_NO_DATA_TRANSFER",
-		"ABORT_PARTIAL_TRANSFER",
-		"ERROR_DIRTY",
-		"ERROR_NO_DATA_TRANSFER",
-		"ERROR_PARTIAL_TRANSFER",
-		"PROCESS_ABORT_DIRTY",
-		"PROCESS_ABORT_NO_DATA_TRANSFER",
-		"PROCESS_ABORT_PARTIAL_TRANSFER",
-		"TRANSACTION_ERROR",
-		"PROGRESS_REPORT"
-	};
-
-	if (req_status >= ARRAY_SIZE(REPLY_STATUS))
-		printk("RequestStatus = %0#2x", req_status);
-	else
-		printk("%s", REPLY_STATUS[req_status]);
-}
-
-/*
- * Used for error reporting/debugging purposes.
- * Following detailed status are valid  for executive class,
- * utility class, DDM class and for transaction error replies.
- */
-static void i2o_report_common_dsc(u16 detailed_status)
-{
-	static char *COMMON_DSC[] = {
-		"SUCCESS",
-		"0x01",		// not used
-		"BAD_KEY",
-		"TCL_ERROR",
-		"REPLY_BUFFER_FULL",
-		"NO_SUCH_PAGE",
-		"INSUFFICIENT_RESOURCE_SOFT",
-		"INSUFFICIENT_RESOURCE_HARD",
-		"0x08",		// not used
-		"CHAIN_BUFFER_TOO_LARGE",
-		"UNSUPPORTED_FUNCTION",
-		"DEVICE_LOCKED",
-		"DEVICE_RESET",
-		"INAPPROPRIATE_FUNCTION",
-		"INVALID_INITIATOR_ADDRESS",
-		"INVALID_MESSAGE_FLAGS",
-		"INVALID_OFFSET",
-		"INVALID_PARAMETER",
-		"INVALID_REQUEST",
-		"INVALID_TARGET_ADDRESS",
-		"MESSAGE_TOO_LARGE",
-		"MESSAGE_TOO_SMALL",
-		"MISSING_PARAMETER",
-		"TIMEOUT",
-		"UNKNOWN_ERROR",
-		"UNKNOWN_FUNCTION",
-		"UNSUPPORTED_VERSION",
-		"DEVICE_BUSY",
-		"DEVICE_NOT_AVAILABLE"
-	};
-
-	if (detailed_status > I2O_DSC_DEVICE_NOT_AVAILABLE)
-		printk(" / DetailedStatus = %0#4x.\n",
-		       detailed_status);
-	else
-		printk(" / %s.\n", COMMON_DSC[detailed_status]);
-}
-
-/*
- * Used for error reporting/debugging purposes
- */
-static void i2o_report_util_cmd(u8 cmd)
-{
-	switch (cmd) {
-	case I2O_CMD_UTIL_NOP:
-		printk("UTIL_NOP, ");
-		break;
-	case I2O_CMD_UTIL_ABORT:
-		printk("UTIL_ABORT, ");
-		break;
-	case I2O_CMD_UTIL_CLAIM:
-		printk("UTIL_CLAIM, ");
-		break;
-	case I2O_CMD_UTIL_RELEASE:
-		printk("UTIL_CLAIM_RELEASE, ");
-		break;
-	case I2O_CMD_UTIL_CONFIG_DIALOG:
-		printk("UTIL_CONFIG_DIALOG, ");
-		break;
-	case I2O_CMD_UTIL_DEVICE_RESERVE:
-		printk("UTIL_DEVICE_RESERVE, ");
-		break;
-	case I2O_CMD_UTIL_DEVICE_RELEASE:
-		printk("UTIL_DEVICE_RELEASE, ");
-		break;
-	case I2O_CMD_UTIL_EVT_ACK:
-		printk("UTIL_EVENT_ACKNOWLEDGE, ");
-		break;
-	case I2O_CMD_UTIL_EVT_REGISTER:
-		printk("UTIL_EVENT_REGISTER, ");
-		break;
-	case I2O_CMD_UTIL_LOCK:
-		printk("UTIL_LOCK, ");
-		break;
-	case I2O_CMD_UTIL_LOCK_RELEASE:
-		printk("UTIL_LOCK_RELEASE, ");
-		break;
-	case I2O_CMD_UTIL_PARAMS_GET:
-		printk("UTIL_PARAMS_GET, ");
-		break;
-	case I2O_CMD_UTIL_PARAMS_SET:
-		printk("UTIL_PARAMS_SET, ");
-		break;
-	case I2O_CMD_UTIL_REPLY_FAULT_NOTIFY:
-		printk("UTIL_REPLY_FAULT_NOTIFY, ");
-		break;
-	default:
-		printk("Cmd = %0#2x, ", cmd);
-	}
-}
-
-/*
- * Used for error reporting/debugging purposes
- */
-static void i2o_report_exec_cmd(u8 cmd)
-{
-	switch (cmd) {
-	case I2O_CMD_ADAPTER_ASSIGN:
-		printk("EXEC_ADAPTER_ASSIGN, ");
-		break;
-	case I2O_CMD_ADAPTER_READ:
-		printk("EXEC_ADAPTER_READ, ");
-		break;
-	case I2O_CMD_ADAPTER_RELEASE:
-		printk("EXEC_ADAPTER_RELEASE, ");
-		break;
-	case I2O_CMD_BIOS_INFO_SET:
-		printk("EXEC_BIOS_INFO_SET, ");
-		break;
-	case I2O_CMD_BOOT_DEVICE_SET:
-		printk("EXEC_BOOT_DEVICE_SET, ");
-		break;
-	case I2O_CMD_CONFIG_VALIDATE:
-		printk("EXEC_CONFIG_VALIDATE, ");
-		break;
-	case I2O_CMD_CONN_SETUP:
-		printk("EXEC_CONN_SETUP, ");
-		break;
-	case I2O_CMD_DDM_DESTROY:
-		printk("EXEC_DDM_DESTROY, ");
-		break;
-	case I2O_CMD_DDM_ENABLE:
-		printk("EXEC_DDM_ENABLE, ");
-		break;
-	case I2O_CMD_DDM_QUIESCE:
-		printk("EXEC_DDM_QUIESCE, ");
-		break;
-	case I2O_CMD_DDM_RESET:
-		printk("EXEC_DDM_RESET, ");
-		break;
-	case I2O_CMD_DDM_SUSPEND:
-		printk("EXEC_DDM_SUSPEND, ");
-		break;
-	case I2O_CMD_DEVICE_ASSIGN:
-		printk("EXEC_DEVICE_ASSIGN, ");
-		break;
-	case I2O_CMD_DEVICE_RELEASE:
-		printk("EXEC_DEVICE_RELEASE, ");
-		break;
-	case I2O_CMD_HRT_GET:
-		printk("EXEC_HRT_GET, ");
-		break;
-	case I2O_CMD_ADAPTER_CLEAR:
-		printk("EXEC_IOP_CLEAR, ");
-		break;
-	case I2O_CMD_ADAPTER_CONNECT:
-		printk("EXEC_IOP_CONNECT, ");
-		break;
-	case I2O_CMD_ADAPTER_RESET:
-		printk("EXEC_IOP_RESET, ");
-		break;
-	case I2O_CMD_LCT_NOTIFY:
-		printk("EXEC_LCT_NOTIFY, ");
-		break;
-	case I2O_CMD_OUTBOUND_INIT:
-		printk("EXEC_OUTBOUND_INIT, ");
-		break;
-	case I2O_CMD_PATH_ENABLE:
-		printk("EXEC_PATH_ENABLE, ");
-		break;
-	case I2O_CMD_PATH_QUIESCE:
-		printk("EXEC_PATH_QUIESCE, ");
-		break;
-	case I2O_CMD_PATH_RESET:
-		printk("EXEC_PATH_RESET, ");
-		break;
-	case I2O_CMD_STATIC_MF_CREATE:
-		printk("EXEC_STATIC_MF_CREATE, ");
-		break;
-	case I2O_CMD_STATIC_MF_RELEASE:
-		printk("EXEC_STATIC_MF_RELEASE, ");
-		break;
-	case I2O_CMD_STATUS_GET:
-		printk("EXEC_STATUS_GET, ");
-		break;
-	case I2O_CMD_SW_DOWNLOAD:
-		printk("EXEC_SW_DOWNLOAD, ");
-		break;
-	case I2O_CMD_SW_UPLOAD:
-		printk("EXEC_SW_UPLOAD, ");
-		break;
-	case I2O_CMD_SW_REMOVE:
-		printk("EXEC_SW_REMOVE, ");
-		break;
-	case I2O_CMD_SYS_ENABLE:
-		printk("EXEC_SYS_ENABLE, ");
-		break;
-	case I2O_CMD_SYS_MODIFY:
-		printk("EXEC_SYS_MODIFY, ");
-		break;
-	case I2O_CMD_SYS_QUIESCE:
-		printk("EXEC_SYS_QUIESCE, ");
-		break;
-	case I2O_CMD_SYS_TAB_SET:
-		printk("EXEC_SYS_TAB_SET, ");
-		break;
-	default:
-		printk("Cmd = %#02x, ", cmd);
-	}
-}
-
-void i2o_debug_state(struct i2o_controller *c)
-{
-	printk(KERN_INFO "%s: State = ", c->name);
-	switch (((i2o_status_block *) c->status_block.virt)->iop_state) {
-	case 0x01:
-		printk("INIT\n");
-		break;
-	case 0x02:
-		printk("RESET\n");
-		break;
-	case 0x04:
-		printk("HOLD\n");
-		break;
-	case 0x05:
-		printk("READY\n");
-		break;
-	case 0x08:
-		printk("OPERATIONAL\n");
-		break;
-	case 0x10:
-		printk("FAILED\n");
-		break;
-	case 0x11:
-		printk("FAULTED\n");
-		break;
-	default:
-		printk("%x (unknown !!)\n",
-		       ((i2o_status_block *) c->status_block.virt)->iop_state);
-	}
-};
-
-void i2o_dump_hrt(struct i2o_controller *c)
-{
-	u32 *rows = (u32 *) c->hrt.virt;
-	u8 *p = (u8 *) c->hrt.virt;
-	u8 *d;
-	int count;
-	int length;
-	int i;
-	int state;
-
-	if (p[3] != 0) {
-		printk(KERN_ERR
-		       "%s: HRT table for controller is too new a version.\n",
-		       c->name);
-		return;
-	}
-
-	count = p[0] | (p[1] << 8);
-	length = p[2];
-
-	printk(KERN_INFO "%s: HRT has %d entries of %d bytes each.\n",
-	       c->name, count, length << 2);
-
-	rows += 2;
-
-	for (i = 0; i < count; i++) {
-		printk(KERN_INFO "Adapter %08X: ", rows[0]);
-		p = (u8 *) (rows + 1);
-		d = (u8 *) (rows + 2);
-		state = p[1] << 8 | p[0];
-
-		printk("TID %04X:[", state & 0xFFF);
-		state >>= 12;
-		if (state & (1 << 0))
-			printk("H");	/* Hidden */
-		if (state & (1 << 2)) {
-			printk("P");	/* Present */
-			if (state & (1 << 1))
-				printk("C");	/* Controlled */
-		}
-		if (state > 9)
-			printk("*");	/* Hard */
-
-		printk("]:");
-
-		switch (p[3] & 0xFFFF) {
-		case 0:
-			/* Adapter private bus - easy */
-			printk("Local bus %d: I/O at 0x%04X Mem 0x%08X", p[2],
-			       d[1] << 8 | d[0], *(u32 *) (d + 4));
-			break;
-		case 1:
-			/* ISA bus */
-			printk("ISA %d: CSN %d I/O at 0x%04X Mem 0x%08X", p[2],
-			       d[2], d[1] << 8 | d[0], *(u32 *) (d + 4));
-			break;
-
-		case 2:	/* EISA bus */
-			printk("EISA %d: Slot %d I/O at 0x%04X Mem 0x%08X",
-			       p[2], d[3], d[1] << 8 | d[0], *(u32 *) (d + 4));
-			break;
-
-		case 3:	/* MCA bus */
-			printk("MCA %d: Slot %d I/O at 0x%04X Mem 0x%08X", p[2],
-			       d[3], d[1] << 8 | d[0], *(u32 *) (d + 4));
-			break;
-
-		case 4:	/* PCI bus */
-			printk("PCI %d: Bus %d Device %d Function %d", p[2],
-			       d[2], d[1], d[0]);
-			break;
-
-		case 0x80:	/* Other */
-		default:
-			printk("Unsupported bus type.");
-			break;
-		}
-		printk("\n");
-		rows += length;
-	}
-}
-
-EXPORT_SYMBOL(i2o_dump_message);
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
deleted file mode 100644
index 98348f420b52..000000000000
--- a/drivers/message/i2o/device.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- *	Functions to handle I2O devices
- *
- *	Copyright (C) 2004	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/delay.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include "core.h"
-
-/**
- *	i2o_device_issue_claim - claim or release a device
- *	@dev: I2O device to claim or release
- *	@cmd: claim or release command
- *	@type: type of claim
- *
- *	Issue I2O UTIL_CLAIM or UTIL_RELEASE messages. The message to be sent
- *	is set by cmd. dev is the I2O device which should be claim or
- *	released and the type is the claim type (see the I2O spec).
- *
- *	Returs 0 on success or negative error code on failure.
- */
-static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd,
-					 u32 type)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid);
-	msg->body[0] = cpu_to_le32(type);
-
-	return i2o_msg_post_wait(dev->iop, msg, 60);
-}
-
-/**
- *	i2o_device_claim - claim a device for use by an OSM
- *	@dev: I2O device to claim
- *
- *	Do the leg work to assign a device to a given OSM. If the claim succeeds,
- *	the owner is the primary. If the attempt fails a negative errno code
- *	is returned. On success zero is returned.
- */
-int i2o_device_claim(struct i2o_device *dev)
-{
-	int rc = 0;
-
-	mutex_lock(&dev->lock);
-
-	rc = i2o_device_issue_claim(dev, I2O_CMD_UTIL_CLAIM, I2O_CLAIM_PRIMARY);
-	if (!rc)
-		pr_debug("i2o: claim of device %d succeeded\n",
-			 dev->lct_data.tid);
-	else
-		pr_debug("i2o: claim of device %d failed %d\n",
-			 dev->lct_data.tid, rc);
-
-	mutex_unlock(&dev->lock);
-
-	return rc;
-}
-
-/**
- *	i2o_device_claim_release - release a device that the OSM is using
- *	@dev: device to release
- *
- *	Drop a claim by an OSM on a given I2O device.
- *
- *	AC - some devices seem to want to refuse an unclaim until they have
- *	finished internal processing. It makes sense since you don't want a
- *	new device to go reconfiguring the entire system until you are done.
- *	Thus we are prepared to wait briefly.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_device_claim_release(struct i2o_device *dev)
-{
-	int tries;
-	int rc = 0;
-
-	mutex_lock(&dev->lock);
-
-	/*
-	 *      If the controller takes a nonblocking approach to
-	 *      releases we have to sleep/poll for a few times.
-	 */
-	for (tries = 0; tries < 10; tries++) {
-		rc = i2o_device_issue_claim(dev, I2O_CMD_UTIL_RELEASE,
-					    I2O_CLAIM_PRIMARY);
-		if (!rc)
-			break;
-
-		ssleep(1);
-	}
-
-	if (!rc)
-		pr_debug("i2o: claim release of device %d succeeded\n",
-			 dev->lct_data.tid);
-	else
-		pr_debug("i2o: claim release of device %d failed %d\n",
-			 dev->lct_data.tid, rc);
-
-	mutex_unlock(&dev->lock);
-
-	return rc;
-}
-
-/**
- *	i2o_device_release - release the memory for a I2O device
- *	@dev: I2O device which should be released
- *
- *	Release the allocated memory. This function is called if refcount of
- *	device reaches 0 automatically.
- */
-static void i2o_device_release(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-
-	pr_debug("i2o: device %s released\n", dev_name(dev));
-
-	kfree(i2o_dev);
-}
-
-/**
- *	class_id_show - Displays class id of I2O device
- *	@dev: device of which the class id should be displayed
- *	@attr: pointer to device attribute
- *	@buf: buffer into which the class id should be printed
- *
- *	Returns the number of bytes which are printed into the buffer.
- */
-static ssize_t class_id_show(struct device *dev, struct device_attribute *attr,
-			     char *buf)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-
-	sprintf(buf, "0x%03x\n", i2o_dev->lct_data.class_id);
-	return strlen(buf) + 1;
-}
-static DEVICE_ATTR_RO(class_id);
-
-/**
- *	tid_show - Displays TID of I2O device
- *	@dev: device of which the TID should be displayed
- *	@attr: pointer to device attribute
- *	@buf: buffer into which the TID should be printed
- *
- *	Returns the number of bytes which are printed into the buffer.
- */
-static ssize_t tid_show(struct device *dev, struct device_attribute *attr,
-			char *buf)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-
-	sprintf(buf, "0x%03x\n", i2o_dev->lct_data.tid);
-	return strlen(buf) + 1;
-}
-static DEVICE_ATTR_RO(tid);
-
-/* I2O device attributes */
-static struct attribute *i2o_device_attrs[] = {
-	&dev_attr_class_id.attr,
-	&dev_attr_tid.attr,
-	NULL,
-};
-
-static const struct attribute_group i2o_device_group = {
-	.attrs = i2o_device_attrs,
-};
-
-const struct attribute_group *i2o_device_groups[] = {
-	&i2o_device_group,
-	NULL,
-};
-
-/**
- *	i2o_device_alloc - Allocate a I2O device and initialize it
- *
- *	Allocate the memory for a I2O device and initialize locks and lists
- *
- *	Returns the allocated I2O device or a negative error code if the device
- *	could not be allocated.
- */
-static struct i2o_device *i2o_device_alloc(void)
-{
-	struct i2o_device *dev;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return ERR_PTR(-ENOMEM);
-
-	INIT_LIST_HEAD(&dev->list);
-	mutex_init(&dev->lock);
-
-	dev->device.bus = &i2o_bus_type;
-	dev->device.release = &i2o_device_release;
-
-	return dev;
-}
-
-/**
- *	i2o_device_add - allocate a new I2O device and add it to the IOP
- *	@c: I2O controller that the device is on
- *	@entry: LCT entry of the I2O device
- *
- *	Allocate a new I2O device and initialize it with the LCT entry. The
- *	device is appended to the device list of the controller.
- *
- *	Returns zero on success, or a -ve errno.
- */
-static int i2o_device_add(struct i2o_controller *c, i2o_lct_entry *entry)
-{
-	struct i2o_device *i2o_dev, *tmp;
-	int rc;
-
-	i2o_dev = i2o_device_alloc();
-	if (IS_ERR(i2o_dev)) {
-		printk(KERN_ERR "i2o: unable to allocate i2o device\n");
-		return PTR_ERR(i2o_dev);
-	}
-
-	i2o_dev->lct_data = *entry;
-
-	dev_set_name(&i2o_dev->device, "%d:%03x", c->unit,
-		     i2o_dev->lct_data.tid);
-
-	i2o_dev->iop = c;
-	i2o_dev->device.parent = &c->device;
-
-	rc = device_register(&i2o_dev->device);
-	if (rc)
-		goto err;
-
-	list_add_tail(&i2o_dev->list, &c->devices);
-
-	/* create user entries for this device */
-	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
-	if (tmp && (tmp != i2o_dev)) {
-		rc = sysfs_create_link(&i2o_dev->device.kobj,
-				       &tmp->device.kobj, "user");
-		if (rc)
-			goto unreg_dev;
-	}
-
-	/* create user entries referring to this device */
-	list_for_each_entry(tmp, &c->devices, list)
-	    if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
-		&& (tmp != i2o_dev)) {
-		rc = sysfs_create_link(&tmp->device.kobj,
-				       &i2o_dev->device.kobj, "user");
-		if (rc)
-			goto rmlink1;
-	}
-
-	/* create parent entries for this device */
-	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
-	if (tmp && (tmp != i2o_dev)) {
-		rc = sysfs_create_link(&i2o_dev->device.kobj,
-				       &tmp->device.kobj, "parent");
-		if (rc)
-			goto rmlink1;
-	}
-
-	/* create parent entries referring to this device */
-	list_for_each_entry(tmp, &c->devices, list)
-	    if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
-		&& (tmp != i2o_dev)) {
-		rc = sysfs_create_link(&tmp->device.kobj,
-				       &i2o_dev->device.kobj, "parent");
-		if (rc)
-			goto rmlink2;
-	}
-
-	i2o_driver_notify_device_add_all(i2o_dev);
-
-	pr_debug("i2o: device %s added\n", dev_name(&i2o_dev->device));
-
-	return 0;
-
-rmlink2:
-	/* If link creating failed halfway, we loop whole list to cleanup.
-	 * And we don't care wrong removing of link, because sysfs_remove_link
-	 * will take care of it.
-	 */
-	list_for_each_entry(tmp, &c->devices, list) {
-		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "parent");
-	}
-	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
-rmlink1:
-	list_for_each_entry(tmp, &c->devices, list)
-		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "user");
-	sysfs_remove_link(&i2o_dev->device.kobj, "user");
-unreg_dev:
-	list_del(&i2o_dev->list);
-	device_unregister(&i2o_dev->device);
-err:
-	kfree(i2o_dev);
-	return rc;
-}
-
-/**
- *	i2o_device_remove - remove an I2O device from the I2O core
- *	@i2o_dev: I2O device which should be released
- *
- *	Is used on I2O controller removal or LCT modification, when the device
- *	is removed from the system. Note that the device could still hang
- *	around until the refcount reaches 0.
- */
-void i2o_device_remove(struct i2o_device *i2o_dev)
-{
-	struct i2o_device *tmp;
-	struct i2o_controller *c = i2o_dev->iop;
-
-	i2o_driver_notify_device_remove_all(i2o_dev);
-
-	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
-	sysfs_remove_link(&i2o_dev->device.kobj, "user");
-
-	list_for_each_entry(tmp, &c->devices, list) {
-		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "parent");
-		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "user");
-	}
-	list_del(&i2o_dev->list);
-
-	device_unregister(&i2o_dev->device);
-}
-
-/**
- *	i2o_device_parse_lct - Parse a previously fetched LCT and create devices
- *	@c: I2O controller from which the LCT should be parsed.
- *
- *	The Logical Configuration Table tells us what we can talk to on the
- *	board. For every entry we create an I2O device, which is registered in
- *	the I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_device_parse_lct(struct i2o_controller *c)
-{
-	struct i2o_device *dev, *tmp;
-	i2o_lct *lct;
-	u32 *dlct = c->dlct.virt;
-	int max = 0, i = 0;
-	u16 table_size;
-	u32 buf;
-
-	mutex_lock(&c->lct_lock);
-
-	kfree(c->lct);
-
-	buf = le32_to_cpu(*dlct++);
-	table_size = buf & 0xffff;
-
-	lct = c->lct = kmalloc(table_size * 4, GFP_KERNEL);
-	if (!lct) {
-		mutex_unlock(&c->lct_lock);
-		return -ENOMEM;
-	}
-
-	lct->lct_ver = buf >> 28;
-	lct->boot_tid = buf >> 16 & 0xfff;
-	lct->table_size = table_size;
-	lct->change_ind = le32_to_cpu(*dlct++);
-	lct->iop_flags = le32_to_cpu(*dlct++);
-
-	table_size -= 3;
-
-	pr_debug("%s: LCT has %d entries (LCT size: %d)\n", c->name, max,
-		 lct->table_size);
-
-	while (table_size > 0) {
-		i2o_lct_entry *entry = &lct->lct_entry[max];
-		int found = 0;
-
-		buf = le32_to_cpu(*dlct++);
-		entry->entry_size = buf & 0xffff;
-		entry->tid = buf >> 16 & 0xfff;
-
-		entry->change_ind = le32_to_cpu(*dlct++);
-		entry->device_flags = le32_to_cpu(*dlct++);
-
-		buf = le32_to_cpu(*dlct++);
-		entry->class_id = buf & 0xfff;
-		entry->version = buf >> 12 & 0xf;
-		entry->vendor_id = buf >> 16;
-
-		entry->sub_class = le32_to_cpu(*dlct++);
-
-		buf = le32_to_cpu(*dlct++);
-		entry->user_tid = buf & 0xfff;
-		entry->parent_tid = buf >> 12 & 0xfff;
-		entry->bios_info = buf >> 24;
-
-		memcpy(&entry->identity_tag, dlct, 8);
-		dlct += 2;
-
-		entry->event_capabilities = le32_to_cpu(*dlct++);
-
-		/* add new devices, which are new in the LCT */
-		list_for_each_entry_safe(dev, tmp, &c->devices, list) {
-			if (entry->tid == dev->lct_data.tid) {
-				found = 1;
-				break;
-			}
-		}
-
-		if (!found)
-			i2o_device_add(c, entry);
-
-		table_size -= 9;
-		max++;
-	}
-
-	/* remove devices, which are not in the LCT anymore */
-	list_for_each_entry_safe(dev, tmp, &c->devices, list) {
-		int found = 0;
-
-		for (i = 0; i < max; i++) {
-			if (lct->lct_entry[i].tid == dev->lct_data.tid) {
-				found = 1;
-				break;
-			}
-		}
-
-		if (!found)
-			i2o_device_remove(dev);
-	}
-
-	mutex_unlock(&c->lct_lock);
-
-	return 0;
-}
-
-/*
- *	Run time support routines
- */
-
-/*	Issue UTIL_PARAMS_GET or UTIL_PARAMS_SET
- *
- *	This function can be used for all UtilParamsGet/Set operations.
- *	The OperationList is given in oplist-buffer,
- *	and results are returned in reslist-buffer.
- *	Note that the minimum sized reslist is 8 bytes and contains
- *	ResultCount, ErrorInfoSize, BlockStatus and BlockSize.
- */
-int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
-		   int oplen, void *reslist, int reslen)
-{
-	struct i2o_message *msg;
-	int i = 0;
-	int rc;
-	struct i2o_dma res;
-	struct i2o_controller *c = i2o_dev->iop;
-	struct device *dev = &c->pdev->dev;
-
-	res.virt = NULL;
-
-	if (i2o_dma_alloc(dev, &res, reslen))
-		return -ENOMEM;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg)) {
-		i2o_dma_free(dev, &res);
-		return PTR_ERR(msg);
-	}
-
-	i = 0;
-	msg->u.head[1] =
-	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid);
-	msg->body[i++] = cpu_to_le32(0x00000000);
-	msg->body[i++] = cpu_to_le32(0x4C000000 | oplen);	/* OperationList */
-	memcpy(&msg->body[i], oplist, oplen);
-	i += (oplen / 4 + (oplen % 4 ? 1 : 0));
-	msg->body[i++] = cpu_to_le32(0xD0000000 | res.len);	/* ResultList */
-	msg->body[i++] = cpu_to_le32(res.phys);
-
-	msg->u.head[0] =
-	    cpu_to_le32(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) |
-			SGL_OFFSET_5);
-
-	rc = i2o_msg_post_wait_mem(c, msg, 10, &res);
-
-	/* This only looks like a memory leak - don't "fix" it. */
-	if (rc == -ETIMEDOUT)
-		return rc;
-
-	memcpy(reslist, res.virt, res.len);
-	i2o_dma_free(dev, &res);
-
-	return rc;
-}
-
-/*
- *	 Query one field group value or a whole scalar group.
- */
-int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
-		       void *buf, int buflen)
-{
-	u32 opblk[] = { cpu_to_le32(0x00000001),
-		cpu_to_le32((u16) group << 16 | I2O_PARAMS_FIELD_GET),
-		cpu_to_le32((s16) field << 16 | 0x00000001)
-	};
-	u8 *resblk;		/* 8 bytes for header */
-	int rc;
-
-	resblk = kmalloc(buflen + 8, GFP_KERNEL);
-	if (!resblk)
-		return -ENOMEM;
-
-	rc = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
-			    sizeof(opblk), resblk, buflen + 8);
-
-	memcpy(buf, resblk + 8, buflen);	/* cut off header */
-
-	kfree(resblk);
-
-	return rc;
-}
-
-/*
- *	if oper == I2O_PARAMS_TABLE_GET, get from all rows
- *		if fieldcount == -1 return all fields
- *			ibuf and ibuflen are unused (use NULL, 0)
- *		else return specific fields
- *			ibuf contains fieldindexes
- *
- *	if oper == I2O_PARAMS_LIST_GET, get from specific rows
- *		if fieldcount == -1 return all fields
- *			ibuf contains rowcount, keyvalues
- *		else return specific fields
- *			fieldcount is # of fieldindexes
- *			ibuf contains fieldindexes, rowcount, keyvalues
- *
- *	You could also use directly function i2o_issue_params().
- */
-int i2o_parm_table_get(struct i2o_device *dev, int oper, int group,
-		       int fieldcount, void *ibuf, int ibuflen, void *resblk,
-		       int reslen)
-{
-	u16 *opblk;
-	int size;
-
-	size = 10 + ibuflen;
-	if (size % 4)
-		size += 4 - size % 4;
-
-	opblk = kmalloc(size, GFP_KERNEL);
-	if (opblk == NULL) {
-		printk(KERN_ERR "i2o: no memory for query buffer.\n");
-		return -ENOMEM;
-	}
-
-	opblk[0] = 1;		/* operation count */
-	opblk[1] = 0;		/* pad */
-	opblk[2] = oper;
-	opblk[3] = group;
-	opblk[4] = fieldcount;
-	memcpy(opblk + 5, ibuf, ibuflen);	/* other params */
-
-	size = i2o_parm_issue(dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
-			      size, resblk, reslen);
-
-	kfree(opblk);
-	if (size > reslen)
-		return reslen;
-
-	return size;
-}
-
-EXPORT_SYMBOL(i2o_device_claim);
-EXPORT_SYMBOL(i2o_device_claim_release);
-EXPORT_SYMBOL(i2o_parm_field_get);
-EXPORT_SYMBOL(i2o_parm_table_get);
-EXPORT_SYMBOL(i2o_parm_issue);
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
deleted file mode 100644
index 1b18a0d1d05b..000000000000
--- a/drivers/message/i2o/driver.c
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- *	Functions to handle I2O drivers (OSMs) and I2O bus type for sysfs
- *
- *	Copyright (C) 2004	Markus Lidel <Markus.Lidel@shadowconnect.com>
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	Fixes/additions:
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			initial version.
- */
-
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/rwsem.h>
-#include <linux/i2o.h>
-#include <linux/workqueue.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include "core.h"
-
-#define OSM_NAME	"i2o"
-
-/* max_drivers - Maximum I2O drivers (OSMs) which could be registered */
-static unsigned int i2o_max_drivers = I2O_MAX_DRIVERS;
-module_param_named(max_drivers, i2o_max_drivers, uint, 0);
-MODULE_PARM_DESC(max_drivers, "maximum number of OSM's to support");
-
-/* I2O drivers lock and array */
-static spinlock_t i2o_drivers_lock;
-static struct i2o_driver **i2o_drivers;
-
-/**
- *	i2o_bus_match - Tell if I2O device class id matches the class ids of the I2O driver (OSM)
- *	@dev: device which should be verified
- *	@drv: the driver to match against
- *
- *	Used by the bus to check if the driver wants to handle the device.
- *
- *	Returns 1 if the class ids of the driver match the class id of the
- *	device, otherwise 0.
- */
-static int i2o_bus_match(struct device *dev, struct device_driver *drv)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_driver *i2o_drv = to_i2o_driver(drv);
-	struct i2o_class_id *ids = i2o_drv->classes;
-
-	if (ids)
-		while (ids->class_id != I2O_CLASS_END) {
-			if (ids->class_id == i2o_dev->lct_data.class_id)
-				return 1;
-			ids++;
-		}
-	return 0;
-};
-
-/* I2O bus type */
-struct bus_type i2o_bus_type = {
-	.name = "i2o",
-	.match = i2o_bus_match,
-	.dev_groups = i2o_device_groups,
-};
-
-/**
- *	i2o_driver_register - Register a I2O driver (OSM) in the I2O core
- *	@drv: I2O driver which should be registered
- *
- *	Registers the OSM drv in the I2O core and creates an event queues if
- *	necessary.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_driver_register(struct i2o_driver *drv)
-{
-	struct i2o_controller *c;
-	int i;
-	int rc = 0;
-	unsigned long flags;
-
-	osm_debug("Register driver %s\n", drv->name);
-
-	if (drv->event) {
-		drv->event_queue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 1,
-						   drv->name);
-		if (!drv->event_queue) {
-			osm_err("Could not initialize event queue for driver "
-				"%s\n", drv->name);
-			return -EFAULT;
-		}
-		osm_debug("Event queue initialized for driver %s\n", drv->name);
-	} else
-		drv->event_queue = NULL;
-
-	drv->driver.name = drv->name;
-	drv->driver.bus = &i2o_bus_type;
-
-	spin_lock_irqsave(&i2o_drivers_lock, flags);
-
-	for (i = 0; i2o_drivers[i]; i++)
-		if (i >= i2o_max_drivers) {
-			osm_err("too many drivers registered, increase "
-				"max_drivers\n");
-			spin_unlock_irqrestore(&i2o_drivers_lock, flags);
-			rc = -EFAULT;
-			goto out;
-		}
-
-	drv->context = i;
-	i2o_drivers[i] = drv;
-
-	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
-
-	osm_debug("driver %s gets context id %d\n", drv->name, drv->context);
-
-	list_for_each_entry(c, &i2o_controllers, list) {
-		struct i2o_device *i2o_dev;
-
-		i2o_driver_notify_controller_add(drv, c);
-		list_for_each_entry(i2o_dev, &c->devices, list)
-		    i2o_driver_notify_device_add(drv, i2o_dev);
-	}
-
-	rc = driver_register(&drv->driver);
-	if (rc)
-		goto out;
-
-	return 0;
-out:
-	if (drv->event_queue) {
-		destroy_workqueue(drv->event_queue);
-		drv->event_queue = NULL;
-	}
-
-	return rc;
-};
-
-/**
- *	i2o_driver_unregister - Unregister a I2O driver (OSM) from the I2O core
- *	@drv: I2O driver which should be unregistered
- *
- *	Unregisters the OSM drv from the I2O core and cleanup event queues if
- *	necessary.
- */
-void i2o_driver_unregister(struct i2o_driver *drv)
-{
-	struct i2o_controller *c;
-	unsigned long flags;
-
-	osm_debug("unregister driver %s\n", drv->name);
-
-	driver_unregister(&drv->driver);
-
-	list_for_each_entry(c, &i2o_controllers, list) {
-		struct i2o_device *i2o_dev;
-
-		list_for_each_entry(i2o_dev, &c->devices, list)
-		    i2o_driver_notify_device_remove(drv, i2o_dev);
-
-		i2o_driver_notify_controller_remove(drv, c);
-	}
-
-	spin_lock_irqsave(&i2o_drivers_lock, flags);
-	i2o_drivers[drv->context] = NULL;
-	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
-
-	if (drv->event_queue) {
-		destroy_workqueue(drv->event_queue);
-		drv->event_queue = NULL;
-		osm_debug("event queue removed for %s\n", drv->name);
-	}
-};
-
-/**
- *	i2o_driver_dispatch - dispatch an I2O reply message
- *	@c: I2O controller of the message
- *	@m: I2O message number
- *
- *	The reply is delivered to the driver from which the original message
- *	was. This function is only called from interrupt context.
- *
- *	Returns 0 on success and the message should not be flushed. Returns > 0
- *	on success and if the message should be flushed afterwords. Returns
- *	negative error code on failure (the message will be flushed too).
- */
-int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
-{
-	struct i2o_driver *drv;
-	struct i2o_message *msg = i2o_msg_out_to_virt(c, m);
-	u32 context = le32_to_cpu(msg->u.s.icntxt);
-	unsigned long flags;
-
-	if (unlikely(context >= i2o_max_drivers)) {
-		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
-			 context);
-		return -EIO;
-	}
-
-	spin_lock_irqsave(&i2o_drivers_lock, flags);
-	drv = i2o_drivers[context];
-	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
-
-	if (unlikely(!drv)) {
-		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
-			 context);
-		return -EIO;
-	}
-
-	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_UTIL_EVT_REGISTER) {
-		struct i2o_device *dev, *tmp;
-		struct i2o_event *evt;
-		u16 size;
-		u16 tid = le32_to_cpu(msg->u.head[1]) & 0xfff;
-
-		osm_debug("event received from device %d\n", tid);
-
-		if (!drv->event)
-			return -EIO;
-
-		/* cut of header from message size (in 32-bit words) */
-		size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5;
-
-		evt = kzalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
-		if (!evt)
-			return -ENOMEM;
-
-		evt->size = size;
-		evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt);
-		evt->event_indicator = le32_to_cpu(msg->body[0]);
-		memcpy(&evt->data, &msg->body[1], size * 4);
-
-		list_for_each_entry_safe(dev, tmp, &c->devices, list)
-		    if (dev->lct_data.tid == tid) {
-			evt->i2o_dev = dev;
-			break;
-		}
-
-		INIT_WORK(&evt->work, drv->event);
-		queue_work(drv->event_queue, &evt->work);
-		return 1;
-	}
-
-	if (unlikely(!drv->reply)) {
-		osm_debug("%s: Reply to driver %s, but no reply function"
-			  " defined!\n", c->name, drv->name);
-		return -EIO;
-	}
-
-	return drv->reply(c, m, msg);
-}
-
-/**
- *	i2o_driver_notify_controller_add_all - Send notify of added controller
- *	@c: newly added controller
- *
- *	Send notifications to all registered drivers that a new controller was
- *	added.
- */
-void i2o_driver_notify_controller_add_all(struct i2o_controller *c)
-{
-	int i;
-	struct i2o_driver *drv;
-
-	for (i = 0; i < i2o_max_drivers; i++) {
-		drv = i2o_drivers[i];
-
-		if (drv)
-			i2o_driver_notify_controller_add(drv, c);
-	}
-}
-
-/**
- *	i2o_driver_notify_controller_remove_all - Send notify of removed controller
- *	@c: controller that is being removed
- *
- *	Send notifications to all registered drivers that a controller was
- *	removed.
- */
-void i2o_driver_notify_controller_remove_all(struct i2o_controller *c)
-{
-	int i;
-	struct i2o_driver *drv;
-
-	for (i = 0; i < i2o_max_drivers; i++) {
-		drv = i2o_drivers[i];
-
-		if (drv)
-			i2o_driver_notify_controller_remove(drv, c);
-	}
-}
-
-/**
- *	i2o_driver_notify_device_add_all - Send notify of added device
- *	@i2o_dev: newly added I2O device
- *
- *	Send notifications to all registered drivers that a device was added.
- */
-void i2o_driver_notify_device_add_all(struct i2o_device *i2o_dev)
-{
-	int i;
-	struct i2o_driver *drv;
-
-	for (i = 0; i < i2o_max_drivers; i++) {
-		drv = i2o_drivers[i];
-
-		if (drv)
-			i2o_driver_notify_device_add(drv, i2o_dev);
-	}
-}
-
-/**
- *	i2o_driver_notify_device_remove_all - Send notify of removed device
- *	@i2o_dev: device that is being removed
- *
- *	Send notifications to all registered drivers that a device was removed.
- */
-void i2o_driver_notify_device_remove_all(struct i2o_device *i2o_dev)
-{
-	int i;
-	struct i2o_driver *drv;
-
-	for (i = 0; i < i2o_max_drivers; i++) {
-		drv = i2o_drivers[i];
-
-		if (drv)
-			i2o_driver_notify_device_remove(drv, i2o_dev);
-	}
-}
-
-/**
- *	i2o_driver_init - initialize I2O drivers (OSMs)
- *
- *	Registers the I2O bus and allocate memory for the array of OSMs.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int __init i2o_driver_init(void)
-{
-	int rc = 0;
-
-	spin_lock_init(&i2o_drivers_lock);
-
-	if ((i2o_max_drivers < 2) || (i2o_max_drivers > 64)) {
-		osm_warn("max_drivers set to %d, but must be >=2 and <= 64\n",
-			 i2o_max_drivers);
-		i2o_max_drivers = I2O_MAX_DRIVERS;
-	}
-	osm_info("max drivers = %d\n", i2o_max_drivers);
-
-	i2o_drivers =
-	    kcalloc(i2o_max_drivers, sizeof(*i2o_drivers), GFP_KERNEL);
-	if (!i2o_drivers)
-		return -ENOMEM;
-
-	rc = bus_register(&i2o_bus_type);
-
-	if (rc < 0)
-		kfree(i2o_drivers);
-
-	return rc;
-};
-
-/**
- *	i2o_driver_exit - clean up I2O drivers (OSMs)
- *
- *	Unregisters the I2O bus and frees driver array.
- */
-void i2o_driver_exit(void)
-{
-	bus_unregister(&i2o_bus_type);
-	kfree(i2o_drivers);
-};
-
-EXPORT_SYMBOL(i2o_driver_register);
-EXPORT_SYMBOL(i2o_driver_unregister);
-EXPORT_SYMBOL(i2o_driver_notify_controller_add_all);
-EXPORT_SYMBOL(i2o_driver_notify_controller_remove_all);
-EXPORT_SYMBOL(i2o_driver_notify_device_add_all);
-EXPORT_SYMBOL(i2o_driver_notify_device_remove_all);
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
deleted file mode 100644
index a3970e56ae53..000000000000
--- a/drivers/message/i2o/exec-osm.c
+++ /dev/null
@@ -1,612 +0,0 @@
-/*
- *	Executive OSM
- *
- * 	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	A lot of the I2O message side code from this is taken from the Red
- *	Creek RCPCI45 adapter driver by Red Creek Communications
- *
- *	Fixes/additions:
- *		Philipp Rumpf
- *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
- *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
- *		Deepak Saxena <deepak@plexity.net>
- *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
- *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
- *			Ported to Linux 2.5.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor fixes for 2.6.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Support for sysfs included.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/delay.h>
-#include <linux/workqueue.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/sched.h>	/* wait_event_interruptible_timeout() needs this */
-#include <asm/param.h>		/* HZ */
-#include "core.h"
-
-#define OSM_NAME "exec-osm"
-
-struct i2o_driver i2o_exec_driver;
-
-/* global wait list for POST WAIT */
-static LIST_HEAD(i2o_exec_wait_list);
-
-/* Wait struct needed for POST WAIT */
-struct i2o_exec_wait {
-	wait_queue_head_t *wq;	/* Pointer to Wait queue */
-	struct i2o_dma dma;	/* DMA buffers to free on failure */
-	u32 tcntxt;		/* transaction context from reply */
-	int complete;		/* 1 if reply received otherwise 0 */
-	u32 m;			/* message id */
-	struct i2o_message *msg;	/* pointer to the reply message */
-	struct list_head list;	/* node in global wait list */
-	spinlock_t lock;	/* lock before modifying */
-};
-
-/* Work struct needed to handle LCT NOTIFY replies */
-struct i2o_exec_lct_notify_work {
-	struct work_struct work;	/* work struct */
-	struct i2o_controller *c;	/* controller on which the LCT NOTIFY
-					   was received */
-};
-
-/* Exec OSM class handling definition */
-static struct i2o_class_id i2o_exec_class_id[] = {
-	{I2O_CLASS_EXECUTIVE},
-	{I2O_CLASS_END}
-};
-
-/**
- *	i2o_exec_wait_alloc - Allocate a i2o_exec_wait struct an initialize it
- *
- *	Allocate the i2o_exec_wait struct and initialize the wait.
- *
- *	Returns i2o_exec_wait pointer on success or negative error code on
- *	failure.
- */
-static struct i2o_exec_wait *i2o_exec_wait_alloc(void)
-{
-	struct i2o_exec_wait *wait;
-
-	wait = kzalloc(sizeof(*wait), GFP_KERNEL);
-	if (!wait)
-		return NULL;
-
-	INIT_LIST_HEAD(&wait->list);
-	spin_lock_init(&wait->lock);
-
-	return wait;
-};
-
-/**
- *	i2o_exec_wait_free - Free an i2o_exec_wait struct
- *	@wait: I2O wait data which should be cleaned up
- */
-static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
-{
-	kfree(wait);
-};
-
-/**
- * 	i2o_msg_post_wait_mem - Post and wait a message with DMA buffers
- *	@c: controller
- *	@msg: message to post
- *	@timeout: time in seconds to wait
- *	@dma: i2o_dma struct of the DMA buffer to free on failure
- *
- * 	This API allows an OSM to post a message and then be told whether or
- *	not the system received a successful reply. If the message times out
- *	then the value '-ETIMEDOUT' is returned. This is a special case. In
- *	this situation the message may (should) complete at an indefinite time
- *	in the future. When it completes it will use the memory buffer
- *	attached to the request. If -ETIMEDOUT is returned then the memory
- *	buffer must not be freed. Instead the event completion will free them
- *	for you. In all other cases the buffer are your problem.
- *
- *	Returns 0 on success, negative error code on timeout or positive error
- *	code from reply.
- */
-int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
-			  unsigned long timeout, struct i2o_dma *dma)
-{
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
-	struct i2o_exec_wait *wait;
-	static u32 tcntxt = 0x80000000;
-	unsigned long flags;
-	int rc = 0;
-
-	wait = i2o_exec_wait_alloc();
-	if (!wait) {
-		i2o_msg_nop(c, msg);
-		return -ENOMEM;
-	}
-
-	if (tcntxt == 0xffffffff)
-		tcntxt = 0x80000000;
-
-	if (dma)
-		wait->dma = *dma;
-
-	/*
-	 * Fill in the message initiator context and transaction context.
-	 * We will only use transaction contexts >= 0x80000000 for POST WAIT,
-	 * so we could find a POST WAIT reply easier in the reply handler.
-	 */
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	wait->tcntxt = tcntxt++;
-	msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt);
-
-	wait->wq = &wq;
-	/*
-	 * we add elements to the head, because if a entry in the list will
-	 * never be removed, we have to iterate over it every time
-	 */
-	list_add(&wait->list, &i2o_exec_wait_list);
-
-	/*
-	 * Post the message to the controller. At some point later it will
-	 * return. If we time out before it returns then complete will be zero.
-	 */
-	i2o_msg_post(c, msg);
-
-	wait_event_interruptible_timeout(wq, wait->complete, timeout * HZ);
-
-	spin_lock_irqsave(&wait->lock, flags);
-
-	wait->wq = NULL;
-
-	if (wait->complete)
-		rc = le32_to_cpu(wait->msg->body[0]) >> 24;
-	else {
-		/*
-		 * We cannot remove it now. This is important. When it does
-		 * terminate (which it must do if the controller has not
-		 * died...) then it will otherwise scribble on stuff.
-		 *
-		 * FIXME: try abort message
-		 */
-		if (dma)
-			dma->virt = NULL;
-
-		rc = -ETIMEDOUT;
-	}
-
-	spin_unlock_irqrestore(&wait->lock, flags);
-
-	if (rc != -ETIMEDOUT) {
-		i2o_flush_reply(c, wait->m);
-		i2o_exec_wait_free(wait);
-	}
-
-	return rc;
-};
-
-/**
- *	i2o_msg_post_wait_complete - Reply to a i2o_msg_post request from IOP
- *	@c: I2O controller which answers
- *	@m: message id
- *	@msg: pointer to the I2O reply message
- *	@context: transaction context of request
- *
- *	This function is called in interrupt context only. If the reply reached
- *	before the timeout, the i2o_exec_wait struct is filled with the message
- *	and the task will be waked up. The task is now responsible for returning
- *	the message m back to the controller! If the message reaches us after
- *	the timeout clean up the i2o_exec_wait struct (including allocated
- *	DMA buffer).
- *
- *	Return 0 on success and if the message m should not be given back to the
- *	I2O controller, or >0 on success and if the message should be given back
- *	afterwords. Returns negative error code on failure. In this case the
- *	message must also be given back to the controller.
- */
-static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
-				      struct i2o_message *msg, u32 context)
-{
-	struct i2o_exec_wait *wait, *tmp;
-	unsigned long flags;
-	int rc = 1;
-
-	/*
-	 * We need to search through the i2o_exec_wait_list to see if the given
-	 * message is still outstanding. If not, it means that the IOP took
-	 * longer to respond to the message than we had allowed and timer has
-	 * already expired. Not much we can do about that except log it for
-	 * debug purposes, increase timeout, and recompile.
-	 */
-	list_for_each_entry_safe(wait, tmp, &i2o_exec_wait_list, list) {
-		if (wait->tcntxt == context) {
-			spin_lock_irqsave(&wait->lock, flags);
-
-			list_del(&wait->list);
-
-			wait->m = m;
-			wait->msg = msg;
-			wait->complete = 1;
-
-			if (wait->wq)
-				rc = 0;
-			else
-				rc = -1;
-
-			spin_unlock_irqrestore(&wait->lock, flags);
-
-			if (rc) {
-				struct device *dev;
-
-				dev = &c->pdev->dev;
-
-				pr_debug("%s: timedout reply received!\n",
-					 c->name);
-				i2o_dma_free(dev, &wait->dma);
-				i2o_exec_wait_free(wait);
-			} else
-				wake_up_interruptible(wait->wq);
-
-			return rc;
-		}
-	}
-
-	osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name,
-		 context);
-
-	return -1;
-};
-
-/**
- *	i2o_exec_show_vendor_id - Displays Vendor ID of controller
- *	@d: device of which the Vendor ID should be displayed
- *	@attr: device_attribute to display
- *	@buf: buffer into which the Vendor ID should be printed
- *
- *	Returns number of bytes printed into buffer.
- */
-static ssize_t i2o_exec_show_vendor_id(struct device *d,
-				       struct device_attribute *attr, char *buf)
-{
-	struct i2o_device *dev = to_i2o_device(d);
-	u16 id;
-
-	if (!i2o_parm_field_get(dev, 0x0000, 0, &id, 2)) {
-		sprintf(buf, "0x%04x", le16_to_cpu(id));
-		return strlen(buf) + 1;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_exec_show_product_id - Displays Product ID of controller
- *	@d: device of which the Product ID should be displayed
- *	@attr: device_attribute to display
- *	@buf: buffer into which the Product ID should be printed
- *
- *	Returns number of bytes printed into buffer.
- */
-static ssize_t i2o_exec_show_product_id(struct device *d,
-					struct device_attribute *attr,
-					char *buf)
-{
-	struct i2o_device *dev = to_i2o_device(d);
-	u16 id;
-
-	if (!i2o_parm_field_get(dev, 0x0000, 1, &id, 2)) {
-		sprintf(buf, "0x%04x", le16_to_cpu(id));
-		return strlen(buf) + 1;
-	}
-
-	return 0;
-};
-
-/* Exec-OSM device attributes */
-static DEVICE_ATTR(vendor_id, S_IRUGO, i2o_exec_show_vendor_id, NULL);
-static DEVICE_ATTR(product_id, S_IRUGO, i2o_exec_show_product_id, NULL);
-
-/**
- *	i2o_exec_probe - Called if a new I2O device (executive class) appears
- *	@dev: I2O device which should be probed
- *
- *	Registers event notification for every event from Executive device. The
- *	return is always 0, because we want all devices of class Executive.
- *
- *	Returns 0 on success.
- */
-static int i2o_exec_probe(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	int rc;
-
-	rc = i2o_event_register(i2o_dev, &i2o_exec_driver, 0, 0xffffffff);
-	if (rc) goto err_out;
-
-	rc = device_create_file(dev, &dev_attr_vendor_id);
-	if (rc) goto err_evtreg;
-	rc = device_create_file(dev, &dev_attr_product_id);
-	if (rc) goto err_vid;
-
-	i2o_dev->iop->exec = i2o_dev;
-
-	return 0;
-
-err_vid:
-	device_remove_file(dev, &dev_attr_vendor_id);
-err_evtreg:
-	i2o_event_register(to_i2o_device(dev), &i2o_exec_driver, 0, 0);
-err_out:
-	return rc;
-};
-
-/**
- *	i2o_exec_remove - Called on I2O device removal
- *	@dev: I2O device which was removed
- *
- *	Unregisters event notification from Executive I2O device.
- *
- *	Returns 0 on success.
- */
-static int i2o_exec_remove(struct device *dev)
-{
-	device_remove_file(dev, &dev_attr_product_id);
-	device_remove_file(dev, &dev_attr_vendor_id);
-
-	i2o_event_register(to_i2o_device(dev), &i2o_exec_driver, 0, 0);
-
-	return 0;
-};
-
-#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
-/**
- *	i2o_exec_lct_notify - Send a asynchronus LCT NOTIFY request
- *	@c: I2O controller to which the request should be send
- *	@change_ind: change indicator
- *
- *	This function sends a LCT NOTIFY request to the I2O controller with
- *	the change indicator change_ind. If the change_ind == 0 the controller
- *	replies immediately after the request. If change_ind > 0 the reply is
- *	send after change indicator of the LCT is > change_ind.
- */
-static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	struct device *dev;
-	struct i2o_message *msg;
-
-	mutex_lock(&c->lct_lock);
-
-	dev = &c->pdev->dev;
-
-	if (i2o_dma_realloc(dev, &c->dlct,
-					le32_to_cpu(sb->expected_lct_size))) {
-		mutex_unlock(&c->lct_lock);
-		return -ENOMEM;
-	}
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg)) {
-		mutex_unlock(&c->lct_lock);
-		return PTR_ERR(msg);
-	}
-
-	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
-	msg->u.head[1] = cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
-				     ADAPTER_TID);
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
-	msg->body[0] = cpu_to_le32(0xffffffff);
-	msg->body[1] = cpu_to_le32(change_ind);
-	msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
-	msg->body[3] = cpu_to_le32(c->dlct.phys);
-
-	i2o_msg_post(c, msg);
-
-	mutex_unlock(&c->lct_lock);
-
-	return 0;
-}
-#endif
-
-/**
- *	i2o_exec_lct_modified - Called on LCT NOTIFY reply
- *	@_work: work struct for a specific controller
- *
- *	This function handles asynchronus LCT NOTIFY replies. It parses the
- *	new LCT and if the buffer for the LCT was to small sends a LCT NOTIFY
- *	again, otherwise send LCT NOTIFY to get informed on next LCT change.
- */
-static void i2o_exec_lct_modified(struct work_struct *_work)
-{
-	struct i2o_exec_lct_notify_work *work =
-		container_of(_work, struct i2o_exec_lct_notify_work, work);
-	u32 change_ind = 0;
-	struct i2o_controller *c = work->c;
-
-	kfree(work);
-
-	if (i2o_device_parse_lct(c) != -EAGAIN)
-		change_ind = c->lct->change_ind + 1;
-
-#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
-	i2o_exec_lct_notify(c, change_ind);
-#endif
-};
-
-/**
- *	i2o_exec_reply -  I2O Executive reply handler
- *	@c: I2O controller from which the reply comes
- *	@m: message id
- *	@msg: pointer to the I2O reply message
- *
- *	This function is always called from interrupt context. If a POST WAIT
- *	reply was received, pass it to the complete function. If a LCT NOTIFY
- *	reply was received, a new event is created to handle the update.
- *
- *	Returns 0 on success and if the reply should not be flushed or > 0
- *	on success and if the reply should be flushed. Returns negative error
- *	code on failure and if the reply should be flushed.
- */
-static int i2o_exec_reply(struct i2o_controller *c, u32 m,
-			  struct i2o_message *msg)
-{
-	u32 context;
-
-	if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) {
-		struct i2o_message __iomem *pmsg;
-		u32 pm;
-
-		/*
-		 * If Fail bit is set we must take the transaction context of
-		 * the preserved message to find the right request again.
-		 */
-
-		pm = le32_to_cpu(msg->body[3]);
-		pmsg = i2o_msg_in_to_virt(c, pm);
-		context = readl(&pmsg->u.s.tcntxt);
-
-		i2o_report_status(KERN_INFO, "i2o_core", msg);
-
-		/* Release the preserved msg */
-		i2o_msg_nop_mfa(c, pm);
-	} else
-		context = le32_to_cpu(msg->u.s.tcntxt);
-
-	if (context & 0x80000000)
-		return i2o_msg_post_wait_complete(c, m, msg, context);
-
-	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_LCT_NOTIFY) {
-		struct i2o_exec_lct_notify_work *work;
-
-		pr_debug("%s: LCT notify received\n", c->name);
-
-		work = kmalloc(sizeof(*work), GFP_ATOMIC);
-		if (!work)
-			return -ENOMEM;
-
-		work->c = c;
-
-		INIT_WORK(&work->work, i2o_exec_lct_modified);
-		queue_work(i2o_exec_driver.event_queue, &work->work);
-		return 1;
-	}
-
-	/*
-	 * If this happens, we want to dump the message to the syslog so
-	 * it can be sent back to the card manufacturer by the end user
-	 * to aid in debugging.
-	 *
-	 */
-	printk(KERN_WARNING "%s: Unsolicited message reply sent to core!"
-	       "Message dumped to syslog\n", c->name);
-	i2o_dump_message(msg);
-
-	return -EFAULT;
-}
-
-/**
- *	i2o_exec_event - Event handling function
- *	@work: Work item in occurring event
- *
- *	Handles events send by the Executive device. At the moment does not do
- *	anything useful.
- */
-static void i2o_exec_event(struct work_struct *work)
-{
-	struct i2o_event *evt = container_of(work, struct i2o_event, work);
-
-	if (likely(evt->i2o_dev))
-		osm_debug("Event received from device: %d\n",
-			  evt->i2o_dev->lct_data.tid);
-	kfree(evt);
-};
-
-/**
- *	i2o_exec_lct_get - Get the IOP's Logical Configuration Table
- *	@c: I2O controller from which the LCT should be fetched
- *
- *	Send a LCT NOTIFY request to the controller, and wait
- *	I2O_TIMEOUT_LCT_GET seconds until arrival of response. If the LCT is
- *	to large, retry it.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_exec_lct_get(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	int i = 0;
-	int rc = -EAGAIN;
-
-	for (i = 1; i <= I2O_LCT_GET_TRIES; i++) {
-		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-		if (IS_ERR(msg))
-			return PTR_ERR(msg);
-
-		msg->u.head[0] =
-		    cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
-		msg->u.head[1] =
-		    cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
-				ADAPTER_TID);
-		msg->body[0] = cpu_to_le32(0xffffffff);
-		msg->body[1] = cpu_to_le32(0x00000000);
-		msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
-		msg->body[3] = cpu_to_le32(c->dlct.phys);
-
-		rc = i2o_msg_post_wait(c, msg, I2O_TIMEOUT_LCT_GET);
-		if (rc < 0)
-			break;
-
-		rc = i2o_device_parse_lct(c);
-		if (rc != -EAGAIN)
-			break;
-	}
-
-	return rc;
-}
-
-/* Exec OSM driver struct */
-struct i2o_driver i2o_exec_driver = {
-	.name = OSM_NAME,
-	.reply = i2o_exec_reply,
-	.event = i2o_exec_event,
-	.classes = i2o_exec_class_id,
-	.driver = {
-		   .probe = i2o_exec_probe,
-		   .remove = i2o_exec_remove,
-		   },
-};
-
-/**
- *	i2o_exec_init - Registers the Exec OSM
- *
- *	Registers the Exec OSM in the I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int __init i2o_exec_init(void)
-{
-	return i2o_driver_register(&i2o_exec_driver);
-};
-
-/**
- *	i2o_exec_exit - Removes the Exec OSM
- *
- *	Unregisters the Exec OSM from the I2O core.
- */
-void i2o_exec_exit(void)
-{
-	i2o_driver_unregister(&i2o_exec_driver);
-};
-
-EXPORT_SYMBOL(i2o_msg_post_wait_mem);
-EXPORT_SYMBOL(i2o_exec_lct_get);
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
deleted file mode 100644
index 6fc3866965df..000000000000
--- a/drivers/message/i2o/i2o_block.c
+++ /dev/null
@@ -1,1228 +0,0 @@
-/*
- *	Block OSM
- *
- * 	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	This program is distributed in the hope that it will be useful, but
- *	WITHOUT ANY WARRANTY; without even the implied warranty of
- *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *	General Public License for more details.
- *
- *	For the purpose of avoiding doubt the preferred form of the work
- *	for making modifications shall be a standards compliant form such
- *	gzipped tar and not one requiring a proprietary or patent encumbered
- *	tool to unpack.
- *
- *	Fixes/additions:
- *		Steve Ralston:
- *			Multiple device handling error fixes,
- *			Added a queue depth.
- *		Alan Cox:
- *			FC920 has an rmw bug. Dont or in the end marker.
- *			Removed queue walk, fixed for 64bitness.
- *			Rewrote much of the code over time
- *			Added indirect block lists
- *			Handle 64K limits on many controllers
- *			Don't use indirects on the Promise (breaks)
- *			Heavily chop down the queue depths
- *		Deepak Saxena:
- *			Independent queues per IOP
- *			Support for dynamic device creation/deletion
- *			Code cleanup
- *	    		Support for larger I/Os through merge* functions
- *			(taken from DAC960 driver)
- *		Boji T Kannanthanam:
- *			Set the I2O Block devices to be detected in increasing
- *			order of TIDs during boot.
- *			Search and set the I2O block device that we boot off
- *			from as the first device to be claimed (as /dev/i2o/hda)
- *			Properly attach/detach I2O gendisk structure from the
- *			system gendisk list. The I2O block devices now appear in
- *			/proc/partitions.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor bugfixes for 2.6.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/i2o.h>
-#include <linux/mutex.h>
-
-#include <linux/mempool.h>
-
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/hdreg.h>
-
-#include <scsi/scsi.h>
-
-#include "i2o_block.h"
-
-#define OSM_NAME	"block-osm"
-#define OSM_VERSION	"1.325"
-#define OSM_DESCRIPTION	"I2O Block Device OSM"
-
-static DEFINE_MUTEX(i2o_block_mutex);
-static struct i2o_driver i2o_block_driver;
-
-/* global Block OSM request mempool */
-static struct i2o_block_mempool i2o_blk_req_pool;
-
-/* Block OSM class handling definition */
-static struct i2o_class_id i2o_block_class_id[] = {
-	{I2O_CLASS_RANDOM_BLOCK_STORAGE},
-	{I2O_CLASS_END}
-};
-
-/**
- *	i2o_block_device_free - free the memory of the I2O Block device
- *	@dev: I2O Block device, which should be cleaned up
- *
- *	Frees the request queue, gendisk and the i2o_block_device structure.
- */
-static void i2o_block_device_free(struct i2o_block_device *dev)
-{
-	blk_cleanup_queue(dev->gd->queue);
-
-	put_disk(dev->gd);
-
-	kfree(dev);
-};
-
-/**
- *	i2o_block_remove - remove the I2O Block device from the system again
- *	@dev: I2O Block device which should be removed
- *
- *	Remove gendisk from system and free all allocated memory.
- *
- *	Always returns 0.
- */
-static int i2o_block_remove(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_block_device *i2o_blk_dev = dev_get_drvdata(dev);
-
-	osm_info("device removed (TID: %03x): %s\n", i2o_dev->lct_data.tid,
-		 i2o_blk_dev->gd->disk_name);
-
-	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0);
-
-	del_gendisk(i2o_blk_dev->gd);
-
-	dev_set_drvdata(dev, NULL);
-
-	i2o_device_claim_release(i2o_dev);
-
-	i2o_block_device_free(i2o_blk_dev);
-
-	return 0;
-};
-
-/**
- *	i2o_block_device flush - Flush all dirty data of I2O device dev
- *	@dev: I2O device which should be flushed
- *
- *	Flushes all dirty data on device dev.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_flush(struct i2o_device *dev)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(60 << 16);
-	osm_debug("Flushing...\n");
-
-	return i2o_msg_post_wait(dev->iop, msg, 60);
-};
-
-/**
- *	i2o_block_device_mount - Mount (load) the media of device dev
- *	@dev: I2O device which should receive the mount request
- *	@media_id: Media Identifier
- *
- *	Load a media into drive. Identifier should be set to -1, because the
- *	spec does not support any other value.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(-1);
-	msg->body[1] = cpu_to_le32(0x00000000);
-	osm_debug("Mounting...\n");
-
-	return i2o_msg_post_wait(dev->iop, msg, 2);
-};
-
-/**
- *	i2o_block_device_lock - Locks the media of device dev
- *	@dev: I2O device which should receive the lock request
- *	@media_id: Media Identifier
- *
- *	Lock media of device dev to prevent removal. The media identifier
- *	should be set to -1, because the spec does not support any other value.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(-1);
-	osm_debug("Locking...\n");
-
-	return i2o_msg_post_wait(dev->iop, msg, 2);
-};
-
-/**
- *	i2o_block_device_unlock - Unlocks the media of device dev
- *	@dev: I2O device which should receive the unlocked request
- *	@media_id: Media Identifier
- *
- *	Unlocks the media in device dev. The media identifier should be set to
- *	-1, because the spec does not support any other value.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_unlock(struct i2o_device *dev, u32 media_id)
-{
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(media_id);
-	osm_debug("Unlocking...\n");
-
-	return i2o_msg_post_wait(dev->iop, msg, 2);
-};
-
-/**
- *	i2o_block_device_power - Power management for device dev
- *	@dev: I2O device which should receive the power management request
- *	@op: Operation to send
- *
- *	Send a power management request to the device dev.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_device_power(struct i2o_block_device *dev, u8 op)
-{
-	struct i2o_device *i2o_dev = dev->i2o_dev;
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_message *msg;
-	int rc;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->
-			lct_data.tid);
-	msg->body[0] = cpu_to_le32(op << 24);
-	osm_debug("Power...\n");
-
-	rc = i2o_msg_post_wait(c, msg, 60);
-	if (!rc)
-		dev->power = op;
-
-	return rc;
-};
-
-/**
- *	i2o_block_request_alloc - Allocate an I2O block request struct
- *
- *	Allocates an I2O block request struct and initialize the list.
- *
- *	Returns a i2o_block_request pointer on success or negative error code
- *	on failure.
- */
-static inline struct i2o_block_request *i2o_block_request_alloc(void)
-{
-	struct i2o_block_request *ireq;
-
-	ireq = mempool_alloc(i2o_blk_req_pool.pool, GFP_ATOMIC);
-	if (!ireq)
-		return ERR_PTR(-ENOMEM);
-
-	INIT_LIST_HEAD(&ireq->queue);
-	sg_init_table(ireq->sg_table, I2O_MAX_PHYS_SEGMENTS);
-
-	return ireq;
-};
-
-/**
- *	i2o_block_request_free - Frees a I2O block request
- *	@ireq: I2O block request which should be freed
- *
- *	Frees the allocated memory (give it back to the request mempool).
- */
-static inline void i2o_block_request_free(struct i2o_block_request *ireq)
-{
-	mempool_free(ireq, i2o_blk_req_pool.pool);
-};
-
-/**
- *	i2o_block_sglist_alloc - Allocate the SG list and map it
- *	@c: I2O controller to which the request belongs
- *	@ireq: I2O block request
- *	@mptr: message body pointer
- *
- *	Builds the SG list and map it to be accessible by the controller.
- *
- *	Returns 0 on failure or 1 on success.
- */
-static inline int i2o_block_sglist_alloc(struct i2o_controller *c,
-					 struct i2o_block_request *ireq,
-					 u32 ** mptr)
-{
-	int nents;
-	enum dma_data_direction direction;
-
-	ireq->dev = &c->pdev->dev;
-	nents = blk_rq_map_sg(ireq->req->q, ireq->req, ireq->sg_table);
-
-	if (rq_data_dir(ireq->req) == READ)
-		direction = PCI_DMA_FROMDEVICE;
-	else
-		direction = PCI_DMA_TODEVICE;
-
-	ireq->sg_nents = nents;
-
-	return i2o_dma_map_sg(c, ireq->sg_table, nents, direction, mptr);
-};
-
-/**
- *	i2o_block_sglist_free - Frees the SG list
- *	@ireq: I2O block request from which the SG should be freed
- *
- *	Frees the SG list from the I2O block request.
- */
-static inline void i2o_block_sglist_free(struct i2o_block_request *ireq)
-{
-	enum dma_data_direction direction;
-
-	if (rq_data_dir(ireq->req) == READ)
-		direction = PCI_DMA_FROMDEVICE;
-	else
-		direction = PCI_DMA_TODEVICE;
-
-	dma_unmap_sg(ireq->dev, ireq->sg_table, ireq->sg_nents, direction);
-};
-
-/**
- *	i2o_block_prep_req_fn - Allocates I2O block device specific struct
- *	@q: request queue for the request
- *	@req: the request to prepare
- *
- *	Allocate the necessary i2o_block_request struct and connect it to
- *	the request. This is needed that we not lose the SG list later on.
- *
- *	Returns BLKPREP_OK on success or BLKPREP_DEFER on failure.
- */
-static int i2o_block_prep_req_fn(struct request_queue *q, struct request *req)
-{
-	struct i2o_block_device *i2o_blk_dev = q->queuedata;
-	struct i2o_block_request *ireq;
-
-	if (unlikely(!i2o_blk_dev)) {
-		osm_err("block device already removed\n");
-		return BLKPREP_KILL;
-	}
-
-	/* connect the i2o_block_request to the request */
-	if (!req->special) {
-		ireq = i2o_block_request_alloc();
-		if (IS_ERR(ireq)) {
-			osm_debug("unable to allocate i2o_block_request!\n");
-			return BLKPREP_DEFER;
-		}
-
-		ireq->i2o_blk_dev = i2o_blk_dev;
-		req->special = ireq;
-		ireq->req = req;
-	}
-	/* do not come back here */
-	req->cmd_flags |= REQ_DONTPREP;
-
-	return BLKPREP_OK;
-};
-
-/**
- *	i2o_block_delayed_request_fn - delayed request queue function
- *	@work: the delayed request with the queue to start
- *
- *	If the request queue is stopped for a disk, and there is no open
- *	request, a new event is created, which calls this function to start
- *	the queue after I2O_BLOCK_REQUEST_TIME. Otherwise the queue will never
- *	be started again.
- */
-static void i2o_block_delayed_request_fn(struct work_struct *work)
-{
-	struct i2o_block_delayed_request *dreq =
-		container_of(work, struct i2o_block_delayed_request,
-			     work.work);
-	struct request_queue *q = dreq->queue;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_start_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-	kfree(dreq);
-};
-
-/**
- *	i2o_block_end_request - Post-processing of completed commands
- *	@req: request which should be completed
- *	@error: 0 for success, < 0 for error
- *	@nr_bytes: number of bytes to complete
- *
- *	Mark the request as complete. The lock must not be held when entering.
- *
- */
-static void i2o_block_end_request(struct request *req, int error,
-				  int nr_bytes)
-{
-	struct i2o_block_request *ireq = req->special;
-	struct i2o_block_device *dev = ireq->i2o_blk_dev;
-	struct request_queue *q = req->q;
-	unsigned long flags;
-
-	if (blk_end_request(req, error, nr_bytes))
-		if (error)
-			blk_end_request_all(req, -EIO);
-
-	spin_lock_irqsave(q->queue_lock, flags);
-
-	if (likely(dev)) {
-		dev->open_queue_depth--;
-		list_del(&ireq->queue);
-	}
-
-	blk_start_queue(q);
-
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	i2o_block_sglist_free(ireq);
-	i2o_block_request_free(ireq);
-};
-
-/**
- *	i2o_block_reply - Block OSM reply handler.
- *	@c: I2O controller from which the message arrives
- *	@m: message id of reply
- *	@msg: the actual I2O message reply
- *
- *	This function gets all the message replies.
- *
- */
-static int i2o_block_reply(struct i2o_controller *c, u32 m,
-			   struct i2o_message *msg)
-{
-	struct request *req;
-	int error = 0;
-
-	req = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
-	if (unlikely(!req)) {
-		osm_err("NULL reply received!\n");
-		return -1;
-	}
-
-	/*
-	 *      Lets see what is cooking. We stuffed the
-	 *      request in the context.
-	 */
-
-	if ((le32_to_cpu(msg->body[0]) >> 24) != 0) {
-		u32 status = le32_to_cpu(msg->body[0]);
-		/*
-		 *      Device not ready means two things. One is that the
-		 *      the thing went offline (but not a removal media)
-		 *
-		 *      The second is that you have a SuperTrak 100 and the
-		 *      firmware got constipated. Unlike standard i2o card
-		 *      setups the supertrak returns an error rather than
-		 *      blocking for the timeout in these cases.
-		 *
-		 *      Don't stick a supertrak100 into cache aggressive modes
-		 */
-
-		osm_err("TID %03x error status: 0x%02x, detailed status: "
-			"0x%04x\n", (le32_to_cpu(msg->u.head[1]) >> 12 & 0xfff),
-			status >> 24, status & 0xffff);
-
-		req->errors++;
-
-		error = -EIO;
-	}
-
-	i2o_block_end_request(req, error, le32_to_cpu(msg->body[1]));
-
-	return 1;
-};
-
-static void i2o_block_event(struct work_struct *work)
-{
-	struct i2o_event *evt = container_of(work, struct i2o_event, work);
-	osm_debug("event received\n");
-	kfree(evt);
-};
-
-/*
- *	SCSI-CAM for ioctl geometry mapping
- *	Duplicated with SCSI - this should be moved into somewhere common
- *	perhaps genhd ?
- *
- * LBA -> CHS mapping table taken from:
- *
- * "Incorporating the I2O Architecture into BIOS for Intel Architecture
- *  Platforms"
- *
- * This is an I2O document that is only available to I2O members,
- * not developers.
- *
- * From my understanding, this is how all the I2O cards do this
- *
- * Disk Size      | Sectors | Heads | Cylinders
- * ---------------+---------+-------+-------------------
- * 1 < X <= 528M  | 63      | 16    | X/(63 * 16 * 512)
- * 528M < X <= 1G | 63      | 32    | X/(63 * 32 * 512)
- * 1 < X <528M    | 63      | 16    | X/(63 * 16 * 512)
- * 1 < X <528M    | 63      | 16    | X/(63 * 16 * 512)
- *
- */
-#define	BLOCK_SIZE_528M		1081344
-#define	BLOCK_SIZE_1G		2097152
-#define	BLOCK_SIZE_21G		4403200
-#define	BLOCK_SIZE_42G		8806400
-#define	BLOCK_SIZE_84G		17612800
-
-static void i2o_block_biosparam(unsigned long capacity, unsigned short *cyls,
-				unsigned char *hds, unsigned char *secs)
-{
-	unsigned long heads, sectors, cylinders;
-
-	sectors = 63L;		/* Maximize sectors per track */
-	if (capacity <= BLOCK_SIZE_528M)
-		heads = 16;
-	else if (capacity <= BLOCK_SIZE_1G)
-		heads = 32;
-	else if (capacity <= BLOCK_SIZE_21G)
-		heads = 64;
-	else if (capacity <= BLOCK_SIZE_42G)
-		heads = 128;
-	else
-		heads = 255;
-
-	cylinders = (unsigned long)capacity / (heads * sectors);
-
-	*cyls = (unsigned short)cylinders;	/* Stuff return values */
-	*secs = (unsigned char)sectors;
-	*hds = (unsigned char)heads;
-}
-
-/**
- *	i2o_block_open - Open the block device
- *	@bdev: block device being opened
- *	@mode: file open mode
- *
- *	Power up the device, mount and lock the media. This function is called,
- *	if the block device is opened for access.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_open(struct block_device *bdev, fmode_t mode)
-{
-	struct i2o_block_device *dev = bdev->bd_disk->private_data;
-
-	if (!dev->i2o_dev)
-		return -ENODEV;
-
-	mutex_lock(&i2o_block_mutex);
-	if (dev->power > 0x1f)
-		i2o_block_device_power(dev, 0x02);
-
-	i2o_block_device_mount(dev->i2o_dev, -1);
-
-	i2o_block_device_lock(dev->i2o_dev, -1);
-
-	osm_debug("Ready.\n");
-	mutex_unlock(&i2o_block_mutex);
-
-	return 0;
-};
-
-/**
- *	i2o_block_release - Release the I2O block device
- *	@disk: gendisk device being released
- *	@mode: file open mode
- *
- *	Unlock and unmount the media, and power down the device. Gets called if
- *	the block device is closed.
- */
-static void i2o_block_release(struct gendisk *disk, fmode_t mode)
-{
-	struct i2o_block_device *dev = disk->private_data;
-	u8 operation;
-
-	/*
-	 * This is to deal with the case of an application
-	 * opening a device and then the device disappears while
-	 * it's in use, and then the application tries to release
-	 * it.  ex: Unmounting a deleted RAID volume at reboot.
-	 * If we send messages, it will just cause FAILs since
-	 * the TID no longer exists.
-	 */
-	if (!dev->i2o_dev)
-		return;
-
-	mutex_lock(&i2o_block_mutex);
-	i2o_block_device_flush(dev->i2o_dev);
-
-	i2o_block_device_unlock(dev->i2o_dev, -1);
-
-	if (dev->flags & (1 << 3 | 1 << 4))	/* Removable */
-		operation = 0x21;
-	else
-		operation = 0x24;
-
-	i2o_block_device_power(dev, operation);
-	mutex_unlock(&i2o_block_mutex);
-}
-
-static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	i2o_block_biosparam(get_capacity(bdev->bd_disk),
-			    &geo->cylinders, &geo->heads, &geo->sectors);
-	return 0;
-}
-
-/**
- *	i2o_block_ioctl - Issue device specific ioctl calls.
- *	@bdev: block device being opened
- *	@mode: file open mode
- *	@cmd: ioctl command
- *	@arg: arg
- *
- *	Handles ioctl request for the block device.
- *
- *	Return 0 on success or negative error on failure.
- */
-static int i2o_block_ioctl(struct block_device *bdev, fmode_t mode,
-			   unsigned int cmd, unsigned long arg)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	struct i2o_block_device *dev = disk->private_data;
-	int ret = -ENOTTY;
-
-	/* Anyone capable of this syscall can do *real bad* things */
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&i2o_block_mutex);
-	switch (cmd) {
-	case BLKI2OGRSTRAT:
-		ret = put_user(dev->rcache, (int __user *)arg);
-		break;
-	case BLKI2OGWSTRAT:
-		ret = put_user(dev->wcache, (int __user *)arg);
-		break;
-	case BLKI2OSRSTRAT:
-		ret = -EINVAL;
-		if (arg < 0 || arg > CACHE_SMARTFETCH)
-			break;
-		dev->rcache = arg;
-		ret = 0;
-		break;
-	case BLKI2OSWSTRAT:
-		ret = -EINVAL;
-		if (arg != 0
-		    && (arg < CACHE_WRITETHROUGH || arg > CACHE_SMARTBACK))
-			break;
-		dev->wcache = arg;
-		ret = 0;
-		break;
-	}
-	mutex_unlock(&i2o_block_mutex);
-
-	return ret;
-};
-
-/**
- *	i2o_block_check_events - Have we seen a media change?
- *	@disk: gendisk which should be verified
- *	@clearing: events being cleared
- *
- *	Verifies if the media has changed.
- *
- *	Returns 1 if the media was changed or 0 otherwise.
- */
-static unsigned int i2o_block_check_events(struct gendisk *disk,
-					   unsigned int clearing)
-{
-	struct i2o_block_device *p = disk->private_data;
-
-	if (p->media_change_flag) {
-		p->media_change_flag = 0;
-		return DISK_EVENT_MEDIA_CHANGE;
-	}
-	return 0;
-}
-
-/**
- *	i2o_block_transfer - Transfer a request to/from the I2O controller
- *	@req: the request which should be transferred
- *
- *	This function converts the request into a I2O message. The necessary
- *	DMA buffers are allocated and after everything is setup post the message
- *	to the I2O controller. No cleanup is done by this function. It is done
- *	on the interrupt side when the reply arrives.
- *
- *	Return 0 on success or negative error code on failure.
- */
-static int i2o_block_transfer(struct request *req)
-{
-	struct i2o_block_device *dev = req->rq_disk->private_data;
-	struct i2o_controller *c;
-	u32 tid;
-	struct i2o_message *msg;
-	u32 *mptr;
-	struct i2o_block_request *ireq = req->special;
-	u32 tcntxt;
-	u32 sgl_offset = SGL_OFFSET_8;
-	u32 ctl_flags = 0x00000000;
-	int rc;
-	u32 cmd;
-
-	if (unlikely(!dev->i2o_dev)) {
-		osm_err("transfer to removed drive\n");
-		rc = -ENODEV;
-		goto exit;
-	}
-
-	tid = dev->i2o_dev->lct_data.tid;
-	c = dev->i2o_dev->iop;
-
-	msg = i2o_msg_get(c);
-	if (IS_ERR(msg)) {
-		rc = PTR_ERR(msg);
-		goto exit;
-	}
-
-	tcntxt = i2o_cntxt_list_add(c, req);
-	if (!tcntxt) {
-		rc = -ENOMEM;
-		goto nop_msg;
-	}
-
-	msg->u.s.icntxt = cpu_to_le32(i2o_block_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
-
-	mptr = &msg->body[0];
-
-	if (rq_data_dir(req) == READ) {
-		cmd = I2O_CMD_BLOCK_READ << 24;
-
-		switch (dev->rcache) {
-		case CACHE_PREFETCH:
-			ctl_flags = 0x201F0008;
-			break;
-
-		case CACHE_SMARTFETCH:
-			if (blk_rq_sectors(req) > 16)
-				ctl_flags = 0x201F0008;
-			else
-				ctl_flags = 0x001F0000;
-			break;
-
-		default:
-			break;
-		}
-	} else {
-		cmd = I2O_CMD_BLOCK_WRITE << 24;
-
-		switch (dev->wcache) {
-		case CACHE_WRITETHROUGH:
-			ctl_flags = 0x001F0008;
-			break;
-		case CACHE_WRITEBACK:
-			ctl_flags = 0x001F0010;
-			break;
-		case CACHE_SMARTBACK:
-			if (blk_rq_sectors(req) > 16)
-				ctl_flags = 0x001F0004;
-			else
-				ctl_flags = 0x001F0010;
-			break;
-		case CACHE_SMARTTHROUGH:
-			if (blk_rq_sectors(req) > 16)
-				ctl_flags = 0x001F0004;
-			else
-				ctl_flags = 0x001F0010;
-		default:
-			break;
-		}
-	}
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	if (c->adaptec) {
-		u8 cmd[10];
-		u32 scsi_flags;
-		u16 hwsec;
-
-		hwsec = queue_logical_block_size(req->q) >> KERNEL_SECTOR_SHIFT;
-		memset(cmd, 0, 10);
-
-		sgl_offset = SGL_OFFSET_12;
-
-		msg->u.head[1] =
-		    cpu_to_le32(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid);
-
-		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
-		*mptr++ = cpu_to_le32(tid);
-
-		/*
-		 * ENABLE_DISCONNECT
-		 * SIMPLE_TAG
-		 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
-		 */
-		if (rq_data_dir(req) == READ) {
-			cmd[0] = READ_10;
-			scsi_flags = 0x60a0000a;
-		} else {
-			cmd[0] = WRITE_10;
-			scsi_flags = 0xa0a0000a;
-		}
-
-		*mptr++ = cpu_to_le32(scsi_flags);
-
-		*((u32 *) & cmd[2]) = cpu_to_be32(blk_rq_pos(req) * hwsec);
-		*((u16 *) & cmd[7]) = cpu_to_be16(blk_rq_sectors(req) * hwsec);
-
-		memcpy(mptr, cmd, 10);
-		mptr += 4;
-		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
-	} else
-#endif
-	{
-		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
-		*mptr++ = cpu_to_le32(ctl_flags);
-		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
-		*mptr++ =
-		    cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT));
-		*mptr++ =
-		    cpu_to_le32(blk_rq_pos(req) >> (32 - KERNEL_SECTOR_SHIFT));
-	}
-
-	if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
-		rc = -ENOMEM;
-		goto context_remove;
-	}
-
-	msg->u.head[0] =
-	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
-
-	list_add_tail(&ireq->queue, &dev->open_queue);
-	dev->open_queue_depth++;
-
-	i2o_msg_post(c, msg);
-
-	return 0;
-
-      context_remove:
-	i2o_cntxt_list_remove(c, req);
-
-      nop_msg:
-	i2o_msg_nop(c, msg);
-
-      exit:
-	return rc;
-};
-
-/**
- *	i2o_block_request_fn - request queue handling function
- *	@q: request queue from which the request could be fetched
- *
- *	Takes the next request from the queue, transfers it and if no error
- *	occurs dequeue it from the queue. On arrival of the reply the message
- *	will be processed further. If an error occurs requeue the request.
- */
-static void i2o_block_request_fn(struct request_queue *q)
-{
-	struct request *req;
-
-	while ((req = blk_peek_request(q)) != NULL) {
-		if (req->cmd_type == REQ_TYPE_FS) {
-			struct i2o_block_delayed_request *dreq;
-			struct i2o_block_request *ireq = req->special;
-			unsigned int queue_depth;
-
-			queue_depth = ireq->i2o_blk_dev->open_queue_depth;
-
-			if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS) {
-				if (!i2o_block_transfer(req)) {
-					blk_start_request(req);
-					continue;
-				} else
-					osm_info("transfer error\n");
-			}
-
-			if (queue_depth)
-				break;
-
-			/* stop the queue and retry later */
-			dreq = kmalloc(sizeof(*dreq), GFP_ATOMIC);
-			if (!dreq)
-				continue;
-
-			dreq->queue = q;
-			INIT_DELAYED_WORK(&dreq->work,
-					  i2o_block_delayed_request_fn);
-
-			if (!queue_delayed_work(i2o_block_driver.event_queue,
-						&dreq->work,
-						I2O_BLOCK_RETRY_TIME))
-				kfree(dreq);
-			else {
-				blk_stop_queue(q);
-				break;
-			}
-		} else {
-			blk_start_request(req);
-			__blk_end_request_all(req, -EIO);
-		}
-	}
-};
-
-/* I2O Block device operations definition */
-static const struct block_device_operations i2o_block_fops = {
-	.owner = THIS_MODULE,
-	.open = i2o_block_open,
-	.release = i2o_block_release,
-	.ioctl = i2o_block_ioctl,
-	.compat_ioctl = i2o_block_ioctl,
-	.getgeo = i2o_block_getgeo,
-	.check_events = i2o_block_check_events,
-};
-
-/**
- *	i2o_block_device_alloc - Allocate memory for a I2O Block device
- *
- *	Allocate memory for the i2o_block_device struct, gendisk and request
- *	queue and initialize them as far as no additional information is needed.
- *
- *	Returns a pointer to the allocated I2O Block device on success or a
- *	negative error code on failure.
- */
-static struct i2o_block_device *i2o_block_device_alloc(void)
-{
-	struct i2o_block_device *dev;
-	struct gendisk *gd;
-	struct request_queue *queue;
-	int rc;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev) {
-		osm_err("Insufficient memory to allocate I2O Block disk.\n");
-		rc = -ENOMEM;
-		goto exit;
-	}
-
-	INIT_LIST_HEAD(&dev->open_queue);
-	spin_lock_init(&dev->lock);
-	dev->rcache = CACHE_PREFETCH;
-	dev->wcache = CACHE_WRITEBACK;
-
-	/* allocate a gendisk with 16 partitions */
-	gd = alloc_disk(16);
-	if (!gd) {
-		osm_err("Insufficient memory to allocate gendisk.\n");
-		rc = -ENOMEM;
-		goto cleanup_dev;
-	}
-
-	/* initialize the request queue */
-	queue = blk_init_queue(i2o_block_request_fn, &dev->lock);
-	if (!queue) {
-		osm_err("Insufficient memory to allocate request queue.\n");
-		rc = -ENOMEM;
-		goto cleanup_queue;
-	}
-
-	blk_queue_prep_rq(queue, i2o_block_prep_req_fn);
-
-	gd->major = I2O_MAJOR;
-	gd->queue = queue;
-	gd->fops = &i2o_block_fops;
-	gd->private_data = dev;
-
-	dev->gd = gd;
-
-	return dev;
-
-      cleanup_queue:
-	put_disk(gd);
-
-      cleanup_dev:
-	kfree(dev);
-
-      exit:
-	return ERR_PTR(rc);
-};
-
-/**
- *	i2o_block_probe - verify if dev is a I2O Block device and install it
- *	@dev: device to verify if it is a I2O Block device
- *
- *	We only verify if the user_tid of the device is 0xfff and then install
- *	the device. Otherwise it is used by some other device (e. g. RAID).
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_block_probe(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_block_device *i2o_blk_dev;
-	struct gendisk *gd;
-	struct request_queue *queue;
-	static int unit = 0;
-	int rc;
-	u64 size;
-	u32 blocksize;
-	u16 body_size = 4;
-	u16 power;
-	unsigned short max_sectors;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	if (c->adaptec)
-		body_size = 8;
-#endif
-
-	if (c->limit_sectors)
-		max_sectors = I2O_MAX_SECTORS_LIMITED;
-	else
-		max_sectors = I2O_MAX_SECTORS;
-
-	/* skip devices which are used by IOP */
-	if (i2o_dev->lct_data.user_tid != 0xfff) {
-		osm_debug("skipping used device %03x\n", i2o_dev->lct_data.tid);
-		return -ENODEV;
-	}
-
-	if (i2o_device_claim(i2o_dev)) {
-		osm_warn("Unable to claim device. Installation aborted\n");
-		rc = -EFAULT;
-		goto exit;
-	}
-
-	i2o_blk_dev = i2o_block_device_alloc();
-	if (IS_ERR(i2o_blk_dev)) {
-		osm_err("could not alloc a new I2O block device");
-		rc = PTR_ERR(i2o_blk_dev);
-		goto claim_release;
-	}
-
-	i2o_blk_dev->i2o_dev = i2o_dev;
-	dev_set_drvdata(dev, i2o_blk_dev);
-
-	/* setup gendisk */
-	gd = i2o_blk_dev->gd;
-	gd->first_minor = unit << 4;
-	sprintf(gd->disk_name, "i2o/hd%c", 'a' + unit);
-	gd->driverfs_dev = &i2o_dev->device;
-
-	/* setup request queue */
-	queue = gd->queue;
-	queue->queuedata = i2o_blk_dev;
-
-	blk_queue_max_hw_sectors(queue, max_sectors);
-	blk_queue_max_segments(queue, i2o_sg_tablesize(c, body_size));
-
-	osm_debug("max sectors = %d\n", queue->max_sectors);
-	osm_debug("phys segments = %d\n", queue->max_phys_segments);
-	osm_debug("max hw segments = %d\n", queue->max_hw_segments);
-
-	/*
-	 *      Ask for the current media data. If that isn't supported
-	 *      then we ask for the device capacity data
-	 */
-	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
-	    !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
-		blk_queue_logical_block_size(queue, le32_to_cpu(blocksize));
-	} else
-		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
-
-	if (!i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) ||
-	    !i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
-		set_capacity(gd, le64_to_cpu(size) >> KERNEL_SECTOR_SHIFT);
-	} else
-		osm_warn("could not get size of %s\n", gd->disk_name);
-
-	if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &power, 2))
-		i2o_blk_dev->power = power;
-
-	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0xffffffff);
-
-	add_disk(gd);
-
-	unit++;
-
-	osm_info("device added (TID: %03x): %s\n", i2o_dev->lct_data.tid,
-		 i2o_blk_dev->gd->disk_name);
-
-	return 0;
-
-      claim_release:
-	i2o_device_claim_release(i2o_dev);
-
-      exit:
-	return rc;
-};
-
-/* Block OSM driver struct */
-static struct i2o_driver i2o_block_driver = {
-	.name = OSM_NAME,
-	.event = i2o_block_event,
-	.reply = i2o_block_reply,
-	.classes = i2o_block_class_id,
-	.driver = {
-		   .probe = i2o_block_probe,
-		   .remove = i2o_block_remove,
-		   },
-};
-
-/**
- *	i2o_block_init - Block OSM initialization function
- *
- *	Allocate the slab and mempool for request structs, registers i2o_block
- *	block device and finally register the Block OSM in the I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_block_init(void)
-{
-	int rc;
-	int size;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	/* Allocate request mempool and slab */
-	size = sizeof(struct i2o_block_request);
-	i2o_blk_req_pool.slab = kmem_cache_create("i2o_block_req", size, 0,
-						  SLAB_HWCACHE_ALIGN, NULL);
-	if (!i2o_blk_req_pool.slab) {
-		osm_err("can't init request slab\n");
-		rc = -ENOMEM;
-		goto exit;
-	}
-
-	i2o_blk_req_pool.pool =
-		mempool_create_slab_pool(I2O_BLOCK_REQ_MEMPOOL_SIZE,
-					 i2o_blk_req_pool.slab);
-	if (!i2o_blk_req_pool.pool) {
-		osm_err("can't init request mempool\n");
-		rc = -ENOMEM;
-		goto free_slab;
-	}
-
-	/* Register the block device interfaces */
-	rc = register_blkdev(I2O_MAJOR, "i2o_block");
-	if (rc) {
-		osm_err("unable to register block device\n");
-		goto free_mempool;
-	}
-#ifdef MODULE
-	osm_info("registered device at major %d\n", I2O_MAJOR);
-#endif
-
-	/* Register Block OSM into I2O core */
-	rc = i2o_driver_register(&i2o_block_driver);
-	if (rc) {
-		osm_err("Could not register Block driver\n");
-		goto unregister_blkdev;
-	}
-
-	return 0;
-
-      unregister_blkdev:
-	unregister_blkdev(I2O_MAJOR, "i2o_block");
-
-      free_mempool:
-	mempool_destroy(i2o_blk_req_pool.pool);
-
-      free_slab:
-	kmem_cache_destroy(i2o_blk_req_pool.slab);
-
-      exit:
-	return rc;
-};
-
-/**
- *	i2o_block_exit - Block OSM exit function
- *
- *	Unregisters Block OSM from I2O core, unregisters i2o_block block device
- *	and frees the mempool and slab.
- */
-static void __exit i2o_block_exit(void)
-{
-	/* Unregister I2O Block OSM from I2O core */
-	i2o_driver_unregister(&i2o_block_driver);
-
-	/* Unregister block device */
-	unregister_blkdev(I2O_MAJOR, "i2o_block");
-
-	/* Free request mempool and slab */
-	mempool_destroy(i2o_blk_req_pool.pool);
-	kmem_cache_destroy(i2o_blk_req_pool.slab);
-};
-
-MODULE_AUTHOR("Red Hat");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_block_init);
-module_exit(i2o_block_exit);
diff --git a/drivers/message/i2o/i2o_block.h b/drivers/message/i2o/i2o_block.h
deleted file mode 100644
index cf8873cbca3f..000000000000
--- a/drivers/message/i2o/i2o_block.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *	Block OSM structures/API
- *
- * 	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	This program is distributed in the hope that it will be useful, but
- *	WITHOUT ANY WARRANTY; without even the implied warranty of
- *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *	General Public License for more details.
- *
- *	For the purpose of avoiding doubt the preferred form of the work
- *	for making modifications shall be a standards compliant form such
- *	gzipped tar and not one requiring a proprietary or patent encumbered
- *	tool to unpack.
- *
- *	Fixes/additions:
- *		Steve Ralston:
- *			Multiple device handling error fixes,
- *			Added a queue depth.
- *		Alan Cox:
- *			FC920 has an rmw bug. Dont or in the end marker.
- *			Removed queue walk, fixed for 64bitness.
- *			Rewrote much of the code over time
- *			Added indirect block lists
- *			Handle 64K limits on many controllers
- *			Don't use indirects on the Promise (breaks)
- *			Heavily chop down the queue depths
- *		Deepak Saxena:
- *			Independent queues per IOP
- *			Support for dynamic device creation/deletion
- *			Code cleanup
- *	    		Support for larger I/Os through merge* functions
- *			(taken from DAC960 driver)
- *		Boji T Kannanthanam:
- *			Set the I2O Block devices to be detected in increasing
- *			order of TIDs during boot.
- *			Search and set the I2O block device that we boot off
- *			from as the first device to be claimed (as /dev/i2o/hda)
- *			Properly attach/detach I2O gendisk structure from the
- *			system gendisk list. The I2O block devices now appear in
- *			/proc/partitions.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor bugfixes for 2.6.
- */
-
-#ifndef I2O_BLOCK_OSM_H
-#define I2O_BLOCK_OSM_H
-
-#define I2O_BLOCK_RETRY_TIME HZ/4
-#define I2O_BLOCK_MAX_OPEN_REQUESTS 50
-
-/* request queue sizes */
-#define I2O_BLOCK_REQ_MEMPOOL_SIZE		32
-
-#define KERNEL_SECTOR_SHIFT 9
-#define KERNEL_SECTOR_SIZE (1 << KERNEL_SECTOR_SHIFT)
-
-/* I2O Block OSM mempool struct */
-struct i2o_block_mempool {
-	struct kmem_cache *slab;
-	mempool_t *pool;
-};
-
-/* I2O Block device descriptor */
-struct i2o_block_device {
-	struct i2o_device *i2o_dev;	/* pointer to I2O device */
-	struct gendisk *gd;
-	spinlock_t lock;	/* queue lock */
-	struct list_head open_queue;	/* list of transferred, but unfinished
-					   requests */
-	unsigned int open_queue_depth;	/* number of requests in the queue */
-
-	int rcache;		/* read cache flags */
-	int wcache;		/* write cache flags */
-	int flags;
-	u16 power;		/* power state */
-	int media_change_flag;	/* media changed flag */
-};
-
-/* I2O Block device request */
-struct i2o_block_request {
-	struct list_head queue;
-	struct request *req;	/* corresponding request */
-	struct i2o_block_device *i2o_blk_dev;	/* I2O block device */
-	struct device *dev;	/* device used for DMA */
-	int sg_nents;		/* number of SG elements */
-	struct scatterlist sg_table[I2O_MAX_PHYS_SEGMENTS];	/* SG table */
-};
-
-/* I2O Block device delayed request */
-struct i2o_block_delayed_request {
-	struct delayed_work work;
-	struct request_queue *queue;
-};
-
-#endif
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
deleted file mode 100644
index 04bd3b6de401..000000000000
--- a/drivers/message/i2o/i2o_config.c
+++ /dev/null
@@ -1,1163 +0,0 @@
-/*
- * I2O Configuration Interface Driver
- *
- * (C) Copyright 1999-2002  Red Hat
- *
- * Written by Alan Cox, Building Number Three Ltd
- *
- * Fixes/additions:
- *	Deepak Saxena (04/20/1999):
- *		Added basic ioctl() support
- *	Deepak Saxena (06/07/1999):
- *		Added software download ioctl (still testing)
- *	Auvo Häkkinen (09/10/1999):
- *		Changes to i2o_cfg_reply(), ioctl_parms()
- *		Added ioct_validate()
- *	Taneli Vähäkangas (09/30/1999):
- *		Fixed ioctl_swdl()
- *	Taneli Vähäkangas (10/04/1999):
- *		Changed ioctl_swdl(), implemented ioctl_swul() and ioctl_swdel()
- *	Deepak Saxena (11/18/1999):
- *		Added event managmenet support
- *	Alan Cox <alan@lxorguk.ukuu.org.uk>:
- *		2.4 rewrite ported to 2.5
- *	Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *		Added pass-thru support for Adaptec's raidutils
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/miscdevice.h>
-#include <linux/mutex.h>
-#include <linux/compat.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-
-#include "core.h"
-
-#define SG_TABLESIZE		30
-
-static DEFINE_MUTEX(i2o_cfg_mutex);
-static long i2o_cfg_ioctl(struct file *, unsigned int, unsigned long);
-
-static spinlock_t i2o_config_lock;
-
-#define MODINC(x,y) ((x) = ((x) + 1) % (y))
-
-struct sg_simple_element {
-	u32 flag_count;
-	u32 addr_bus;
-};
-
-struct i2o_cfg_info {
-	struct file *fp;
-	struct fasync_struct *fasync;
-	struct i2o_evt_info event_q[I2O_EVT_Q_LEN];
-	u16 q_in;		// Queue head index
-	u16 q_out;		// Queue tail index
-	u16 q_len;		// Queue length
-	u16 q_lost;		// Number of lost events
-	ulong q_id;		// Event queue ID...used as tx_context
-	struct i2o_cfg_info *next;
-};
-static struct i2o_cfg_info *open_files = NULL;
-static ulong i2o_cfg_info_id = 0;
-
-static int i2o_cfg_getiops(unsigned long arg)
-{
-	struct i2o_controller *c;
-	u8 __user *user_iop_table = (void __user *)arg;
-	u8 tmp[MAX_I2O_CONTROLLERS];
-	int ret = 0;
-
-	memset(tmp, 0, MAX_I2O_CONTROLLERS);
-
-	list_for_each_entry(c, &i2o_controllers, list)
-	    tmp[c->unit] = 1;
-
-	if (copy_to_user(user_iop_table, tmp, MAX_I2O_CONTROLLERS))
-		ret = -EFAULT;
-
-	return ret;
-};
-
-static int i2o_cfg_gethrt(unsigned long arg)
-{
-	struct i2o_controller *c;
-	struct i2o_cmd_hrtlct __user *cmd = (struct i2o_cmd_hrtlct __user *)arg;
-	struct i2o_cmd_hrtlct kcmd;
-	i2o_hrt *hrt;
-	int len;
-	u32 reslen;
-	int ret = 0;
-
-	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_hrtlct)))
-		return -EFAULT;
-
-	if (get_user(reslen, kcmd.reslen) < 0)
-		return -EFAULT;
-
-	if (kcmd.resbuf == NULL)
-		return -EFAULT;
-
-	c = i2o_find_iop(kcmd.iop);
-	if (!c)
-		return -ENXIO;
-
-	hrt = (i2o_hrt *) c->hrt.virt;
-
-	len = 8 + ((hrt->entry_len * hrt->num_entries) << 2);
-
-	if (put_user(len, kcmd.reslen))
-		ret = -EFAULT;
-	else if (len > reslen)
-		ret = -ENOBUFS;
-	else if (copy_to_user(kcmd.resbuf, (void *)hrt, len))
-		ret = -EFAULT;
-
-	return ret;
-};
-
-static int i2o_cfg_getlct(unsigned long arg)
-{
-	struct i2o_controller *c;
-	struct i2o_cmd_hrtlct __user *cmd = (struct i2o_cmd_hrtlct __user *)arg;
-	struct i2o_cmd_hrtlct kcmd;
-	i2o_lct *lct;
-	int len;
-	int ret = 0;
-	u32 reslen;
-
-	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_hrtlct)))
-		return -EFAULT;
-
-	if (get_user(reslen, kcmd.reslen) < 0)
-		return -EFAULT;
-
-	if (kcmd.resbuf == NULL)
-		return -EFAULT;
-
-	c = i2o_find_iop(kcmd.iop);
-	if (!c)
-		return -ENXIO;
-
-	lct = (i2o_lct *) c->lct;
-
-	len = (unsigned int)lct->table_size << 2;
-	if (put_user(len, kcmd.reslen))
-		ret = -EFAULT;
-	else if (len > reslen)
-		ret = -ENOBUFS;
-	else if (copy_to_user(kcmd.resbuf, lct, len))
-		ret = -EFAULT;
-
-	return ret;
-};
-
-static int i2o_cfg_parms(unsigned long arg, unsigned int type)
-{
-	int ret = 0;
-	struct i2o_controller *c;
-	struct i2o_device *dev;
-	struct i2o_cmd_psetget __user *cmd =
-	    (struct i2o_cmd_psetget __user *)arg;
-	struct i2o_cmd_psetget kcmd;
-	u32 reslen;
-	u8 *ops;
-	u8 *res;
-	int len = 0;
-
-	u32 i2o_cmd = (type == I2OPARMGET ?
-		       I2O_CMD_UTIL_PARAMS_GET : I2O_CMD_UTIL_PARAMS_SET);
-
-	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_psetget)))
-		return -EFAULT;
-
-	if (get_user(reslen, kcmd.reslen))
-		return -EFAULT;
-
-	c = i2o_find_iop(kcmd.iop);
-	if (!c)
-		return -ENXIO;
-
-	dev = i2o_iop_find_device(c, kcmd.tid);
-	if (!dev)
-		return -ENXIO;
-
-	/*
-	 * Stop users being able to try and allocate arbitrary amounts
-	 * of DMA space. 64K is way more than sufficient for this.
-	 */
-	if (kcmd.oplen > 65536)
-		return -EMSGSIZE;
-
-	ops = memdup_user(kcmd.opbuf, kcmd.oplen);
-	if (IS_ERR(ops))
-		return PTR_ERR(ops);
-
-	/*
-	 * It's possible to have a _very_ large table
-	 * and that the user asks for all of it at once...
-	 */
-	res = kmalloc(65536, GFP_KERNEL);
-	if (!res) {
-		kfree(ops);
-		return -ENOMEM;
-	}
-
-	len = i2o_parm_issue(dev, i2o_cmd, ops, kcmd.oplen, res, 65536);
-	kfree(ops);
-
-	if (len < 0) {
-		kfree(res);
-		return -EAGAIN;
-	}
-
-	if (put_user(len, kcmd.reslen))
-		ret = -EFAULT;
-	else if (len > reslen)
-		ret = -ENOBUFS;
-	else if (copy_to_user(kcmd.resbuf, res, len))
-		ret = -EFAULT;
-
-	kfree(res);
-
-	return ret;
-};
-
-static int i2o_cfg_swdl(unsigned long arg)
-{
-	struct i2o_sw_xfer kxfer;
-	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
-	unsigned char maxfrag = 0, curfrag = 1;
-	struct i2o_dma buffer;
-	struct i2o_message *msg;
-	unsigned int status = 0, swlen = 0, fragsize = 8192;
-	struct i2o_controller *c;
-
-	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
-		return -EFAULT;
-
-	if (get_user(swlen, kxfer.swlen) < 0)
-		return -EFAULT;
-
-	if (get_user(maxfrag, kxfer.maxfrag) < 0)
-		return -EFAULT;
-
-	if (get_user(curfrag, kxfer.curfrag) < 0)
-		return -EFAULT;
-
-	if (curfrag == maxfrag)
-		fragsize = swlen - (maxfrag - 1) * 8192;
-
-	if (!kxfer.buf || !access_ok(VERIFY_READ, kxfer.buf, fragsize))
-		return -EFAULT;
-
-	c = i2o_find_iop(kxfer.iop);
-	if (!c)
-		return -ENXIO;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize)) {
-		i2o_msg_nop(c, msg);
-		return -ENOMEM;
-	}
-
-	if (__copy_from_user(buffer.virt, kxfer.buf, fragsize)) {
-		i2o_msg_nop(c, msg);
-		i2o_dma_free(&c->pdev->dev, &buffer);
-		return -EFAULT;
-	}
-
-	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(0);
-	msg->body[0] =
-	    cpu_to_le32((((u32) kxfer.flags) << 24) | (((u32) kxfer.
-							sw_type) << 16) |
-			(((u32) maxfrag) << 8) | (((u32) curfrag)));
-	msg->body[1] = cpu_to_le32(swlen);
-	msg->body[2] = cpu_to_le32(kxfer.sw_id);
-	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
-	msg->body[4] = cpu_to_le32(buffer.phys);
-
-	osm_debug("swdl frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
-	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
-
-	if (status != -ETIMEDOUT)
-		i2o_dma_free(&c->pdev->dev, &buffer);
-
-	if (status != I2O_POST_WAIT_OK) {
-		// it fails if you try and send frags out of order
-		// and for some yet unknown reasons too
-		osm_info("swdl failed, DetailedStatus = %d\n", status);
-		return status;
-	}
-
-	return 0;
-};
-
-static int i2o_cfg_swul(unsigned long arg)
-{
-	struct i2o_sw_xfer kxfer;
-	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
-	unsigned char maxfrag = 0, curfrag = 1;
-	struct i2o_dma buffer;
-	struct i2o_message *msg;
-	unsigned int status = 0, swlen = 0, fragsize = 8192;
-	struct i2o_controller *c;
-	int ret = 0;
-
-	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
-		return -EFAULT;
-
-	if (get_user(swlen, kxfer.swlen) < 0)
-		return -EFAULT;
-
-	if (get_user(maxfrag, kxfer.maxfrag) < 0)
-		return -EFAULT;
-
-	if (get_user(curfrag, kxfer.curfrag) < 0)
-		return -EFAULT;
-
-	if (curfrag == maxfrag)
-		fragsize = swlen - (maxfrag - 1) * 8192;
-
-	if (!kxfer.buf)
-		return -EFAULT;
-
-	c = i2o_find_iop(kxfer.iop);
-	if (!c)
-		return -ENXIO;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize)) {
-		i2o_msg_nop(c, msg);
-		return -ENOMEM;
-	}
-
-	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(0);
-	msg->body[0] =
-	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.
-			sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag);
-	msg->body[1] = cpu_to_le32(swlen);
-	msg->body[2] = cpu_to_le32(kxfer.sw_id);
-	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
-	msg->body[4] = cpu_to_le32(buffer.phys);
-
-	osm_debug("swul frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
-	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
-
-	if (status != I2O_POST_WAIT_OK) {
-		if (status != -ETIMEDOUT)
-			i2o_dma_free(&c->pdev->dev, &buffer);
-
-		osm_info("swul failed, DetailedStatus = %d\n", status);
-		return status;
-	}
-
-	if (copy_to_user(kxfer.buf, buffer.virt, fragsize))
-		ret = -EFAULT;
-
-	i2o_dma_free(&c->pdev->dev, &buffer);
-
-	return ret;
-}
-
-static int i2o_cfg_swdel(unsigned long arg)
-{
-	struct i2o_controller *c;
-	struct i2o_sw_xfer kxfer;
-	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
-	struct i2o_message *msg;
-	unsigned int swlen;
-	int token;
-
-	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
-		return -EFAULT;
-
-	if (get_user(swlen, kxfer.swlen) < 0)
-		return -EFAULT;
-
-	c = i2o_find_iop(kxfer.iop);
-	if (!c)
-		return -ENXIO;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(0);
-	msg->body[0] =
-	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16);
-	msg->body[1] = cpu_to_le32(swlen);
-	msg->body[2] = cpu_to_le32(kxfer.sw_id);
-
-	token = i2o_msg_post_wait(c, msg, 10);
-
-	if (token != I2O_POST_WAIT_OK) {
-		osm_info("swdel failed, DetailedStatus = %d\n", token);
-		return -ETIMEDOUT;
-	}
-
-	return 0;
-};
-
-static int i2o_cfg_validate(unsigned long arg)
-{
-	int token;
-	int iop = (int)arg;
-	struct i2o_message *msg;
-	struct i2o_controller *c;
-
-	c = i2o_find_iop(iop);
-	if (!c)
-		return -ENXIO;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(0);
-
-	token = i2o_msg_post_wait(c, msg, 10);
-
-	if (token != I2O_POST_WAIT_OK) {
-		osm_info("Can't validate configuration, ErrorStatus = %d\n",
-			 token);
-		return -ETIMEDOUT;
-	}
-
-	return 0;
-};
-
-static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
-{
-	struct i2o_message *msg;
-	struct i2o_evt_id __user *pdesc = (struct i2o_evt_id __user *)arg;
-	struct i2o_evt_id kdesc;
-	struct i2o_controller *c;
-	struct i2o_device *d;
-
-	if (copy_from_user(&kdesc, pdesc, sizeof(struct i2o_evt_id)))
-		return -EFAULT;
-
-	/* IOP exists? */
-	c = i2o_find_iop(kdesc.iop);
-	if (!c)
-		return -ENXIO;
-
-	/* Device exists? */
-	d = i2o_iop_find_device(c, kdesc.tid);
-	if (!d)
-		return -ENODEV;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 |
-			kdesc.tid);
-	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
-	msg->u.head[3] = cpu_to_le32(i2o_cntxt_list_add(c, fp->private_data));
-	msg->body[0] = cpu_to_le32(kdesc.evt_mask);
-
-	i2o_msg_post(c, msg);
-
-	return 0;
-}
-
-static int i2o_cfg_evt_get(unsigned long arg, struct file *fp)
-{
-	struct i2o_cfg_info *p = NULL;
-	struct i2o_evt_get __user *uget = (struct i2o_evt_get __user *)arg;
-	struct i2o_evt_get kget;
-	unsigned long flags;
-
-	for (p = open_files; p; p = p->next)
-		if (p->q_id == (ulong) fp->private_data)
-			break;
-
-	if (!p->q_len)
-		return -ENOENT;
-
-	memcpy(&kget.info, &p->event_q[p->q_out], sizeof(struct i2o_evt_info));
-	MODINC(p->q_out, I2O_EVT_Q_LEN);
-	spin_lock_irqsave(&i2o_config_lock, flags);
-	p->q_len--;
-	kget.pending = p->q_len;
-	kget.lost = p->q_lost;
-	spin_unlock_irqrestore(&i2o_config_lock, flags);
-
-	if (copy_to_user(uget, &kget, sizeof(struct i2o_evt_get)))
-		return -EFAULT;
-	return 0;
-}
-
-#ifdef CONFIG_COMPAT
-static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
-			      unsigned long arg)
-{
-	struct i2o_cmd_passthru32 __user *cmd;
-	struct i2o_controller *c;
-	u32 __user *user_msg;
-	u32 *reply = NULL;
-	u32 __user *user_reply = NULL;
-	u32 size = 0;
-	u32 reply_size = 0;
-	u32 rcode = 0;
-	struct i2o_dma sg_list[SG_TABLESIZE];
-	u32 sg_offset = 0;
-	u32 sg_count = 0;
-	u32 i = 0;
-	u32 sg_index = 0;
-	i2o_status_block *sb;
-	struct i2o_message *msg;
-	unsigned int iop;
-
-	cmd = (struct i2o_cmd_passthru32 __user *)arg;
-
-	if (get_user(iop, &cmd->iop) || get_user(i, &cmd->msg))
-		return -EFAULT;
-
-	user_msg = compat_ptr(i);
-
-	c = i2o_find_iop(iop);
-	if (!c) {
-		osm_debug("controller %d not found\n", iop);
-		return -ENXIO;
-	}
-
-	sb = c->status_block.virt;
-
-	if (get_user(size, &user_msg[0])) {
-		osm_warn("unable to get size!\n");
-		return -EFAULT;
-	}
-	size = size >> 16;
-
-	if (size > sb->inbound_frame_size) {
-		osm_warn("size of message > inbound_frame_size");
-		return -EFAULT;
-	}
-
-	user_reply = &user_msg[size];
-
-	size <<= 2;		// Convert to bytes
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	rcode = -EFAULT;
-	/* Copy in the user's I2O command */
-	if (copy_from_user(msg, user_msg, size)) {
-		osm_warn("unable to copy user message\n");
-		goto out;
-	}
-	i2o_dump_message(msg);
-
-	if (get_user(reply_size, &user_reply[0]) < 0)
-		goto out;
-
-	reply_size >>= 16;
-	reply_size <<= 2;
-
-	rcode = -ENOMEM;
-	reply = kzalloc(reply_size, GFP_KERNEL);
-	if (!reply) {
-		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
-		       c->name);
-		goto out;
-	}
-
-	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
-
-	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
-	if (sg_offset) {
-		struct sg_simple_element *sg;
-
-		if (sg_offset * 4 >= size) {
-			rcode = -EFAULT;
-			goto cleanup;
-		}
-		// TODO 64bit fix
-		sg = (struct sg_simple_element *)((&msg->u.head[0]) +
-						  sg_offset);
-		sg_count =
-		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
-		if (sg_count > SG_TABLESIZE) {
-			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
-			       c->name, sg_count);
-			rcode = -EINVAL;
-			goto cleanup;
-		}
-
-		for (i = 0; i < sg_count; i++) {
-			int sg_size;
-			struct i2o_dma *p;
-
-			if (!(sg[i].flag_count & 0x10000000
-			      /*I2O_SGL_FLAGS_SIMPLE_ADDRESS_ELEMENT */ )) {
-				printk(KERN_DEBUG
-				       "%s:Bad SG element %d - not simple (%x)\n",
-				       c->name, i, sg[i].flag_count);
-				rcode = -EINVAL;
-				goto cleanup;
-			}
-			sg_size = sg[i].flag_count & 0xffffff;
-			p = &(sg_list[sg_index]);
-			/* Allocate memory for the transfer */
-			if (i2o_dma_alloc(&c->pdev->dev, p, sg_size)) {
-				printk(KERN_DEBUG
-				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
-				       c->name, sg_size, i, sg_count);
-				rcode = -ENOMEM;
-				goto sg_list_cleanup;
-			}
-			sg_index++;
-			/* Copy in the user's SG buffer if necessary */
-			if (sg[i].
-			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
-				// TODO 64bit fix
-				if (copy_from_user
-				    (p->virt,
-				     (void __user *)(unsigned long)sg[i].
-				     addr_bus, sg_size)) {
-					printk(KERN_DEBUG
-					       "%s: Could not copy SG buf %d FROM user\n",
-					       c->name, i);
-					rcode = -EFAULT;
-					goto sg_list_cleanup;
-				}
-			}
-			//TODO 64bit fix
-			sg[i].addr_bus = (u32) p->phys;
-		}
-	}
-
-	rcode = i2o_msg_post_wait(c, msg, 60);
-	msg = NULL;
-	if (rcode) {
-		reply[4] = ((u32) rcode) << 24;
-		goto sg_list_cleanup;
-	}
-
-	if (sg_offset) {
-		u32 rmsg[I2O_OUTBOUND_MSG_FRAME_SIZE];
-		/* Copy back the Scatter Gather buffers back to user space */
-		u32 j;
-		// TODO 64bit fix
-		struct sg_simple_element *sg;
-		int sg_size;
-
-		// re-acquire the original message to handle correctly the sg copy operation
-		memset(&rmsg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
-		// get user msg size in u32s
-		if (get_user(size, &user_msg[0])) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-		size = size >> 16;
-		size *= 4;
-		if (size > sizeof(rmsg)) {
-			rcode = -EINVAL;
-			goto sg_list_cleanup;
-		}
-
-		/* Copy in the user's I2O command */
-		if (copy_from_user(rmsg, user_msg, size)) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-		sg_count =
-		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
-
-		// TODO 64bit fix
-		sg = (struct sg_simple_element *)(rmsg + sg_offset);
-		for (j = 0; j < sg_count; j++) {
-			/* Copy out the SG list to user's buffer if necessary */
-			if (!
-			    (sg[j].
-			     flag_count & 0x4000000 /*I2O_SGL_FLAGS_DIR */ )) {
-				sg_size = sg[j].flag_count & 0xffffff;
-				// TODO 64bit fix
-				if (copy_to_user
-				    ((void __user *)(u64) sg[j].addr_bus,
-				     sg_list[j].virt, sg_size)) {
-					printk(KERN_WARNING
-					       "%s: Could not copy %p TO user %x\n",
-					       c->name, sg_list[j].virt,
-					       sg[j].addr_bus);
-					rcode = -EFAULT;
-					goto sg_list_cleanup;
-				}
-			}
-		}
-	}
-
-sg_list_cleanup:
-	/* Copy back the reply to user space */
-	if (reply_size) {
-		// we wrote our own values for context - now restore the user supplied ones
-		if (copy_from_user(reply + 2, user_msg + 2, sizeof(u32) * 2)) {
-			printk(KERN_WARNING
-			       "%s: Could not copy message context FROM user\n",
-			       c->name);
-			rcode = -EFAULT;
-		}
-		if (copy_to_user(user_reply, reply, reply_size)) {
-			printk(KERN_WARNING
-			       "%s: Could not copy reply TO user\n", c->name);
-			rcode = -EFAULT;
-		}
-	}
-	for (i = 0; i < sg_index; i++)
-		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
-
-cleanup:
-	kfree(reply);
-out:
-	if (msg)
-		i2o_msg_nop(c, msg);
-	return rcode;
-}
-
-static long i2o_cfg_compat_ioctl(struct file *file, unsigned cmd,
-				 unsigned long arg)
-{
-	int ret;
-	switch (cmd) {
-	case I2OGETIOPS:
-		ret = i2o_cfg_ioctl(file, cmd, arg);
-		break;
-	case I2OPASSTHRU32:
-		mutex_lock(&i2o_cfg_mutex);
-		ret = i2o_cfg_passthru32(file, cmd, arg);
-		mutex_unlock(&i2o_cfg_mutex);
-		break;
-	default:
-		ret = -ENOIOCTLCMD;
-		break;
-	}
-	return ret;
-}
-
-#endif
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-static int i2o_cfg_passthru(unsigned long arg)
-{
-	struct i2o_cmd_passthru __user *cmd =
-	    (struct i2o_cmd_passthru __user *)arg;
-	struct i2o_controller *c;
-	u32 __user *user_msg;
-	u32 *reply = NULL;
-	u32 __user *user_reply = NULL;
-	u32 size = 0;
-	u32 reply_size = 0;
-	u32 rcode = 0;
-	struct i2o_dma sg_list[SG_TABLESIZE];
-	u32 sg_offset = 0;
-	u32 sg_count = 0;
-	int sg_index = 0;
-	u32 i = 0;
-	i2o_status_block *sb;
-	struct i2o_message *msg;
-	unsigned int iop;
-
-	if (get_user(iop, &cmd->iop) || get_user(user_msg, &cmd->msg))
-		return -EFAULT;
-
-	c = i2o_find_iop(iop);
-	if (!c) {
-		osm_warn("controller %d not found\n", iop);
-		return -ENXIO;
-	}
-
-	sb = c->status_block.virt;
-
-	if (get_user(size, &user_msg[0]))
-		return -EFAULT;
-	size = size >> 16;
-
-	if (size > sb->inbound_frame_size) {
-		osm_warn("size of message > inbound_frame_size");
-		return -EFAULT;
-	}
-
-	user_reply = &user_msg[size];
-
-	size <<= 2;		// Convert to bytes
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	rcode = -EFAULT;
-	/* Copy in the user's I2O command */
-	if (copy_from_user(msg, user_msg, size))
-		goto out;
-
-	if (get_user(reply_size, &user_reply[0]) < 0)
-		goto out;
-
-	reply_size >>= 16;
-	reply_size <<= 2;
-
-	reply = kzalloc(reply_size, GFP_KERNEL);
-	if (!reply) {
-		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
-		       c->name);
-		rcode = -ENOMEM;
-		goto out;
-	}
-
-	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
-
-	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
-	if (sg_offset) {
-		struct sg_simple_element *sg;
-		struct i2o_dma *p;
-
-		if (sg_offset * 4 >= size) {
-			rcode = -EFAULT;
-			goto cleanup;
-		}
-		// TODO 64bit fix
-		sg = (struct sg_simple_element *)((&msg->u.head[0]) +
-						  sg_offset);
-		sg_count =
-		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
-		if (sg_count > SG_TABLESIZE) {
-			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
-			       c->name, sg_count);
-			rcode = -EINVAL;
-			goto cleanup;
-		}
-
-		for (i = 0; i < sg_count; i++) {
-			int sg_size;
-
-			if (!(sg[i].flag_count & 0x10000000
-			      /*I2O_SGL_FLAGS_SIMPLE_ADDRESS_ELEMENT */ )) {
-				printk(KERN_DEBUG
-				       "%s:Bad SG element %d - not simple (%x)\n",
-				       c->name, i, sg[i].flag_count);
-				rcode = -EINVAL;
-				goto sg_list_cleanup;
-			}
-			sg_size = sg[i].flag_count & 0xffffff;
-			p = &(sg_list[sg_index]);
-			if (i2o_dma_alloc(&c->pdev->dev, p, sg_size)) {
-			/* Allocate memory for the transfer */
-				printk(KERN_DEBUG
-				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
-				       c->name, sg_size, i, sg_count);
-				rcode = -ENOMEM;
-				goto sg_list_cleanup;
-			}
-			sg_index++;
-			/* Copy in the user's SG buffer if necessary */
-			if (sg[i].
-			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
-				// TODO 64bit fix
-				if (copy_from_user
-				    (p->virt, (void __user *)sg[i].addr_bus,
-				     sg_size)) {
-					printk(KERN_DEBUG
-					       "%s: Could not copy SG buf %d FROM user\n",
-					       c->name, i);
-					rcode = -EFAULT;
-					goto sg_list_cleanup;
-				}
-			}
-			sg[i].addr_bus = p->phys;
-		}
-	}
-
-	rcode = i2o_msg_post_wait(c, msg, 60);
-	msg = NULL;
-	if (rcode) {
-		reply[4] = ((u32) rcode) << 24;
-		goto sg_list_cleanup;
-	}
-
-	if (sg_offset) {
-		u32 rmsg[I2O_OUTBOUND_MSG_FRAME_SIZE];
-		/* Copy back the Scatter Gather buffers back to user space */
-		u32 j;
-		// TODO 64bit fix
-		struct sg_simple_element *sg;
-		int sg_size;
-
-		// re-acquire the original message to handle correctly the sg copy operation
-		memset(&rmsg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
-		// get user msg size in u32s
-		if (get_user(size, &user_msg[0])) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-		size = size >> 16;
-		size *= 4;
-		if (size > sizeof(rmsg)) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-
-		/* Copy in the user's I2O command */
-		if (copy_from_user(rmsg, user_msg, size)) {
-			rcode = -EFAULT;
-			goto sg_list_cleanup;
-		}
-		sg_count =
-		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
-
-		// TODO 64bit fix
-		sg = (struct sg_simple_element *)(rmsg + sg_offset);
-		for (j = 0; j < sg_count; j++) {
-			/* Copy out the SG list to user's buffer if necessary */
-			if (!
-			    (sg[j].
-			     flag_count & 0x4000000 /*I2O_SGL_FLAGS_DIR */ )) {
-				sg_size = sg[j].flag_count & 0xffffff;
-				// TODO 64bit fix
-				if (copy_to_user
-				    ((void __user *)sg[j].addr_bus, sg_list[j].virt,
-				     sg_size)) {
-					printk(KERN_WARNING
-					       "%s: Could not copy %p TO user %x\n",
-					       c->name, sg_list[j].virt,
-					       sg[j].addr_bus);
-					rcode = -EFAULT;
-					goto sg_list_cleanup;
-				}
-			}
-		}
-	}
-
-sg_list_cleanup:
-	/* Copy back the reply to user space */
-	if (reply_size) {
-		// we wrote our own values for context - now restore the user supplied ones
-		if (copy_from_user(reply + 2, user_msg + 2, sizeof(u32) * 2)) {
-			printk(KERN_WARNING
-			       "%s: Could not copy message context FROM user\n",
-			       c->name);
-			rcode = -EFAULT;
-		}
-		if (copy_to_user(user_reply, reply, reply_size)) {
-			printk(KERN_WARNING
-			       "%s: Could not copy reply TO user\n", c->name);
-			rcode = -EFAULT;
-		}
-	}
-
-	for (i = 0; i < sg_index; i++)
-		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
-
-cleanup:
-	kfree(reply);
-out:
-	if (msg)
-		i2o_msg_nop(c, msg);
-	return rcode;
-}
-#endif
-
-/*
- * IOCTL Handler
- */
-static long i2o_cfg_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
-{
-	int ret;
-
-	mutex_lock(&i2o_cfg_mutex);
-	switch (cmd) {
-	case I2OGETIOPS:
-		ret = i2o_cfg_getiops(arg);
-		break;
-
-	case I2OHRTGET:
-		ret = i2o_cfg_gethrt(arg);
-		break;
-
-	case I2OLCTGET:
-		ret = i2o_cfg_getlct(arg);
-		break;
-
-	case I2OPARMSET:
-		ret = i2o_cfg_parms(arg, I2OPARMSET);
-		break;
-
-	case I2OPARMGET:
-		ret = i2o_cfg_parms(arg, I2OPARMGET);
-		break;
-
-	case I2OSWDL:
-		ret = i2o_cfg_swdl(arg);
-		break;
-
-	case I2OSWUL:
-		ret = i2o_cfg_swul(arg);
-		break;
-
-	case I2OSWDEL:
-		ret = i2o_cfg_swdel(arg);
-		break;
-
-	case I2OVALIDATE:
-		ret = i2o_cfg_validate(arg);
-		break;
-
-	case I2OEVTREG:
-		ret = i2o_cfg_evt_reg(arg, fp);
-		break;
-
-	case I2OEVTGET:
-		ret = i2o_cfg_evt_get(arg, fp);
-		break;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	case I2OPASSTHRU:
-		ret = i2o_cfg_passthru(arg);
-		break;
-#endif
-
-	default:
-		osm_debug("unknown ioctl called!\n");
-		ret = -EINVAL;
-	}
-	mutex_unlock(&i2o_cfg_mutex);
-	return ret;
-}
-
-static int cfg_open(struct inode *inode, struct file *file)
-{
-	struct i2o_cfg_info *tmp = kmalloc(sizeof(struct i2o_cfg_info),
-					   GFP_KERNEL);
-	unsigned long flags;
-
-	if (!tmp)
-		return -ENOMEM;
-
-	mutex_lock(&i2o_cfg_mutex);
-	file->private_data = (void *)(i2o_cfg_info_id++);
-	tmp->fp = file;
-	tmp->fasync = NULL;
-	tmp->q_id = (ulong) file->private_data;
-	tmp->q_len = 0;
-	tmp->q_in = 0;
-	tmp->q_out = 0;
-	tmp->q_lost = 0;
-	tmp->next = open_files;
-
-	spin_lock_irqsave(&i2o_config_lock, flags);
-	open_files = tmp;
-	spin_unlock_irqrestore(&i2o_config_lock, flags);
-	mutex_unlock(&i2o_cfg_mutex);
-
-	return 0;
-}
-
-static int cfg_fasync(int fd, struct file *fp, int on)
-{
-	ulong id = (ulong) fp->private_data;
-	struct i2o_cfg_info *p;
-	int ret = -EBADF;
-
-	mutex_lock(&i2o_cfg_mutex);
-	for (p = open_files; p; p = p->next)
-		if (p->q_id == id)
-			break;
-
-	if (p)
-		ret = fasync_helper(fd, fp, on, &p->fasync);
-	mutex_unlock(&i2o_cfg_mutex);
-	return ret;
-}
-
-static int cfg_release(struct inode *inode, struct file *file)
-{
-	ulong id = (ulong) file->private_data;
-	struct i2o_cfg_info *p, **q;
-	unsigned long flags;
-
-	mutex_lock(&i2o_cfg_mutex);
-	spin_lock_irqsave(&i2o_config_lock, flags);
-	for (q = &open_files; (p = *q) != NULL; q = &p->next) {
-		if (p->q_id == id) {
-			*q = p->next;
-			kfree(p);
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&i2o_config_lock, flags);
-	mutex_unlock(&i2o_cfg_mutex);
-
-	return 0;
-}
-
-static const struct file_operations config_fops = {
-	.owner = THIS_MODULE,
-	.llseek = no_llseek,
-	.unlocked_ioctl = i2o_cfg_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = i2o_cfg_compat_ioctl,
-#endif
-	.open = cfg_open,
-	.release = cfg_release,
-	.fasync = cfg_fasync,
-};
-
-static struct miscdevice i2o_miscdev = {
-	I2O_MINOR,
-	"i2octl",
-	&config_fops
-};
-
-static int __init i2o_config_old_init(void)
-{
-	spin_lock_init(&i2o_config_lock);
-
-	if (misc_register(&i2o_miscdev) < 0) {
-		osm_err("can't register device.\n");
-		return -EBUSY;
-	}
-
-	return 0;
-}
-
-static void i2o_config_old_exit(void)
-{
-	misc_deregister(&i2o_miscdev);
-}
-
-MODULE_AUTHOR("Red Hat Software");
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
deleted file mode 100644
index b7d87cd227a9..000000000000
--- a/drivers/message/i2o/i2o_proc.c
+++ /dev/null
@@ -1,2045 +0,0 @@
-/*
- *	procfs handler for Linux I2O subsystem
- *
- *	(c) Copyright 1999	Deepak Saxena
- *
- *	Originally written by Deepak Saxena(deepak@plexity.net)
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	This is an initial test release. The code is based on the design of the
- *	ide procfs system (drivers/block/ide-proc.c). Some code taken from
- *	i2o-core module by Alan Cox.
- *
- *	DISCLAIMER: This code is still under development/test and may cause
- *	your system to behave unpredictably.  Use at your own discretion.
- *
- *
- *	Fixes/additions:
- *		Juha Sievänen (Juha.Sievanen@cs.Helsinki.FI),
- *		Auvo Häkkinen (Auvo.Hakkinen@cs.Helsinki.FI)
- *		University of Helsinki, Department of Computer Science
- *			LAN entries
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>
- *			Changes for new I2O API
- */
-
-#define OSM_NAME	"proc-osm"
-#define OSM_VERSION	"1.316"
-#define OSM_DESCRIPTION	"I2O ProcFS OSM"
-
-#define I2O_MAX_MODULES 4
-// FIXME!
-#define FMT_U64_HEX "0x%08x%08x"
-#define U64_VAL(pu64) *((u32*)(pu64)+1), *((u32*)(pu64))
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/i2o.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/workqueue.h>
-
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/byteorder.h>
-
-/* Structure used to define /proc entries */
-typedef struct _i2o_proc_entry_t {
-	char *name;		/* entry name */
-	umode_t mode;		/* mode */
-	const struct file_operations *fops;	/* open function */
-} i2o_proc_entry;
-
-/* global I2O /proc/i2o entry */
-static struct proc_dir_entry *i2o_proc_dir_root;
-
-/* proc OSM driver struct */
-static struct i2o_driver i2o_proc_driver = {
-	.name = OSM_NAME,
-};
-
-static int print_serial_number(struct seq_file *seq, u8 * serialno, int max_len)
-{
-	int i;
-
-	/* 19990419 -sralston
-	 *      The I2O v1.5 (and v2.0 so far) "official specification"
-	 *      got serial numbers WRONG!
-	 *      Apparently, and despite what Section 3.4.4 says and
-	 *      Figure 3-35 shows (pg 3-39 in the pdf doc),
-	 *      the convention / consensus seems to be:
-	 *        + First byte is SNFormat
-	 *        + Second byte is SNLen (but only if SNFormat==7 (?))
-	 *        + (v2.0) SCSI+BS may use IEEE Registered (64 or 128 bit) format
-	 */
-	switch (serialno[0]) {
-	case I2O_SNFORMAT_BINARY:	/* Binary */
-		seq_printf(seq, "0x");
-		for (i = 0; i < serialno[1]; i++) {
-			seq_printf(seq, "%02X", serialno[2 + i]);
-		}
-		break;
-
-	case I2O_SNFORMAT_ASCII:	/* ASCII */
-		if (serialno[1] < ' ') {	/* printable or SNLen? */
-			/* sanity */
-			max_len =
-			    (max_len < serialno[1]) ? max_len : serialno[1];
-			serialno[1 + max_len] = '\0';
-
-			/* just print it */
-			seq_printf(seq, "%s", &serialno[2]);
-		} else {
-			/* print chars for specified length */
-			for (i = 0; i < serialno[1]; i++) {
-				seq_printf(seq, "%c", serialno[2 + i]);
-			}
-		}
-		break;
-
-	case I2O_SNFORMAT_UNICODE:	/* UNICODE */
-		seq_printf(seq, "UNICODE Format.  Can't Display\n");
-		break;
-
-	case I2O_SNFORMAT_LAN48_MAC:	/* LAN-48 MAC Address */
-		seq_printf(seq, "LAN-48 MAC address @ %pM", &serialno[2]);
-		break;
-
-	case I2O_SNFORMAT_WAN:	/* WAN MAC Address */
-		/* FIXME: Figure out what a WAN access address looks like?? */
-		seq_printf(seq, "WAN Access Address");
-		break;
-
-/* plus new in v2.0 */
-	case I2O_SNFORMAT_LAN64_MAC:	/* LAN-64 MAC Address */
-		/* FIXME: Figure out what a LAN-64 address really looks like?? */
-		seq_printf(seq,
-			   "LAN-64 MAC address @ [?:%02X:%02X:?] %pM",
-			   serialno[8], serialno[9], &serialno[2]);
-		break;
-
-	case I2O_SNFORMAT_DDM:	/* I2O DDM */
-		seq_printf(seq,
-			   "DDM: Tid=%03Xh, Rsvd=%04Xh, OrgId=%04Xh",
-			   *(u16 *) & serialno[2],
-			   *(u16 *) & serialno[4], *(u16 *) & serialno[6]);
-		break;
-
-	case I2O_SNFORMAT_IEEE_REG64:	/* IEEE Registered (64-bit) */
-	case I2O_SNFORMAT_IEEE_REG128:	/* IEEE Registered (128-bit) */
-		/* FIXME: Figure if this is even close?? */
-		seq_printf(seq,
-			   "IEEE NodeName(hi,lo)=(%08Xh:%08Xh), PortName(hi,lo)=(%08Xh:%08Xh)\n",
-			   *(u32 *) & serialno[2],
-			   *(u32 *) & serialno[6],
-			   *(u32 *) & serialno[10], *(u32 *) & serialno[14]);
-		break;
-
-	case I2O_SNFORMAT_UNKNOWN:	/* Unknown 0    */
-	case I2O_SNFORMAT_UNKNOWN2:	/* Unknown 0xff */
-	default:
-		seq_printf(seq, "Unknown data format (0x%02x)", serialno[0]);
-		break;
-	}
-
-	return 0;
-}
-
-/**
- *	i2o_get_class_name - 	do i2o class name lookup
- *	@class: class number
- *
- *	Return a descriptive string for an i2o class.
- */
-static const char *i2o_get_class_name(int class)
-{
-	int idx = 16;
-	static char *i2o_class_name[] = {
-		"Executive",
-		"Device Driver Module",
-		"Block Device",
-		"Tape Device",
-		"LAN Interface",
-		"WAN Interface",
-		"Fibre Channel Port",
-		"Fibre Channel Device",
-		"SCSI Device",
-		"ATE Port",
-		"ATE Device",
-		"Floppy Controller",
-		"Floppy Device",
-		"Secondary Bus Port",
-		"Peer Transport Agent",
-		"Peer Transport",
-		"Unknown"
-	};
-
-	switch (class & 0xfff) {
-	case I2O_CLASS_EXECUTIVE:
-		idx = 0;
-		break;
-	case I2O_CLASS_DDM:
-		idx = 1;
-		break;
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-		idx = 2;
-		break;
-	case I2O_CLASS_SEQUENTIAL_STORAGE:
-		idx = 3;
-		break;
-	case I2O_CLASS_LAN:
-		idx = 4;
-		break;
-	case I2O_CLASS_WAN:
-		idx = 5;
-		break;
-	case I2O_CLASS_FIBRE_CHANNEL_PORT:
-		idx = 6;
-		break;
-	case I2O_CLASS_FIBRE_CHANNEL_PERIPHERAL:
-		idx = 7;
-		break;
-	case I2O_CLASS_SCSI_PERIPHERAL:
-		idx = 8;
-		break;
-	case I2O_CLASS_ATE_PORT:
-		idx = 9;
-		break;
-	case I2O_CLASS_ATE_PERIPHERAL:
-		idx = 10;
-		break;
-	case I2O_CLASS_FLOPPY_CONTROLLER:
-		idx = 11;
-		break;
-	case I2O_CLASS_FLOPPY_DEVICE:
-		idx = 12;
-		break;
-	case I2O_CLASS_BUS_ADAPTER:
-		idx = 13;
-		break;
-	case I2O_CLASS_PEER_TRANSPORT_AGENT:
-		idx = 14;
-		break;
-	case I2O_CLASS_PEER_TRANSPORT:
-		idx = 15;
-		break;
-	}
-
-	return i2o_class_name[idx];
-}
-
-#define SCSI_TABLE_SIZE	13
-static char *scsi_devices[] = {
-	"Direct-Access Read/Write",
-	"Sequential-Access Storage",
-	"Printer",
-	"Processor",
-	"WORM Device",
-	"CD-ROM Device",
-	"Scanner Device",
-	"Optical Memory Device",
-	"Medium Changer Device",
-	"Communications Device",
-	"Graphics Art Pre-Press Device",
-	"Graphics Art Pre-Press Device",
-	"Array Controller Device"
-};
-
-static char *chtostr(char *tmp, u8 *chars, int n)
-{
-	tmp[0] = 0;
-	return strncat(tmp, (char *)chars, n);
-}
-
-static int i2o_report_query_status(struct seq_file *seq, int block_status,
-				   char *group)
-{
-	switch (block_status) {
-	case -ETIMEDOUT:
-		return seq_printf(seq, "Timeout reading group %s.\n", group);
-	case -ENOMEM:
-		return seq_printf(seq, "No free memory to read the table.\n");
-	case -I2O_PARAMS_STATUS_INVALID_GROUP_ID:
-		return seq_printf(seq, "Group %s not supported.\n", group);
-	default:
-		return seq_printf(seq,
-				  "Error reading group %s. BlockStatus 0x%02X\n",
-				  group, -block_status);
-	}
-}
-
-static char *bus_strings[] = {
-	"Local Bus",
-	"ISA",
-	"EISA",
-	"PCI",
-	"PCMCIA",
-	"NUBUS",
-	"CARDBUS"
-};
-
-static int i2o_seq_show_hrt(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	i2o_hrt *hrt = (i2o_hrt *) c->hrt.virt;
-	u32 bus;
-	int i;
-
-	if (hrt->hrt_version) {
-		seq_printf(seq,
-			   "HRT table for controller is too new a version.\n");
-		return 0;
-	}
-
-	seq_printf(seq, "HRT has %d entries of %d bytes each.\n",
-		   hrt->num_entries, hrt->entry_len << 2);
-
-	for (i = 0; i < hrt->num_entries; i++) {
-		seq_printf(seq, "Entry %d:\n", i);
-		seq_printf(seq, "   Adapter ID: %0#10x\n",
-			   hrt->hrt_entry[i].adapter_id);
-		seq_printf(seq, "   Controlling tid: %0#6x\n",
-			   hrt->hrt_entry[i].parent_tid);
-
-		if (hrt->hrt_entry[i].bus_type != 0x80) {
-			bus = hrt->hrt_entry[i].bus_type;
-			seq_printf(seq, "   %s Information\n",
-				   bus_strings[bus]);
-
-			switch (bus) {
-			case I2O_BUS_LOCAL:
-				seq_printf(seq, "     IOBase: %0#6x,",
-					   hrt->hrt_entry[i].bus.local_bus.
-					   LbBaseIOPort);
-				seq_printf(seq, " MemoryBase: %0#10x\n",
-					   hrt->hrt_entry[i].bus.local_bus.
-					   LbBaseMemoryAddress);
-				break;
-
-			case I2O_BUS_ISA:
-				seq_printf(seq, "     IOBase: %0#6x,",
-					   hrt->hrt_entry[i].bus.isa_bus.
-					   IsaBaseIOPort);
-				seq_printf(seq, " MemoryBase: %0#10x,",
-					   hrt->hrt_entry[i].bus.isa_bus.
-					   IsaBaseMemoryAddress);
-				seq_printf(seq, " CSN: %0#4x,",
-					   hrt->hrt_entry[i].bus.isa_bus.CSN);
-				break;
-
-			case I2O_BUS_EISA:
-				seq_printf(seq, "     IOBase: %0#6x,",
-					   hrt->hrt_entry[i].bus.eisa_bus.
-					   EisaBaseIOPort);
-				seq_printf(seq, " MemoryBase: %0#10x,",
-					   hrt->hrt_entry[i].bus.eisa_bus.
-					   EisaBaseMemoryAddress);
-				seq_printf(seq, " Slot: %0#4x,",
-					   hrt->hrt_entry[i].bus.eisa_bus.
-					   EisaSlotNumber);
-				break;
-
-			case I2O_BUS_PCI:
-				seq_printf(seq, "     Bus: %0#4x",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciBusNumber);
-				seq_printf(seq, " Dev: %0#4x",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciDeviceNumber);
-				seq_printf(seq, " Func: %0#4x",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciFunctionNumber);
-				seq_printf(seq, " Vendor: %0#6x",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciVendorID);
-				seq_printf(seq, " Device: %0#6x\n",
-					   hrt->hrt_entry[i].bus.pci_bus.
-					   PciDeviceID);
-				break;
-
-			default:
-				seq_printf(seq, "      Unsupported Bus Type\n");
-			}
-		} else
-			seq_printf(seq, "   Unknown Bus Type\n");
-	}
-
-	return 0;
-}
-
-static int i2o_seq_show_lct(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	i2o_lct *lct = (i2o_lct *) c->lct;
-	int entries;
-	int i;
-
-#define BUS_TABLE_SIZE 3
-	static char *bus_ports[] = {
-		"Generic Bus",
-		"SCSI Bus",
-		"Fibre Channel Bus"
-	};
-
-	entries = (lct->table_size - 3) / 9;
-
-	seq_printf(seq, "LCT contains %d %s\n", entries,
-		   entries == 1 ? "entry" : "entries");
-	if (lct->boot_tid)
-		seq_printf(seq, "Boot Device @ ID %d\n", lct->boot_tid);
-
-	seq_printf(seq, "Current Change Indicator: %#10x\n", lct->change_ind);
-
-	for (i = 0; i < entries; i++) {
-		seq_printf(seq, "Entry %d\n", i);
-		seq_printf(seq, "  Class, SubClass  : %s",
-			   i2o_get_class_name(lct->lct_entry[i].class_id));
-
-		/*
-		 *      Classes which we'll print subclass info for
-		 */
-		switch (lct->lct_entry[i].class_id & 0xFFF) {
-		case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-			switch (lct->lct_entry[i].sub_class) {
-			case 0x00:
-				seq_printf(seq, ", Direct-Access Read/Write");
-				break;
-
-			case 0x04:
-				seq_printf(seq, ", WORM Drive");
-				break;
-
-			case 0x05:
-				seq_printf(seq, ", CD-ROM Drive");
-				break;
-
-			case 0x07:
-				seq_printf(seq, ", Optical Memory Device");
-				break;
-
-			default:
-				seq_printf(seq, ", Unknown (0x%02x)",
-					   lct->lct_entry[i].sub_class);
-				break;
-			}
-			break;
-
-		case I2O_CLASS_LAN:
-			switch (lct->lct_entry[i].sub_class & 0xFF) {
-			case 0x30:
-				seq_printf(seq, ", Ethernet");
-				break;
-
-			case 0x40:
-				seq_printf(seq, ", 100base VG");
-				break;
-
-			case 0x50:
-				seq_printf(seq, ", IEEE 802.5/Token-Ring");
-				break;
-
-			case 0x60:
-				seq_printf(seq, ", ANSI X3T9.5 FDDI");
-				break;
-
-			case 0x70:
-				seq_printf(seq, ", Fibre Channel");
-				break;
-
-			default:
-				seq_printf(seq, ", Unknown Sub-Class (0x%02x)",
-					   lct->lct_entry[i].sub_class & 0xFF);
-				break;
-			}
-			break;
-
-		case I2O_CLASS_SCSI_PERIPHERAL:
-			if (lct->lct_entry[i].sub_class < SCSI_TABLE_SIZE)
-				seq_printf(seq, ", %s",
-					   scsi_devices[lct->lct_entry[i].
-							sub_class]);
-			else
-				seq_printf(seq, ", Unknown Device Type");
-			break;
-
-		case I2O_CLASS_BUS_ADAPTER:
-			if (lct->lct_entry[i].sub_class < BUS_TABLE_SIZE)
-				seq_printf(seq, ", %s",
-					   bus_ports[lct->lct_entry[i].
-						     sub_class]);
-			else
-				seq_printf(seq, ", Unknown Bus Type");
-			break;
-		}
-		seq_printf(seq, "\n");
-
-		seq_printf(seq, "  Local TID        : 0x%03x\n",
-			   lct->lct_entry[i].tid);
-		seq_printf(seq, "  User TID         : 0x%03x\n",
-			   lct->lct_entry[i].user_tid);
-		seq_printf(seq, "  Parent TID       : 0x%03x\n",
-			   lct->lct_entry[i].parent_tid);
-		seq_printf(seq, "  Identity Tag     : 0x%x%x%x%x%x%x%x%x\n",
-			   lct->lct_entry[i].identity_tag[0],
-			   lct->lct_entry[i].identity_tag[1],
-			   lct->lct_entry[i].identity_tag[2],
-			   lct->lct_entry[i].identity_tag[3],
-			   lct->lct_entry[i].identity_tag[4],
-			   lct->lct_entry[i].identity_tag[5],
-			   lct->lct_entry[i].identity_tag[6],
-			   lct->lct_entry[i].identity_tag[7]);
-		seq_printf(seq, "  Change Indicator : %0#10x\n",
-			   lct->lct_entry[i].change_ind);
-		seq_printf(seq, "  Event Capab Mask : %0#10x\n",
-			   lct->lct_entry[i].device_flags);
-	}
-
-	return 0;
-}
-
-static int i2o_seq_show_status(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	char prodstr[25];
-	int version;
-	i2o_status_block *sb = c->status_block.virt;
-
-	i2o_status_get(c);	// reread the status block
-
-	seq_printf(seq, "Organization ID        : %0#6x\n", sb->org_id);
-
-	version = sb->i2o_version;
-
-/* FIXME for Spec 2.0
-	if (version == 0x02) {
-		seq_printf(seq, "Lowest I2O version supported: ");
-		switch(workspace[2]) {
-			case 0x00:
-				seq_printf(seq, "1.0\n");
-				break;
-			case 0x01:
-				seq_printf(seq, "1.5\n");
-				break;
-			case 0x02:
-				seq_printf(seq, "2.0\n");
-				break;
-		}
-
-		seq_printf(seq, "Highest I2O version supported: ");
-		switch(workspace[3]) {
-			case 0x00:
-				seq_printf(seq, "1.0\n");
-				break;
-			case 0x01:
-				seq_printf(seq, "1.5\n");
-				break;
-			case 0x02:
-				seq_printf(seq, "2.0\n");
-				break;
-		}
-	}
-*/
-	seq_printf(seq, "IOP ID                 : %0#5x\n", sb->iop_id);
-	seq_printf(seq, "Host Unit ID           : %0#6x\n", sb->host_unit_id);
-	seq_printf(seq, "Segment Number         : %0#5x\n", sb->segment_number);
-
-	seq_printf(seq, "I2O version            : ");
-	switch (version) {
-	case 0x00:
-		seq_printf(seq, "1.0\n");
-		break;
-	case 0x01:
-		seq_printf(seq, "1.5\n");
-		break;
-	case 0x02:
-		seq_printf(seq, "2.0\n");
-		break;
-	default:
-		seq_printf(seq, "Unknown version\n");
-	}
-
-	seq_printf(seq, "IOP State              : ");
-	switch (sb->iop_state) {
-	case 0x01:
-		seq_printf(seq, "INIT\n");
-		break;
-
-	case 0x02:
-		seq_printf(seq, "RESET\n");
-		break;
-
-	case 0x04:
-		seq_printf(seq, "HOLD\n");
-		break;
-
-	case 0x05:
-		seq_printf(seq, "READY\n");
-		break;
-
-	case 0x08:
-		seq_printf(seq, "OPERATIONAL\n");
-		break;
-
-	case 0x10:
-		seq_printf(seq, "FAILED\n");
-		break;
-
-	case 0x11:
-		seq_printf(seq, "FAULTED\n");
-		break;
-
-	default:
-		seq_printf(seq, "Unknown\n");
-		break;
-	}
-
-	seq_printf(seq, "Messenger Type         : ");
-	switch (sb->msg_type) {
-	case 0x00:
-		seq_printf(seq, "Memory mapped\n");
-		break;
-	case 0x01:
-		seq_printf(seq, "Memory mapped only\n");
-		break;
-	case 0x02:
-		seq_printf(seq, "Remote only\n");
-		break;
-	case 0x03:
-		seq_printf(seq, "Memory mapped and remote\n");
-		break;
-	default:
-		seq_printf(seq, "Unknown\n");
-	}
-
-	seq_printf(seq, "Inbound Frame Size     : %d bytes\n",
-		   sb->inbound_frame_size << 2);
-	seq_printf(seq, "Max Inbound Frames     : %d\n",
-		   sb->max_inbound_frames);
-	seq_printf(seq, "Current Inbound Frames : %d\n",
-		   sb->cur_inbound_frames);
-	seq_printf(seq, "Max Outbound Frames    : %d\n",
-		   sb->max_outbound_frames);
-
-	/* Spec doesn't say if NULL terminated or not... */
-	memcpy(prodstr, sb->product_id, 24);
-	prodstr[24] = '\0';
-	seq_printf(seq, "Product ID             : %s\n", prodstr);
-	seq_printf(seq, "Expected LCT Size      : %d bytes\n",
-		   sb->expected_lct_size);
-
-	seq_printf(seq, "IOP Capabilities\n");
-	seq_printf(seq, "    Context Field Size Support : ");
-	switch (sb->iop_capabilities & 0x0000003) {
-	case 0:
-		seq_printf(seq, "Supports only 32-bit context fields\n");
-		break;
-	case 1:
-		seq_printf(seq, "Supports only 64-bit context fields\n");
-		break;
-	case 2:
-		seq_printf(seq, "Supports 32-bit and 64-bit context fields, "
-			   "but not concurrently\n");
-		break;
-	case 3:
-		seq_printf(seq, "Supports 32-bit and 64-bit context fields "
-			   "concurrently\n");
-		break;
-	default:
-		seq_printf(seq, "0x%08x\n", sb->iop_capabilities);
-	}
-	seq_printf(seq, "    Current Context Field Size : ");
-	switch (sb->iop_capabilities & 0x0000000C) {
-	case 0:
-		seq_printf(seq, "not configured\n");
-		break;
-	case 4:
-		seq_printf(seq, "Supports only 32-bit context fields\n");
-		break;
-	case 8:
-		seq_printf(seq, "Supports only 64-bit context fields\n");
-		break;
-	case 12:
-		seq_printf(seq, "Supports both 32-bit or 64-bit context fields "
-			   "concurrently\n");
-		break;
-	default:
-		seq_printf(seq, "\n");
-	}
-	seq_printf(seq, "    Inbound Peer Support       : %s\n",
-		   (sb->
-		    iop_capabilities & 0x00000010) ? "Supported" :
-		   "Not supported");
-	seq_printf(seq, "    Outbound Peer Support      : %s\n",
-		   (sb->
-		    iop_capabilities & 0x00000020) ? "Supported" :
-		   "Not supported");
-	seq_printf(seq, "    Peer to Peer Support       : %s\n",
-		   (sb->
-		    iop_capabilities & 0x00000040) ? "Supported" :
-		   "Not supported");
-
-	seq_printf(seq, "Desired private memory size   : %d kB\n",
-		   sb->desired_mem_size >> 10);
-	seq_printf(seq, "Allocated private memory size : %d kB\n",
-		   sb->current_mem_size >> 10);
-	seq_printf(seq, "Private memory base address   : %0#10x\n",
-		   sb->current_mem_base);
-	seq_printf(seq, "Desired private I/O size      : %d kB\n",
-		   sb->desired_io_size >> 10);
-	seq_printf(seq, "Allocated private I/O size    : %d kB\n",
-		   sb->current_io_size >> 10);
-	seq_printf(seq, "Private I/O base address      : %0#10x\n",
-		   sb->current_io_base);
-
-	return 0;
-}
-
-static int i2o_seq_show_hw(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	static u32 work32[5];
-	static u8 *work8 = (u8 *) work32;
-	static u16 *work16 = (u16 *) work32;
-	int token;
-	u32 hwcap;
-
-	static char *cpu_table[] = {
-		"Intel 80960 series",
-		"AMD2900 series",
-		"Motorola 68000 series",
-		"ARM series",
-		"MIPS series",
-		"Sparc series",
-		"PowerPC series",
-		"Intel x86 series"
-	};
-
-	token =
-	    i2o_parm_field_get(c->exec, 0x0000, -1, &work32, sizeof(work32));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0x0000 IOP Hardware");
-		return 0;
-	}
-
-	seq_printf(seq, "I2O Vendor ID    : %0#6x\n", work16[0]);
-	seq_printf(seq, "Product ID       : %0#6x\n", work16[1]);
-	seq_printf(seq, "CPU              : ");
-	if (work8[16] > 8)
-		seq_printf(seq, "Unknown\n");
-	else
-		seq_printf(seq, "%s\n", cpu_table[work8[16]]);
-	/* Anyone using ProcessorVersion? */
-
-	seq_printf(seq, "RAM              : %dkB\n", work32[1] >> 10);
-	seq_printf(seq, "Non-Volatile Mem : %dkB\n", work32[2] >> 10);
-
-	hwcap = work32[3];
-	seq_printf(seq, "Capabilities : 0x%08x\n", hwcap);
-	seq_printf(seq, "   [%s] Self booting\n",
-		   (hwcap & 0x00000001) ? "+" : "-");
-	seq_printf(seq, "   [%s] Upgradable IRTOS\n",
-		   (hwcap & 0x00000002) ? "+" : "-");
-	seq_printf(seq, "   [%s] Supports downloading DDMs\n",
-		   (hwcap & 0x00000004) ? "+" : "-");
-	seq_printf(seq, "   [%s] Supports installing DDMs\n",
-		   (hwcap & 0x00000008) ? "+" : "-");
-	seq_printf(seq, "   [%s] Battery-backed RAM\n",
-		   (hwcap & 0x00000010) ? "+" : "-");
-
-	return 0;
-}
-
-/* Executive group 0003h - Executing DDM List (table) */
-static int i2o_seq_show_ddm_table(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	int token;
-	int i;
-
-	typedef struct _i2o_exec_execute_ddm_table {
-		u16 ddm_tid;
-		u8 module_type;
-		u8 reserved;
-		u16 i2o_vendor_id;
-		u16 module_id;
-		u8 module_name_version[28];
-		u32 data_size;
-		u32 code_size;
-	} i2o_exec_execute_ddm_table;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_exec_execute_ddm_table ddm_table[I2O_MAX_MODULES];
-	} *result;
-
-	i2o_exec_execute_ddm_table ddm_table;
-	char tmp[28 + 1];
-
-	result = kmalloc(sizeof(*result), GFP_KERNEL);
-	if (!result)
-		return -ENOMEM;
-
-	token = i2o_parm_table_get(c->exec, I2O_PARAMS_TABLE_GET, 0x0003, -1,
-				   NULL, 0, result, sizeof(*result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0x0003 Executing DDM List");
-		goto out;
-	}
-
-	seq_printf(seq,
-		   "Tid   Module_type     Vendor Mod_id  Module_name             Vrs  Data_size Code_size\n");
-	ddm_table = result->ddm_table[0];
-
-	for (i = 0; i < result->row_count; ddm_table = result->ddm_table[++i]) {
-		seq_printf(seq, "0x%03x ", ddm_table.ddm_tid & 0xFFF);
-
-		switch (ddm_table.module_type) {
-		case 0x01:
-			seq_printf(seq, "Downloaded DDM  ");
-			break;
-		case 0x22:
-			seq_printf(seq, "Embedded DDM    ");
-			break;
-		default:
-			seq_printf(seq, "                ");
-		}
-
-		seq_printf(seq, "%-#7x", ddm_table.i2o_vendor_id);
-		seq_printf(seq, "%-#8x", ddm_table.module_id);
-		seq_printf(seq, "%-29s",
-			   chtostr(tmp, ddm_table.module_name_version, 28));
-		seq_printf(seq, "%9d  ", ddm_table.data_size);
-		seq_printf(seq, "%8d", ddm_table.code_size);
-
-		seq_printf(seq, "\n");
-	}
-      out:
-	kfree(result);
-	return 0;
-}
-
-/* Executive group 0004h - Driver Store (scalar) */
-static int i2o_seq_show_driver_store(struct seq_file *seq, void *v)
-{
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	u32 work32[8];
-	int token;
-
-	token =
-	    i2o_parm_field_get(c->exec, 0x0004, -1, &work32, sizeof(work32));
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0x0004 Driver Store");
-		return 0;
-	}
-
-	seq_printf(seq, "Module limit  : %d\n"
-		   "Module count  : %d\n"
-		   "Current space : %d kB\n"
-		   "Free space    : %d kB\n",
-		   work32[0], work32[1], work32[2] >> 10, work32[3] >> 10);
-
-	return 0;
-}
-
-/* Executive group 0005h - Driver Store Table (table) */
-static int i2o_seq_show_drivers_stored(struct seq_file *seq, void *v)
-{
-	typedef struct _i2o_driver_store {
-		u16 stored_ddm_index;
-		u8 module_type;
-		u8 reserved;
-		u16 i2o_vendor_id;
-		u16 module_id;
-		u8 module_name_version[28];
-		u8 date[8];
-		u32 module_size;
-		u32 mpb_size;
-		u32 module_flags;
-	} i2o_driver_store_table;
-
-	struct i2o_controller *c = (struct i2o_controller *)seq->private;
-	int token;
-	int i;
-
-	typedef struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_driver_store_table dst[I2O_MAX_MODULES];
-	} i2o_driver_result_table;
-
-	i2o_driver_result_table *result;
-	i2o_driver_store_table *dst;
-	char tmp[28 + 1];
-
-	result = kmalloc(sizeof(i2o_driver_result_table), GFP_KERNEL);
-	if (result == NULL)
-		return -ENOMEM;
-
-	token = i2o_parm_table_get(c->exec, I2O_PARAMS_TABLE_GET, 0x0005, -1,
-				   NULL, 0, result, sizeof(*result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0x0005 DRIVER STORE TABLE");
-		kfree(result);
-		return 0;
-	}
-
-	seq_printf(seq,
-		   "#  Module_type     Vendor Mod_id  Module_name             Vrs"
-		   "Date     Mod_size Par_size Flags\n");
-	for (i = 0, dst = &result->dst[0]; i < result->row_count;
-	     dst = &result->dst[++i]) {
-		seq_printf(seq, "%-3d", dst->stored_ddm_index);
-		switch (dst->module_type) {
-		case 0x01:
-			seq_printf(seq, "Downloaded DDM  ");
-			break;
-		case 0x22:
-			seq_printf(seq, "Embedded DDM    ");
-			break;
-		default:
-			seq_printf(seq, "                ");
-		}
-
-		seq_printf(seq, "%-#7x", dst->i2o_vendor_id);
-		seq_printf(seq, "%-#8x", dst->module_id);
-		seq_printf(seq, "%-29s",
-			   chtostr(tmp, dst->module_name_version, 28));
-		seq_printf(seq, "%-9s", chtostr(tmp, dst->date, 8));
-		seq_printf(seq, "%8d ", dst->module_size);
-		seq_printf(seq, "%8d ", dst->mpb_size);
-		seq_printf(seq, "0x%04x", dst->module_flags);
-		seq_printf(seq, "\n");
-	}
-
-	kfree(result);
-	return 0;
-}
-
-/* Generic group F000h - Params Descriptor (table) */
-static int i2o_seq_show_groups(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-	u8 properties;
-
-	typedef struct _i2o_group_info {
-		u16 group_number;
-		u16 field_count;
-		u16 row_count;
-		u8 properties;
-		u8 reserved;
-	} i2o_group_info;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_group_info group[256];
-	} *result;
-
-	result = kmalloc(sizeof(*result), GFP_KERNEL);
-	if (!result)
-		return -ENOMEM;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF000, -1, NULL, 0,
-				   result, sizeof(*result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF000 Params Descriptor");
-		goto out;
-	}
-
-	seq_printf(seq,
-		   "#  Group   FieldCount RowCount Type   Add Del Clear\n");
-
-	for (i = 0; i < result->row_count; i++) {
-		seq_printf(seq, "%-3d", i);
-		seq_printf(seq, "0x%04X ", result->group[i].group_number);
-		seq_printf(seq, "%10d ", result->group[i].field_count);
-		seq_printf(seq, "%8d ", result->group[i].row_count);
-
-		properties = result->group[i].properties;
-		if (properties & 0x1)
-			seq_printf(seq, "Table  ");
-		else
-			seq_printf(seq, "Scalar ");
-		if (properties & 0x2)
-			seq_printf(seq, " + ");
-		else
-			seq_printf(seq, " - ");
-		if (properties & 0x4)
-			seq_printf(seq, "  + ");
-		else
-			seq_printf(seq, "  - ");
-		if (properties & 0x8)
-			seq_printf(seq, "  + ");
-		else
-			seq_printf(seq, "  - ");
-
-		seq_printf(seq, "\n");
-	}
-
-	if (result->more_flag)
-		seq_printf(seq, "There is more...\n");
-      out:
-	kfree(result);
-	return 0;
-}
-
-/* Generic group F001h - Physical Device Table (table) */
-static int i2o_seq_show_phys_device(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		u32 adapter_id[64];
-	} result;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF001, -1, NULL, 0,
-				   &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF001 Physical Device Table");
-		return 0;
-	}
-
-	if (result.row_count)
-		seq_printf(seq, "#  AdapterId\n");
-
-	for (i = 0; i < result.row_count; i++) {
-		seq_printf(seq, "%-2d", i);
-		seq_printf(seq, "%#7x\n", result.adapter_id[i]);
-	}
-
-	if (result.more_flag)
-		seq_printf(seq, "There is more...\n");
-
-	return 0;
-}
-
-/* Generic group F002h - Claimed Table (table) */
-static int i2o_seq_show_claimed(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		u16 claimed_tid[64];
-	} result;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF002, -1, NULL, 0,
-				   &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF002 Claimed Table");
-		return 0;
-	}
-
-	if (result.row_count)
-		seq_printf(seq, "#  ClaimedTid\n");
-
-	for (i = 0; i < result.row_count; i++) {
-		seq_printf(seq, "%-2d", i);
-		seq_printf(seq, "%#7x\n", result.claimed_tid[i]);
-	}
-
-	if (result.more_flag)
-		seq_printf(seq, "There is more...\n");
-
-	return 0;
-}
-
-/* Generic group F003h - User Table (table) */
-static int i2o_seq_show_users(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	typedef struct _i2o_user_table {
-		u16 instance;
-		u16 user_tid;
-		u8 claim_type;
-		u8 reserved1;
-		u16 reserved2;
-	} i2o_user_table;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_user_table user[64];
-	} *result;
-
-	result = kmalloc(sizeof(*result), GFP_KERNEL);
-	if (!result)
-		return -ENOMEM;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF003, -1, NULL, 0,
-				   result, sizeof(*result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF003 User Table");
-		goto out;
-	}
-
-	seq_printf(seq, "#  Instance UserTid ClaimType\n");
-
-	for (i = 0; i < result->row_count; i++) {
-		seq_printf(seq, "%-3d", i);
-		seq_printf(seq, "%#8x ", result->user[i].instance);
-		seq_printf(seq, "%#7x ", result->user[i].user_tid);
-		seq_printf(seq, "%#9x\n", result->user[i].claim_type);
-	}
-
-	if (result->more_flag)
-		seq_printf(seq, "There is more...\n");
-      out:
-	kfree(result);
-	return 0;
-}
-
-/* Generic group F005h - Private message extensions (table) (optional) */
-static int i2o_seq_show_priv_msgs(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	typedef struct _i2o_private {
-		u16 ext_instance;
-		u16 organization_id;
-		u16 x_function_code;
-	} i2o_private;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		i2o_private extension[64];
-	} result;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF000, -1, NULL, 0,
-				   &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF005 Private Message Extensions (optional)");
-		return 0;
-	}
-
-	seq_printf(seq, "Instance#  OrgId  FunctionCode\n");
-
-	for (i = 0; i < result.row_count; i++) {
-		seq_printf(seq, "%0#9x ", result.extension[i].ext_instance);
-		seq_printf(seq, "%0#6x ", result.extension[i].organization_id);
-		seq_printf(seq, "%0#6x", result.extension[i].x_function_code);
-
-		seq_printf(seq, "\n");
-	}
-
-	if (result.more_flag)
-		seq_printf(seq, "There is more...\n");
-
-	return 0;
-}
-
-/* Generic group F006h - Authorized User Table (table) */
-static int i2o_seq_show_authorized_users(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-	int i;
-
-	struct {
-		u16 result_count;
-		u16 pad;
-		u16 block_size;
-		u8 block_status;
-		u8 error_info_size;
-		u16 row_count;
-		u16 more_flag;
-		u32 alternate_tid[64];
-	} result;
-
-	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF006, -1, NULL, 0,
-				   &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF006 Autohorized User Table");
-		return 0;
-	}
-
-	if (result.row_count)
-		seq_printf(seq, "#  AlternateTid\n");
-
-	for (i = 0; i < result.row_count; i++) {
-		seq_printf(seq, "%-2d", i);
-		seq_printf(seq, "%#7x ", result.alternate_tid[i]);
-	}
-
-	if (result.more_flag)
-		seq_printf(seq, "There is more...\n");
-
-	return 0;
-}
-
-/* Generic group F100h - Device Identity (scalar) */
-static int i2o_seq_show_dev_identity(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	static u32 work32[128];	// allow for "stuff" + up to 256 byte (max) serial number
-	// == (allow) 512d bytes (max)
-	static u16 *work16 = (u16 *) work32;
-	int token;
-	char tmp[16 + 1];
-
-	token = i2o_parm_field_get(d, 0xF100, -1, &work32, sizeof(work32));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF100 Device Identity");
-		return 0;
-	}
-
-	seq_printf(seq, "Device Class  : %s\n", i2o_get_class_name(work16[0]));
-	seq_printf(seq, "Owner TID     : %0#5x\n", work16[2]);
-	seq_printf(seq, "Parent TID    : %0#5x\n", work16[3]);
-	seq_printf(seq, "Vendor info   : %s\n",
-		   chtostr(tmp, (u8 *) (work32 + 2), 16));
-	seq_printf(seq, "Product info  : %s\n",
-		   chtostr(tmp, (u8 *) (work32 + 6), 16));
-	seq_printf(seq, "Description   : %s\n",
-		   chtostr(tmp, (u8 *) (work32 + 10), 16));
-	seq_printf(seq, "Product rev.  : %s\n",
-		   chtostr(tmp, (u8 *) (work32 + 14), 8));
-
-	seq_printf(seq, "Serial number : ");
-	print_serial_number(seq, (u8 *) (work32 + 16),
-			    /* allow for SNLen plus
-			     * possible trailing '\0'
-			     */
-			    sizeof(work32) - (16 * sizeof(u32)) - 2);
-	seq_printf(seq, "\n");
-
-	return 0;
-}
-
-static int i2o_seq_show_dev_name(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-
-	seq_printf(seq, "%s\n", dev_name(&d->device));
-
-	return 0;
-}
-
-/* Generic group F101h - DDM Identity (scalar) */
-static int i2o_seq_show_ddm_identity(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-
-	struct {
-		u16 ddm_tid;
-		u8 module_name[24];
-		u8 module_rev[8];
-		u8 sn_format;
-		u8 serial_number[12];
-		u8 pad[256];	// allow up to 256 byte (max) serial number
-	} result;
-
-	char tmp[24 + 1];
-
-	token = i2o_parm_field_get(d, 0xF101, -1, &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF101 DDM Identity");
-		return 0;
-	}
-
-	seq_printf(seq, "Registering DDM TID : 0x%03x\n", result.ddm_tid);
-	seq_printf(seq, "Module name         : %s\n",
-		   chtostr(tmp, result.module_name, 24));
-	seq_printf(seq, "Module revision     : %s\n",
-		   chtostr(tmp, result.module_rev, 8));
-
-	seq_printf(seq, "Serial number       : ");
-	print_serial_number(seq, result.serial_number, sizeof(result) - 36);
-	/* allow for SNLen plus possible trailing '\0' */
-
-	seq_printf(seq, "\n");
-
-	return 0;
-}
-
-/* Generic group F102h - User Information (scalar) */
-static int i2o_seq_show_uinfo(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-
-	struct {
-		u8 device_name[64];
-		u8 service_name[64];
-		u8 physical_location[64];
-		u8 instance_number[4];
-	} result;
-
-	char tmp[64 + 1];
-
-	token = i2o_parm_field_get(d, 0xF102, -1, &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token, "0xF102 User Information");
-		return 0;
-	}
-
-	seq_printf(seq, "Device name     : %s\n",
-		   chtostr(tmp, result.device_name, 64));
-	seq_printf(seq, "Service name    : %s\n",
-		   chtostr(tmp, result.service_name, 64));
-	seq_printf(seq, "Physical name   : %s\n",
-		   chtostr(tmp, result.physical_location, 64));
-	seq_printf(seq, "Instance number : %s\n",
-		   chtostr(tmp, result.instance_number, 4));
-
-	return 0;
-}
-
-/* Generic group F103h - SGL Operating Limits (scalar) */
-static int i2o_seq_show_sgl_limits(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	static u32 work32[12];
-	static u16 *work16 = (u16 *) work32;
-	static u8 *work8 = (u8 *) work32;
-	int token;
-
-	token = i2o_parm_field_get(d, 0xF103, -1, &work32, sizeof(work32));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF103 SGL Operating Limits");
-		return 0;
-	}
-
-	seq_printf(seq, "SGL chain size        : %d\n", work32[0]);
-	seq_printf(seq, "Max SGL chain size    : %d\n", work32[1]);
-	seq_printf(seq, "SGL chain size target : %d\n", work32[2]);
-	seq_printf(seq, "SGL frag count        : %d\n", work16[6]);
-	seq_printf(seq, "Max SGL frag count    : %d\n", work16[7]);
-	seq_printf(seq, "SGL frag count target : %d\n", work16[8]);
-
-/* FIXME
-	if (d->i2oversion == 0x02)
-	{
-*/
-	seq_printf(seq, "SGL data alignment    : %d\n", work16[8]);
-	seq_printf(seq, "SGL addr limit        : %d\n", work8[20]);
-	seq_printf(seq, "SGL addr sizes supported : ");
-	if (work8[21] & 0x01)
-		seq_printf(seq, "32 bit ");
-	if (work8[21] & 0x02)
-		seq_printf(seq, "64 bit ");
-	if (work8[21] & 0x04)
-		seq_printf(seq, "96 bit ");
-	if (work8[21] & 0x08)
-		seq_printf(seq, "128 bit ");
-	seq_printf(seq, "\n");
-/*
-	}
-*/
-
-	return 0;
-}
-
-/* Generic group F200h - Sensors (scalar) */
-static int i2o_seq_show_sensors(struct seq_file *seq, void *v)
-{
-	struct i2o_device *d = (struct i2o_device *)seq->private;
-	int token;
-
-	struct {
-		u16 sensor_instance;
-		u8 component;
-		u16 component_instance;
-		u8 sensor_class;
-		u8 sensor_type;
-		u8 scaling_exponent;
-		u32 actual_reading;
-		u32 minimum_reading;
-		u32 low2lowcat_treshold;
-		u32 lowcat2low_treshold;
-		u32 lowwarn2low_treshold;
-		u32 low2lowwarn_treshold;
-		u32 norm2lowwarn_treshold;
-		u32 lowwarn2norm_treshold;
-		u32 nominal_reading;
-		u32 hiwarn2norm_treshold;
-		u32 norm2hiwarn_treshold;
-		u32 high2hiwarn_treshold;
-		u32 hiwarn2high_treshold;
-		u32 hicat2high_treshold;
-		u32 hi2hicat_treshold;
-		u32 maximum_reading;
-		u8 sensor_state;
-		u16 event_enable;
-	} result;
-
-	token = i2o_parm_field_get(d, 0xF200, -1, &result, sizeof(result));
-
-	if (token < 0) {
-		i2o_report_query_status(seq, token,
-					"0xF200 Sensors (optional)");
-		return 0;
-	}
-
-	seq_printf(seq, "Sensor instance       : %d\n", result.sensor_instance);
-
-	seq_printf(seq, "Component             : %d = ", result.component);
-	switch (result.component) {
-	case 0:
-		seq_printf(seq, "Other");
-		break;
-	case 1:
-		seq_printf(seq, "Planar logic Board");
-		break;
-	case 2:
-		seq_printf(seq, "CPU");
-		break;
-	case 3:
-		seq_printf(seq, "Chassis");
-		break;
-	case 4:
-		seq_printf(seq, "Power Supply");
-		break;
-	case 5:
-		seq_printf(seq, "Storage");
-		break;
-	case 6:
-		seq_printf(seq, "External");
-		break;
-	}
-	seq_printf(seq, "\n");
-
-	seq_printf(seq, "Component instance    : %d\n",
-		   result.component_instance);
-	seq_printf(seq, "Sensor class          : %s\n",
-		   result.sensor_class ? "Analog" : "Digital");
-
-	seq_printf(seq, "Sensor type           : %d = ", result.sensor_type);
-	switch (result.sensor_type) {
-	case 0:
-		seq_printf(seq, "Other\n");
-		break;
-	case 1:
-		seq_printf(seq, "Thermal\n");
-		break;
-	case 2:
-		seq_printf(seq, "DC voltage (DC volts)\n");
-		break;
-	case 3:
-		seq_printf(seq, "AC voltage (AC volts)\n");
-		break;
-	case 4:
-		seq_printf(seq, "DC current (DC amps)\n");
-		break;
-	case 5:
-		seq_printf(seq, "AC current (AC volts)\n");
-		break;
-	case 6:
-		seq_printf(seq, "Door open\n");
-		break;
-	case 7:
-		seq_printf(seq, "Fan operational\n");
-		break;
-	}
-
-	seq_printf(seq, "Scaling exponent      : %d\n",
-		   result.scaling_exponent);
-	seq_printf(seq, "Actual reading        : %d\n", result.actual_reading);
-	seq_printf(seq, "Minimum reading       : %d\n", result.minimum_reading);
-	seq_printf(seq, "Low2LowCat treshold   : %d\n",
-		   result.low2lowcat_treshold);
-	seq_printf(seq, "LowCat2Low treshold   : %d\n",
-		   result.lowcat2low_treshold);
-	seq_printf(seq, "LowWarn2Low treshold  : %d\n",
-		   result.lowwarn2low_treshold);
-	seq_printf(seq, "Low2LowWarn treshold  : %d\n",
-		   result.low2lowwarn_treshold);
-	seq_printf(seq, "Norm2LowWarn treshold : %d\n",
-		   result.norm2lowwarn_treshold);
-	seq_printf(seq, "LowWarn2Norm treshold : %d\n",
-		   result.lowwarn2norm_treshold);
-	seq_printf(seq, "Nominal reading       : %d\n", result.nominal_reading);
-	seq_printf(seq, "HiWarn2Norm treshold  : %d\n",
-		   result.hiwarn2norm_treshold);
-	seq_printf(seq, "Norm2HiWarn treshold  : %d\n",
-		   result.norm2hiwarn_treshold);
-	seq_printf(seq, "High2HiWarn treshold  : %d\n",
-		   result.high2hiwarn_treshold);
-	seq_printf(seq, "HiWarn2High treshold  : %d\n",
-		   result.hiwarn2high_treshold);
-	seq_printf(seq, "HiCat2High treshold   : %d\n",
-		   result.hicat2high_treshold);
-	seq_printf(seq, "High2HiCat treshold   : %d\n",
-		   result.hi2hicat_treshold);
-	seq_printf(seq, "Maximum reading       : %d\n", result.maximum_reading);
-
-	seq_printf(seq, "Sensor state          : %d = ", result.sensor_state);
-	switch (result.sensor_state) {
-	case 0:
-		seq_printf(seq, "Normal\n");
-		break;
-	case 1:
-		seq_printf(seq, "Abnormal\n");
-		break;
-	case 2:
-		seq_printf(seq, "Unknown\n");
-		break;
-	case 3:
-		seq_printf(seq, "Low Catastrophic (LoCat)\n");
-		break;
-	case 4:
-		seq_printf(seq, "Low (Low)\n");
-		break;
-	case 5:
-		seq_printf(seq, "Low Warning (LoWarn)\n");
-		break;
-	case 6:
-		seq_printf(seq, "High Warning (HiWarn)\n");
-		break;
-	case 7:
-		seq_printf(seq, "High (High)\n");
-		break;
-	case 8:
-		seq_printf(seq, "High Catastrophic (HiCat)\n");
-		break;
-	}
-
-	seq_printf(seq, "Event_enable : 0x%02X\n", result.event_enable);
-	seq_printf(seq, "    [%s] Operational state change. \n",
-		   (result.event_enable & 0x01) ? "+" : "-");
-	seq_printf(seq, "    [%s] Low catastrophic. \n",
-		   (result.event_enable & 0x02) ? "+" : "-");
-	seq_printf(seq, "    [%s] Low reading. \n",
-		   (result.event_enable & 0x04) ? "+" : "-");
-	seq_printf(seq, "    [%s] Low warning. \n",
-		   (result.event_enable & 0x08) ? "+" : "-");
-	seq_printf(seq,
-		   "    [%s] Change back to normal from out of range state. \n",
-		   (result.event_enable & 0x10) ? "+" : "-");
-	seq_printf(seq, "    [%s] High warning. \n",
-		   (result.event_enable & 0x20) ? "+" : "-");
-	seq_printf(seq, "    [%s] High reading. \n",
-		   (result.event_enable & 0x40) ? "+" : "-");
-	seq_printf(seq, "    [%s] High catastrophic. \n",
-		   (result.event_enable & 0x80) ? "+" : "-");
-
-	return 0;
-}
-
-static int i2o_seq_open_hrt(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_hrt, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_lct(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_lct, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_status(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_status, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_hw(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_hw, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_ddm_table(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_ddm_table, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_driver_store(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_driver_store, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_drivers_stored(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_drivers_stored, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_groups(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_groups, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_phys_device(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_phys_device, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_claimed(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_claimed, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_users(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_users, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_priv_msgs(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_priv_msgs, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_authorized_users(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_authorized_users,
-			   PDE_DATA(inode));
-};
-
-static int i2o_seq_open_dev_identity(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_dev_identity, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_ddm_identity(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_ddm_identity, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_uinfo(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_uinfo, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_sgl_limits(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_sgl_limits, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_sensors(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_sensors, PDE_DATA(inode));
-};
-
-static int i2o_seq_open_dev_name(struct inode *inode, struct file *file)
-{
-	return single_open(file, i2o_seq_show_dev_name, PDE_DATA(inode));
-};
-
-static const struct file_operations i2o_seq_fops_lct = {
-	.open = i2o_seq_open_lct,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_hrt = {
-	.open = i2o_seq_open_hrt,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_status = {
-	.open = i2o_seq_open_status,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_hw = {
-	.open = i2o_seq_open_hw,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_ddm_table = {
-	.open = i2o_seq_open_ddm_table,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_driver_store = {
-	.open = i2o_seq_open_driver_store,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_drivers_stored = {
-	.open = i2o_seq_open_drivers_stored,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_groups = {
-	.open = i2o_seq_open_groups,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_phys_device = {
-	.open = i2o_seq_open_phys_device,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_claimed = {
-	.open = i2o_seq_open_claimed,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_users = {
-	.open = i2o_seq_open_users,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_priv_msgs = {
-	.open = i2o_seq_open_priv_msgs,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_authorized_users = {
-	.open = i2o_seq_open_authorized_users,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_dev_name = {
-	.open = i2o_seq_open_dev_name,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_dev_identity = {
-	.open = i2o_seq_open_dev_identity,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_ddm_identity = {
-	.open = i2o_seq_open_ddm_identity,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_uinfo = {
-	.open = i2o_seq_open_uinfo,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_sgl_limits = {
-	.open = i2o_seq_open_sgl_limits,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static const struct file_operations i2o_seq_fops_sensors = {
-	.open = i2o_seq_open_sensors,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-/*
- * IOP specific entries...write field just in case someone
- * ever wants one.
- */
-static i2o_proc_entry i2o_proc_generic_iop_entries[] = {
-	{"hrt", S_IFREG | S_IRUGO, &i2o_seq_fops_hrt},
-	{"lct", S_IFREG | S_IRUGO, &i2o_seq_fops_lct},
-	{"status", S_IFREG | S_IRUGO, &i2o_seq_fops_status},
-	{"hw", S_IFREG | S_IRUGO, &i2o_seq_fops_hw},
-	{"ddm_table", S_IFREG | S_IRUGO, &i2o_seq_fops_ddm_table},
-	{"driver_store", S_IFREG | S_IRUGO, &i2o_seq_fops_driver_store},
-	{"drivers_stored", S_IFREG | S_IRUGO, &i2o_seq_fops_drivers_stored},
-	{NULL, 0, NULL}
-};
-
-/*
- * Device specific entries
- */
-static i2o_proc_entry generic_dev_entries[] = {
-	{"groups", S_IFREG | S_IRUGO, &i2o_seq_fops_groups},
-	{"phys_dev", S_IFREG | S_IRUGO, &i2o_seq_fops_phys_device},
-	{"claimed", S_IFREG | S_IRUGO, &i2o_seq_fops_claimed},
-	{"users", S_IFREG | S_IRUGO, &i2o_seq_fops_users},
-	{"priv_msgs", S_IFREG | S_IRUGO, &i2o_seq_fops_priv_msgs},
-	{"authorized_users", S_IFREG | S_IRUGO, &i2o_seq_fops_authorized_users},
-	{"dev_identity", S_IFREG | S_IRUGO, &i2o_seq_fops_dev_identity},
-	{"ddm_identity", S_IFREG | S_IRUGO, &i2o_seq_fops_ddm_identity},
-	{"user_info", S_IFREG | S_IRUGO, &i2o_seq_fops_uinfo},
-	{"sgl_limits", S_IFREG | S_IRUGO, &i2o_seq_fops_sgl_limits},
-	{"sensors", S_IFREG | S_IRUGO, &i2o_seq_fops_sensors},
-	{NULL, 0, NULL}
-};
-
-/*
- *  Storage unit specific entries (SCSI Periph, BS) with device names
- */
-static i2o_proc_entry rbs_dev_entries[] = {
-	{"dev_name", S_IFREG | S_IRUGO, &i2o_seq_fops_dev_name},
-	{NULL, 0, NULL}
-};
-
-/**
- *	i2o_proc_create_entries - Creates proc dir entries
- *	@dir: proc dir entry under which the entries should be placed
- *	@i2o_pe: pointer to the entries which should be added
- *	@data: pointer to I2O controller or device
- *
- *	Create proc dir entries for a I2O controller or I2O device.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_proc_create_entries(struct proc_dir_entry *dir,
-				   i2o_proc_entry * i2o_pe, void *data)
-{
-	struct proc_dir_entry *tmp;
-
-	while (i2o_pe->name) {
-		tmp = proc_create_data(i2o_pe->name, i2o_pe->mode, dir,
-				       i2o_pe->fops, data);
-		if (!tmp)
-			return -1;
-
-		i2o_pe++;
-	}
-
-	return 0;
-}
-
-/**
- *	i2o_proc_device_add - Add an I2O device to the proc dir
- *	@dir: proc dir entry to which the device should be added
- *	@dev: I2O device which should be added
- *
- *	Add an I2O device to the proc dir entry dir and create the entries for
- *	the device depending on the class of the I2O device.
- */
-static void i2o_proc_device_add(struct proc_dir_entry *dir,
-				struct i2o_device *dev)
-{
-	char buff[10];
-	struct proc_dir_entry *devdir;
-	i2o_proc_entry *i2o_pe = NULL;
-
-	sprintf(buff, "%03x", dev->lct_data.tid);
-
-	osm_debug("adding device /proc/i2o/%s/%s\n", dev->iop->name, buff);
-
-	devdir = proc_mkdir_data(buff, 0, dir, dev);
-	if (!devdir) {
-		osm_warn("Could not allocate procdir!\n");
-		return;
-	}
-
-	i2o_proc_create_entries(devdir, generic_dev_entries, dev);
-
-	/* Inform core that we want updates about this device's status */
-	switch (dev->lct_data.class_id) {
-	case I2O_CLASS_SCSI_PERIPHERAL:
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-		i2o_pe = rbs_dev_entries;
-		break;
-	default:
-		break;
-	}
-	if (i2o_pe)
-		i2o_proc_create_entries(devdir, i2o_pe, dev);
-}
-
-/**
- *	i2o_proc_iop_add - Add an I2O controller to the i2o proc tree
- *	@dir: parent proc dir entry
- *	@c: I2O controller which should be added
- *
- *	Add the entries to the parent proc dir entry. Also each device is added
- *	to the controllers proc dir entry.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_proc_iop_add(struct proc_dir_entry *dir,
-			    struct i2o_controller *c)
-{
-	struct proc_dir_entry *iopdir;
-	struct i2o_device *dev;
-
-	osm_debug("adding IOP /proc/i2o/%s\n", c->name);
-
-	iopdir = proc_mkdir_data(c->name, 0, dir, c);
-	if (!iopdir)
-		return -1;
-
-	i2o_proc_create_entries(iopdir, i2o_proc_generic_iop_entries, c);
-
-	list_for_each_entry(dev, &c->devices, list)
-	    i2o_proc_device_add(iopdir, dev);
-
-	return 0;
-}
-
-/**
- *	i2o_proc_fs_create - Create the i2o proc fs.
- *
- *	Iterate over each I2O controller and create the entries for it.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_proc_fs_create(void)
-{
-	struct i2o_controller *c;
-
-	i2o_proc_dir_root = proc_mkdir("i2o", NULL);
-	if (!i2o_proc_dir_root)
-		return -1;
-
-	list_for_each_entry(c, &i2o_controllers, list)
-	    i2o_proc_iop_add(i2o_proc_dir_root, c);
-
-	return 0;
-};
-
-/**
- *	i2o_proc_fs_destroy - Cleanup the all i2o proc entries
- *
- *	Iterate over each I2O controller and remove the entries for it.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __exit i2o_proc_fs_destroy(void)
-{
-	remove_proc_subtree("i2o", NULL);
-
-	return 0;
-};
-
-/**
- *	i2o_proc_init - Init function for procfs
- *
- *	Registers Proc OSM and creates procfs entries.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_proc_init(void)
-{
-	int rc;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	rc = i2o_driver_register(&i2o_proc_driver);
-	if (rc)
-		return rc;
-
-	rc = i2o_proc_fs_create();
-	if (rc) {
-		i2o_driver_unregister(&i2o_proc_driver);
-		return rc;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_proc_exit - Exit function for procfs
- *
- *	Unregisters Proc OSM and removes procfs entries.
- */
-static void __exit i2o_proc_exit(void)
-{
-	i2o_driver_unregister(&i2o_proc_driver);
-	i2o_proc_fs_destroy();
-};
-
-MODULE_AUTHOR("Deepak Saxena");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_proc_init);
-module_exit(i2o_proc_exit);
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
deleted file mode 100644
index 8152e9fa9d95..000000000000
--- a/drivers/message/i2o/i2o_scsi.c
+++ /dev/null
@@ -1,814 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * For the avoidance of doubt the "preferred form" of this code is one which
- * is in an open non patent encumbered format. Where cryptographic key signing
- * forms part of the process of creating an executable the information
- * including keys needed to generate an equivalently functional executable
- * are deemed to be part of the source code.
- *
- *  Complications for I2O scsi
- *
- *	o	Each (bus,lun) is a logical device in I2O. We keep a map
- *		table. We spoof failed selection for unmapped units
- *	o	Request sense buffers can come back for free.
- *	o	Scatter gather is a bit dynamic. We have to investigate at
- *		setup time.
- *	o	Some of our resources are dynamically shared. The i2o core
- *		needs a message reservation protocol to avoid swap v net
- *		deadlocking. We need to back off queue requests.
- *
- *	In general the firmware wants to help. Where its help isn't performance
- *	useful we just ignore the aid. Its not worth the code in truth.
- *
- * Fixes/additions:
- *	Steve Ralston:
- *		Scatter gather now works
- *	Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *		Minor fixes for 2.6.
- *
- * To Do:
- *	64bit cleanups
- *	Fix the resource management problems.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/ioport.h>
-#include <linux/jiffies.h>
-#include <linux/interrupt.h>
-#include <linux/timer.h>
-#include <linux/delay.h>
-#include <linux/proc_fs.h>
-#include <linux/prefetch.h>
-#include <linux/pci.h>
-#include <linux/blkdev.h>
-#include <linux/i2o.h>
-#include <linux/scatterlist.h>
-
-#include <asm/dma.h>
-#include <asm/io.h>
-#include <linux/atomic.h>
-
-#include <scsi/scsi.h>
-#include <scsi/scsi_host.h>
-#include <scsi/scsi_device.h>
-#include <scsi/scsi_cmnd.h>
-#include <scsi/sg.h>
-
-#define OSM_NAME	"scsi-osm"
-#define OSM_VERSION	"1.316"
-#define OSM_DESCRIPTION	"I2O SCSI Peripheral OSM"
-
-static struct i2o_driver i2o_scsi_driver;
-
-static unsigned int i2o_scsi_max_id = 16;
-static unsigned int i2o_scsi_max_lun = 255;
-
-struct i2o_scsi_host {
-	struct Scsi_Host *scsi_host;	/* pointer to the SCSI host */
-	struct i2o_controller *iop;	/* pointer to the I2O controller */
-	u64 lun;	/* lun's used for block devices */
-	struct i2o_device *channel[0];	/* channel->i2o_dev mapping table */
-};
-
-static struct scsi_host_template i2o_scsi_host_template;
-
-#define I2O_SCSI_CAN_QUEUE	4
-
-/* SCSI OSM class handling definition */
-static struct i2o_class_id i2o_scsi_class_id[] = {
-	{I2O_CLASS_SCSI_PERIPHERAL},
-	{I2O_CLASS_END}
-};
-
-static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
-{
-	struct i2o_scsi_host *i2o_shost;
-	struct i2o_device *i2o_dev;
-	struct Scsi_Host *scsi_host;
-	int max_channel = 0;
-	u8 type;
-	int i;
-	size_t size;
-	u16 body_size = 6;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	if (c->adaptec)
-		body_size = 8;
-#endif
-
-	list_for_each_entry(i2o_dev, &c->devices, list)
-	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
-		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
-		    && (type == 0x01))	/* SCSI bus */
-			max_channel++;
-	}
-
-	if (!max_channel) {
-		osm_warn("no channels found on %s\n", c->name);
-		return ERR_PTR(-EFAULT);
-	}
-
-	size = max_channel * sizeof(struct i2o_device *)
-	    + sizeof(struct i2o_scsi_host);
-
-	scsi_host = scsi_host_alloc(&i2o_scsi_host_template, size);
-	if (!scsi_host) {
-		osm_warn("Could not allocate SCSI host\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	scsi_host->max_channel = max_channel - 1;
-	scsi_host->max_id = i2o_scsi_max_id;
-	scsi_host->max_lun = i2o_scsi_max_lun;
-	scsi_host->this_id = c->unit;
-	scsi_host->sg_tablesize = i2o_sg_tablesize(c, body_size);
-
-	i2o_shost = (struct i2o_scsi_host *)scsi_host->hostdata;
-	i2o_shost->scsi_host = scsi_host;
-	i2o_shost->iop = c;
-	i2o_shost->lun = 1;
-
-	i = 0;
-	list_for_each_entry(i2o_dev, &c->devices, list)
-	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
-		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
-		    && (type == 0x01))	/* only SCSI bus */
-			i2o_shost->channel[i++] = i2o_dev;
-
-		if (i >= max_channel)
-			break;
-	}
-
-	return i2o_shost;
-};
-
-/**
- *	i2o_scsi_get_host - Get an I2O SCSI host
- *	@c: I2O controller to for which to get the SCSI host
- *
- *	If the I2O controller already exists as SCSI host, the SCSI host
- *	is returned, otherwise the I2O controller is added to the SCSI
- *	core.
- *
- *	Returns pointer to the I2O SCSI host on success or NULL on failure.
- */
-static struct i2o_scsi_host *i2o_scsi_get_host(struct i2o_controller *c)
-{
-	return c->driver_data[i2o_scsi_driver.context];
-};
-
-/**
- *	i2o_scsi_remove - Remove I2O device from SCSI core
- *	@dev: device which should be removed
- *
- *	Removes the I2O device from the SCSI core again.
- *
- *	Returns 0 on success.
- */
-static int i2o_scsi_remove(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_scsi_host *i2o_shost;
-	struct scsi_device *scsi_dev;
-
-	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
-
-	i2o_shost = i2o_scsi_get_host(c);
-
-	shost_for_each_device(scsi_dev, i2o_shost->scsi_host)
-	    if (scsi_dev->hostdata == i2o_dev) {
-		sysfs_remove_link(&i2o_dev->device.kobj, "scsi");
-		scsi_remove_device(scsi_dev);
-		scsi_device_put(scsi_dev);
-		break;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_scsi_probe - verify if dev is a I2O SCSI device and install it
- *	@dev: device to verify if it is a I2O SCSI device
- *
- *	Retrieve channel, id and lun for I2O device. If everything goes well
- *	register the I2O device as SCSI device on the I2O SCSI controller.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_scsi_probe(struct device *dev)
-{
-	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_scsi_host *i2o_shost;
-	struct Scsi_Host *scsi_host;
-	struct i2o_device *parent;
-	struct scsi_device *scsi_dev;
-	u32 id = -1;
-	u64 lun = -1;
-	int channel = -1;
-	int i, rc;
-
-	i2o_shost = i2o_scsi_get_host(c);
-	if (!i2o_shost)
-		return -EFAULT;
-
-	scsi_host = i2o_shost->scsi_host;
-
-	switch (i2o_dev->lct_data.class_id) {
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-	case I2O_CLASS_EXECUTIVE:
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-		if (c->adaptec) {
-			u8 type;
-			struct i2o_device *d = i2o_shost->channel[0];
-
-			if (!i2o_parm_field_get(d, 0x0000, 0, &type, 1)
-			    && (type == 0x01))	/* SCSI bus */
-				if (!i2o_parm_field_get(d, 0x0200, 4, &id, 4)) {
-					channel = 0;
-					if (i2o_dev->lct_data.class_id ==
-					    I2O_CLASS_RANDOM_BLOCK_STORAGE)
-						lun =
-						    cpu_to_le64(i2o_shost->
-								lun++);
-					else
-						lun = 0;
-				}
-		}
-#endif
-		break;
-
-	case I2O_CLASS_SCSI_PERIPHERAL:
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 3, &id, 4))
-			return -EFAULT;
-
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 4, &lun, 8))
-			return -EFAULT;
-
-		parent = i2o_iop_find_device(c, i2o_dev->lct_data.parent_tid);
-		if (!parent) {
-			osm_warn("can not find parent of device %03x\n",
-				 i2o_dev->lct_data.tid);
-			return -EFAULT;
-		}
-
-		for (i = 0; i <= i2o_shost->scsi_host->max_channel; i++)
-			if (i2o_shost->channel[i] == parent)
-				channel = i;
-		break;
-
-	default:
-		return -EFAULT;
-	}
-
-	if (channel == -1) {
-		osm_warn("can not find channel of device %03x\n",
-			 i2o_dev->lct_data.tid);
-		return -EFAULT;
-	}
-
-	if (le32_to_cpu(id) >= scsi_host->max_id) {
-		osm_warn("SCSI device id (%d) >= max_id of I2O host (%d)",
-			 le32_to_cpu(id), scsi_host->max_id);
-		return -EFAULT;
-	}
-
-	if (le64_to_cpu(lun) >= scsi_host->max_lun) {
-		osm_warn("SCSI device lun (%llu) >= max_lun of I2O host (%llu)",
-			 le64_to_cpu(lun), scsi_host->max_lun);
-		return -EFAULT;
-	}
-
-	scsi_dev =
-	    __scsi_add_device(i2o_shost->scsi_host, channel, le32_to_cpu(id),
-			      le64_to_cpu(lun), i2o_dev);
-
-	if (IS_ERR(scsi_dev)) {
-		osm_warn("can not add SCSI device %03x\n",
-			 i2o_dev->lct_data.tid);
-		return PTR_ERR(scsi_dev);
-	}
-
-	rc = sysfs_create_link(&i2o_dev->device.kobj,
-			       &scsi_dev->sdev_gendev.kobj, "scsi");
-	if (rc)
-		goto err;
-
-	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %llu\n",
-		 i2o_dev->lct_data.tid, channel, le32_to_cpu(id),
-		 le64_to_cpu(lun));
-
-	return 0;
-
-err:
-	scsi_remove_device(scsi_dev);
-	return rc;
-};
-
-static const char *i2o_scsi_info(struct Scsi_Host *SChost)
-{
-	struct i2o_scsi_host *hostdata;
-	hostdata = (struct i2o_scsi_host *)SChost->hostdata;
-	return hostdata->iop->name;
-}
-
-/**
- *	i2o_scsi_reply - SCSI OSM message reply handler
- *	@c: controller issuing the reply
- *	@m: message id for flushing
- *	@msg: the message from the controller
- *
- *	Process reply messages (interrupts in normal scsi controller think).
- *	We can get a variety of messages to process. The normal path is
- *	scsi command completions. We must also deal with IOP failures,
- *	the reply to a bus reset and the reply to a LUN query.
- *
- *	Returns 0 on success and if the reply should not be flushed or > 0
- *	on success and if the reply should be flushed. Returns negative error
- *	code on failure and if the reply should be flushed.
- */
-static int i2o_scsi_reply(struct i2o_controller *c, u32 m,
-			  struct i2o_message *msg)
-{
-	struct scsi_cmnd *cmd;
-	u32 error;
-	struct device *dev;
-
-	cmd = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
-	if (unlikely(!cmd)) {
-		osm_err("NULL reply received!\n");
-		return -1;
-	}
-
-	/*
-	 *      Low byte is device status, next is adapter status,
-	 *      (then one byte reserved), then request status.
-	 */
-	error = le32_to_cpu(msg->body[0]);
-
-	osm_debug("Completed %0x%p\n", cmd);
-
-	cmd->result = error & 0xff;
-	/*
-	 * if DeviceStatus is not SCSI_SUCCESS copy over the sense data and let
-	 * the SCSI layer handle the error
-	 */
-	if (cmd->result)
-		memcpy(cmd->sense_buffer, &msg->body[3],
-		       min(SCSI_SENSE_BUFFERSIZE, 40));
-
-	/* only output error code if AdapterStatus is not HBA_SUCCESS */
-	if ((error >> 8) & 0xff)
-		osm_err("SCSI error %08x\n", error);
-
-	dev = &c->pdev->dev;
-
-	scsi_dma_unmap(cmd);
-
-	cmd->scsi_done(cmd);
-
-	return 1;
-};
-
-/**
- *	i2o_scsi_notify_device_add - Retrieve notifications of added devices
- *	@i2o_dev: the I2O device which was added
- *
- *	If a I2O device is added we catch the notification, because I2O classes
- *	other than SCSI peripheral will not be received through
- *	i2o_scsi_probe().
- */
-static void i2o_scsi_notify_device_add(struct i2o_device *i2o_dev)
-{
-	switch (i2o_dev->lct_data.class_id) {
-	case I2O_CLASS_EXECUTIVE:
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-		i2o_scsi_probe(&i2o_dev->device);
-		break;
-
-	default:
-		break;
-	}
-};
-
-/**
- *	i2o_scsi_notify_device_remove - Retrieve notifications of removed devices
- *	@i2o_dev: the I2O device which was removed
- *
- *	If a I2O device is removed, we catch the notification to remove the
- *	corresponding SCSI device.
- */
-static void i2o_scsi_notify_device_remove(struct i2o_device *i2o_dev)
-{
-	switch (i2o_dev->lct_data.class_id) {
-	case I2O_CLASS_EXECUTIVE:
-	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-		i2o_scsi_remove(&i2o_dev->device);
-		break;
-
-	default:
-		break;
-	}
-};
-
-/**
- *	i2o_scsi_notify_controller_add - Retrieve notifications of added controllers
- *	@c: the controller which was added
- *
- *	If a I2O controller is added, we catch the notification to add a
- *	corresponding Scsi_Host.
- */
-static void i2o_scsi_notify_controller_add(struct i2o_controller *c)
-{
-	struct i2o_scsi_host *i2o_shost;
-	int rc;
-
-	i2o_shost = i2o_scsi_host_alloc(c);
-	if (IS_ERR(i2o_shost)) {
-		osm_err("Could not initialize SCSI host\n");
-		return;
-	}
-
-	rc = scsi_add_host(i2o_shost->scsi_host, &c->device);
-	if (rc) {
-		osm_err("Could not add SCSI host\n");
-		scsi_host_put(i2o_shost->scsi_host);
-		return;
-	}
-
-	c->driver_data[i2o_scsi_driver.context] = i2o_shost;
-
-	osm_debug("new I2O SCSI host added\n");
-};
-
-/**
- *	i2o_scsi_notify_controller_remove - Retrieve notifications of removed controllers
- *	@c: the controller which was removed
- *
- *	If a I2O controller is removed, we catch the notification to remove the
- *	corresponding Scsi_Host.
- */
-static void i2o_scsi_notify_controller_remove(struct i2o_controller *c)
-{
-	struct i2o_scsi_host *i2o_shost;
-	i2o_shost = i2o_scsi_get_host(c);
-	if (!i2o_shost)
-		return;
-
-	c->driver_data[i2o_scsi_driver.context] = NULL;
-
-	scsi_remove_host(i2o_shost->scsi_host);
-	scsi_host_put(i2o_shost->scsi_host);
-	osm_debug("I2O SCSI host removed\n");
-};
-
-/* SCSI OSM driver struct */
-static struct i2o_driver i2o_scsi_driver = {
-	.name = OSM_NAME,
-	.reply = i2o_scsi_reply,
-	.classes = i2o_scsi_class_id,
-	.notify_device_add = i2o_scsi_notify_device_add,
-	.notify_device_remove = i2o_scsi_notify_device_remove,
-	.notify_controller_add = i2o_scsi_notify_controller_add,
-	.notify_controller_remove = i2o_scsi_notify_controller_remove,
-	.driver = {
-		   .probe = i2o_scsi_probe,
-		   .remove = i2o_scsi_remove,
-		   },
-};
-
-/**
- *	i2o_scsi_queuecommand - queue a SCSI command
- *	@SCpnt: scsi command pointer
- *	@done: callback for completion
- *
- *	Issue a scsi command asynchronously. Return 0 on success or 1 if
- *	we hit an error (normally message queue congestion). The only
- *	minor complication here is that I2O deals with the device addressing
- *	so we have to map the bus/dev/lun back to an I2O handle as well
- *	as faking absent devices ourself.
- *
- *	Locks: takes the controller lock on error path only
- */
-
-static int i2o_scsi_queuecommand_lck(struct scsi_cmnd *SCpnt,
-				 void (*done) (struct scsi_cmnd *))
-{
-	struct i2o_controller *c;
-	struct i2o_device *i2o_dev;
-	int tid;
-	struct i2o_message *msg;
-	/*
-	 * ENABLE_DISCONNECT
-	 * SIMPLE_TAG
-	 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
-	 */
-	u32 scsi_flags = 0x20a00000;
-	u32 sgl_offset;
-	u32 *mptr;
-	u32 cmd = I2O_CMD_SCSI_EXEC << 24;
-	int rc = 0;
-
-	/*
-	 *      Do the incoming paperwork
-	 */
-	i2o_dev = SCpnt->device->hostdata;
-
-	SCpnt->scsi_done = done;
-
-	if (unlikely(!i2o_dev)) {
-		osm_warn("no I2O device in request\n");
-		SCpnt->result = DID_NO_CONNECT << 16;
-		done(SCpnt);
-		goto exit;
-	}
-	c = i2o_dev->iop;
-	tid = i2o_dev->lct_data.tid;
-
-	osm_debug("qcmd: Tid = %03x\n", tid);
-	osm_debug("Real scsi messages.\n");
-
-	/*
-	 *      Put together a scsi execscb message
-	 */
-	switch (SCpnt->sc_data_direction) {
-	case PCI_DMA_NONE:
-		/* DATA NO XFER */
-		sgl_offset = SGL_OFFSET_0;
-		break;
-
-	case PCI_DMA_TODEVICE:
-		/* DATA OUT (iop-->dev) */
-		scsi_flags |= 0x80000000;
-		sgl_offset = SGL_OFFSET_10;
-		break;
-
-	case PCI_DMA_FROMDEVICE:
-		/* DATA IN  (iop<--dev) */
-		scsi_flags |= 0x40000000;
-		sgl_offset = SGL_OFFSET_10;
-		break;
-
-	default:
-		/* Unknown - kill the command */
-		SCpnt->result = DID_NO_CONNECT << 16;
-		done(SCpnt);
-		goto exit;
-	}
-
-	/*
-	 *      Obtain an I2O message. If there are none free then
-	 *      throw it back to the scsi layer
-	 */
-
-	msg = i2o_msg_get(c);
-	if (IS_ERR(msg)) {
-		rc = SCSI_MLQUEUE_HOST_BUSY;
-		goto exit;
-	}
-
-	mptr = &msg->body[0];
-
-#if 0 /* this code can't work */
-#ifdef CONFIG_I2O_EXT_ADAPTEC
-	if (c->adaptec) {
-		u32 adpt_flags = 0;
-
-		if (SCpnt->sc_request && SCpnt->sc_request->upper_private_data) {
-			i2o_sg_io_hdr_t __user *usr_ptr =
-			    ((Sg_request *) (SCpnt->sc_request->
-					     upper_private_data))->header.
-			    usr_ptr;
-
-			if (usr_ptr)
-				get_user(adpt_flags, &usr_ptr->flags);
-		}
-
-		switch (i2o_dev->lct_data.class_id) {
-		case I2O_CLASS_EXECUTIVE:
-		case I2O_CLASS_RANDOM_BLOCK_STORAGE:
-			/* interpret flag has to be set for executive */
-			adpt_flags ^= I2O_DPT_SG_FLAG_INTERPRET;
-			break;
-
-		default:
-			break;
-		}
-
-		/*
-		 * for Adaptec controllers we use the PRIVATE command, because
-		 * the normal SCSI EXEC doesn't support all SCSI commands on
-		 * all controllers (for example READ CAPACITY).
-		 */
-		if (sgl_offset == SGL_OFFSET_10)
-			sgl_offset = SGL_OFFSET_12;
-		cmd = I2O_CMD_PRIVATE << 24;
-		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
-		*mptr++ = cpu_to_le32(adpt_flags | tid);
-	}
-#endif
-#endif
-
-	msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
-	msg->u.s.icntxt = cpu_to_le32(i2o_scsi_driver.context);
-
-	/* We want the SCSI control block back */
-	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, SCpnt));
-
-	/* LSI_920_PCI_QUIRK
-	 *
-	 *      Intermittant observations of msg frame word data corruption
-	 *      observed on msg[4] after:
-	 *        WRITE, READ-MODIFY-WRITE
-	 *      operations.  19990606 -sralston
-	 *
-	 *      (Hence we build this word via tag. Its good practice anyway
-	 *       we don't want fetches over PCI needlessly)
-	 */
-
-	/* Attach tags to the devices */
-	/* FIXME: implement
-	   if(SCpnt->device->tagged_supported) {
-	   if(SCpnt->tag == HEAD_OF_QUEUE_TAG)
-	   scsi_flags |= 0x01000000;
-	   else if(SCpnt->tag == ORDERED_QUEUE_TAG)
-	   scsi_flags |= 0x01800000;
-	   }
-	 */
-
-	*mptr++ = cpu_to_le32(scsi_flags | SCpnt->cmd_len);
-
-	/* Write SCSI command into the message - always 16 byte block */
-	memcpy(mptr, SCpnt->cmnd, 16);
-	mptr += 4;
-
-	if (sgl_offset != SGL_OFFSET_0) {
-		/* write size of data addressed by SGL */
-		*mptr++ = cpu_to_le32(scsi_bufflen(SCpnt));
-
-		/* Now fill in the SGList and command */
-
-		if (scsi_sg_count(SCpnt)) {
-			if (!i2o_dma_map_sg(c, scsi_sglist(SCpnt),
-					    scsi_sg_count(SCpnt),
-					    SCpnt->sc_data_direction, &mptr))
-				goto nomem;
-		}
-	}
-
-	/* Stick the headers on */
-	msg->u.head[0] =
-	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
-
-	/* Queue the message */
-	i2o_msg_post(c, msg);
-
-	osm_debug("Issued %0x%p\n", SCpnt);
-
-	return 0;
-
-      nomem:
-	rc = -ENOMEM;
-	i2o_msg_nop(c, msg);
-
-      exit:
-	return rc;
-}
-
-static DEF_SCSI_QCMD(i2o_scsi_queuecommand)
-
-/**
- *	i2o_scsi_abort - abort a running command
- *	@SCpnt: command to abort
- *
- *	Ask the I2O controller to abort a command. This is an asynchrnous
- *	process and our callback handler will see the command complete with an
- *	aborted message if it succeeds.
- *
- *	Returns 0 if the command is successfully aborted or negative error code
- *	on failure.
- */
-static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
-{
-	struct i2o_device *i2o_dev;
-	struct i2o_controller *c;
-	struct i2o_message *msg;
-	int tid;
-	int status = FAILED;
-
-	osm_warn("Aborting command block.\n");
-
-	i2o_dev = SCpnt->device->hostdata;
-	c = i2o_dev->iop;
-	tid = i2o_dev->lct_data.tid;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return SCSI_MLQUEUE_HOST_BUSY;
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid);
-	msg->body[0] = cpu_to_le32(i2o_cntxt_list_get_ptr(c, SCpnt));
-
-	if (!i2o_msg_post_wait(c, msg, I2O_TIMEOUT_SCSI_SCB_ABORT))
-		status = SUCCESS;
-
-	return status;
-}
-
-/**
- *	i2o_scsi_bios_param	-	Invent disk geometry
- *	@sdev: scsi device
- *	@dev: block layer device
- *	@capacity: size in sectors
- *	@ip: geometry array
- *
- *	This is anyone's guess quite frankly. We use the same rules everyone
- *	else appears to and hope. It seems to work.
- */
-
-static int i2o_scsi_bios_param(struct scsi_device *sdev,
-			       struct block_device *dev, sector_t capacity,
-			       int *ip)
-{
-	int size;
-
-	size = capacity;
-	ip[0] = 64;		/* heads                        */
-	ip[1] = 32;		/* sectors                      */
-	if ((ip[2] = size >> 11) > 1024) {	/* cylinders, test for big disk */
-		ip[0] = 255;	/* heads                        */
-		ip[1] = 63;	/* sectors                      */
-		ip[2] = size / (255 * 63);	/* cylinders                    */
-	}
-	return 0;
-}
-
-static struct scsi_host_template i2o_scsi_host_template = {
-	.proc_name = OSM_NAME,
-	.name = OSM_DESCRIPTION,
-	.info = i2o_scsi_info,
-	.queuecommand = i2o_scsi_queuecommand,
-	.eh_abort_handler = i2o_scsi_abort,
-	.bios_param = i2o_scsi_bios_param,
-	.can_queue = I2O_SCSI_CAN_QUEUE,
-	.sg_tablesize = 8,
-	.cmd_per_lun = 6,
-	.use_clustering = ENABLE_CLUSTERING,
-};
-
-/**
- *	i2o_scsi_init - SCSI OSM initialization function
- *
- *	Register SCSI OSM into I2O core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_scsi_init(void)
-{
-	int rc;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	/* Register SCSI OSM into I2O core */
-	rc = i2o_driver_register(&i2o_scsi_driver);
-	if (rc) {
-		osm_err("Could not register SCSI driver\n");
-		return rc;
-	}
-
-	return 0;
-};
-
-/**
- *	i2o_scsi_exit - SCSI OSM exit function
- *
- *	Unregisters SCSI OSM from I2O core.
- */
-static void __exit i2o_scsi_exit(void)
-{
-	/* Unregister I2O SCSI OSM from I2O core */
-	i2o_driver_unregister(&i2o_scsi_driver);
-};
-
-MODULE_AUTHOR("Red Hat Software");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_scsi_init);
-module_exit(i2o_scsi_exit);
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
deleted file mode 100644
index 92752fb5b2d3..000000000000
--- a/drivers/message/i2o/iop.c
+++ /dev/null
@@ -1,1247 +0,0 @@
-/*
- *	Functions to handle I2O controllers and I2O message handling
- *
- *	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	A lot of the I2O message side code from this is taken from the
- *	Red Creek RCPCI45 adapter driver by Red Creek Communications
- *
- *	Fixes/additions:
- *		Philipp Rumpf
- *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
- *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
- *		Deepak Saxena <deepak@plexity.net>
- *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
- *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
- *			Ported to Linux 2.5.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor fixes for 2.6.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include "core.h"
-
-#define OSM_NAME	"i2o"
-#define OSM_VERSION	"1.325"
-#define OSM_DESCRIPTION	"I2O subsystem"
-
-/* global I2O controller list */
-LIST_HEAD(i2o_controllers);
-
-/*
- * global I2O System Table. Contains information about all the IOPs in the
- * system. Used to inform IOPs about each others existence.
- */
-static struct i2o_dma i2o_systab;
-
-static int i2o_hrt_get(struct i2o_controller *c);
-
-/**
- *	i2o_msg_get_wait - obtain an I2O message from the IOP
- *	@c: I2O controller
- *	@wait: how long to wait until timeout
- *
- *	This function waits up to wait seconds for a message slot to be
- *	available.
- *
- *	On a success the message is returned and the pointer to the message is
- *	set in msg. The returned message is the physical page frame offset
- *	address from the read port (see the i2o spec). If no message is
- *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
- */
-struct i2o_message *i2o_msg_get_wait(struct i2o_controller *c, int wait)
-{
-	unsigned long timeout = jiffies + wait * HZ;
-	struct i2o_message *msg;
-
-	while (IS_ERR(msg = i2o_msg_get(c))) {
-		if (time_after(jiffies, timeout)) {
-			osm_debug("%s: Timeout waiting for message frame.\n",
-				  c->name);
-			return ERR_PTR(-ETIMEDOUT);
-		}
-		schedule_timeout_uninterruptible(1);
-	}
-
-	return msg;
-};
-
-#if BITS_PER_LONG == 64
-/**
- *      i2o_cntxt_list_add - Append a pointer to context list and return a id
- *	@c: controller to which the context list belong
- *	@ptr: pointer to add to the context list
- *
- *	Because the context field in I2O is only 32-bit large, on 64-bit the
- *	pointer is to large to fit in the context field. The i2o_cntxt_list
- *	functions therefore map pointers to context fields.
- *
- *	Returns context id > 0 on success or 0 on failure.
- */
-u32 i2o_cntxt_list_add(struct i2o_controller * c, void *ptr)
-{
-	struct i2o_context_list_element *entry;
-	unsigned long flags;
-
-	if (!ptr)
-		osm_err("%s: couldn't add NULL pointer to context list!\n",
-			c->name);
-
-	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
-	if (!entry) {
-		osm_err("%s: Could not allocate memory for context list element"
-			"\n", c->name);
-		return 0;
-	}
-
-	entry->ptr = ptr;
-	entry->timestamp = jiffies;
-	INIT_LIST_HEAD(&entry->list);
-
-	spin_lock_irqsave(&c->context_list_lock, flags);
-
-	if (unlikely(atomic_inc_and_test(&c->context_list_counter)))
-		atomic_inc(&c->context_list_counter);
-
-	entry->context = atomic_read(&c->context_list_counter);
-
-	list_add(&entry->list, &c->context_list);
-
-	spin_unlock_irqrestore(&c->context_list_lock, flags);
-
-	osm_debug("%s: Add context to list %p -> %d\n", c->name, ptr, context);
-
-	return entry->context;
-};
-
-/**
- *      i2o_cntxt_list_remove - Remove a pointer from the context list
- *	@c: controller to which the context list belong
- *	@ptr: pointer which should be removed from the context list
- *
- *	Removes a previously added pointer from the context list and returns
- *	the matching context id.
- *
- *	Returns context id on success or 0 on failure.
- */
-u32 i2o_cntxt_list_remove(struct i2o_controller * c, void *ptr)
-{
-	struct i2o_context_list_element *entry;
-	u32 context = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&c->context_list_lock, flags);
-	list_for_each_entry(entry, &c->context_list, list)
-	    if (entry->ptr == ptr) {
-		list_del(&entry->list);
-		context = entry->context;
-		kfree(entry);
-		break;
-	}
-	spin_unlock_irqrestore(&c->context_list_lock, flags);
-
-	if (!context)
-		osm_warn("%s: Could not remove nonexistent ptr %p\n", c->name,
-			 ptr);
-
-	osm_debug("%s: remove ptr from context list %d -> %p\n", c->name,
-		  context, ptr);
-
-	return context;
-};
-
-/**
- *      i2o_cntxt_list_get - Get a pointer from the context list and remove it
- *	@c: controller to which the context list belong
- *	@context: context id to which the pointer belong
- *
- *	Returns pointer to the matching context id on success or NULL on
- *	failure.
- */
-void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
-{
-	struct i2o_context_list_element *entry;
-	unsigned long flags;
-	void *ptr = NULL;
-
-	spin_lock_irqsave(&c->context_list_lock, flags);
-	list_for_each_entry(entry, &c->context_list, list)
-	    if (entry->context == context) {
-		list_del(&entry->list);
-		ptr = entry->ptr;
-		kfree(entry);
-		break;
-	}
-	spin_unlock_irqrestore(&c->context_list_lock, flags);
-
-	if (!ptr)
-		osm_warn("%s: context id %d not found\n", c->name, context);
-
-	osm_debug("%s: get ptr from context list %d -> %p\n", c->name, context,
-		  ptr);
-
-	return ptr;
-};
-
-/**
- *      i2o_cntxt_list_get_ptr - Get a context id from the context list
- *	@c: controller to which the context list belong
- *	@ptr: pointer to which the context id should be fetched
- *
- *	Returns context id which matches to the pointer on success or 0 on
- *	failure.
- */
-u32 i2o_cntxt_list_get_ptr(struct i2o_controller * c, void *ptr)
-{
-	struct i2o_context_list_element *entry;
-	u32 context = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&c->context_list_lock, flags);
-	list_for_each_entry(entry, &c->context_list, list)
-	    if (entry->ptr == ptr) {
-		context = entry->context;
-		break;
-	}
-	spin_unlock_irqrestore(&c->context_list_lock, flags);
-
-	if (!context)
-		osm_warn("%s: Could not find nonexistent ptr %p\n", c->name,
-			 ptr);
-
-	osm_debug("%s: get context id from context list %p -> %d\n", c->name,
-		  ptr, context);
-
-	return context;
-};
-#endif
-
-/**
- *	i2o_iop_find - Find an I2O controller by id
- *	@unit: unit number of the I2O controller to search for
- *
- *	Lookup the I2O controller on the controller list.
- *
- *	Returns pointer to the I2O controller on success or NULL if not found.
- */
-struct i2o_controller *i2o_find_iop(int unit)
-{
-	struct i2o_controller *c;
-
-	list_for_each_entry(c, &i2o_controllers, list) {
-		if (c->unit == unit)
-			return c;
-	}
-
-	return NULL;
-};
-
-/**
- *	i2o_iop_find_device - Find a I2O device on an I2O controller
- *	@c: I2O controller where the I2O device hangs on
- *	@tid: TID of the I2O device to search for
- *
- *	Searches the devices of the I2O controller for a device with TID tid and
- *	returns it.
- *
- *	Returns a pointer to the I2O device if found, otherwise NULL.
- */
-struct i2o_device *i2o_iop_find_device(struct i2o_controller *c, u16 tid)
-{
-	struct i2o_device *dev;
-
-	list_for_each_entry(dev, &c->devices, list)
-	    if (dev->lct_data.tid == tid)
-		return dev;
-
-	return NULL;
-};
-
-/**
- *	i2o_quiesce_controller - quiesce controller
- *	@c: controller
- *
- *	Quiesce an IOP. Causes IOP to make external operation quiescent
- *	(i2o 'READY' state). Internal operation of the IOP continues normally.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_quiesce(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	i2o_status_block *sb = c->status_block.virt;
-	int rc;
-
-	i2o_status_get(c);
-
-	/* SysQuiesce discarded if IOP not in READY or OPERATIONAL state */
-	if ((sb->iop_state != ADAPTER_STATE_READY) &&
-	    (sb->iop_state != ADAPTER_STATE_OPERATIONAL))
-		return 0;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-
-	/* Long timeout needed for quiesce if lots of devices */
-	if ((rc = i2o_msg_post_wait(c, msg, 240)))
-		osm_info("%s: Unable to quiesce (status=%#x).\n", c->name, -rc);
-	else
-		osm_debug("%s: Quiesced.\n", c->name);
-
-	i2o_status_get(c);	// Entered READY state
-
-	return rc;
-};
-
-/**
- *	i2o_iop_enable - move controller from ready to OPERATIONAL
- *	@c: I2O controller
- *
- *	Enable IOP. This allows the IOP to resume external operations and
- *	reverses the effect of a quiesce. Returns zero or an error code if
- *	an error occurs.
- */
-static int i2o_iop_enable(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	i2o_status_block *sb = c->status_block.virt;
-	int rc;
-
-	i2o_status_get(c);
-
-	/* Enable only allowed on READY state */
-	if (sb->iop_state != ADAPTER_STATE_READY)
-		return -EINVAL;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-
-	/* How long of a timeout do we need? */
-	if ((rc = i2o_msg_post_wait(c, msg, 240)))
-		osm_err("%s: Could not enable (status=%#x).\n", c->name, -rc);
-	else
-		osm_debug("%s: Enabled.\n", c->name);
-
-	i2o_status_get(c);	// entered OPERATIONAL state
-
-	return rc;
-};
-
-/**
- *	i2o_iop_quiesce_all - Quiesce all I2O controllers on the system
- *
- *	Quiesce all I2O controllers which are connected to the system.
- */
-static inline void i2o_iop_quiesce_all(void)
-{
-	struct i2o_controller *c, *tmp;
-
-	list_for_each_entry_safe(c, tmp, &i2o_controllers, list) {
-		if (!c->no_quiesce)
-			i2o_iop_quiesce(c);
-	}
-};
-
-/**
- *	i2o_iop_enable_all - Enables all controllers on the system
- *
- *	Enables all I2O controllers which are connected to the system.
- */
-static inline void i2o_iop_enable_all(void)
-{
-	struct i2o_controller *c, *tmp;
-
-	list_for_each_entry_safe(c, tmp, &i2o_controllers, list)
-	    i2o_iop_enable(c);
-};
-
-/**
- *	i2o_clear_controller - Bring I2O controller into HOLD state
- *	@c: controller
- *
- *	Clear an IOP to HOLD state, ie. terminate external operations, clear all
- *	input queues and prepare for a system restart. IOP's internal operation
- *	continues normally and the outbound queue is alive. The IOP is not
- *	expected to rebuild its LCT.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_clear(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	int rc;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	/* Quiesce all IOPs first */
-	i2o_iop_quiesce_all();
-
-	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-
-	if ((rc = i2o_msg_post_wait(c, msg, 30)))
-		osm_info("%s: Unable to clear (status=%#x).\n", c->name, -rc);
-	else
-		osm_debug("%s: Cleared.\n", c->name);
-
-	/* Enable all IOPs */
-	i2o_iop_enable_all();
-
-	return rc;
-}
-
-/**
- *	i2o_iop_init_outbound_queue - setup the outbound message queue
- *	@c: I2O controller
- *
- *	Clear and (re)initialize IOP's outbound queue and post the message
- *	frames to the IOP.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
-{
-	u32 m;
-	volatile u8 *status = c->status.virt;
-	struct i2o_message *msg;
-	ulong timeout;
-	int i;
-
-	osm_debug("%s: Initializing Outbound Queue...\n", c->name);
-
-	memset(c->status.virt, 0, 4);
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
-	msg->body[0] = cpu_to_le32(PAGE_SIZE);
-	/* Outbound msg frame size in words and Initcode */
-	msg->body[1] = cpu_to_le32(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80);
-	msg->body[2] = cpu_to_le32(0xd0000004);
-	msg->body[3] = cpu_to_le32(i2o_dma_low(c->status.phys));
-	msg->body[4] = cpu_to_le32(i2o_dma_high(c->status.phys));
-
-	i2o_msg_post(c, msg);
-
-	timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ;
-	while (*status <= I2O_CMD_IN_PROGRESS) {
-		if (time_after(jiffies, timeout)) {
-			osm_warn("%s: Timeout Initializing\n", c->name);
-			return -ETIMEDOUT;
-		}
-		schedule_timeout_uninterruptible(1);
-	}
-
-	m = c->out_queue.phys;
-
-	/* Post frames */
-	for (i = 0; i < I2O_MAX_OUTBOUND_MSG_FRAMES; i++) {
-		i2o_flush_reply(c, m);
-		udelay(1);	/* Promise */
-		m += I2O_OUTBOUND_MSG_FRAME_SIZE * sizeof(u32);
-	}
-
-	return 0;
-}
-
-/**
- *	i2o_iop_reset - reset an I2O controller
- *	@c: controller to reset
- *
- *	Reset the IOP into INIT state and wait until IOP gets into RESET state.
- *	Terminate all external operations, clear IOP's inbound and outbound
- *	queues, terminate all DDMs, and reload the IOP's operating environment
- *	and all local DDMs. The IOP rebuilds its LCT.
- */
-static int i2o_iop_reset(struct i2o_controller *c)
-{
-	volatile u8 *status = c->status.virt;
-	struct i2o_message *msg;
-	unsigned long timeout;
-	i2o_status_block *sb = c->status_block.virt;
-	int rc = 0;
-
-	osm_debug("%s: Resetting controller\n", c->name);
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	memset(c->status_block.virt, 0, 8);
-
-	/* Quiesce all IOPs first */
-	i2o_iop_quiesce_all();
-
-	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
-	msg->body[0] = cpu_to_le32(0x00000000);
-	msg->body[1] = cpu_to_le32(0x00000000);
-	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status.phys));
-	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status.phys));
-
-	i2o_msg_post(c, msg);
-
-	/* Wait for a reply */
-	timeout = jiffies + I2O_TIMEOUT_RESET * HZ;
-	while (!*status) {
-		if (time_after(jiffies, timeout))
-			break;
-
-		schedule_timeout_uninterruptible(1);
-	}
-
-	switch (*status) {
-	case I2O_CMD_REJECTED:
-		osm_warn("%s: IOP reset rejected\n", c->name);
-		rc = -EPERM;
-		break;
-
-	case I2O_CMD_IN_PROGRESS:
-		/*
-		 * Once the reset is sent, the IOP goes into the INIT state
-		 * which is indeterminate. We need to wait until the IOP has
-		 * rebooted before we can let the system talk to it. We read
-		 * the inbound Free_List until a message is available. If we
-		 * can't read one in the given amount of time, we assume the
-		 * IOP could not reboot properly.
-		 */
-		osm_debug("%s: Reset in progress, waiting for reboot...\n",
-			  c->name);
-
-		while (IS_ERR(msg = i2o_msg_get_wait(c, I2O_TIMEOUT_RESET))) {
-			if (time_after(jiffies, timeout)) {
-				osm_err("%s: IOP reset timeout.\n", c->name);
-				rc = PTR_ERR(msg);
-				goto exit;
-			}
-			schedule_timeout_uninterruptible(1);
-		}
-		i2o_msg_nop(c, msg);
-
-		/* from here all quiesce commands are safe */
-		c->no_quiesce = 0;
-
-		/* verify if controller is in state RESET */
-		i2o_status_get(c);
-
-		if (!c->promise && (sb->iop_state != ADAPTER_STATE_RESET))
-			osm_warn("%s: reset completed, but adapter not in RESET"
-				 " state.\n", c->name);
-		else
-			osm_debug("%s: reset completed.\n", c->name);
-
-		break;
-
-	default:
-		osm_err("%s: IOP reset timeout.\n", c->name);
-		rc = -ETIMEDOUT;
-		break;
-	}
-
-      exit:
-	/* Enable all IOPs */
-	i2o_iop_enable_all();
-
-	return rc;
-};
-
-/**
- *	i2o_iop_activate - Bring controller up to HOLD
- *	@c: controller
- *
- *	This function brings an I2O controller into HOLD state. The adapter
- *	is reset if necessary and then the queues and resource table are read.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_activate(struct i2o_controller *c)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	int rc;
-	int state;
-
-	/* In INIT state, Wait Inbound Q to initialize (in i2o_status_get) */
-	/* In READY state, Get status */
-
-	rc = i2o_status_get(c);
-	if (rc) {
-		osm_info("%s: Unable to obtain status, attempting a reset.\n",
-			 c->name);
-		rc = i2o_iop_reset(c);
-		if (rc)
-			return rc;
-	}
-
-	if (sb->i2o_version > I2OVER15) {
-		osm_err("%s: Not running version 1.5 of the I2O Specification."
-			"\n", c->name);
-		return -ENODEV;
-	}
-
-	switch (sb->iop_state) {
-	case ADAPTER_STATE_FAULTED:
-		osm_err("%s: hardware fault\n", c->name);
-		return -EFAULT;
-
-	case ADAPTER_STATE_READY:
-	case ADAPTER_STATE_OPERATIONAL:
-	case ADAPTER_STATE_HOLD:
-	case ADAPTER_STATE_FAILED:
-		osm_debug("%s: already running, trying to reset...\n", c->name);
-		rc = i2o_iop_reset(c);
-		if (rc)
-			return rc;
-	}
-
-	/* preserve state */
-	state = sb->iop_state;
-
-	rc = i2o_iop_init_outbound_queue(c);
-	if (rc)
-		return rc;
-
-	/* if adapter was not in RESET state clear now */
-	if (state != ADAPTER_STATE_RESET)
-		i2o_iop_clear(c);
-
-	i2o_status_get(c);
-
-	if (sb->iop_state != ADAPTER_STATE_HOLD) {
-		osm_err("%s: failed to bring IOP into HOLD state\n", c->name);
-		return -EIO;
-	}
-
-	return i2o_hrt_get(c);
-};
-
-static void i2o_res_alloc(struct i2o_controller *c, unsigned long flags)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	struct resource *res = &c->mem_resource;
-	resource_size_t size, align;
-	int err;
-
-	res->name = c->pdev->bus->name;
-	res->flags = flags;
-	res->start = 0;
-	res->end = 0;
-	osm_info("%s: requires private memory resources.\n", c->name);
-
-	if (flags & IORESOURCE_MEM) {
-		size = sb->desired_mem_size;
-		align = 1 << 20;	/* unspecified, use 1Mb and play safe */
-	} else {
-		size = sb->desired_io_size;
-		align = 1 << 12;	/* unspecified, use 4Kb and play safe */
-	}
-
-	err = pci_bus_alloc_resource(c->pdev->bus, res, size, align, 0, 0,
-				     NULL, NULL);
-	if (err < 0)
-		return;
-
-	if (flags & IORESOURCE_MEM) {
-		c->mem_alloc = 1;
-		sb->current_mem_size = resource_size(res);
-		sb->current_mem_base = res->start;
-	} else if (flags & IORESOURCE_IO) {
-		c->io_alloc = 1;
-		sb->current_io_size = resource_size(res);
-		sb->current_io_base = res->start;
-	}
-	osm_info("%s: allocated PCI space %pR\n", c->name, res);
-}
-
-/**
- *	i2o_iop_systab_set - Set the I2O System Table of the specified IOP
- *	@c: I2O controller to which the system table should be send
- *
- *	Before the systab could be set i2o_systab_build() must be called.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_systab_set(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	i2o_status_block *sb = c->status_block.virt;
-	struct device *dev = &c->pdev->dev;
-	int rc;
-
-	if (sb->current_mem_size < sb->desired_mem_size)
-		i2o_res_alloc(c, IORESOURCE_MEM);
-
-	if (sb->current_io_size < sb->desired_io_size)
-		i2o_res_alloc(c, IORESOURCE_IO);
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	i2o_systab.phys = dma_map_single(dev, i2o_systab.virt, i2o_systab.len,
-					 PCI_DMA_TODEVICE);
-	if (!i2o_systab.phys) {
-		i2o_msg_nop(c, msg);
-		return -ENOMEM;
-	}
-
-	msg->u.head[0] = cpu_to_le32(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-
-	/*
-	 * Provide three SGL-elements:
-	 * System table (SysTab), Private memory space declaration and
-	 * Private i/o space declaration
-	 */
-
-	msg->body[0] = cpu_to_le32(c->unit + 2);
-	msg->body[1] = cpu_to_le32(0x00000000);
-	msg->body[2] = cpu_to_le32(0x54000000 | i2o_systab.len);
-	msg->body[3] = cpu_to_le32(i2o_systab.phys);
-	msg->body[4] = cpu_to_le32(0x54000000 | sb->current_mem_size);
-	msg->body[5] = cpu_to_le32(sb->current_mem_base);
-	msg->body[6] = cpu_to_le32(0xd4000000 | sb->current_io_size);
-	msg->body[6] = cpu_to_le32(sb->current_io_base);
-
-	rc = i2o_msg_post_wait(c, msg, 120);
-
-	dma_unmap_single(dev, i2o_systab.phys, i2o_systab.len,
-			 PCI_DMA_TODEVICE);
-
-	if (rc < 0)
-		osm_err("%s: Unable to set SysTab (status=%#x).\n", c->name,
-			-rc);
-	else
-		osm_debug("%s: SysTab set.\n", c->name);
-
-	return rc;
-}
-
-/**
- *	i2o_iop_online - Bring a controller online into OPERATIONAL state.
- *	@c: I2O controller
- *
- *	Send the system table and enable the I2O controller.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_iop_online(struct i2o_controller *c)
-{
-	int rc;
-
-	rc = i2o_iop_systab_set(c);
-	if (rc)
-		return rc;
-
-	/* In READY state */
-	osm_debug("%s: Attempting to enable...\n", c->name);
-	rc = i2o_iop_enable(c);
-	if (rc)
-		return rc;
-
-	return 0;
-};
-
-/**
- *	i2o_iop_remove - Remove the I2O controller from the I2O core
- *	@c: I2O controller
- *
- *	Remove the I2O controller from the I2O core. If devices are attached to
- *	the controller remove these also and finally reset the controller.
- */
-void i2o_iop_remove(struct i2o_controller *c)
-{
-	struct i2o_device *dev, *tmp;
-
-	osm_debug("%s: deleting controller\n", c->name);
-
-	i2o_driver_notify_controller_remove_all(c);
-
-	list_del(&c->list);
-
-	list_for_each_entry_safe(dev, tmp, &c->devices, list)
-	    i2o_device_remove(dev);
-
-	device_del(&c->device);
-
-	/* Ask the IOP to switch to RESET state */
-	i2o_iop_reset(c);
-}
-
-/**
- *	i2o_systab_build - Build system table
- *
- *	The system table contains information about all the IOPs in the system
- *	(duh) and is used by the Executives on the IOPs to establish peer2peer
- *	connections. We're not supporting peer2peer at the moment, but this
- *	will be needed down the road for things like lan2lan forwarding.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_systab_build(void)
-{
-	struct i2o_controller *c, *tmp;
-	int num_controllers = 0;
-	u32 change_ind = 0;
-	int count = 0;
-	struct i2o_sys_tbl *systab = i2o_systab.virt;
-
-	list_for_each_entry_safe(c, tmp, &i2o_controllers, list)
-	    num_controllers++;
-
-	if (systab) {
-		change_ind = systab->change_ind;
-		kfree(i2o_systab.virt);
-	}
-
-	/* Header + IOPs */
-	i2o_systab.len = sizeof(struct i2o_sys_tbl) + num_controllers *
-	    sizeof(struct i2o_sys_tbl_entry);
-
-	systab = i2o_systab.virt = kzalloc(i2o_systab.len, GFP_KERNEL);
-	if (!systab) {
-		osm_err("unable to allocate memory for System Table\n");
-		return -ENOMEM;
-	}
-
-	systab->version = I2OVERSION;
-	systab->change_ind = change_ind + 1;
-
-	list_for_each_entry_safe(c, tmp, &i2o_controllers, list) {
-		i2o_status_block *sb;
-
-		if (count >= num_controllers) {
-			osm_err("controller added while building system table"
-				"\n");
-			break;
-		}
-
-		sb = c->status_block.virt;
-
-		/*
-		 * Get updated IOP state so we have the latest information
-		 *
-		 * We should delete the controller at this point if it
-		 * doesn't respond since if it's not on the system table
-		 * it is techninically not part of the I2O subsystem...
-		 */
-		if (unlikely(i2o_status_get(c))) {
-			osm_err("%s: Deleting b/c could not get status while "
-				"attempting to build system table\n", c->name);
-			i2o_iop_remove(c);
-			continue;	// try the next one
-		}
-
-		systab->iops[count].org_id = sb->org_id;
-		systab->iops[count].iop_id = c->unit + 2;
-		systab->iops[count].seg_num = 0;
-		systab->iops[count].i2o_version = sb->i2o_version;
-		systab->iops[count].iop_state = sb->iop_state;
-		systab->iops[count].msg_type = sb->msg_type;
-		systab->iops[count].frame_size = sb->inbound_frame_size;
-		systab->iops[count].last_changed = change_ind;
-		systab->iops[count].iop_capabilities = sb->iop_capabilities;
-		systab->iops[count].inbound_low =
-		    i2o_dma_low(c->base.phys + I2O_IN_PORT);
-		systab->iops[count].inbound_high =
-		    i2o_dma_high(c->base.phys + I2O_IN_PORT);
-
-		count++;
-	}
-
-	systab->num_entries = count;
-
-	return 0;
-};
-
-/**
- *	i2o_parse_hrt - Parse the hardware resource table.
- *	@c: I2O controller
- *
- *	We don't do anything with it except dumping it (in debug mode).
- *
- *	Returns 0.
- */
-static int i2o_parse_hrt(struct i2o_controller *c)
-{
-	i2o_dump_hrt(c);
-	return 0;
-};
-
-/**
- *	i2o_status_get - Get the status block from the I2O controller
- *	@c: I2O controller
- *
- *	Issue a status query on the controller. This updates the attached
- *	status block. The status block could then be accessed through
- *	c->status_block.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_status_get(struct i2o_controller *c)
-{
-	struct i2o_message *msg;
-	volatile u8 *status_block;
-	unsigned long timeout;
-
-	status_block = (u8 *) c->status_block.virt;
-	memset(c->status_block.virt, 0, sizeof(i2o_status_block));
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 |
-			ADAPTER_TID);
-	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
-	msg->body[0] = cpu_to_le32(0x00000000);
-	msg->body[1] = cpu_to_le32(0x00000000);
-	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status_block.phys));
-	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status_block.phys));
-	msg->body[4] = cpu_to_le32(sizeof(i2o_status_block));	/* always 88 bytes */
-
-	i2o_msg_post(c, msg);
-
-	/* Wait for a reply */
-	timeout = jiffies + I2O_TIMEOUT_STATUS_GET * HZ;
-	while (status_block[87] != 0xFF) {
-		if (time_after(jiffies, timeout)) {
-			osm_err("%s: Get status timeout.\n", c->name);
-			return -ETIMEDOUT;
-		}
-
-		schedule_timeout_uninterruptible(1);
-	}
-
-#ifdef DEBUG
-	i2o_debug_state(c);
-#endif
-
-	return 0;
-}
-
-/*
- *	i2o_hrt_get - Get the Hardware Resource Table from the I2O controller
- *	@c: I2O controller from which the HRT should be fetched
- *
- *	The HRT contains information about possible hidden devices but is
- *	mostly useless to us.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_hrt_get(struct i2o_controller *c)
-{
-	int rc;
-	int i;
-	i2o_hrt *hrt = c->hrt.virt;
-	u32 size = sizeof(i2o_hrt);
-	struct device *dev = &c->pdev->dev;
-
-	for (i = 0; i < I2O_HRT_GET_TRIES; i++) {
-		struct i2o_message *msg;
-
-		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-		if (IS_ERR(msg))
-			return PTR_ERR(msg);
-
-		msg->u.head[0] = cpu_to_le32(SIX_WORD_MSG_SIZE | SGL_OFFSET_4);
-		msg->u.head[1] =
-		    cpu_to_le32(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 |
-				ADAPTER_TID);
-		msg->body[0] = cpu_to_le32(0xd0000000 | c->hrt.len);
-		msg->body[1] = cpu_to_le32(c->hrt.phys);
-
-		rc = i2o_msg_post_wait_mem(c, msg, 20, &c->hrt);
-
-		if (rc < 0) {
-			osm_err("%s: Unable to get HRT (status=%#x)\n", c->name,
-				-rc);
-			return rc;
-		}
-
-		size = hrt->num_entries * hrt->entry_len << 2;
-		if (size > c->hrt.len) {
-			if (i2o_dma_realloc(dev, &c->hrt, size))
-				return -ENOMEM;
-			else
-				hrt = c->hrt.virt;
-		} else
-			return i2o_parse_hrt(c);
-	}
-
-	osm_err("%s: Unable to get HRT after %d tries, giving up\n", c->name,
-		I2O_HRT_GET_TRIES);
-
-	return -EBUSY;
-}
-
-/**
- *	i2o_iop_release - release the memory for a I2O controller
- *	@dev: I2O controller which should be released
- *
- *	Release the allocated memory. This function is called if refcount of
- *	device reaches 0 automatically.
- */
-static void i2o_iop_release(struct device *dev)
-{
-	struct i2o_controller *c = to_i2o_controller(dev);
-
-	i2o_iop_free(c);
-};
-
-/**
- *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
- *
- *	Allocate the necessary memory for a i2o_controller struct and
- *	initialize the lists and message mempool.
- *
- *	Returns a pointer to the I2O controller or a negative error code on
- *	failure.
- */
-struct i2o_controller *i2o_iop_alloc(void)
-{
-	static int unit = 0;	/* 0 and 1 are NULL IOP and Local Host */
-	struct i2o_controller *c;
-	char poolname[32];
-
-	c = kzalloc(sizeof(*c), GFP_KERNEL);
-	if (!c) {
-		osm_err("i2o: Insufficient memory to allocate a I2O controller."
-			"\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	c->unit = unit++;
-	sprintf(c->name, "iop%d", c->unit);
-
-	snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name);
-	if (i2o_pool_alloc
-	    (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4 + sizeof(u32),
-	     I2O_MSG_INPOOL_MIN)) {
-		kfree(c);
-		return ERR_PTR(-ENOMEM);
-	};
-
-	INIT_LIST_HEAD(&c->devices);
-	spin_lock_init(&c->lock);
-	mutex_init(&c->lct_lock);
-
-	device_initialize(&c->device);
-
-	c->device.release = &i2o_iop_release;
-
-	dev_set_name(&c->device, "iop%d", c->unit);
-
-#if BITS_PER_LONG == 64
-	spin_lock_init(&c->context_list_lock);
-	atomic_set(&c->context_list_counter, 0);
-	INIT_LIST_HEAD(&c->context_list);
-#endif
-
-	return c;
-};
-
-/**
- *	i2o_iop_add - Initialize the I2O controller and add him to the I2O core
- *	@c: controller
- *
- *	Initialize the I2O controller and if no error occurs add him to the I2O
- *	core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_iop_add(struct i2o_controller *c)
-{
-	int rc;
-
-	if ((rc = device_add(&c->device))) {
-		osm_err("%s: could not add controller\n", c->name);
-		goto iop_reset;
-	}
-
-	osm_info("%s: Activating I2O controller...\n", c->name);
-	osm_info("%s: This may take a few minutes if there are many devices\n",
-		 c->name);
-
-	if ((rc = i2o_iop_activate(c))) {
-		osm_err("%s: could not activate controller\n", c->name);
-		goto device_del;
-	}
-
-	osm_debug("%s: building sys table...\n", c->name);
-
-	if ((rc = i2o_systab_build()))
-		goto device_del;
-
-	osm_debug("%s: online controller...\n", c->name);
-
-	if ((rc = i2o_iop_online(c)))
-		goto device_del;
-
-	osm_debug("%s: getting LCT...\n", c->name);
-
-	if ((rc = i2o_exec_lct_get(c)))
-		goto device_del;
-
-	list_add(&c->list, &i2o_controllers);
-
-	i2o_driver_notify_controller_add_all(c);
-
-	osm_info("%s: Controller added\n", c->name);
-
-	return 0;
-
-      device_del:
-	device_del(&c->device);
-
-      iop_reset:
-	i2o_iop_reset(c);
-
-	return rc;
-};
-
-/**
- *	i2o_event_register - Turn on/off event notification for a I2O device
- *	@dev: I2O device which should receive the event registration request
- *	@drv: driver which want to get notified
- *	@tcntxt: transaction context to use with this notifier
- *	@evt_mask: mask of events
- *
- *	Create and posts an event registration message to the task. No reply
- *	is waited for, or expected. If you do not want further notifications,
- *	call the i2o_event_register again with a evt_mask of 0.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_event_register(struct i2o_device *dev, struct i2o_driver *drv,
-		       int tcntxt, u32 evt_mask)
-{
-	struct i2o_controller *c = dev->iop;
-	struct i2o_message *msg;
-
-	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-
-	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
-	msg->u.head[1] =
-	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->
-			lct_data.tid);
-	msg->u.s.icntxt = cpu_to_le32(drv->context);
-	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
-	msg->body[0] = cpu_to_le32(evt_mask);
-
-	i2o_msg_post(c, msg);
-
-	return 0;
-};
-
-/**
- *	i2o_iop_init - I2O main initialization function
- *
- *	Initialize the I2O drivers (OSM) functions, register the Executive OSM,
- *	initialize the I2O PCI part and finally initialize I2O device stuff.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int __init i2o_iop_init(void)
-{
-	int rc = 0;
-
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
-	if ((rc = i2o_driver_init()))
-		goto exit;
-
-	if ((rc = i2o_exec_init()))
-		goto driver_exit;
-
-	if ((rc = i2o_pci_init()))
-		goto exec_exit;
-
-	return 0;
-
-      exec_exit:
-	i2o_exec_exit();
-
-      driver_exit:
-	i2o_driver_exit();
-
-      exit:
-	return rc;
-}
-
-/**
- *	i2o_iop_exit - I2O main exit function
- *
- *	Removes I2O controllers from PCI subsystem and shut down OSMs.
- */
-static void __exit i2o_iop_exit(void)
-{
-	i2o_pci_exit();
-	i2o_exec_exit();
-	i2o_driver_exit();
-};
-
-module_init(i2o_iop_init);
-module_exit(i2o_iop_exit);
-
-MODULE_AUTHOR("Red Hat Software");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-#if BITS_PER_LONG == 64
-EXPORT_SYMBOL(i2o_cntxt_list_add);
-EXPORT_SYMBOL(i2o_cntxt_list_get);
-EXPORT_SYMBOL(i2o_cntxt_list_remove);
-EXPORT_SYMBOL(i2o_cntxt_list_get_ptr);
-#endif
-EXPORT_SYMBOL(i2o_msg_get_wait);
-EXPORT_SYMBOL(i2o_find_iop);
-EXPORT_SYMBOL(i2o_iop_find_device);
-EXPORT_SYMBOL(i2o_event_register);
-EXPORT_SYMBOL(i2o_status_get);
-EXPORT_SYMBOL(i2o_controllers);
diff --git a/drivers/message/i2o/memory.c b/drivers/message/i2o/memory.c
deleted file mode 100644
index 292b41e49fbd..000000000000
--- a/drivers/message/i2o/memory.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- *	Functions to handle I2O memory
- *
- *	Pulled from the inlines in i2o headers and uninlined
- *
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/i2o.h>
-#include <linux/delay.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include "core.h"
-
-/* Protects our 32/64bit mask switching */
-static DEFINE_MUTEX(mem_lock);
-
-/**
- *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
- *	@c: I2O controller for which the calculation should be done
- *	@body_size: maximum body size used for message in 32-bit words.
- *
- *	Return the maximum number of SG elements in a SG list.
- */
-u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	u16 sg_count =
-	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
-	    body_size;
-
-	if (c->pae_support) {
-		/*
-		 * for 64-bit a SG attribute element must be added and each
-		 * SG element needs 12 bytes instead of 8.
-		 */
-		sg_count -= 2;
-		sg_count /= 3;
-	} else
-		sg_count /= 2;
-
-	if (c->short_req && (sg_count > 8))
-		sg_count = 8;
-
-	return sg_count;
-}
-EXPORT_SYMBOL_GPL(i2o_sg_tablesize);
-
-
-/**
- *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
- *	@c: I2O controller
- *	@ptr: pointer to the data which should be mapped
- *	@size: size of data in bytes
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
- *	SG list if the allocation was successful.
- *
- *	Returns DMA address which must be checked for failures using
- *	dma_mapping_error().
- */
-dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
-					    size_t size,
-					    enum dma_data_direction direction,
-					    u32 ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 *mptr = *sg_ptr;
-	dma_addr_t dma_addr;
-
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0xd4000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0xd0000000;
-		break;
-	default:
-		return 0;
-	}
-
-	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
-	if (!dma_mapping_error(&c->pdev->dev, dma_addr)) {
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-			*mptr++ = cpu_to_le32(0x7C020002);
-			*mptr++ = cpu_to_le32(PAGE_SIZE);
-		}
-#endif
-
-		*mptr++ = cpu_to_le32(sg_flags | size);
-		*mptr++ = cpu_to_le32(i2o_dma_low(dma_addr));
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			*mptr++ = cpu_to_le32(i2o_dma_high(dma_addr));
-#endif
-		*sg_ptr = mptr;
-	}
-	return dma_addr;
-}
-EXPORT_SYMBOL_GPL(i2o_dma_map_single);
-
-/**
- *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
- *	@c: I2O controller
- *	@sg: SG list to be mapped
- *	@sg_count: number of elements in the SG list
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
- *	list if the allocation was successful.
- *
- *	Returns 0 on failure or 1 on success.
- */
-int i2o_dma_map_sg(struct i2o_controller *c, struct scatterlist *sg,
-	    int sg_count, enum dma_data_direction direction, u32 ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 *mptr = *sg_ptr;
-
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0x14000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0x10000000;
-		break;
-	default:
-		return 0;
-	}
-
-	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
-	if (!sg_count)
-		return 0;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-		*mptr++ = cpu_to_le32(0x7C020002);
-		*mptr++ = cpu_to_le32(PAGE_SIZE);
-	}
-#endif
-
-	while (sg_count-- > 0) {
-		if (!sg_count)
-			sg_flags |= 0xC0000000;
-		*mptr++ = cpu_to_le32(sg_flags | sg_dma_len(sg));
-		*mptr++ = cpu_to_le32(i2o_dma_low(sg_dma_address(sg)));
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			*mptr++ = cpu_to_le32(i2o_dma_high(sg_dma_address(sg)));
-#endif
-		sg = sg_next(sg);
-	}
-	*sg_ptr = mptr;
-
-	return 1;
-}
-EXPORT_SYMBOL_GPL(i2o_dma_map_sg);
-
-/**
- *	i2o_dma_alloc - Allocate DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which should get the DMA buffer
- *	@len: length of the new DMA memory
- *
- *	Allocate a coherent DMA memory and write the pointers into addr.
- *
- *	Returns 0 on success or -ENOMEM on failure.
- */
-int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	int dma_64 = 0;
-
-	mutex_lock(&mem_lock);
-	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_BIT_MASK(64))) {
-		dma_64 = 1;
-		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
-			mutex_unlock(&mem_lock);
-			return -ENOMEM;
-		}
-	}
-
-	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, GFP_KERNEL);
-
-	if ((sizeof(dma_addr_t) > 4) && dma_64)
-		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
-			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
-	mutex_unlock(&mem_lock);
-
-	if (!addr->virt)
-		return -ENOMEM;
-
-	memset(addr->virt, 0, len);
-	addr->len = len;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(i2o_dma_alloc);
-
-
-/**
- *	i2o_dma_free - Free DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which contains the DMA buffer
- *
- *	Free a coherent DMA memory and set virtual address of addr to NULL.
- */
-void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
-{
-	if (addr->virt) {
-		if (addr->phys)
-			dma_free_coherent(dev, addr->len, addr->virt,
-					  addr->phys);
-		else
-			kfree(addr->virt);
-		addr->virt = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(i2o_dma_free);
-
-
-/**
- *	i2o_dma_realloc - Realloc DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: pointer to a i2o_dma struct DMA buffer
- *	@len: new length of memory
- *
- *	If there was something allocated in the addr, free it first. If len > 0
- *	than try to allocate it and write the addresses back to the addr
- *	structure. If len == 0 set the virtual address to NULL.
- *
- *	Returns the 0 on success or negative error code on failure.
- */
-int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr, size_t len)
-{
-	i2o_dma_free(dev, addr);
-
-	if (len)
-		return i2o_dma_alloc(dev, addr, len);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(i2o_dma_realloc);
-
-/*
- *	i2o_pool_alloc - Allocate an slab cache and mempool
- *	@mempool: pointer to struct i2o_pool to write data into.
- *	@name: name which is used to identify cache
- *	@size: size of each object
- *	@min_nr: minimum number of objects
- *
- *	First allocates a slab cache with name and size. Then allocates a
- *	mempool which uses the slab cache for allocation and freeing.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
-				 size_t size, int min_nr)
-{
-	pool->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
-	if (!pool->name)
-		goto exit;
-	strcpy(pool->name, name);
-
-	pool->slab =
-	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!pool->slab)
-		goto free_name;
-
-	pool->mempool = mempool_create_slab_pool(min_nr, pool->slab);
-	if (!pool->mempool)
-		goto free_slab;
-
-	return 0;
-
-free_slab:
-	kmem_cache_destroy(pool->slab);
-
-free_name:
-	kfree(pool->name);
-
-exit:
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(i2o_pool_alloc);
-
-/*
- *	i2o_pool_free - Free slab cache and mempool again
- *	@mempool: pointer to struct i2o_pool which should be freed
- *
- *	Note that you have to return all objects to the mempool again before
- *	calling i2o_pool_free().
- */
-void i2o_pool_free(struct i2o_pool *pool)
-{
-	mempool_destroy(pool->mempool);
-	kmem_cache_destroy(pool->slab);
-	kfree(pool->name);
-};
-EXPORT_SYMBOL_GPL(i2o_pool_free);
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
deleted file mode 100644
index 0f9f3e1a2b6b..000000000000
--- a/drivers/message/i2o/pci.c
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- *	PCI handling of I2O controller
- *
- * 	Copyright (C) 1999-2002	Red Hat Software
- *
- *	Written by Alan Cox, Building Number Three Ltd
- *
- *	This program is free software; you can redistribute it and/or modify it
- *	under the terms of the GNU General Public License as published by the
- *	Free Software Foundation; either version 2 of the License, or (at your
- *	option) any later version.
- *
- *	A lot of the I2O message side code from this is taken from the Red
- *	Creek RCPCI45 adapter driver by Red Creek Communications
- *
- *	Fixes/additions:
- *		Philipp Rumpf
- *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
- *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
- *		Deepak Saxena <deepak@plexity.net>
- *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
- *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
- *			Ported to Linux 2.5.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Minor fixes for 2.6.
- *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
- *			Support for sysfs included.
- */
-
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/i2o.h>
-#include <linux/module.h>
-#include "core.h"
-
-#define OSM_DESCRIPTION	"I2O-subsystem"
-
-/* PCI device id table for all I2O controllers */
-static struct pci_device_id i2o_pci_ids[] = {
-	{PCI_DEVICE_CLASS(PCI_CLASS_INTELLIGENT_I2O << 8, 0xffff00)},
-	{PCI_DEVICE(PCI_VENDOR_ID_DPT, 0xa511)},
-	{.vendor = PCI_VENDOR_ID_INTEL,.device = 0x1962,
-	 .subvendor = PCI_VENDOR_ID_PROMISE,.subdevice = PCI_ANY_ID},
-	{0}
-};
-
-/**
- *	i2o_pci_free - Frees the DMA memory for the I2O controller
- *	@c: I2O controller to free
- *
- *	Remove all allocated DMA memory and unmap memory IO regions. If MTRR
- *	is enabled, also remove it again.
- */
-static void i2o_pci_free(struct i2o_controller *c)
-{
-	struct device *dev;
-
-	dev = &c->pdev->dev;
-
-	i2o_dma_free(dev, &c->out_queue);
-	i2o_dma_free(dev, &c->status_block);
-	kfree(c->lct);
-	i2o_dma_free(dev, &c->dlct);
-	i2o_dma_free(dev, &c->hrt);
-	i2o_dma_free(dev, &c->status);
-
-	if (c->raptor && c->in_queue.virt)
-		iounmap(c->in_queue.virt);
-
-	if (c->base.virt)
-		iounmap(c->base.virt);
-
-	pci_release_regions(c->pdev);
-}
-
-/**
- *	i2o_pci_alloc - Allocate DMA memory, map IO memory for I2O controller
- *	@c: I2O controller
- *
- *	Allocate DMA memory for a PCI (or in theory AGP) I2O controller. All
- *	IO mappings are also done here. If MTRR is enabled, also do add memory
- *	regions here.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_pci_alloc(struct i2o_controller *c)
-{
-	struct pci_dev *pdev = c->pdev;
-	struct device *dev = &pdev->dev;
-	int i;
-
-	if (pci_request_regions(pdev, OSM_DESCRIPTION)) {
-		printk(KERN_ERR "%s: device already claimed\n", c->name);
-		return -ENODEV;
-	}
-
-	for (i = 0; i < 6; i++) {
-		/* Skip I/O spaces */
-		if (!(pci_resource_flags(pdev, i) & IORESOURCE_IO)) {
-			if (!c->base.phys) {
-				c->base.phys = pci_resource_start(pdev, i);
-				c->base.len = pci_resource_len(pdev, i);
-
-				/*
-				 * If we know what card it is, set the size
-				 * correctly. Code is taken from dpt_i2o.c
-				 */
-				if (pdev->device == 0xa501) {
-					if (pdev->subsystem_device >= 0xc032 &&
-					    pdev->subsystem_device <= 0xc03b) {
-						if (c->base.len > 0x400000)
-							c->base.len = 0x400000;
-					} else {
-						if (c->base.len > 0x100000)
-							c->base.len = 0x100000;
-					}
-				}
-				if (!c->raptor)
-					break;
-			} else {
-				c->in_queue.phys = pci_resource_start(pdev, i);
-				c->in_queue.len = pci_resource_len(pdev, i);
-				break;
-			}
-		}
-	}
-
-	if (i == 6) {
-		printk(KERN_ERR "%s: I2O controller has no memory regions"
-		       " defined.\n", c->name);
-		i2o_pci_free(c);
-		return -EINVAL;
-	}
-
-	/* Map the I2O controller */
-	if (c->raptor) {
-		printk(KERN_INFO "%s: PCI I2O controller\n", c->name);
-		printk(KERN_INFO "     BAR0 at 0x%08lX size=%ld\n",
-		       (unsigned long)c->base.phys, (unsigned long)c->base.len);
-		printk(KERN_INFO "     BAR1 at 0x%08lX size=%ld\n",
-		       (unsigned long)c->in_queue.phys,
-		       (unsigned long)c->in_queue.len);
-	} else
-		printk(KERN_INFO "%s: PCI I2O controller at %08lX size=%ld\n",
-		       c->name, (unsigned long)c->base.phys,
-		       (unsigned long)c->base.len);
-
-	c->base.virt = ioremap_nocache(c->base.phys, c->base.len);
-	if (!c->base.virt) {
-		printk(KERN_ERR "%s: Unable to map controller.\n", c->name);
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (c->raptor) {
-		c->in_queue.virt =
-		    ioremap_nocache(c->in_queue.phys, c->in_queue.len);
-		if (!c->in_queue.virt) {
-			printk(KERN_ERR "%s: Unable to map controller.\n",
-			       c->name);
-			i2o_pci_free(c);
-			return -ENOMEM;
-		}
-	} else
-		c->in_queue = c->base;
-
-	c->irq_status = c->base.virt + I2O_IRQ_STATUS;
-	c->irq_mask = c->base.virt + I2O_IRQ_MASK;
-	c->in_port = c->base.virt + I2O_IN_PORT;
-	c->out_port = c->base.virt + I2O_OUT_PORT;
-
-	/* Motorola/Freescale chip does not follow spec */
-	if (pdev->vendor == PCI_VENDOR_ID_MOTOROLA && pdev->device == 0x18c0) {
-		/* Check if CPU is enabled */
-		if (be32_to_cpu(readl(c->base.virt + 0x10000)) & 0x10000000) {
-			printk(KERN_INFO "%s: MPC82XX needs CPU running to "
-			       "service I2O.\n", c->name);
-			i2o_pci_free(c);
-			return -ENODEV;
-		} else {
-			c->irq_status += I2O_MOTOROLA_PORT_OFFSET;
-			c->irq_mask += I2O_MOTOROLA_PORT_OFFSET;
-			c->in_port += I2O_MOTOROLA_PORT_OFFSET;
-			c->out_port += I2O_MOTOROLA_PORT_OFFSET;
-			printk(KERN_INFO "%s: MPC82XX workarounds activated.\n",
-			       c->name);
-		}
-	}
-
-	if (i2o_dma_alloc(dev, &c->status, 8)) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (i2o_dma_alloc(dev, &c->hrt, sizeof(i2o_hrt))) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (i2o_dma_alloc(dev, &c->dlct, 8192)) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (i2o_dma_alloc(dev, &c->status_block, sizeof(i2o_status_block))) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	if (i2o_dma_alloc(dev, &c->out_queue,
-		I2O_MAX_OUTBOUND_MSG_FRAMES * I2O_OUTBOUND_MSG_FRAME_SIZE *
-				sizeof(u32))) {
-		i2o_pci_free(c);
-		return -ENOMEM;
-	}
-
-	pci_set_drvdata(pdev, c);
-
-	return 0;
-}
-
-/**
- *	i2o_pci_interrupt - Interrupt handler for I2O controller
- *	@irq: interrupt line
- *	@dev_id: pointer to the I2O controller
- *
- *	Handle an interrupt from a PCI based I2O controller. This turns out
- *	to be rather simple. We keep the controller pointer in the cookie.
- */
-static irqreturn_t i2o_pci_interrupt(int irq, void *dev_id)
-{
-	struct i2o_controller *c = dev_id;
-	u32 m;
-	irqreturn_t rc = IRQ_NONE;
-
-	while (readl(c->irq_status) & I2O_IRQ_OUTBOUND_POST) {
-		m = readl(c->out_port);
-		if (m == I2O_QUEUE_EMPTY) {
-			/*
-			 * Old 960 steppings had a bug in the I2O unit that
-			 * caused the queue to appear empty when it wasn't.
-			 */
-			m = readl(c->out_port);
-			if (unlikely(m == I2O_QUEUE_EMPTY))
-				break;
-		}
-
-		/* dispatch it */
-		if (i2o_driver_dispatch(c, m))
-			/* flush it if result != 0 */
-			i2o_flush_reply(c, m);
-
-		rc = IRQ_HANDLED;
-	}
-
-	return rc;
-}
-
-/**
- *	i2o_pci_irq_enable - Allocate interrupt for I2O controller
- *	@c: i2o_controller that the request is for
- *
- *	Allocate an interrupt for the I2O controller, and activate interrupts
- *	on the I2O controller.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_pci_irq_enable(struct i2o_controller *c)
-{
-	struct pci_dev *pdev = c->pdev;
-	int rc;
-
-	writel(0xffffffff, c->irq_mask);
-
-	if (pdev->irq) {
-		rc = request_irq(pdev->irq, i2o_pci_interrupt, IRQF_SHARED,
-				 c->name, c);
-		if (rc < 0) {
-			printk(KERN_ERR "%s: unable to allocate interrupt %d."
-			       "\n", c->name, pdev->irq);
-			return rc;
-		}
-	}
-
-	writel(0x00000000, c->irq_mask);
-
-	printk(KERN_INFO "%s: Installed at IRQ %d\n", c->name, pdev->irq);
-
-	return 0;
-}
-
-/**
- *	i2o_pci_irq_disable - Free interrupt for I2O controller
- *	@c: I2O controller
- *
- *	Disable interrupts in I2O controller and then free interrupt.
- */
-static void i2o_pci_irq_disable(struct i2o_controller *c)
-{
-	writel(0xffffffff, c->irq_mask);
-
-	if (c->pdev->irq > 0)
-		free_irq(c->pdev->irq, c);
-}
-
-/**
- *	i2o_pci_probe - Probe the PCI device for an I2O controller
- *	@pdev: PCI device to test
- *	@id: id which matched with the PCI device id table
- *
- *	Probe the PCI device for any device which is a memory of the
- *	Intelligent, I2O class or an Adaptec Zero Channel Controller. We
- *	attempt to set up each such device and register it with the core.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static int i2o_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-	struct i2o_controller *c;
-	int rc;
-	struct pci_dev *i960 = NULL;
-
-	printk(KERN_INFO "i2o: Checking for PCI I2O controllers...\n");
-
-	if ((pdev->class & 0xff) > 1) {
-		printk(KERN_WARNING "i2o: %s does not support I2O 1.5 "
-		       "(skipping).\n", pci_name(pdev));
-		return -ENODEV;
-	}
-
-	if ((rc = pci_enable_device(pdev))) {
-		printk(KERN_WARNING "i2o: couldn't enable device %s\n",
-		       pci_name(pdev));
-		return rc;
-	}
-
-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
-		printk(KERN_WARNING "i2o: no suitable DMA found for %s\n",
-		       pci_name(pdev));
-		rc = -ENODEV;
-		goto disable;
-	}
-
-	pci_set_master(pdev);
-
-	c = i2o_iop_alloc();
-	if (IS_ERR(c)) {
-		printk(KERN_ERR "i2o: couldn't allocate memory for %s\n",
-		       pci_name(pdev));
-		rc = PTR_ERR(c);
-		goto disable;
-	} else
-		printk(KERN_INFO "%s: controller found (%s)\n", c->name,
-		       pci_name(pdev));
-
-	c->pdev = pdev;
-	c->device.parent = &pdev->dev;
-
-	/* Cards that fall apart if you hit them with large I/O loads... */
-	if (pdev->vendor == PCI_VENDOR_ID_NCR && pdev->device == 0x0630) {
-		c->short_req = 1;
-		printk(KERN_INFO "%s: Symbios FC920 workarounds activated.\n",
-		       c->name);
-	}
-
-	if (pdev->subsystem_vendor == PCI_VENDOR_ID_PROMISE) {
-		/*
-		 * Expose the ship behind i960 for initialization, or it will
-		 * failed
-		 */
-		i960 = pci_get_slot(c->pdev->bus,
-				  PCI_DEVFN(PCI_SLOT(c->pdev->devfn), 0));
-
-		if (i960) {
-			pci_write_config_word(i960, 0x42, 0);
-			pci_dev_put(i960);
-		}
-
-		c->promise = 1;
-		c->limit_sectors = 1;
-	}
-
-	if (pdev->subsystem_vendor == PCI_VENDOR_ID_DPT)
-		c->adaptec = 1;
-
-	/* Cards that go bananas if you quiesce them before you reset them. */
-	if (pdev->vendor == PCI_VENDOR_ID_DPT) {
-		c->no_quiesce = 1;
-		if (pdev->device == 0xa511)
-			c->raptor = 1;
-
-		if (pdev->subsystem_device == 0xc05a) {
-			c->limit_sectors = 1;
-			printk(KERN_INFO
-			       "%s: limit sectors per request to %d\n", c->name,
-			       I2O_MAX_SECTORS_LIMITED);
-		}
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if (sizeof(dma_addr_t) > 4) {
-			if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
-				printk(KERN_INFO "%s: 64-bit DMA unavailable\n",
-				       c->name);
-			else {
-				c->pae_support = 1;
-				printk(KERN_INFO "%s: using 64-bit DMA\n",
-				       c->name);
-			}
-		}
-#endif
-	}
-
-	if ((rc = i2o_pci_alloc(c))) {
-		printk(KERN_ERR "%s: DMA / IO allocation for I2O controller "
-		       "failed\n", c->name);
-		goto free_controller;
-	}
-
-	if (i2o_pci_irq_enable(c)) {
-		printk(KERN_ERR "%s: unable to enable interrupts for I2O "
-		       "controller\n", c->name);
-		goto free_pci;
-	}
-
-	if ((rc = i2o_iop_add(c)))
-		goto uninstall;
-
-	if (i960)
-		pci_write_config_word(i960, 0x42, 0x03ff);
-
-	return 0;
-
-      uninstall:
-	i2o_pci_irq_disable(c);
-
-      free_pci:
-	i2o_pci_free(c);
-
-      free_controller:
-	i2o_iop_free(c);
-
-      disable:
-	pci_disable_device(pdev);
-
-	return rc;
-}
-
-/**
- *	i2o_pci_remove - Removes a I2O controller from the system
- *	@pdev: I2O controller which should be removed
- *
- *	Reset the I2O controller, disable interrupts and remove all allocated
- *	resources.
- */
-static void i2o_pci_remove(struct pci_dev *pdev)
-{
-	struct i2o_controller *c;
-	c = pci_get_drvdata(pdev);
-
-	i2o_iop_remove(c);
-	i2o_pci_irq_disable(c);
-	i2o_pci_free(c);
-
-	pci_disable_device(pdev);
-
-	printk(KERN_INFO "%s: Controller removed.\n", c->name);
-
-	put_device(&c->device);
-};
-
-/* PCI driver for I2O controller */
-static struct pci_driver i2o_pci_driver = {
-	.name = "PCI_I2O",
-	.id_table = i2o_pci_ids,
-	.probe = i2o_pci_probe,
-	.remove = i2o_pci_remove,
-};
-
-/**
- *	i2o_pci_init - registers I2O PCI driver in PCI subsystem
- *
- *	Returns > 0 on success or negative error code on failure.
- */
-int __init i2o_pci_init(void)
-{
-	return pci_register_driver(&i2o_pci_driver);
-};
-
-/**
- *	i2o_pci_exit - unregisters I2O PCI driver from PCI subsystem
- */
-void __exit i2o_pci_exit(void)
-{
-	pci_unregister_driver(&i2o_pci_driver);
-};
-
-MODULE_DEVICE_TABLE(pci, i2o_pci_ids);
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 071ac116818f..9e52bcd5356d 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -110,4 +110,6 @@ source "drivers/staging/clocking-wizard/Kconfig"
 
 source "drivers/staging/fbtft/Kconfig"
 
+source "drivers/staging/i2o/Kconfig"
+
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index b65ca376d957..6e0ac524c84d 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -47,3 +47,4 @@ obj-$(CONFIG_CRYPTO_SKEIN)	+= skein/
 obj-$(CONFIG_UNISYSSPAR)	+= unisys/
 obj-$(CONFIG_COMMON_CLK_XLNX_CLKWZRD)	+= clocking-wizard/
 obj-$(CONFIG_FB_TFT)		+= fbtft/
+obj-$(CONFIG_I2O)		+= i2o/
diff --git a/drivers/staging/i2o/Kconfig b/drivers/staging/i2o/Kconfig
new file mode 100644
index 000000000000..286c53f4b13d
--- /dev/null
+++ b/drivers/staging/i2o/Kconfig
@@ -0,0 +1,120 @@
+menuconfig I2O
+	tristate "I2O device support"
+	depends on PCI
+	---help---
+	  The Intelligent Input/Output (I2O) architecture allows hardware
+	  drivers to be split into two parts: an operating system specific
+	  module called the OSM and an hardware specific module called the
+	  HDM. The OSM can talk to a whole range of HDM's, and ideally the
+	  HDM's are not OS dependent. This allows for the same HDM driver to
+	  be used under different operating systems if the relevant OSM is in
+	  place. In order for this to work, you need to have an I2O interface
+	  adapter card in your computer. This card contains a special I/O
+	  processor (IOP), thus allowing high speeds since the CPU does not
+	  have to deal with I/O.
+
+	  If you say Y here, you will get a choice of interface adapter
+	  drivers and OSM's with the following questions.
+
+	  To compile this support as a module, choose M here: the
+	  modules will be called i2o_core.
+
+	  If unsure, say N.
+
+if I2O
+
+config I2O_LCT_NOTIFY_ON_CHANGES
+	bool "Enable LCT notification"
+	default y
+	---help---
+	  Only say N here if you have a I2O controller from SUN. The SUN
+	  firmware doesn't support LCT notification on changes. If this option
+	  is enabled on such a controller the driver will hang up in a endless
+	  loop. On all other controllers say Y.
+
+	  If unsure, say Y.
+
+config I2O_EXT_ADAPTEC
+	bool "Enable Adaptec extensions"
+	default y
+	---help---
+	  Say Y for support of raidutils for Adaptec I2O controllers. You also
+	  have to say Y to "I2O Configuration support", "I2O SCSI OSM" below
+	  and to "SCSI generic support" under "SCSI device configuration".
+
+config I2O_EXT_ADAPTEC_DMA64
+	bool "Enable 64-bit DMA"
+	depends on I2O_EXT_ADAPTEC && ( 64BIT || HIGHMEM64G )
+	default y
+	---help---
+	  Say Y for support of 64-bit DMA transfer mode on Adaptec I2O
+	  controllers.
+	  Note: You need at least firmware version 3709.
+
+config I2O_CONFIG
+	tristate "I2O Configuration support"
+	depends on VIRT_TO_BUS
+	---help---
+	  Say Y for support of the configuration interface for the I2O adapters.
+	  If you have a RAID controller from Adaptec and you want to use the
+	  raidutils to manage your RAID array, you have to say Y here.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_config.
+
+	  Note: If you want to use the new API you have to download the
+	  i2o_config patch from http://i2o.shadowconnect.com/
+
+config I2O_CONFIG_OLD_IOCTL
+	bool "Enable ioctls (OBSOLETE)"
+	depends on I2O_CONFIG
+	default y
+	---help---
+	  Enables old ioctls.
+
+config I2O_BUS
+	tristate "I2O Bus Adapter OSM"
+	---help---
+	  Include support for the I2O Bus Adapter OSM. The Bus Adapter OSM
+	  provides access to the busses on the I2O controller. The main purpose
+	  is to rescan the bus to find new devices.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_bus.
+
+config I2O_BLOCK
+	tristate "I2O Block OSM"
+	depends on BLOCK
+	---help---
+	  Include support for the I2O Block OSM. The Block OSM presents disk
+	  and other structured block devices to the operating system. If you
+	  are using an RAID controller, you could access the array only by
+	  the Block OSM driver. But it is possible to access the single disks
+	  by the SCSI OSM driver, for example to monitor the disks.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_block.
+
+config I2O_SCSI
+	tristate "I2O SCSI OSM"
+	depends on SCSI
+	---help---
+	  Allows direct SCSI access to SCSI devices on a SCSI or FibreChannel
+	  I2O controller. You can use both the SCSI and Block OSM together if
+	  you wish. To access a RAID array, you must use the Block OSM driver.
+	  But you could use the SCSI OSM driver to monitor the single disks.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_scsi.
+
+config I2O_PROC
+	tristate "I2O /proc support"
+	---help---
+	  If you say Y here and to "/proc file system support", you will be
+	  able to read I2O related information from the virtual directory
+	  /proc/i2o.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_proc.
+
+endif # I2O
diff --git a/drivers/staging/i2o/Makefile b/drivers/staging/i2o/Makefile
new file mode 100644
index 000000000000..b0982dacfd0a
--- /dev/null
+++ b/drivers/staging/i2o/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the kernel I2O OSM.
+#
+# Note : at this point, these files are compiled on all systems.
+# In the future, some of these should be built conditionally.
+#
+
+i2o_core-y		+= iop.o driver.o device.o debug.o pci.o exec-osm.o memory.o
+i2o_bus-y		+= bus-osm.o
+i2o_config-y		+= config-osm.o
+obj-$(CONFIG_I2O)	+= i2o_core.o
+obj-$(CONFIG_I2O_CONFIG)+= i2o_config.o
+obj-$(CONFIG_I2O_BUS)	+= i2o_bus.o
+obj-$(CONFIG_I2O_BLOCK)	+= i2o_block.o
+obj-$(CONFIG_I2O_SCSI)	+= i2o_scsi.o
+obj-$(CONFIG_I2O_PROC)	+= i2o_proc.o
diff --git a/drivers/staging/i2o/README b/drivers/staging/i2o/README
new file mode 100644
index 000000000000..f072a8eb3041
--- /dev/null
+++ b/drivers/staging/i2o/README
@@ -0,0 +1,98 @@
+
+	Linux I2O Support	(c) Copyright 1999 Red Hat Software
+					and others.
+
+	This program is free software; you can redistribute it and/or
+	modify it under the terms of the GNU General Public License
+	as published by the Free Software Foundation; either version
+	2 of the License, or (at your option) any later version.
+
+AUTHORS (so far)
+
+Alan Cox, Building Number Three Ltd.
+	Core code, SCSI and Block OSMs
+
+Steve Ralston, LSI Logic Corp.
+	Debugging SCSI and Block OSM
+
+Deepak Saxena, Intel Corp.
+	Various core/block extensions
+	/proc interface, bug fixes
+	Ioctl interfaces for control
+	Debugging LAN OSM
+
+Philip Rumpf
+	Fixed assorted dumb SMP locking bugs
+
+Juha Sievanen, University of Helsinki Finland
+	LAN OSM code
+	/proc interface to LAN class
+	Bug fixes
+	Core code extensions
+
+Auvo Häkkinen, University of Helsinki Finland
+	LAN OSM code
+	/Proc interface to LAN class
+	Bug fixes
+	Core code extensions
+
+Taneli Vähäkangas, University of Helsinki Finland
+	Fixes to i2o_config
+
+CREDITS
+
+	This work was made possible by 
+
+Red Hat Software
+	Funding for the Building #3 part of the project
+
+Symbios Logic (Now LSI)
+	Host adapters, hints, known to work platforms when I hit
+	compatibility problems
+
+BoxHill Corporation
+	Loan of initial FibreChannel disk array used for development work.
+
+European Commission
+	Funding the work done by the University of Helsinki
+
+SysKonnect
+        Loan of FDDI and Gigabit Ethernet cards
+
+ASUSTeK
+        Loan of I2O motherboard 
+
+STATUS:
+
+o	The core setup works within limits.
+o	The scsi layer seems to almost work. 
+           I'm still chasing down the hang bug.
+o	The block OSM is mostly functional
+o	LAN OSM works with FDDI and Ethernet cards.
+
+TO DO:
+
+General:
+o	Provide hidden address space if asked
+o	Long term message flow control
+o	PCI IOP's without interrupts are not supported yet
+o	Push FAIL handling into the core
+o	DDM control interfaces for module load etc
+o       Add I2O 2.0 support (Deffered to 2.5 kernel)
+
+Block:
+o	Multiple major numbers
+o	Read ahead and cache handling stuff. Talk to Ingo and people
+o	Power management
+o	Finish Media changers
+
+SCSI:
+o	Find the right way to associate drives/luns/busses
+
+Lan:	
+o	Performance tuning
+o	Test Fibre Channel code
+
+Tape:
+o	Anyone seen anything implementing this ?
+           (D.S: Will attempt to do so if spare cycles permit)
diff --git a/drivers/staging/i2o/README.ioctl b/drivers/staging/i2o/README.ioctl
new file mode 100644
index 000000000000..4a7d2ebdfc97
--- /dev/null
+++ b/drivers/staging/i2o/README.ioctl
@@ -0,0 +1,394 @@
+
+Linux I2O User Space Interface
+rev 0.3 - 04/20/99
+
+=============================================================================
+Originally written by Deepak Saxena(deepak@plexity.net)
+Currently maintained by Deepak Saxena(deepak@plexity.net)
+=============================================================================
+
+I. Introduction
+
+The Linux I2O subsystem provides a set of ioctl() commands that can be
+utilized by user space applications to communicate with IOPs and devices
+on individual IOPs. This document defines the specific ioctl() commands
+that are available to the user and provides examples of their uses.
+
+This document assumes the reader is familiar with or has access to the 
+I2O specification as no I2O message parameters are outlined.  For information 
+on the specification, see http://www.i2osig.org
+
+This document and the I2O user space interface are currently maintained
+by Deepak Saxena.  Please send all comments, errata, and bug fixes to
+deepak@csociety.purdue.edu
+
+II. IOP Access
+
+Access to the I2O subsystem is provided through the device file named 
+/dev/i2o/ctl.  This file is a character file with major number 10 and minor
+number 166.  It can be created through the following command:
+
+   mknod /dev/i2o/ctl c 10 166
+
+III. Determining the IOP Count
+
+   SYNOPSIS 
+
+   ioctl(fd, I2OGETIOPS,  int *count);
+
+   u8 count[MAX_I2O_CONTROLLERS];
+
+   DESCRIPTION
+
+   This function returns the system's active IOP table.  count should
+   point to a buffer containing MAX_I2O_CONTROLLERS entries.  Upon 
+   returning, each entry will contain a non-zero value if the given
+   IOP unit is active, and NULL if it is inactive or non-existent.
+
+   RETURN VALUE.
+
+   Returns 0 if no errors occur, and -1 otherwise.  If an error occurs,
+   errno is set appropriately:
+
+     EFAULT   Invalid user space pointer was passed
+
+IV. Getting Hardware Resource Table
+
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OHRTGET, struct i2o_cmd_hrt *hrt);
+
+      struct i2o_cmd_hrtlct
+      {
+         u32   iop;      /* IOP unit number */
+         void  *resbuf;  /* Buffer for result */
+         u32   *reslen;  /* Buffer length in bytes */
+      };
+
+   DESCRIPTION
+
+   This function returns the Hardware Resource Table of the IOP specified 
+   by hrt->iop in the buffer pointed to by hrt->resbuf. The actual size of 
+   the data is written into *(hrt->reslen).
+
+   RETURNS
+
+   This function returns 0 if no errors occur. If an error occurs, -1 
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(hrt->reslen)
+  
+V. Getting Logical Configuration Table
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OLCTGET, struct i2o_cmd_lct *lct);
+
+      struct i2o_cmd_hrtlct
+      {
+         u32   iop;      /* IOP unit number */
+         void  *resbuf;  /* Buffer for result */
+         u32   *reslen;  /* Buffer length in bytes */
+      };
+
+   DESCRIPTION
+
+   This function returns the Logical Configuration Table of the IOP specified
+   by lct->iop in the buffer pointed to by lct->resbuf. The actual size of 
+   the data is written into *(lct->reslen).
+
+   RETURNS
+
+   This function returns 0 if no errors occur. If an error occurs, -1 
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(lct->reslen)
+
+VI. Setting Parameters
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OPARMSET, struct i2o_parm_setget *ops);
+
+      struct i2o_cmd_psetget
+      {
+         u32   iop;      /* IOP unit number */
+         u32   tid;      /* Target device TID */
+         void  *opbuf;   /* Operation List buffer */
+         u32   oplen;    /* Operation List buffer length in bytes */
+         void  *resbuf;  /* Result List buffer */
+         u32   *reslen;  /* Result List buffer length in bytes */
+      };
+
+   DESCRIPTION
+
+   This function posts a UtilParamsSet message to the device identified
+   by ops->iop and ops->tid.  The operation list for the message is 
+   sent through the ops->opbuf buffer, and the result list is written
+   into the buffer pointed to by ops->resbuf.  The number of bytes 
+   written is placed into *(ops->reslen). 
+
+   RETURNS
+
+   The return value is the size in bytes of the data written into
+   ops->resbuf if no errors occur.  If an error occurs, -1 is returned 
+   and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(ops->reslen)
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+   A return value of 0 does not mean that the value was actually
+   changed properly on the IOP.  The user should check the result
+   list to determine the specific status of the transaction.
+
+VII. Getting Parameters
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OPARMGET, struct i2o_parm_setget *ops);
+
+      struct i2o_parm_setget
+      {
+         u32   iop;      /* IOP unit number */
+         u32   tid;      /* Target device TID */
+         void  *opbuf;   /* Operation List buffer */
+         u32   oplen;    /* Operation List buffer length in bytes */
+         void  *resbuf;  /* Result List buffer */
+         u32   *reslen;  /* Result List buffer length in bytes */
+      };
+
+   DESCRIPTION
+
+   This function posts a UtilParamsGet message to the device identified
+   by ops->iop and ops->tid.  The operation list for the message is 
+   sent through the ops->opbuf buffer, and the result list is written
+   into the buffer pointed to by ops->resbuf.  The actual size of data
+   written is placed into *(ops->reslen).
+
+   RETURNS
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(ops->reslen)
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+   A return value of 0 does not mean that the value was actually
+   properly retrieved.  The user should check the result list 
+   to determine the specific status of the transaction.
+
+VIII. Downloading Software
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OSWDL, struct i2o_sw_xfer *sw);
+
+      struct i2o_sw_xfer
+      {
+         u32   iop;       /* IOP unit number */
+         u8    flags;     /* DownloadFlags field */
+         u8    sw_type;   /* Software type */
+         u32   sw_id;     /* Software ID */
+         void  *buf;      /* Pointer to software buffer */
+         u32   *swlen;    /* Length of software buffer */        
+         u32   *maxfrag;  /* Number of fragments */
+         u32   *curfrag;  /* Current fragment number */
+      };
+
+   DESCRIPTION
+
+   This function downloads a software fragment pointed by sw->buf
+   to the iop identified by sw->iop. The DownloadFlags, SwID, SwType
+   and SwSize fields of the ExecSwDownload message are filled in with
+   the values of sw->flags, sw->sw_id, sw->sw_type and *(sw->swlen).
+
+   The fragments _must_ be sent in order and be 8K in size. The last
+   fragment _may_ be shorter, however. The kernel will compute its
+   size based on information in the sw->swlen field.
+
+   Please note that SW transfers can take a long time.
+
+   RETURNS
+
+   This function returns 0 no errors occur. If an error occurs, -1 
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+IX. Uploading Software
+   
+   SYNOPSIS 
+
+   ioctl(fd, I2OSWUL, struct i2o_sw_xfer *sw);
+
+      struct i2o_sw_xfer
+      {
+         u32   iop;      /* IOP unit number */
+         u8    flags; 	 /* UploadFlags */
+         u8    sw_type;  /* Software type */
+         u32   sw_id;    /* Software ID */
+         void  *buf;     /* Pointer to software buffer */
+         u32   *swlen;   /* Length of software buffer */        
+         u32   *maxfrag; /* Number of fragments */
+         u32   *curfrag; /* Current fragment number */
+      };
+
+   DESCRIPTION
+
+   This function uploads a software fragment from the IOP identified
+   by sw->iop, sw->sw_type, sw->sw_id and optionally sw->swlen fields.
+   The UploadFlags, SwID, SwType and SwSize fields of the ExecSwUpload
+   message are filled in with the values of sw->flags, sw->sw_id,
+   sw->sw_type and *(sw->swlen).
+
+   The fragments _must_ be requested in order and be 8K in size. The
+   user is responsible for allocating memory pointed by sw->buf. The
+   last fragment _may_ be shorter.
+
+   Please note that SW transfers can take a long time.
+
+   RETURNS
+
+   This function returns 0 if no errors occur.  If an error occurs, -1
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+         
+X. Removing Software
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OSWDEL, struct i2o_sw_xfer *sw);
+
+      struct i2o_sw_xfer
+      {
+         u32   iop;      /* IOP unit number */
+         u8    flags; 	 /* RemoveFlags */
+         u8    sw_type;  /* Software type */
+         u32   sw_id;    /* Software ID */
+         void  *buf;     /* Unused */
+         u32   *swlen;   /* Length of the software data */        
+         u32   *maxfrag; /* Unused */
+         u32   *curfrag; /* Unused */
+      };
+
+   DESCRIPTION
+
+   This function removes software from the IOP identified by sw->iop.
+   The RemoveFlags, SwID, SwType and SwSize fields of the ExecSwRemove message 
+   are filled in with the values of sw->flags, sw->sw_id, sw->sw_type and 
+   *(sw->swlen). Give zero in *(sw->len) if the value is unknown. IOP uses 
+   *(sw->swlen) value to verify correct identication of the module to remove. 
+   The actual size of the module is written into *(sw->swlen).
+
+   RETURNS
+
+   This function returns 0 if no errors occur.  If an error occurs, -1
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+X. Validating Configuration
+
+   SYNOPSIS
+
+   ioctl(fd, I2OVALIDATE, int *iop);
+	u32 iop;
+
+   DESCRIPTION
+
+   This function posts an ExecConfigValidate message to the controller
+   identified by iop. This message indicates that the current
+   configuration is accepted. The iop changes the status of suspect drivers 
+   to valid and may delete old drivers from its store.
+
+   RETURNS
+
+   This function returns 0 if no erro occur.  If an error occurs, -1 is
+   returned and errno is set appropriately:
+
+      ETIMEDOUT   Timeout waiting for reply message
+      ENXIO       Invalid IOP number
+
+XI. Configuration Dialog
+   
+   SYNOPSIS 
+ 
+   ioctl(fd, I2OHTML, struct i2o_html *htquery);
+      struct i2o_html
+      {
+         u32   iop;      /* IOP unit number */
+         u32   tid;      /* Target device ID */
+         u32   page;     /* HTML page */
+         void  *resbuf;  /* Buffer for reply HTML page */
+         u32   *reslen;  /* Length in bytes of reply buffer */
+         void  *qbuf;    /* Pointer to HTTP query string */
+         u32   qlen;     /* Length in bytes of query string buffer */        
+      };
+
+   DESCRIPTION
+
+   This function posts an UtilConfigDialog message to the device identified
+   by htquery->iop and htquery->tid.  The requested HTML page number is 
+   provided by the htquery->page field, and the resultant data is stored 
+   in the buffer pointed to by htquery->resbuf.  If there is an HTTP query 
+   string that is to be sent to the device, it should be sent in the buffer
+   pointed to by htquery->qbuf.  If there is no query string, this field
+   should be set to NULL. The actual size of the reply received is written
+   into *(htquery->reslen).
+  
+   RETURNS
+
+   This function returns 0 if no error occur. If an error occurs, -1
+   is returned and errno is set appropriately:
+
+      EFAULT      Invalid user space pointer was passed
+      ENXIO       Invalid IOP number
+      ENOBUFS     Buffer not large enough.  If this occurs, the required
+                  buffer length is written into *(ops->reslen)
+      ETIMEDOUT   Timeout waiting for reply message
+      ENOMEM      Kernel memory allocation error
+
+XII. Events
+
+    In the process of determining this.  Current idea is to have use
+    the select() interface to allow user apps to periodically poll
+    the /dev/i2o/ctl device for events.  When select() notifies the user
+    that an event is available, the user would call read() to retrieve
+    a list of all the events that are pending for the specific device.
+
+=============================================================================
+Revision History
+=============================================================================
+
+Rev 0.1 - 04/01/99
+- Initial revision
+
+Rev 0.2 - 04/06/99
+- Changed return values to match UNIX ioctl() standard.  Only return values
+  are 0 and -1.  All errors are reported through errno.
+- Added summary of proposed possible event interfaces
+
+Rev 0.3 - 04/20/99
+- Changed all ioctls() to use pointers to user data instead of actual data
+- Updated error values to match the code
diff --git a/drivers/staging/i2o/bus-osm.c b/drivers/staging/i2o/bus-osm.c
new file mode 100644
index 000000000000..7aa0339aea05
--- /dev/null
+++ b/drivers/staging/i2o/bus-osm.c
@@ -0,0 +1,176 @@
+/*
+ *	Bus Adapter OSM
+ *
+ *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+
+#define OSM_NAME	"bus-osm"
+#define OSM_VERSION	"1.317"
+#define OSM_DESCRIPTION	"I2O Bus Adapter OSM"
+
+static struct i2o_driver i2o_bus_driver;
+
+/* Bus OSM class handling definition */
+static struct i2o_class_id i2o_bus_class_id[] = {
+	{I2O_CLASS_BUS_ADAPTER},
+	{I2O_CLASS_END}
+};
+
+/**
+ *	i2o_bus_scan - Scan the bus for new devices
+ *	@dev: I2O device of the bus, which should be scanned
+ *
+ *	Scans the bus dev for new / removed devices. After the scan a new LCT
+ *	will be fetched automatically.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_bus_scan(struct i2o_device *dev)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return -ETIMEDOUT;
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.
+			tid);
+
+	return i2o_msg_post_wait(dev->iop, msg, 60);
+};
+
+/**
+ *	i2o_bus_store_scan - Scan the I2O Bus Adapter
+ *	@d: device which should be scanned
+ *	@attr: device_attribute
+ *	@buf: output buffer
+ *	@count: buffer size
+ *
+ *	Returns count.
+ */
+static ssize_t i2o_bus_store_scan(struct device *d,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(d);
+	int rc;
+
+	if ((rc = i2o_bus_scan(i2o_dev)))
+		osm_warn("bus scan failed %d\n", rc);
+
+	return count;
+}
+
+/* Bus Adapter OSM device attributes */
+static DEVICE_ATTR(scan, S_IWUSR, NULL, i2o_bus_store_scan);
+
+/**
+ *	i2o_bus_probe - verify if dev is a I2O Bus Adapter device and install it
+ *	@dev: device to verify if it is a I2O Bus Adapter device
+ *
+ *	Because we want all Bus Adapters always return 0.
+ *	Except when we fail.  Then we are sad.
+ *
+ *	Returns 0, except when we fail to excel.
+ */
+static int i2o_bus_probe(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(get_device(dev));
+	int rc;
+
+	rc = device_create_file(dev, &dev_attr_scan);
+	if (rc)
+		goto err_out;
+
+	osm_info("device added (TID: %03x)\n", i2o_dev->lct_data.tid);
+
+	return 0;
+
+err_out:
+	put_device(dev);
+	return rc;
+};
+
+/**
+ *	i2o_bus_remove - remove the I2O Bus Adapter device from the system again
+ *	@dev: I2O Bus Adapter device which should be removed
+ *
+ *	Always returns 0.
+ */
+static int i2o_bus_remove(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+
+	device_remove_file(dev, &dev_attr_scan);
+
+	put_device(dev);
+
+	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
+
+	return 0;
+};
+
+/* Bus Adapter OSM driver struct */
+static struct i2o_driver i2o_bus_driver = {
+	.name = OSM_NAME,
+	.classes = i2o_bus_class_id,
+	.driver = {
+		   .probe = i2o_bus_probe,
+		   .remove = i2o_bus_remove,
+		   },
+};
+
+/**
+ *	i2o_bus_init - Bus Adapter OSM initialization function
+ *
+ *	Only register the Bus Adapter OSM in the I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_bus_init(void)
+{
+	int rc;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	/* Register Bus Adapter OSM into I2O core */
+	rc = i2o_driver_register(&i2o_bus_driver);
+	if (rc) {
+		osm_err("Could not register Bus Adapter OSM\n");
+		return rc;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_bus_exit - Bus Adapter OSM exit function
+ *
+ *	Unregisters Bus Adapter OSM from I2O core.
+ */
+static void __exit i2o_bus_exit(void)
+{
+	i2o_driver_unregister(&i2o_bus_driver);
+};
+
+MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_bus_init);
+module_exit(i2o_bus_exit);
diff --git a/drivers/staging/i2o/config-osm.c b/drivers/staging/i2o/config-osm.c
new file mode 100644
index 000000000000..519f52f9f688
--- /dev/null
+++ b/drivers/staging/i2o/config-osm.c
@@ -0,0 +1,90 @@
+/*
+ *	Configuration OSM
+ *
+ *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+
+#define OSM_NAME	"config-osm"
+#define OSM_VERSION	"1.323"
+#define OSM_DESCRIPTION	"I2O Configuration OSM"
+
+/* access mode user rw */
+#define S_IWRSR (S_IRUSR | S_IWUSR)
+
+static struct i2o_driver i2o_config_driver;
+
+/* Config OSM driver struct */
+static struct i2o_driver i2o_config_driver = {
+	.name = OSM_NAME,
+};
+
+#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
+#include "i2o_config.c"
+#endif
+
+/**
+ *	i2o_config_init - Configuration OSM initialization function
+ *
+ *	Registers Configuration OSM in the I2O core and if old ioctl's are
+ *	compiled in initialize them.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_config_init(void)
+{
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	if (i2o_driver_register(&i2o_config_driver)) {
+		osm_err("handler register failed.\n");
+		return -EBUSY;
+	}
+#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
+	if (i2o_config_old_init()) {
+		osm_err("old config handler initialization failed\n");
+		i2o_driver_unregister(&i2o_config_driver);
+		return -EBUSY;
+	}
+#endif
+
+	return 0;
+}
+
+/**
+ *	i2o_config_exit - Configuration OSM exit function
+ *
+ *	If old ioctl's are compiled in exit remove them and unregisters
+ *	Configuration OSM from I2O core.
+ */
+static void i2o_config_exit(void)
+{
+#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
+	i2o_config_old_exit();
+#endif
+
+	i2o_driver_unregister(&i2o_config_driver);
+}
+
+MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_config_init);
+module_exit(i2o_config_exit);
diff --git a/drivers/staging/i2o/core.h b/drivers/staging/i2o/core.h
new file mode 100644
index 000000000000..91614f11f89a
--- /dev/null
+++ b/drivers/staging/i2o/core.h
@@ -0,0 +1,69 @@
+/*
+ *	I2O core internal declarations
+ *
+ *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+/* Exec-OSM */
+extern struct i2o_driver i2o_exec_driver;
+extern int i2o_exec_lct_get(struct i2o_controller *);
+
+extern int __init i2o_exec_init(void);
+extern void i2o_exec_exit(void);
+
+/* driver */
+extern struct bus_type i2o_bus_type;
+
+extern int i2o_driver_dispatch(struct i2o_controller *, u32);
+
+extern int __init i2o_driver_init(void);
+extern void i2o_driver_exit(void);
+
+/* PCI */
+extern int __init i2o_pci_init(void);
+extern void __exit i2o_pci_exit(void);
+
+/* device */
+extern const struct attribute_group *i2o_device_groups[];
+
+extern void i2o_device_remove(struct i2o_device *);
+extern int i2o_device_parse_lct(struct i2o_controller *);
+
+int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
+		   int oplen, void *reslist, int reslen);
+
+/* IOP */
+extern struct i2o_controller *i2o_iop_alloc(void);
+
+/**
+ *	i2o_iop_free - Free the i2o_controller struct
+ *	@c: I2O controller to free
+ */
+static inline void i2o_iop_free(struct i2o_controller *c)
+{
+	i2o_pool_free(&c->in_msg);
+	kfree(c);
+}
+
+extern int i2o_iop_add(struct i2o_controller *);
+extern void i2o_iop_remove(struct i2o_controller *);
+
+/* control registers relative to c->base */
+#define I2O_IRQ_STATUS	0x30
+#define I2O_IRQ_MASK	0x34
+#define I2O_IN_PORT	0x40
+#define I2O_OUT_PORT	0x44
+
+/* Motorola/Freescale specific register offset */
+#define I2O_MOTOROLA_PORT_OFFSET	0x10400
+
+#define I2O_IRQ_OUTBOUND_POST	0x00000008
diff --git a/drivers/staging/i2o/debug.c b/drivers/staging/i2o/debug.c
new file mode 100644
index 000000000000..7a16114ed8ea
--- /dev/null
+++ b/drivers/staging/i2o/debug.c
@@ -0,0 +1,472 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "i2o.h"
+
+static void i2o_report_util_cmd(u8 cmd);
+static void i2o_report_exec_cmd(u8 cmd);
+static void i2o_report_fail_status(u8 req_status, u32 * msg);
+static void i2o_report_common_status(u8 req_status);
+static void i2o_report_common_dsc(u16 detailed_status);
+
+/*
+ * Used for error reporting/debugging purposes.
+ * Report Cmd name, Request status, Detailed Status.
+ */
+void i2o_report_status(const char *severity, const char *str,
+		       struct i2o_message *m)
+{
+	u32 *msg = (u32 *) m;
+	u8 cmd = (msg[1] >> 24) & 0xFF;
+	u8 req_status = (msg[4] >> 24) & 0xFF;
+	u16 detailed_status = msg[4] & 0xFFFF;
+
+	if (cmd == I2O_CMD_UTIL_EVT_REGISTER)
+		return;		// No status in this reply
+
+	printk("%s%s: ", severity, str);
+
+	if (cmd < 0x1F)		// Utility cmd
+		i2o_report_util_cmd(cmd);
+
+	else if (cmd >= 0xA0 && cmd <= 0xEF)	// Executive cmd
+		i2o_report_exec_cmd(cmd);
+	else
+		printk("Cmd = %0#2x, ", cmd);	// Other cmds
+
+	if (msg[0] & MSG_FAIL) {
+		i2o_report_fail_status(req_status, msg);
+		return;
+	}
+
+	i2o_report_common_status(req_status);
+
+	if (cmd < 0x1F || (cmd >= 0xA0 && cmd <= 0xEF))
+		i2o_report_common_dsc(detailed_status);
+	else
+		printk(" / DetailedStatus = %0#4x.\n",
+		       detailed_status);
+}
+
+/* Used to dump a message to syslog during debugging */
+void i2o_dump_message(struct i2o_message *m)
+{
+#ifdef DEBUG
+	u32 *msg = (u32 *) m;
+	int i;
+	printk(KERN_INFO "Dumping I2O message size %d @ %p\n",
+	       msg[0] >> 16 & 0xffff, msg);
+	for (i = 0; i < ((msg[0] >> 16) & 0xffff); i++)
+		printk(KERN_INFO "  msg[%d] = %0#10x\n", i, msg[i]);
+#endif
+}
+
+/*
+ * Used for error reporting/debugging purposes.
+ * Following fail status are common to all classes.
+ * The preserved message must be handled in the reply handler.
+ */
+static void i2o_report_fail_status(u8 req_status, u32 * msg)
+{
+	static char *FAIL_STATUS[] = {
+		"0x80",		/* not used */
+		"SERVICE_SUSPENDED",	/* 0x81 */
+		"SERVICE_TERMINATED",	/* 0x82 */
+		"CONGESTION",
+		"FAILURE",
+		"STATE_ERROR",
+		"TIME_OUT",
+		"ROUTING_FAILURE",
+		"INVALID_VERSION",
+		"INVALID_OFFSET",
+		"INVALID_MSG_FLAGS",
+		"FRAME_TOO_SMALL",
+		"FRAME_TOO_LARGE",
+		"INVALID_TARGET_ID",
+		"INVALID_INITIATOR_ID",
+		"INVALID_INITIATOR_CONTEX",	/* 0x8F */
+		"UNKNOWN_FAILURE"	/* 0xFF */
+	};
+
+	if (req_status == I2O_FSC_TRANSPORT_UNKNOWN_FAILURE)
+		printk("TRANSPORT_UNKNOWN_FAILURE (%0#2x).\n",
+		       req_status);
+	else
+		printk("TRANSPORT_%s.\n",
+		       FAIL_STATUS[req_status & 0x0F]);
+
+	/* Dump some details */
+
+	printk(KERN_ERR "  InitiatorId = %d, TargetId = %d\n",
+	       (msg[1] >> 12) & 0xFFF, msg[1] & 0xFFF);
+	printk(KERN_ERR "  LowestVersion = 0x%02X, HighestVersion = 0x%02X\n",
+	       (msg[4] >> 8) & 0xFF, msg[4] & 0xFF);
+	printk(KERN_ERR "  FailingHostUnit = 0x%04X,  FailingIOP = 0x%03X\n",
+	       msg[5] >> 16, msg[5] & 0xFFF);
+
+	printk(KERN_ERR "  Severity:  0x%02X\n", (msg[4] >> 16) & 0xFF);
+	if (msg[4] & (1 << 16))
+		printk(KERN_DEBUG "(FormatError), "
+		       "this msg can never be delivered/processed.\n");
+	if (msg[4] & (1 << 17))
+		printk(KERN_DEBUG "(PathError), "
+		       "this msg can no longer be delivered/processed.\n");
+	if (msg[4] & (1 << 18))
+		printk(KERN_DEBUG "(PathState), "
+		       "the system state does not allow delivery.\n");
+	if (msg[4] & (1 << 19))
+		printk(KERN_DEBUG
+		       "(Congestion), resources temporarily not available;"
+		       "do not retry immediately.\n");
+}
+
+/*
+ * Used for error reporting/debugging purposes.
+ * Following reply status are common to all classes.
+ */
+static void i2o_report_common_status(u8 req_status)
+{
+	static char *REPLY_STATUS[] = {
+		"SUCCESS",
+		"ABORT_DIRTY",
+		"ABORT_NO_DATA_TRANSFER",
+		"ABORT_PARTIAL_TRANSFER",
+		"ERROR_DIRTY",
+		"ERROR_NO_DATA_TRANSFER",
+		"ERROR_PARTIAL_TRANSFER",
+		"PROCESS_ABORT_DIRTY",
+		"PROCESS_ABORT_NO_DATA_TRANSFER",
+		"PROCESS_ABORT_PARTIAL_TRANSFER",
+		"TRANSACTION_ERROR",
+		"PROGRESS_REPORT"
+	};
+
+	if (req_status >= ARRAY_SIZE(REPLY_STATUS))
+		printk("RequestStatus = %0#2x", req_status);
+	else
+		printk("%s", REPLY_STATUS[req_status]);
+}
+
+/*
+ * Used for error reporting/debugging purposes.
+ * Following detailed status are valid  for executive class,
+ * utility class, DDM class and for transaction error replies.
+ */
+static void i2o_report_common_dsc(u16 detailed_status)
+{
+	static char *COMMON_DSC[] = {
+		"SUCCESS",
+		"0x01",		// not used
+		"BAD_KEY",
+		"TCL_ERROR",
+		"REPLY_BUFFER_FULL",
+		"NO_SUCH_PAGE",
+		"INSUFFICIENT_RESOURCE_SOFT",
+		"INSUFFICIENT_RESOURCE_HARD",
+		"0x08",		// not used
+		"CHAIN_BUFFER_TOO_LARGE",
+		"UNSUPPORTED_FUNCTION",
+		"DEVICE_LOCKED",
+		"DEVICE_RESET",
+		"INAPPROPRIATE_FUNCTION",
+		"INVALID_INITIATOR_ADDRESS",
+		"INVALID_MESSAGE_FLAGS",
+		"INVALID_OFFSET",
+		"INVALID_PARAMETER",
+		"INVALID_REQUEST",
+		"INVALID_TARGET_ADDRESS",
+		"MESSAGE_TOO_LARGE",
+		"MESSAGE_TOO_SMALL",
+		"MISSING_PARAMETER",
+		"TIMEOUT",
+		"UNKNOWN_ERROR",
+		"UNKNOWN_FUNCTION",
+		"UNSUPPORTED_VERSION",
+		"DEVICE_BUSY",
+		"DEVICE_NOT_AVAILABLE"
+	};
+
+	if (detailed_status > I2O_DSC_DEVICE_NOT_AVAILABLE)
+		printk(" / DetailedStatus = %0#4x.\n",
+		       detailed_status);
+	else
+		printk(" / %s.\n", COMMON_DSC[detailed_status]);
+}
+
+/*
+ * Used for error reporting/debugging purposes
+ */
+static void i2o_report_util_cmd(u8 cmd)
+{
+	switch (cmd) {
+	case I2O_CMD_UTIL_NOP:
+		printk("UTIL_NOP, ");
+		break;
+	case I2O_CMD_UTIL_ABORT:
+		printk("UTIL_ABORT, ");
+		break;
+	case I2O_CMD_UTIL_CLAIM:
+		printk("UTIL_CLAIM, ");
+		break;
+	case I2O_CMD_UTIL_RELEASE:
+		printk("UTIL_CLAIM_RELEASE, ");
+		break;
+	case I2O_CMD_UTIL_CONFIG_DIALOG:
+		printk("UTIL_CONFIG_DIALOG, ");
+		break;
+	case I2O_CMD_UTIL_DEVICE_RESERVE:
+		printk("UTIL_DEVICE_RESERVE, ");
+		break;
+	case I2O_CMD_UTIL_DEVICE_RELEASE:
+		printk("UTIL_DEVICE_RELEASE, ");
+		break;
+	case I2O_CMD_UTIL_EVT_ACK:
+		printk("UTIL_EVENT_ACKNOWLEDGE, ");
+		break;
+	case I2O_CMD_UTIL_EVT_REGISTER:
+		printk("UTIL_EVENT_REGISTER, ");
+		break;
+	case I2O_CMD_UTIL_LOCK:
+		printk("UTIL_LOCK, ");
+		break;
+	case I2O_CMD_UTIL_LOCK_RELEASE:
+		printk("UTIL_LOCK_RELEASE, ");
+		break;
+	case I2O_CMD_UTIL_PARAMS_GET:
+		printk("UTIL_PARAMS_GET, ");
+		break;
+	case I2O_CMD_UTIL_PARAMS_SET:
+		printk("UTIL_PARAMS_SET, ");
+		break;
+	case I2O_CMD_UTIL_REPLY_FAULT_NOTIFY:
+		printk("UTIL_REPLY_FAULT_NOTIFY, ");
+		break;
+	default:
+		printk("Cmd = %0#2x, ", cmd);
+	}
+}
+
+/*
+ * Used for error reporting/debugging purposes
+ */
+static void i2o_report_exec_cmd(u8 cmd)
+{
+	switch (cmd) {
+	case I2O_CMD_ADAPTER_ASSIGN:
+		printk("EXEC_ADAPTER_ASSIGN, ");
+		break;
+	case I2O_CMD_ADAPTER_READ:
+		printk("EXEC_ADAPTER_READ, ");
+		break;
+	case I2O_CMD_ADAPTER_RELEASE:
+		printk("EXEC_ADAPTER_RELEASE, ");
+		break;
+	case I2O_CMD_BIOS_INFO_SET:
+		printk("EXEC_BIOS_INFO_SET, ");
+		break;
+	case I2O_CMD_BOOT_DEVICE_SET:
+		printk("EXEC_BOOT_DEVICE_SET, ");
+		break;
+	case I2O_CMD_CONFIG_VALIDATE:
+		printk("EXEC_CONFIG_VALIDATE, ");
+		break;
+	case I2O_CMD_CONN_SETUP:
+		printk("EXEC_CONN_SETUP, ");
+		break;
+	case I2O_CMD_DDM_DESTROY:
+		printk("EXEC_DDM_DESTROY, ");
+		break;
+	case I2O_CMD_DDM_ENABLE:
+		printk("EXEC_DDM_ENABLE, ");
+		break;
+	case I2O_CMD_DDM_QUIESCE:
+		printk("EXEC_DDM_QUIESCE, ");
+		break;
+	case I2O_CMD_DDM_RESET:
+		printk("EXEC_DDM_RESET, ");
+		break;
+	case I2O_CMD_DDM_SUSPEND:
+		printk("EXEC_DDM_SUSPEND, ");
+		break;
+	case I2O_CMD_DEVICE_ASSIGN:
+		printk("EXEC_DEVICE_ASSIGN, ");
+		break;
+	case I2O_CMD_DEVICE_RELEASE:
+		printk("EXEC_DEVICE_RELEASE, ");
+		break;
+	case I2O_CMD_HRT_GET:
+		printk("EXEC_HRT_GET, ");
+		break;
+	case I2O_CMD_ADAPTER_CLEAR:
+		printk("EXEC_IOP_CLEAR, ");
+		break;
+	case I2O_CMD_ADAPTER_CONNECT:
+		printk("EXEC_IOP_CONNECT, ");
+		break;
+	case I2O_CMD_ADAPTER_RESET:
+		printk("EXEC_IOP_RESET, ");
+		break;
+	case I2O_CMD_LCT_NOTIFY:
+		printk("EXEC_LCT_NOTIFY, ");
+		break;
+	case I2O_CMD_OUTBOUND_INIT:
+		printk("EXEC_OUTBOUND_INIT, ");
+		break;
+	case I2O_CMD_PATH_ENABLE:
+		printk("EXEC_PATH_ENABLE, ");
+		break;
+	case I2O_CMD_PATH_QUIESCE:
+		printk("EXEC_PATH_QUIESCE, ");
+		break;
+	case I2O_CMD_PATH_RESET:
+		printk("EXEC_PATH_RESET, ");
+		break;
+	case I2O_CMD_STATIC_MF_CREATE:
+		printk("EXEC_STATIC_MF_CREATE, ");
+		break;
+	case I2O_CMD_STATIC_MF_RELEASE:
+		printk("EXEC_STATIC_MF_RELEASE, ");
+		break;
+	case I2O_CMD_STATUS_GET:
+		printk("EXEC_STATUS_GET, ");
+		break;
+	case I2O_CMD_SW_DOWNLOAD:
+		printk("EXEC_SW_DOWNLOAD, ");
+		break;
+	case I2O_CMD_SW_UPLOAD:
+		printk("EXEC_SW_UPLOAD, ");
+		break;
+	case I2O_CMD_SW_REMOVE:
+		printk("EXEC_SW_REMOVE, ");
+		break;
+	case I2O_CMD_SYS_ENABLE:
+		printk("EXEC_SYS_ENABLE, ");
+		break;
+	case I2O_CMD_SYS_MODIFY:
+		printk("EXEC_SYS_MODIFY, ");
+		break;
+	case I2O_CMD_SYS_QUIESCE:
+		printk("EXEC_SYS_QUIESCE, ");
+		break;
+	case I2O_CMD_SYS_TAB_SET:
+		printk("EXEC_SYS_TAB_SET, ");
+		break;
+	default:
+		printk("Cmd = %#02x, ", cmd);
+	}
+}
+
+void i2o_debug_state(struct i2o_controller *c)
+{
+	printk(KERN_INFO "%s: State = ", c->name);
+	switch (((i2o_status_block *) c->status_block.virt)->iop_state) {
+	case 0x01:
+		printk("INIT\n");
+		break;
+	case 0x02:
+		printk("RESET\n");
+		break;
+	case 0x04:
+		printk("HOLD\n");
+		break;
+	case 0x05:
+		printk("READY\n");
+		break;
+	case 0x08:
+		printk("OPERATIONAL\n");
+		break;
+	case 0x10:
+		printk("FAILED\n");
+		break;
+	case 0x11:
+		printk("FAULTED\n");
+		break;
+	default:
+		printk("%x (unknown !!)\n",
+		       ((i2o_status_block *) c->status_block.virt)->iop_state);
+	}
+};
+
+void i2o_dump_hrt(struct i2o_controller *c)
+{
+	u32 *rows = (u32 *) c->hrt.virt;
+	u8 *p = (u8 *) c->hrt.virt;
+	u8 *d;
+	int count;
+	int length;
+	int i;
+	int state;
+
+	if (p[3] != 0) {
+		printk(KERN_ERR
+		       "%s: HRT table for controller is too new a version.\n",
+		       c->name);
+		return;
+	}
+
+	count = p[0] | (p[1] << 8);
+	length = p[2];
+
+	printk(KERN_INFO "%s: HRT has %d entries of %d bytes each.\n",
+	       c->name, count, length << 2);
+
+	rows += 2;
+
+	for (i = 0; i < count; i++) {
+		printk(KERN_INFO "Adapter %08X: ", rows[0]);
+		p = (u8 *) (rows + 1);
+		d = (u8 *) (rows + 2);
+		state = p[1] << 8 | p[0];
+
+		printk("TID %04X:[", state & 0xFFF);
+		state >>= 12;
+		if (state & (1 << 0))
+			printk("H");	/* Hidden */
+		if (state & (1 << 2)) {
+			printk("P");	/* Present */
+			if (state & (1 << 1))
+				printk("C");	/* Controlled */
+		}
+		if (state > 9)
+			printk("*");	/* Hard */
+
+		printk("]:");
+
+		switch (p[3] & 0xFFFF) {
+		case 0:
+			/* Adapter private bus - easy */
+			printk("Local bus %d: I/O at 0x%04X Mem 0x%08X", p[2],
+			       d[1] << 8 | d[0], *(u32 *) (d + 4));
+			break;
+		case 1:
+			/* ISA bus */
+			printk("ISA %d: CSN %d I/O at 0x%04X Mem 0x%08X", p[2],
+			       d[2], d[1] << 8 | d[0], *(u32 *) (d + 4));
+			break;
+
+		case 2:	/* EISA bus */
+			printk("EISA %d: Slot %d I/O at 0x%04X Mem 0x%08X",
+			       p[2], d[3], d[1] << 8 | d[0], *(u32 *) (d + 4));
+			break;
+
+		case 3:	/* MCA bus */
+			printk("MCA %d: Slot %d I/O at 0x%04X Mem 0x%08X", p[2],
+			       d[3], d[1] << 8 | d[0], *(u32 *) (d + 4));
+			break;
+
+		case 4:	/* PCI bus */
+			printk("PCI %d: Bus %d Device %d Function %d", p[2],
+			       d[2], d[1], d[0]);
+			break;
+
+		case 0x80:	/* Other */
+		default:
+			printk("Unsupported bus type.");
+			break;
+		}
+		printk("\n");
+		rows += length;
+	}
+}
+
+EXPORT_SYMBOL(i2o_dump_message);
diff --git a/drivers/staging/i2o/device.c b/drivers/staging/i2o/device.c
new file mode 100644
index 000000000000..2af22553dd4e
--- /dev/null
+++ b/drivers/staging/i2o/device.c
@@ -0,0 +1,594 @@
+/*
+ *	Functions to handle I2O devices
+ *
+ *	Copyright (C) 2004	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "core.h"
+
+/**
+ *	i2o_device_issue_claim - claim or release a device
+ *	@dev: I2O device to claim or release
+ *	@cmd: claim or release command
+ *	@type: type of claim
+ *
+ *	Issue I2O UTIL_CLAIM or UTIL_RELEASE messages. The message to be sent
+ *	is set by cmd. dev is the I2O device which should be claim or
+ *	released and the type is the claim type (see the I2O spec).
+ *
+ *	Returs 0 on success or negative error code on failure.
+ */
+static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd,
+					 u32 type)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid);
+	msg->body[0] = cpu_to_le32(type);
+
+	return i2o_msg_post_wait(dev->iop, msg, 60);
+}
+
+/**
+ *	i2o_device_claim - claim a device for use by an OSM
+ *	@dev: I2O device to claim
+ *
+ *	Do the leg work to assign a device to a given OSM. If the claim succeeds,
+ *	the owner is the primary. If the attempt fails a negative errno code
+ *	is returned. On success zero is returned.
+ */
+int i2o_device_claim(struct i2o_device *dev)
+{
+	int rc = 0;
+
+	mutex_lock(&dev->lock);
+
+	rc = i2o_device_issue_claim(dev, I2O_CMD_UTIL_CLAIM, I2O_CLAIM_PRIMARY);
+	if (!rc)
+		pr_debug("i2o: claim of device %d succeeded\n",
+			 dev->lct_data.tid);
+	else
+		pr_debug("i2o: claim of device %d failed %d\n",
+			 dev->lct_data.tid, rc);
+
+	mutex_unlock(&dev->lock);
+
+	return rc;
+}
+
+/**
+ *	i2o_device_claim_release - release a device that the OSM is using
+ *	@dev: device to release
+ *
+ *	Drop a claim by an OSM on a given I2O device.
+ *
+ *	AC - some devices seem to want to refuse an unclaim until they have
+ *	finished internal processing. It makes sense since you don't want a
+ *	new device to go reconfiguring the entire system until you are done.
+ *	Thus we are prepared to wait briefly.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_device_claim_release(struct i2o_device *dev)
+{
+	int tries;
+	int rc = 0;
+
+	mutex_lock(&dev->lock);
+
+	/*
+	 *      If the controller takes a nonblocking approach to
+	 *      releases we have to sleep/poll for a few times.
+	 */
+	for (tries = 0; tries < 10; tries++) {
+		rc = i2o_device_issue_claim(dev, I2O_CMD_UTIL_RELEASE,
+					    I2O_CLAIM_PRIMARY);
+		if (!rc)
+			break;
+
+		ssleep(1);
+	}
+
+	if (!rc)
+		pr_debug("i2o: claim release of device %d succeeded\n",
+			 dev->lct_data.tid);
+	else
+		pr_debug("i2o: claim release of device %d failed %d\n",
+			 dev->lct_data.tid, rc);
+
+	mutex_unlock(&dev->lock);
+
+	return rc;
+}
+
+/**
+ *	i2o_device_release - release the memory for a I2O device
+ *	@dev: I2O device which should be released
+ *
+ *	Release the allocated memory. This function is called if refcount of
+ *	device reaches 0 automatically.
+ */
+static void i2o_device_release(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+
+	pr_debug("i2o: device %s released\n", dev_name(dev));
+
+	kfree(i2o_dev);
+}
+
+/**
+ *	class_id_show - Displays class id of I2O device
+ *	@dev: device of which the class id should be displayed
+ *	@attr: pointer to device attribute
+ *	@buf: buffer into which the class id should be printed
+ *
+ *	Returns the number of bytes which are printed into the buffer.
+ */
+static ssize_t class_id_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+
+	sprintf(buf, "0x%03x\n", i2o_dev->lct_data.class_id);
+	return strlen(buf) + 1;
+}
+static DEVICE_ATTR_RO(class_id);
+
+/**
+ *	tid_show - Displays TID of I2O device
+ *	@dev: device of which the TID should be displayed
+ *	@attr: pointer to device attribute
+ *	@buf: buffer into which the TID should be printed
+ *
+ *	Returns the number of bytes which are printed into the buffer.
+ */
+static ssize_t tid_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+
+	sprintf(buf, "0x%03x\n", i2o_dev->lct_data.tid);
+	return strlen(buf) + 1;
+}
+static DEVICE_ATTR_RO(tid);
+
+/* I2O device attributes */
+static struct attribute *i2o_device_attrs[] = {
+	&dev_attr_class_id.attr,
+	&dev_attr_tid.attr,
+	NULL,
+};
+
+static const struct attribute_group i2o_device_group = {
+	.attrs = i2o_device_attrs,
+};
+
+const struct attribute_group *i2o_device_groups[] = {
+	&i2o_device_group,
+	NULL,
+};
+
+/**
+ *	i2o_device_alloc - Allocate a I2O device and initialize it
+ *
+ *	Allocate the memory for a I2O device and initialize locks and lists
+ *
+ *	Returns the allocated I2O device or a negative error code if the device
+ *	could not be allocated.
+ */
+static struct i2o_device *i2o_device_alloc(void)
+{
+	struct i2o_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&dev->list);
+	mutex_init(&dev->lock);
+
+	dev->device.bus = &i2o_bus_type;
+	dev->device.release = &i2o_device_release;
+
+	return dev;
+}
+
+/**
+ *	i2o_device_add - allocate a new I2O device and add it to the IOP
+ *	@c: I2O controller that the device is on
+ *	@entry: LCT entry of the I2O device
+ *
+ *	Allocate a new I2O device and initialize it with the LCT entry. The
+ *	device is appended to the device list of the controller.
+ *
+ *	Returns zero on success, or a -ve errno.
+ */
+static int i2o_device_add(struct i2o_controller *c, i2o_lct_entry *entry)
+{
+	struct i2o_device *i2o_dev, *tmp;
+	int rc;
+
+	i2o_dev = i2o_device_alloc();
+	if (IS_ERR(i2o_dev)) {
+		printk(KERN_ERR "i2o: unable to allocate i2o device\n");
+		return PTR_ERR(i2o_dev);
+	}
+
+	i2o_dev->lct_data = *entry;
+
+	dev_set_name(&i2o_dev->device, "%d:%03x", c->unit,
+		     i2o_dev->lct_data.tid);
+
+	i2o_dev->iop = c;
+	i2o_dev->device.parent = &c->device;
+
+	rc = device_register(&i2o_dev->device);
+	if (rc)
+		goto err;
+
+	list_add_tail(&i2o_dev->list, &c->devices);
+
+	/* create user entries for this device */
+	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
+	if (tmp && (tmp != i2o_dev)) {
+		rc = sysfs_create_link(&i2o_dev->device.kobj,
+				       &tmp->device.kobj, "user");
+		if (rc)
+			goto unreg_dev;
+	}
+
+	/* create user entries referring to this device */
+	list_for_each_entry(tmp, &c->devices, list)
+	    if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+		&& (tmp != i2o_dev)) {
+		rc = sysfs_create_link(&tmp->device.kobj,
+				       &i2o_dev->device.kobj, "user");
+		if (rc)
+			goto rmlink1;
+	}
+
+	/* create parent entries for this device */
+	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
+	if (tmp && (tmp != i2o_dev)) {
+		rc = sysfs_create_link(&i2o_dev->device.kobj,
+				       &tmp->device.kobj, "parent");
+		if (rc)
+			goto rmlink1;
+	}
+
+	/* create parent entries referring to this device */
+	list_for_each_entry(tmp, &c->devices, list)
+	    if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+		&& (tmp != i2o_dev)) {
+		rc = sysfs_create_link(&tmp->device.kobj,
+				       &i2o_dev->device.kobj, "parent");
+		if (rc)
+			goto rmlink2;
+	}
+
+	i2o_driver_notify_device_add_all(i2o_dev);
+
+	pr_debug("i2o: device %s added\n", dev_name(&i2o_dev->device));
+
+	return 0;
+
+rmlink2:
+	/* If link creating failed halfway, we loop whole list to cleanup.
+	 * And we don't care wrong removing of link, because sysfs_remove_link
+	 * will take care of it.
+	 */
+	list_for_each_entry(tmp, &c->devices, list) {
+		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "parent");
+	}
+	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
+rmlink1:
+	list_for_each_entry(tmp, &c->devices, list)
+		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "user");
+	sysfs_remove_link(&i2o_dev->device.kobj, "user");
+unreg_dev:
+	list_del(&i2o_dev->list);
+	device_unregister(&i2o_dev->device);
+err:
+	kfree(i2o_dev);
+	return rc;
+}
+
+/**
+ *	i2o_device_remove - remove an I2O device from the I2O core
+ *	@i2o_dev: I2O device which should be released
+ *
+ *	Is used on I2O controller removal or LCT modification, when the device
+ *	is removed from the system. Note that the device could still hang
+ *	around until the refcount reaches 0.
+ */
+void i2o_device_remove(struct i2o_device *i2o_dev)
+{
+	struct i2o_device *tmp;
+	struct i2o_controller *c = i2o_dev->iop;
+
+	i2o_driver_notify_device_remove_all(i2o_dev);
+
+	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
+	sysfs_remove_link(&i2o_dev->device.kobj, "user");
+
+	list_for_each_entry(tmp, &c->devices, list) {
+		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "parent");
+		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "user");
+	}
+	list_del(&i2o_dev->list);
+
+	device_unregister(&i2o_dev->device);
+}
+
+/**
+ *	i2o_device_parse_lct - Parse a previously fetched LCT and create devices
+ *	@c: I2O controller from which the LCT should be parsed.
+ *
+ *	The Logical Configuration Table tells us what we can talk to on the
+ *	board. For every entry we create an I2O device, which is registered in
+ *	the I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_device_parse_lct(struct i2o_controller *c)
+{
+	struct i2o_device *dev, *tmp;
+	i2o_lct *lct;
+	u32 *dlct = c->dlct.virt;
+	int max = 0, i = 0;
+	u16 table_size;
+	u32 buf;
+
+	mutex_lock(&c->lct_lock);
+
+	kfree(c->lct);
+
+	buf = le32_to_cpu(*dlct++);
+	table_size = buf & 0xffff;
+
+	lct = c->lct = kmalloc(table_size * 4, GFP_KERNEL);
+	if (!lct) {
+		mutex_unlock(&c->lct_lock);
+		return -ENOMEM;
+	}
+
+	lct->lct_ver = buf >> 28;
+	lct->boot_tid = buf >> 16 & 0xfff;
+	lct->table_size = table_size;
+	lct->change_ind = le32_to_cpu(*dlct++);
+	lct->iop_flags = le32_to_cpu(*dlct++);
+
+	table_size -= 3;
+
+	pr_debug("%s: LCT has %d entries (LCT size: %d)\n", c->name, max,
+		 lct->table_size);
+
+	while (table_size > 0) {
+		i2o_lct_entry *entry = &lct->lct_entry[max];
+		int found = 0;
+
+		buf = le32_to_cpu(*dlct++);
+		entry->entry_size = buf & 0xffff;
+		entry->tid = buf >> 16 & 0xfff;
+
+		entry->change_ind = le32_to_cpu(*dlct++);
+		entry->device_flags = le32_to_cpu(*dlct++);
+
+		buf = le32_to_cpu(*dlct++);
+		entry->class_id = buf & 0xfff;
+		entry->version = buf >> 12 & 0xf;
+		entry->vendor_id = buf >> 16;
+
+		entry->sub_class = le32_to_cpu(*dlct++);
+
+		buf = le32_to_cpu(*dlct++);
+		entry->user_tid = buf & 0xfff;
+		entry->parent_tid = buf >> 12 & 0xfff;
+		entry->bios_info = buf >> 24;
+
+		memcpy(&entry->identity_tag, dlct, 8);
+		dlct += 2;
+
+		entry->event_capabilities = le32_to_cpu(*dlct++);
+
+		/* add new devices, which are new in the LCT */
+		list_for_each_entry_safe(dev, tmp, &c->devices, list) {
+			if (entry->tid == dev->lct_data.tid) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found)
+			i2o_device_add(c, entry);
+
+		table_size -= 9;
+		max++;
+	}
+
+	/* remove devices, which are not in the LCT anymore */
+	list_for_each_entry_safe(dev, tmp, &c->devices, list) {
+		int found = 0;
+
+		for (i = 0; i < max; i++) {
+			if (lct->lct_entry[i].tid == dev->lct_data.tid) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found)
+			i2o_device_remove(dev);
+	}
+
+	mutex_unlock(&c->lct_lock);
+
+	return 0;
+}
+
+/*
+ *	Run time support routines
+ */
+
+/*	Issue UTIL_PARAMS_GET or UTIL_PARAMS_SET
+ *
+ *	This function can be used for all UtilParamsGet/Set operations.
+ *	The OperationList is given in oplist-buffer,
+ *	and results are returned in reslist-buffer.
+ *	Note that the minimum sized reslist is 8 bytes and contains
+ *	ResultCount, ErrorInfoSize, BlockStatus and BlockSize.
+ */
+int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
+		   int oplen, void *reslist, int reslen)
+{
+	struct i2o_message *msg;
+	int i = 0;
+	int rc;
+	struct i2o_dma res;
+	struct i2o_controller *c = i2o_dev->iop;
+	struct device *dev = &c->pdev->dev;
+
+	res.virt = NULL;
+
+	if (i2o_dma_alloc(dev, &res, reslen))
+		return -ENOMEM;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg)) {
+		i2o_dma_free(dev, &res);
+		return PTR_ERR(msg);
+	}
+
+	i = 0;
+	msg->u.head[1] =
+	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid);
+	msg->body[i++] = cpu_to_le32(0x00000000);
+	msg->body[i++] = cpu_to_le32(0x4C000000 | oplen);	/* OperationList */
+	memcpy(&msg->body[i], oplist, oplen);
+	i += (oplen / 4 + (oplen % 4 ? 1 : 0));
+	msg->body[i++] = cpu_to_le32(0xD0000000 | res.len);	/* ResultList */
+	msg->body[i++] = cpu_to_le32(res.phys);
+
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) |
+			SGL_OFFSET_5);
+
+	rc = i2o_msg_post_wait_mem(c, msg, 10, &res);
+
+	/* This only looks like a memory leak - don't "fix" it. */
+	if (rc == -ETIMEDOUT)
+		return rc;
+
+	memcpy(reslist, res.virt, res.len);
+	i2o_dma_free(dev, &res);
+
+	return rc;
+}
+
+/*
+ *	 Query one field group value or a whole scalar group.
+ */
+int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
+		       void *buf, int buflen)
+{
+	u32 opblk[] = { cpu_to_le32(0x00000001),
+		cpu_to_le32((u16) group << 16 | I2O_PARAMS_FIELD_GET),
+		cpu_to_le32((s16) field << 16 | 0x00000001)
+	};
+	u8 *resblk;		/* 8 bytes for header */
+	int rc;
+
+	resblk = kmalloc(buflen + 8, GFP_KERNEL);
+	if (!resblk)
+		return -ENOMEM;
+
+	rc = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
+			    sizeof(opblk), resblk, buflen + 8);
+
+	memcpy(buf, resblk + 8, buflen);	/* cut off header */
+
+	kfree(resblk);
+
+	return rc;
+}
+
+/*
+ *	if oper == I2O_PARAMS_TABLE_GET, get from all rows
+ *		if fieldcount == -1 return all fields
+ *			ibuf and ibuflen are unused (use NULL, 0)
+ *		else return specific fields
+ *			ibuf contains fieldindexes
+ *
+ *	if oper == I2O_PARAMS_LIST_GET, get from specific rows
+ *		if fieldcount == -1 return all fields
+ *			ibuf contains rowcount, keyvalues
+ *		else return specific fields
+ *			fieldcount is # of fieldindexes
+ *			ibuf contains fieldindexes, rowcount, keyvalues
+ *
+ *	You could also use directly function i2o_issue_params().
+ */
+int i2o_parm_table_get(struct i2o_device *dev, int oper, int group,
+		       int fieldcount, void *ibuf, int ibuflen, void *resblk,
+		       int reslen)
+{
+	u16 *opblk;
+	int size;
+
+	size = 10 + ibuflen;
+	if (size % 4)
+		size += 4 - size % 4;
+
+	opblk = kmalloc(size, GFP_KERNEL);
+	if (opblk == NULL) {
+		printk(KERN_ERR "i2o: no memory for query buffer.\n");
+		return -ENOMEM;
+	}
+
+	opblk[0] = 1;		/* operation count */
+	opblk[1] = 0;		/* pad */
+	opblk[2] = oper;
+	opblk[3] = group;
+	opblk[4] = fieldcount;
+	memcpy(opblk + 5, ibuf, ibuflen);	/* other params */
+
+	size = i2o_parm_issue(dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
+			      size, resblk, reslen);
+
+	kfree(opblk);
+	if (size > reslen)
+		return reslen;
+
+	return size;
+}
+
+EXPORT_SYMBOL(i2o_device_claim);
+EXPORT_SYMBOL(i2o_device_claim_release);
+EXPORT_SYMBOL(i2o_parm_field_get);
+EXPORT_SYMBOL(i2o_parm_table_get);
+EXPORT_SYMBOL(i2o_parm_issue);
diff --git a/drivers/staging/i2o/driver.c b/drivers/staging/i2o/driver.c
new file mode 100644
index 000000000000..111c3edde035
--- /dev/null
+++ b/drivers/staging/i2o/driver.c
@@ -0,0 +1,382 @@
+/*
+ *	Functions to handle I2O drivers (OSMs) and I2O bus type for sysfs
+ *
+ *	Copyright (C) 2004	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/rwsem.h>
+#include "i2o.h"
+#include <linux/workqueue.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "core.h"
+
+#define OSM_NAME	"i2o"
+
+/* max_drivers - Maximum I2O drivers (OSMs) which could be registered */
+static unsigned int i2o_max_drivers = I2O_MAX_DRIVERS;
+module_param_named(max_drivers, i2o_max_drivers, uint, 0);
+MODULE_PARM_DESC(max_drivers, "maximum number of OSM's to support");
+
+/* I2O drivers lock and array */
+static spinlock_t i2o_drivers_lock;
+static struct i2o_driver **i2o_drivers;
+
+/**
+ *	i2o_bus_match - Tell if I2O device class id matches the class ids of the I2O driver (OSM)
+ *	@dev: device which should be verified
+ *	@drv: the driver to match against
+ *
+ *	Used by the bus to check if the driver wants to handle the device.
+ *
+ *	Returns 1 if the class ids of the driver match the class id of the
+ *	device, otherwise 0.
+ */
+static int i2o_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_driver *i2o_drv = to_i2o_driver(drv);
+	struct i2o_class_id *ids = i2o_drv->classes;
+
+	if (ids)
+		while (ids->class_id != I2O_CLASS_END) {
+			if (ids->class_id == i2o_dev->lct_data.class_id)
+				return 1;
+			ids++;
+		}
+	return 0;
+};
+
+/* I2O bus type */
+struct bus_type i2o_bus_type = {
+	.name = "i2o",
+	.match = i2o_bus_match,
+	.dev_groups = i2o_device_groups,
+};
+
+/**
+ *	i2o_driver_register - Register a I2O driver (OSM) in the I2O core
+ *	@drv: I2O driver which should be registered
+ *
+ *	Registers the OSM drv in the I2O core and creates an event queues if
+ *	necessary.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_driver_register(struct i2o_driver *drv)
+{
+	struct i2o_controller *c;
+	int i;
+	int rc = 0;
+	unsigned long flags;
+
+	osm_debug("Register driver %s\n", drv->name);
+
+	if (drv->event) {
+		drv->event_queue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 1,
+						   drv->name);
+		if (!drv->event_queue) {
+			osm_err("Could not initialize event queue for driver "
+				"%s\n", drv->name);
+			return -EFAULT;
+		}
+		osm_debug("Event queue initialized for driver %s\n", drv->name);
+	} else
+		drv->event_queue = NULL;
+
+	drv->driver.name = drv->name;
+	drv->driver.bus = &i2o_bus_type;
+
+	spin_lock_irqsave(&i2o_drivers_lock, flags);
+
+	for (i = 0; i2o_drivers[i]; i++)
+		if (i >= i2o_max_drivers) {
+			osm_err("too many drivers registered, increase "
+				"max_drivers\n");
+			spin_unlock_irqrestore(&i2o_drivers_lock, flags);
+			rc = -EFAULT;
+			goto out;
+		}
+
+	drv->context = i;
+	i2o_drivers[i] = drv;
+
+	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
+
+	osm_debug("driver %s gets context id %d\n", drv->name, drv->context);
+
+	list_for_each_entry(c, &i2o_controllers, list) {
+		struct i2o_device *i2o_dev;
+
+		i2o_driver_notify_controller_add(drv, c);
+		list_for_each_entry(i2o_dev, &c->devices, list)
+		    i2o_driver_notify_device_add(drv, i2o_dev);
+	}
+
+	rc = driver_register(&drv->driver);
+	if (rc)
+		goto out;
+
+	return 0;
+out:
+	if (drv->event_queue) {
+		destroy_workqueue(drv->event_queue);
+		drv->event_queue = NULL;
+	}
+
+	return rc;
+};
+
+/**
+ *	i2o_driver_unregister - Unregister a I2O driver (OSM) from the I2O core
+ *	@drv: I2O driver which should be unregistered
+ *
+ *	Unregisters the OSM drv from the I2O core and cleanup event queues if
+ *	necessary.
+ */
+void i2o_driver_unregister(struct i2o_driver *drv)
+{
+	struct i2o_controller *c;
+	unsigned long flags;
+
+	osm_debug("unregister driver %s\n", drv->name);
+
+	driver_unregister(&drv->driver);
+
+	list_for_each_entry(c, &i2o_controllers, list) {
+		struct i2o_device *i2o_dev;
+
+		list_for_each_entry(i2o_dev, &c->devices, list)
+		    i2o_driver_notify_device_remove(drv, i2o_dev);
+
+		i2o_driver_notify_controller_remove(drv, c);
+	}
+
+	spin_lock_irqsave(&i2o_drivers_lock, flags);
+	i2o_drivers[drv->context] = NULL;
+	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
+
+	if (drv->event_queue) {
+		destroy_workqueue(drv->event_queue);
+		drv->event_queue = NULL;
+		osm_debug("event queue removed for %s\n", drv->name);
+	}
+};
+
+/**
+ *	i2o_driver_dispatch - dispatch an I2O reply message
+ *	@c: I2O controller of the message
+ *	@m: I2O message number
+ *
+ *	The reply is delivered to the driver from which the original message
+ *	was. This function is only called from interrupt context.
+ *
+ *	Returns 0 on success and the message should not be flushed. Returns > 0
+ *	on success and if the message should be flushed afterwords. Returns
+ *	negative error code on failure (the message will be flushed too).
+ */
+int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
+{
+	struct i2o_driver *drv;
+	struct i2o_message *msg = i2o_msg_out_to_virt(c, m);
+	u32 context = le32_to_cpu(msg->u.s.icntxt);
+	unsigned long flags;
+
+	if (unlikely(context >= i2o_max_drivers)) {
+		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
+			 context);
+		return -EIO;
+	}
+
+	spin_lock_irqsave(&i2o_drivers_lock, flags);
+	drv = i2o_drivers[context];
+	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
+
+	if (unlikely(!drv)) {
+		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
+			 context);
+		return -EIO;
+	}
+
+	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_UTIL_EVT_REGISTER) {
+		struct i2o_device *dev, *tmp;
+		struct i2o_event *evt;
+		u16 size;
+		u16 tid = le32_to_cpu(msg->u.head[1]) & 0xfff;
+
+		osm_debug("event received from device %d\n", tid);
+
+		if (!drv->event)
+			return -EIO;
+
+		/* cut of header from message size (in 32-bit words) */
+		size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5;
+
+		evt = kzalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
+		if (!evt)
+			return -ENOMEM;
+
+		evt->size = size;
+		evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt);
+		evt->event_indicator = le32_to_cpu(msg->body[0]);
+		memcpy(&evt->data, &msg->body[1], size * 4);
+
+		list_for_each_entry_safe(dev, tmp, &c->devices, list)
+		    if (dev->lct_data.tid == tid) {
+			evt->i2o_dev = dev;
+			break;
+		}
+
+		INIT_WORK(&evt->work, drv->event);
+		queue_work(drv->event_queue, &evt->work);
+		return 1;
+	}
+
+	if (unlikely(!drv->reply)) {
+		osm_debug("%s: Reply to driver %s, but no reply function"
+			  " defined!\n", c->name, drv->name);
+		return -EIO;
+	}
+
+	return drv->reply(c, m, msg);
+}
+
+/**
+ *	i2o_driver_notify_controller_add_all - Send notify of added controller
+ *	@c: newly added controller
+ *
+ *	Send notifications to all registered drivers that a new controller was
+ *	added.
+ */
+void i2o_driver_notify_controller_add_all(struct i2o_controller *c)
+{
+	int i;
+	struct i2o_driver *drv;
+
+	for (i = 0; i < i2o_max_drivers; i++) {
+		drv = i2o_drivers[i];
+
+		if (drv)
+			i2o_driver_notify_controller_add(drv, c);
+	}
+}
+
+/**
+ *	i2o_driver_notify_controller_remove_all - Send notify of removed controller
+ *	@c: controller that is being removed
+ *
+ *	Send notifications to all registered drivers that a controller was
+ *	removed.
+ */
+void i2o_driver_notify_controller_remove_all(struct i2o_controller *c)
+{
+	int i;
+	struct i2o_driver *drv;
+
+	for (i = 0; i < i2o_max_drivers; i++) {
+		drv = i2o_drivers[i];
+
+		if (drv)
+			i2o_driver_notify_controller_remove(drv, c);
+	}
+}
+
+/**
+ *	i2o_driver_notify_device_add_all - Send notify of added device
+ *	@i2o_dev: newly added I2O device
+ *
+ *	Send notifications to all registered drivers that a device was added.
+ */
+void i2o_driver_notify_device_add_all(struct i2o_device *i2o_dev)
+{
+	int i;
+	struct i2o_driver *drv;
+
+	for (i = 0; i < i2o_max_drivers; i++) {
+		drv = i2o_drivers[i];
+
+		if (drv)
+			i2o_driver_notify_device_add(drv, i2o_dev);
+	}
+}
+
+/**
+ *	i2o_driver_notify_device_remove_all - Send notify of removed device
+ *	@i2o_dev: device that is being removed
+ *
+ *	Send notifications to all registered drivers that a device was removed.
+ */
+void i2o_driver_notify_device_remove_all(struct i2o_device *i2o_dev)
+{
+	int i;
+	struct i2o_driver *drv;
+
+	for (i = 0; i < i2o_max_drivers; i++) {
+		drv = i2o_drivers[i];
+
+		if (drv)
+			i2o_driver_notify_device_remove(drv, i2o_dev);
+	}
+}
+
+/**
+ *	i2o_driver_init - initialize I2O drivers (OSMs)
+ *
+ *	Registers the I2O bus and allocate memory for the array of OSMs.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int __init i2o_driver_init(void)
+{
+	int rc = 0;
+
+	spin_lock_init(&i2o_drivers_lock);
+
+	if ((i2o_max_drivers < 2) || (i2o_max_drivers > 64)) {
+		osm_warn("max_drivers set to %d, but must be >=2 and <= 64\n",
+			 i2o_max_drivers);
+		i2o_max_drivers = I2O_MAX_DRIVERS;
+	}
+	osm_info("max drivers = %d\n", i2o_max_drivers);
+
+	i2o_drivers =
+	    kcalloc(i2o_max_drivers, sizeof(*i2o_drivers), GFP_KERNEL);
+	if (!i2o_drivers)
+		return -ENOMEM;
+
+	rc = bus_register(&i2o_bus_type);
+
+	if (rc < 0)
+		kfree(i2o_drivers);
+
+	return rc;
+};
+
+/**
+ *	i2o_driver_exit - clean up I2O drivers (OSMs)
+ *
+ *	Unregisters the I2O bus and frees driver array.
+ */
+void i2o_driver_exit(void)
+{
+	bus_unregister(&i2o_bus_type);
+	kfree(i2o_drivers);
+};
+
+EXPORT_SYMBOL(i2o_driver_register);
+EXPORT_SYMBOL(i2o_driver_unregister);
+EXPORT_SYMBOL(i2o_driver_notify_controller_add_all);
+EXPORT_SYMBOL(i2o_driver_notify_controller_remove_all);
+EXPORT_SYMBOL(i2o_driver_notify_device_add_all);
+EXPORT_SYMBOL(i2o_driver_notify_device_remove_all);
diff --git a/drivers/staging/i2o/exec-osm.c b/drivers/staging/i2o/exec-osm.c
new file mode 100644
index 000000000000..16d857d5e655
--- /dev/null
+++ b/drivers/staging/i2o/exec-osm.c
@@ -0,0 +1,612 @@
+/*
+ *	Executive OSM
+ *
+ * 	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	A lot of the I2O message side code from this is taken from the Red
+ *	Creek RCPCI45 adapter driver by Red Creek Communications
+ *
+ *	Fixes/additions:
+ *		Philipp Rumpf
+ *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
+ *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
+ *		Deepak Saxena <deepak@plexity.net>
+ *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
+ *			Ported to Linux 2.5.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor fixes for 2.6.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Support for sysfs included.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sched.h>	/* wait_event_interruptible_timeout() needs this */
+#include <asm/param.h>		/* HZ */
+#include "core.h"
+
+#define OSM_NAME "exec-osm"
+
+struct i2o_driver i2o_exec_driver;
+
+/* global wait list for POST WAIT */
+static LIST_HEAD(i2o_exec_wait_list);
+
+/* Wait struct needed for POST WAIT */
+struct i2o_exec_wait {
+	wait_queue_head_t *wq;	/* Pointer to Wait queue */
+	struct i2o_dma dma;	/* DMA buffers to free on failure */
+	u32 tcntxt;		/* transaction context from reply */
+	int complete;		/* 1 if reply received otherwise 0 */
+	u32 m;			/* message id */
+	struct i2o_message *msg;	/* pointer to the reply message */
+	struct list_head list;	/* node in global wait list */
+	spinlock_t lock;	/* lock before modifying */
+};
+
+/* Work struct needed to handle LCT NOTIFY replies */
+struct i2o_exec_lct_notify_work {
+	struct work_struct work;	/* work struct */
+	struct i2o_controller *c;	/* controller on which the LCT NOTIFY
+					   was received */
+};
+
+/* Exec OSM class handling definition */
+static struct i2o_class_id i2o_exec_class_id[] = {
+	{I2O_CLASS_EXECUTIVE},
+	{I2O_CLASS_END}
+};
+
+/**
+ *	i2o_exec_wait_alloc - Allocate a i2o_exec_wait struct an initialize it
+ *
+ *	Allocate the i2o_exec_wait struct and initialize the wait.
+ *
+ *	Returns i2o_exec_wait pointer on success or negative error code on
+ *	failure.
+ */
+static struct i2o_exec_wait *i2o_exec_wait_alloc(void)
+{
+	struct i2o_exec_wait *wait;
+
+	wait = kzalloc(sizeof(*wait), GFP_KERNEL);
+	if (!wait)
+		return NULL;
+
+	INIT_LIST_HEAD(&wait->list);
+	spin_lock_init(&wait->lock);
+
+	return wait;
+};
+
+/**
+ *	i2o_exec_wait_free - Free an i2o_exec_wait struct
+ *	@wait: I2O wait data which should be cleaned up
+ */
+static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
+{
+	kfree(wait);
+};
+
+/**
+ * 	i2o_msg_post_wait_mem - Post and wait a message with DMA buffers
+ *	@c: controller
+ *	@msg: message to post
+ *	@timeout: time in seconds to wait
+ *	@dma: i2o_dma struct of the DMA buffer to free on failure
+ *
+ * 	This API allows an OSM to post a message and then be told whether or
+ *	not the system received a successful reply. If the message times out
+ *	then the value '-ETIMEDOUT' is returned. This is a special case. In
+ *	this situation the message may (should) complete at an indefinite time
+ *	in the future. When it completes it will use the memory buffer
+ *	attached to the request. If -ETIMEDOUT is returned then the memory
+ *	buffer must not be freed. Instead the event completion will free them
+ *	for you. In all other cases the buffer are your problem.
+ *
+ *	Returns 0 on success, negative error code on timeout or positive error
+ *	code from reply.
+ */
+int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
+			  unsigned long timeout, struct i2o_dma *dma)
+{
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+	struct i2o_exec_wait *wait;
+	static u32 tcntxt = 0x80000000;
+	unsigned long flags;
+	int rc = 0;
+
+	wait = i2o_exec_wait_alloc();
+	if (!wait) {
+		i2o_msg_nop(c, msg);
+		return -ENOMEM;
+	}
+
+	if (tcntxt == 0xffffffff)
+		tcntxt = 0x80000000;
+
+	if (dma)
+		wait->dma = *dma;
+
+	/*
+	 * Fill in the message initiator context and transaction context.
+	 * We will only use transaction contexts >= 0x80000000 for POST WAIT,
+	 * so we could find a POST WAIT reply easier in the reply handler.
+	 */
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	wait->tcntxt = tcntxt++;
+	msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt);
+
+	wait->wq = &wq;
+	/*
+	 * we add elements to the head, because if a entry in the list will
+	 * never be removed, we have to iterate over it every time
+	 */
+	list_add(&wait->list, &i2o_exec_wait_list);
+
+	/*
+	 * Post the message to the controller. At some point later it will
+	 * return. If we time out before it returns then complete will be zero.
+	 */
+	i2o_msg_post(c, msg);
+
+	wait_event_interruptible_timeout(wq, wait->complete, timeout * HZ);
+
+	spin_lock_irqsave(&wait->lock, flags);
+
+	wait->wq = NULL;
+
+	if (wait->complete)
+		rc = le32_to_cpu(wait->msg->body[0]) >> 24;
+	else {
+		/*
+		 * We cannot remove it now. This is important. When it does
+		 * terminate (which it must do if the controller has not
+		 * died...) then it will otherwise scribble on stuff.
+		 *
+		 * FIXME: try abort message
+		 */
+		if (dma)
+			dma->virt = NULL;
+
+		rc = -ETIMEDOUT;
+	}
+
+	spin_unlock_irqrestore(&wait->lock, flags);
+
+	if (rc != -ETIMEDOUT) {
+		i2o_flush_reply(c, wait->m);
+		i2o_exec_wait_free(wait);
+	}
+
+	return rc;
+};
+
+/**
+ *	i2o_msg_post_wait_complete - Reply to a i2o_msg_post request from IOP
+ *	@c: I2O controller which answers
+ *	@m: message id
+ *	@msg: pointer to the I2O reply message
+ *	@context: transaction context of request
+ *
+ *	This function is called in interrupt context only. If the reply reached
+ *	before the timeout, the i2o_exec_wait struct is filled with the message
+ *	and the task will be waked up. The task is now responsible for returning
+ *	the message m back to the controller! If the message reaches us after
+ *	the timeout clean up the i2o_exec_wait struct (including allocated
+ *	DMA buffer).
+ *
+ *	Return 0 on success and if the message m should not be given back to the
+ *	I2O controller, or >0 on success and if the message should be given back
+ *	afterwords. Returns negative error code on failure. In this case the
+ *	message must also be given back to the controller.
+ */
+static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
+				      struct i2o_message *msg, u32 context)
+{
+	struct i2o_exec_wait *wait, *tmp;
+	unsigned long flags;
+	int rc = 1;
+
+	/*
+	 * We need to search through the i2o_exec_wait_list to see if the given
+	 * message is still outstanding. If not, it means that the IOP took
+	 * longer to respond to the message than we had allowed and timer has
+	 * already expired. Not much we can do about that except log it for
+	 * debug purposes, increase timeout, and recompile.
+	 */
+	list_for_each_entry_safe(wait, tmp, &i2o_exec_wait_list, list) {
+		if (wait->tcntxt == context) {
+			spin_lock_irqsave(&wait->lock, flags);
+
+			list_del(&wait->list);
+
+			wait->m = m;
+			wait->msg = msg;
+			wait->complete = 1;
+
+			if (wait->wq)
+				rc = 0;
+			else
+				rc = -1;
+
+			spin_unlock_irqrestore(&wait->lock, flags);
+
+			if (rc) {
+				struct device *dev;
+
+				dev = &c->pdev->dev;
+
+				pr_debug("%s: timedout reply received!\n",
+					 c->name);
+				i2o_dma_free(dev, &wait->dma);
+				i2o_exec_wait_free(wait);
+			} else
+				wake_up_interruptible(wait->wq);
+
+			return rc;
+		}
+	}
+
+	osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name,
+		 context);
+
+	return -1;
+};
+
+/**
+ *	i2o_exec_show_vendor_id - Displays Vendor ID of controller
+ *	@d: device of which the Vendor ID should be displayed
+ *	@attr: device_attribute to display
+ *	@buf: buffer into which the Vendor ID should be printed
+ *
+ *	Returns number of bytes printed into buffer.
+ */
+static ssize_t i2o_exec_show_vendor_id(struct device *d,
+				       struct device_attribute *attr, char *buf)
+{
+	struct i2o_device *dev = to_i2o_device(d);
+	u16 id;
+
+	if (!i2o_parm_field_get(dev, 0x0000, 0, &id, 2)) {
+		sprintf(buf, "0x%04x", le16_to_cpu(id));
+		return strlen(buf) + 1;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_exec_show_product_id - Displays Product ID of controller
+ *	@d: device of which the Product ID should be displayed
+ *	@attr: device_attribute to display
+ *	@buf: buffer into which the Product ID should be printed
+ *
+ *	Returns number of bytes printed into buffer.
+ */
+static ssize_t i2o_exec_show_product_id(struct device *d,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct i2o_device *dev = to_i2o_device(d);
+	u16 id;
+
+	if (!i2o_parm_field_get(dev, 0x0000, 1, &id, 2)) {
+		sprintf(buf, "0x%04x", le16_to_cpu(id));
+		return strlen(buf) + 1;
+	}
+
+	return 0;
+};
+
+/* Exec-OSM device attributes */
+static DEVICE_ATTR(vendor_id, S_IRUGO, i2o_exec_show_vendor_id, NULL);
+static DEVICE_ATTR(product_id, S_IRUGO, i2o_exec_show_product_id, NULL);
+
+/**
+ *	i2o_exec_probe - Called if a new I2O device (executive class) appears
+ *	@dev: I2O device which should be probed
+ *
+ *	Registers event notification for every event from Executive device. The
+ *	return is always 0, because we want all devices of class Executive.
+ *
+ *	Returns 0 on success.
+ */
+static int i2o_exec_probe(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	int rc;
+
+	rc = i2o_event_register(i2o_dev, &i2o_exec_driver, 0, 0xffffffff);
+	if (rc) goto err_out;
+
+	rc = device_create_file(dev, &dev_attr_vendor_id);
+	if (rc) goto err_evtreg;
+	rc = device_create_file(dev, &dev_attr_product_id);
+	if (rc) goto err_vid;
+
+	i2o_dev->iop->exec = i2o_dev;
+
+	return 0;
+
+err_vid:
+	device_remove_file(dev, &dev_attr_vendor_id);
+err_evtreg:
+	i2o_event_register(to_i2o_device(dev), &i2o_exec_driver, 0, 0);
+err_out:
+	return rc;
+};
+
+/**
+ *	i2o_exec_remove - Called on I2O device removal
+ *	@dev: I2O device which was removed
+ *
+ *	Unregisters event notification from Executive I2O device.
+ *
+ *	Returns 0 on success.
+ */
+static int i2o_exec_remove(struct device *dev)
+{
+	device_remove_file(dev, &dev_attr_product_id);
+	device_remove_file(dev, &dev_attr_vendor_id);
+
+	i2o_event_register(to_i2o_device(dev), &i2o_exec_driver, 0, 0);
+
+	return 0;
+};
+
+#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
+/**
+ *	i2o_exec_lct_notify - Send a asynchronus LCT NOTIFY request
+ *	@c: I2O controller to which the request should be send
+ *	@change_ind: change indicator
+ *
+ *	This function sends a LCT NOTIFY request to the I2O controller with
+ *	the change indicator change_ind. If the change_ind == 0 the controller
+ *	replies immediately after the request. If change_ind > 0 the reply is
+ *	send after change indicator of the LCT is > change_ind.
+ */
+static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
+{
+	i2o_status_block *sb = c->status_block.virt;
+	struct device *dev;
+	struct i2o_message *msg;
+
+	mutex_lock(&c->lct_lock);
+
+	dev = &c->pdev->dev;
+
+	if (i2o_dma_realloc(dev, &c->dlct,
+					le32_to_cpu(sb->expected_lct_size))) {
+		mutex_unlock(&c->lct_lock);
+		return -ENOMEM;
+	}
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg)) {
+		mutex_unlock(&c->lct_lock);
+		return PTR_ERR(msg);
+	}
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+	msg->u.head[1] = cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
+				     ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0xffffffff);
+	msg->body[1] = cpu_to_le32(change_ind);
+	msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
+	msg->body[3] = cpu_to_le32(c->dlct.phys);
+
+	i2o_msg_post(c, msg);
+
+	mutex_unlock(&c->lct_lock);
+
+	return 0;
+}
+#endif
+
+/**
+ *	i2o_exec_lct_modified - Called on LCT NOTIFY reply
+ *	@_work: work struct for a specific controller
+ *
+ *	This function handles asynchronus LCT NOTIFY replies. It parses the
+ *	new LCT and if the buffer for the LCT was to small sends a LCT NOTIFY
+ *	again, otherwise send LCT NOTIFY to get informed on next LCT change.
+ */
+static void i2o_exec_lct_modified(struct work_struct *_work)
+{
+	struct i2o_exec_lct_notify_work *work =
+		container_of(_work, struct i2o_exec_lct_notify_work, work);
+	u32 change_ind = 0;
+	struct i2o_controller *c = work->c;
+
+	kfree(work);
+
+	if (i2o_device_parse_lct(c) != -EAGAIN)
+		change_ind = c->lct->change_ind + 1;
+
+#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
+	i2o_exec_lct_notify(c, change_ind);
+#endif
+};
+
+/**
+ *	i2o_exec_reply -  I2O Executive reply handler
+ *	@c: I2O controller from which the reply comes
+ *	@m: message id
+ *	@msg: pointer to the I2O reply message
+ *
+ *	This function is always called from interrupt context. If a POST WAIT
+ *	reply was received, pass it to the complete function. If a LCT NOTIFY
+ *	reply was received, a new event is created to handle the update.
+ *
+ *	Returns 0 on success and if the reply should not be flushed or > 0
+ *	on success and if the reply should be flushed. Returns negative error
+ *	code on failure and if the reply should be flushed.
+ */
+static int i2o_exec_reply(struct i2o_controller *c, u32 m,
+			  struct i2o_message *msg)
+{
+	u32 context;
+
+	if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) {
+		struct i2o_message __iomem *pmsg;
+		u32 pm;
+
+		/*
+		 * If Fail bit is set we must take the transaction context of
+		 * the preserved message to find the right request again.
+		 */
+
+		pm = le32_to_cpu(msg->body[3]);
+		pmsg = i2o_msg_in_to_virt(c, pm);
+		context = readl(&pmsg->u.s.tcntxt);
+
+		i2o_report_status(KERN_INFO, "i2o_core", msg);
+
+		/* Release the preserved msg */
+		i2o_msg_nop_mfa(c, pm);
+	} else
+		context = le32_to_cpu(msg->u.s.tcntxt);
+
+	if (context & 0x80000000)
+		return i2o_msg_post_wait_complete(c, m, msg, context);
+
+	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_LCT_NOTIFY) {
+		struct i2o_exec_lct_notify_work *work;
+
+		pr_debug("%s: LCT notify received\n", c->name);
+
+		work = kmalloc(sizeof(*work), GFP_ATOMIC);
+		if (!work)
+			return -ENOMEM;
+
+		work->c = c;
+
+		INIT_WORK(&work->work, i2o_exec_lct_modified);
+		queue_work(i2o_exec_driver.event_queue, &work->work);
+		return 1;
+	}
+
+	/*
+	 * If this happens, we want to dump the message to the syslog so
+	 * it can be sent back to the card manufacturer by the end user
+	 * to aid in debugging.
+	 *
+	 */
+	printk(KERN_WARNING "%s: Unsolicited message reply sent to core!"
+	       "Message dumped to syslog\n", c->name);
+	i2o_dump_message(msg);
+
+	return -EFAULT;
+}
+
+/**
+ *	i2o_exec_event - Event handling function
+ *	@work: Work item in occurring event
+ *
+ *	Handles events send by the Executive device. At the moment does not do
+ *	anything useful.
+ */
+static void i2o_exec_event(struct work_struct *work)
+{
+	struct i2o_event *evt = container_of(work, struct i2o_event, work);
+
+	if (likely(evt->i2o_dev))
+		osm_debug("Event received from device: %d\n",
+			  evt->i2o_dev->lct_data.tid);
+	kfree(evt);
+};
+
+/**
+ *	i2o_exec_lct_get - Get the IOP's Logical Configuration Table
+ *	@c: I2O controller from which the LCT should be fetched
+ *
+ *	Send a LCT NOTIFY request to the controller, and wait
+ *	I2O_TIMEOUT_LCT_GET seconds until arrival of response. If the LCT is
+ *	to large, retry it.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_exec_lct_get(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	int i = 0;
+	int rc = -EAGAIN;
+
+	for (i = 1; i <= I2O_LCT_GET_TRIES; i++) {
+		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
+
+		msg->u.head[0] =
+		    cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
+				ADAPTER_TID);
+		msg->body[0] = cpu_to_le32(0xffffffff);
+		msg->body[1] = cpu_to_le32(0x00000000);
+		msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
+		msg->body[3] = cpu_to_le32(c->dlct.phys);
+
+		rc = i2o_msg_post_wait(c, msg, I2O_TIMEOUT_LCT_GET);
+		if (rc < 0)
+			break;
+
+		rc = i2o_device_parse_lct(c);
+		if (rc != -EAGAIN)
+			break;
+	}
+
+	return rc;
+}
+
+/* Exec OSM driver struct */
+struct i2o_driver i2o_exec_driver = {
+	.name = OSM_NAME,
+	.reply = i2o_exec_reply,
+	.event = i2o_exec_event,
+	.classes = i2o_exec_class_id,
+	.driver = {
+		   .probe = i2o_exec_probe,
+		   .remove = i2o_exec_remove,
+		   },
+};
+
+/**
+ *	i2o_exec_init - Registers the Exec OSM
+ *
+ *	Registers the Exec OSM in the I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int __init i2o_exec_init(void)
+{
+	return i2o_driver_register(&i2o_exec_driver);
+};
+
+/**
+ *	i2o_exec_exit - Removes the Exec OSM
+ *
+ *	Unregisters the Exec OSM from the I2O core.
+ */
+void i2o_exec_exit(void)
+{
+	i2o_driver_unregister(&i2o_exec_driver);
+};
+
+EXPORT_SYMBOL(i2o_msg_post_wait_mem);
+EXPORT_SYMBOL(i2o_exec_lct_get);
diff --git a/drivers/staging/i2o/i2o.h b/drivers/staging/i2o/i2o.h
new file mode 100644
index 000000000000..d23c3c20b201
--- /dev/null
+++ b/drivers/staging/i2o/i2o.h
@@ -0,0 +1,988 @@
+/*
+ * I2O kernel space accessible structures/APIs
+ *
+ * (c) Copyright 1999, 2000 Red Hat Software
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *************************************************************************
+ *
+ * This header file defined the I2O APIs/structures for use by
+ * the I2O kernel modules.
+ *
+ */
+
+#ifndef _I2O_H
+#define _I2O_H
+
+#include <linux/i2o-dev.h>
+
+/* How many different OSM's are we allowing */
+#define I2O_MAX_DRIVERS		8
+
+#include <linux/pci.h>
+#include <linux/bug.h>
+#include <linux/dma-mapping.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>	/* work_struct */
+#include <linux/mempool.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/semaphore.h>	/* Needed for MUTEX init macros */
+
+#include <asm/io.h>
+
+/* message queue empty */
+#define I2O_QUEUE_EMPTY		0xffffffff
+
+/*
+ *	Cache strategies
+ */
+
+/*	The NULL strategy leaves everything up to the controller. This tends to be a
+ *	pessimal but functional choice.
+ */
+#define CACHE_NULL		0
+/*	Prefetch data when reading. We continually attempt to load the next 32 sectors
+ *	into the controller cache.
+ */
+#define CACHE_PREFETCH		1
+/*	Prefetch data when reading. We sometimes attempt to load the next 32 sectors
+ *	into the controller cache. When an I/O is less <= 8K we assume its probably
+ *	not sequential and don't prefetch (default)
+ */
+#define CACHE_SMARTFETCH	2
+/*	Data is written to the cache and then out on to the disk. The I/O must be
+ *	physically on the medium before the write is acknowledged (default without
+ *	NVRAM)
+ */
+#define CACHE_WRITETHROUGH	17
+/*	Data is written to the cache and then out on to the disk. The controller
+ *	is permitted to write back the cache any way it wants. (default if battery
+ *	backed NVRAM is present). It can be useful to set this for swap regardless of
+ *	battery state.
+ */
+#define CACHE_WRITEBACK		18
+/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
+ *	write large I/O's directly to disk bypassing the cache to avoid the extra
+ *	memory copy hits. Small writes are writeback cached
+ */
+#define CACHE_SMARTBACK		19
+/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
+ *	write large I/O's directly to disk bypassing the cache to avoid the extra
+ *	memory copy hits. Small writes are writethrough cached. Suitable for devices
+ *	lacking battery backup
+ */
+#define CACHE_SMARTTHROUGH	20
+
+/*
+ *	Ioctl structures
+ */
+
+#define 	BLKI2OGRSTRAT	_IOR('2', 1, int)
+#define 	BLKI2OGWSTRAT	_IOR('2', 2, int)
+#define 	BLKI2OSRSTRAT	_IOW('2', 3, int)
+#define 	BLKI2OSWSTRAT	_IOW('2', 4, int)
+
+/*
+ *	I2O Function codes
+ */
+
+/*
+ *	Executive Class
+ */
+#define	I2O_CMD_ADAPTER_ASSIGN		0xB3
+#define	I2O_CMD_ADAPTER_READ		0xB2
+#define	I2O_CMD_ADAPTER_RELEASE		0xB5
+#define	I2O_CMD_BIOS_INFO_SET		0xA5
+#define	I2O_CMD_BOOT_DEVICE_SET		0xA7
+#define	I2O_CMD_CONFIG_VALIDATE		0xBB
+#define	I2O_CMD_CONN_SETUP		0xCA
+#define	I2O_CMD_DDM_DESTROY		0xB1
+#define	I2O_CMD_DDM_ENABLE		0xD5
+#define	I2O_CMD_DDM_QUIESCE		0xC7
+#define	I2O_CMD_DDM_RESET		0xD9
+#define	I2O_CMD_DDM_SUSPEND		0xAF
+#define	I2O_CMD_DEVICE_ASSIGN		0xB7
+#define	I2O_CMD_DEVICE_RELEASE		0xB9
+#define	I2O_CMD_HRT_GET			0xA8
+#define	I2O_CMD_ADAPTER_CLEAR		0xBE
+#define	I2O_CMD_ADAPTER_CONNECT		0xC9
+#define	I2O_CMD_ADAPTER_RESET		0xBD
+#define	I2O_CMD_LCT_NOTIFY		0xA2
+#define	I2O_CMD_OUTBOUND_INIT		0xA1
+#define	I2O_CMD_PATH_ENABLE		0xD3
+#define	I2O_CMD_PATH_QUIESCE		0xC5
+#define	I2O_CMD_PATH_RESET		0xD7
+#define	I2O_CMD_STATIC_MF_CREATE	0xDD
+#define	I2O_CMD_STATIC_MF_RELEASE	0xDF
+#define	I2O_CMD_STATUS_GET		0xA0
+#define	I2O_CMD_SW_DOWNLOAD		0xA9
+#define	I2O_CMD_SW_UPLOAD		0xAB
+#define	I2O_CMD_SW_REMOVE		0xAD
+#define	I2O_CMD_SYS_ENABLE		0xD1
+#define	I2O_CMD_SYS_MODIFY		0xC1
+#define	I2O_CMD_SYS_QUIESCE		0xC3
+#define	I2O_CMD_SYS_TAB_SET		0xA3
+
+/*
+ * Utility Class
+ */
+#define I2O_CMD_UTIL_NOP		0x00
+#define I2O_CMD_UTIL_ABORT		0x01
+#define I2O_CMD_UTIL_CLAIM		0x09
+#define I2O_CMD_UTIL_RELEASE		0x0B
+#define I2O_CMD_UTIL_PARAMS_GET		0x06
+#define I2O_CMD_UTIL_PARAMS_SET		0x05
+#define I2O_CMD_UTIL_EVT_REGISTER	0x13
+#define I2O_CMD_UTIL_EVT_ACK		0x14
+#define I2O_CMD_UTIL_CONFIG_DIALOG	0x10
+#define I2O_CMD_UTIL_DEVICE_RESERVE	0x0D
+#define I2O_CMD_UTIL_DEVICE_RELEASE	0x0F
+#define I2O_CMD_UTIL_LOCK		0x17
+#define I2O_CMD_UTIL_LOCK_RELEASE	0x19
+#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY	0x15
+
+/*
+ * SCSI Host Bus Adapter Class
+ */
+#define I2O_CMD_SCSI_EXEC		0x81
+#define I2O_CMD_SCSI_ABORT		0x83
+#define I2O_CMD_SCSI_BUSRESET		0x27
+
+/*
+ * Bus Adapter Class
+ */
+#define I2O_CMD_BUS_ADAPTER_RESET	0x85
+#define I2O_CMD_BUS_RESET		0x87
+#define I2O_CMD_BUS_SCAN		0x89
+#define I2O_CMD_BUS_QUIESCE		0x8b
+
+/*
+ * Random Block Storage Class
+ */
+#define I2O_CMD_BLOCK_READ		0x30
+#define I2O_CMD_BLOCK_WRITE		0x31
+#define I2O_CMD_BLOCK_CFLUSH		0x37
+#define I2O_CMD_BLOCK_MLOCK		0x49
+#define I2O_CMD_BLOCK_MUNLOCK		0x4B
+#define I2O_CMD_BLOCK_MMOUNT		0x41
+#define I2O_CMD_BLOCK_MEJECT		0x43
+#define I2O_CMD_BLOCK_POWER		0x70
+
+#define I2O_CMD_PRIVATE			0xFF
+
+/* Command status values  */
+
+#define I2O_CMD_IN_PROGRESS	0x01
+#define I2O_CMD_REJECTED	0x02
+#define I2O_CMD_FAILED		0x03
+#define I2O_CMD_COMPLETED	0x04
+
+/* I2O API function return values */
+
+#define I2O_RTN_NO_ERROR			0
+#define I2O_RTN_NOT_INIT			1
+#define I2O_RTN_FREE_Q_EMPTY			2
+#define I2O_RTN_TCB_ERROR			3
+#define I2O_RTN_TRANSACTION_ERROR		4
+#define I2O_RTN_ADAPTER_ALREADY_INIT		5
+#define I2O_RTN_MALLOC_ERROR			6
+#define I2O_RTN_ADPTR_NOT_REGISTERED		7
+#define I2O_RTN_MSG_REPLY_TIMEOUT		8
+#define I2O_RTN_NO_STATUS			9
+#define I2O_RTN_NO_FIRM_VER			10
+#define	I2O_RTN_NO_LINK_SPEED			11
+
+/* Reply message status defines for all messages */
+
+#define I2O_REPLY_STATUS_SUCCESS                    	0x00
+#define I2O_REPLY_STATUS_ABORT_DIRTY                	0x01
+#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER     	0x02
+#define	I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER		0x03
+#define	I2O_REPLY_STATUS_ERROR_DIRTY			0x04
+#define	I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER		0x05
+#define	I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER		0x06
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY		0x08
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER	0x09
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER	0x0A
+#define	I2O_REPLY_STATUS_TRANSACTION_ERROR		0x0B
+#define	I2O_REPLY_STATUS_PROGRESS_REPORT		0x80
+
+/* Status codes and Error Information for Parameter functions */
+
+#define I2O_PARAMS_STATUS_SUCCESS		0x00
+#define I2O_PARAMS_STATUS_BAD_KEY_ABORT		0x01
+#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE   	0x02
+#define I2O_PARAMS_STATUS_BUFFER_FULL		0x03
+#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL	0x04
+#define I2O_PARAMS_STATUS_FIELD_UNREADABLE	0x05
+#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE	0x06
+#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS	0x07
+#define I2O_PARAMS_STATUS_INVALID_GROUP_ID	0x08
+#define I2O_PARAMS_STATUS_INVALID_OPERATION	0x09
+#define I2O_PARAMS_STATUS_NO_KEY_FIELD		0x0A
+#define I2O_PARAMS_STATUS_NO_SUCH_FIELD		0x0B
+#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP	0x0C
+#define I2O_PARAMS_STATUS_OPERATION_ERROR	0x0D
+#define I2O_PARAMS_STATUS_SCALAR_ERROR		0x0E
+#define I2O_PARAMS_STATUS_TABLE_ERROR		0x0F
+#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE	0x10
+
+/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
+ * messages: Table 3-2 Detailed Status Codes.*/
+
+#define I2O_DSC_SUCCESS                        0x0000
+#define I2O_DSC_BAD_KEY                        0x0002
+#define I2O_DSC_TCL_ERROR                      0x0003
+#define I2O_DSC_REPLY_BUFFER_FULL              0x0004
+#define I2O_DSC_NO_SUCH_PAGE                   0x0005
+#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT     0x0006
+#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD     0x0007
+#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE         0x0009
+#define I2O_DSC_UNSUPPORTED_FUNCTION           0x000A
+#define I2O_DSC_DEVICE_LOCKED                  0x000B
+#define I2O_DSC_DEVICE_RESET                   0x000C
+#define I2O_DSC_INAPPROPRIATE_FUNCTION         0x000D
+#define I2O_DSC_INVALID_INITIATOR_ADDRESS      0x000E
+#define I2O_DSC_INVALID_MESSAGE_FLAGS          0x000F
+#define I2O_DSC_INVALID_OFFSET                 0x0010
+#define I2O_DSC_INVALID_PARAMETER              0x0011
+#define I2O_DSC_INVALID_REQUEST                0x0012
+#define I2O_DSC_INVALID_TARGET_ADDRESS         0x0013
+#define I2O_DSC_MESSAGE_TOO_LARGE              0x0014
+#define I2O_DSC_MESSAGE_TOO_SMALL              0x0015
+#define I2O_DSC_MISSING_PARAMETER              0x0016
+#define I2O_DSC_TIMEOUT                        0x0017
+#define I2O_DSC_UNKNOWN_ERROR                  0x0018
+#define I2O_DSC_UNKNOWN_FUNCTION               0x0019
+#define I2O_DSC_UNSUPPORTED_VERSION            0x001A
+#define I2O_DSC_DEVICE_BUSY                    0x001B
+#define I2O_DSC_DEVICE_NOT_AVAILABLE           0x001C
+
+/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
+   Status Codes.*/
+
+#define I2O_BSA_DSC_SUCCESS               0x0000
+#define I2O_BSA_DSC_MEDIA_ERROR           0x0001
+#define I2O_BSA_DSC_ACCESS_ERROR          0x0002
+#define I2O_BSA_DSC_DEVICE_FAILURE        0x0003
+#define I2O_BSA_DSC_DEVICE_NOT_READY      0x0004
+#define I2O_BSA_DSC_MEDIA_NOT_PRESENT     0x0005
+#define I2O_BSA_DSC_MEDIA_LOCKED          0x0006
+#define I2O_BSA_DSC_MEDIA_FAILURE         0x0007
+#define I2O_BSA_DSC_PROTOCOL_FAILURE      0x0008
+#define I2O_BSA_DSC_BUS_FAILURE           0x0009
+#define I2O_BSA_DSC_ACCESS_VIOLATION      0x000A
+#define I2O_BSA_DSC_WRITE_PROTECTED       0x000B
+#define I2O_BSA_DSC_DEVICE_RESET          0x000C
+#define I2O_BSA_DSC_VOLUME_CHANGED        0x000D
+#define I2O_BSA_DSC_TIMEOUT               0x000E
+
+/* FailureStatusCodes, Table 3-3 Message Failure Codes */
+
+#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED             0x81
+#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED            0x82
+#define I2O_FSC_TRANSPORT_CONGESTION                    0x83
+#define I2O_FSC_TRANSPORT_FAILURE                       0x84
+#define I2O_FSC_TRANSPORT_STATE_ERROR                   0x85
+#define I2O_FSC_TRANSPORT_TIME_OUT                      0x86
+#define I2O_FSC_TRANSPORT_ROUTING_FAILURE               0x87
+#define I2O_FSC_TRANSPORT_INVALID_VERSION               0x88
+#define I2O_FSC_TRANSPORT_INVALID_OFFSET                0x89
+#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS             0x8A
+#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL               0x8B
+#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE               0x8C
+#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID             0x8D
+#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID          0x8E
+#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT     0x8F
+#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE               0xFF
+
+/* Device Claim Types */
+#define	I2O_CLAIM_PRIMARY					0x01000000
+#define	I2O_CLAIM_MANAGEMENT					0x02000000
+#define	I2O_CLAIM_AUTHORIZED					0x03000000
+#define	I2O_CLAIM_SECONDARY					0x04000000
+
+/* Message header defines for VersionOffset */
+#define I2OVER15	0x0001
+#define I2OVER20	0x0002
+
+/* Default is 1.5 */
+#define I2OVERSION	I2OVER15
+
+#define SGL_OFFSET_0    I2OVERSION
+#define SGL_OFFSET_4    (0x0040 | I2OVERSION)
+#define SGL_OFFSET_5    (0x0050 | I2OVERSION)
+#define SGL_OFFSET_6    (0x0060 | I2OVERSION)
+#define SGL_OFFSET_7    (0x0070 | I2OVERSION)
+#define SGL_OFFSET_8    (0x0080 | I2OVERSION)
+#define SGL_OFFSET_9    (0x0090 | I2OVERSION)
+#define SGL_OFFSET_10   (0x00A0 | I2OVERSION)
+#define SGL_OFFSET_11   (0x00B0 | I2OVERSION)
+#define SGL_OFFSET_12   (0x00C0 | I2OVERSION)
+#define SGL_OFFSET(x)   (((x)<<4) | I2OVERSION)
+
+/* Transaction Reply Lists (TRL) Control Word structure */
+#define TRL_SINGLE_FIXED_LENGTH		0x00
+#define TRL_SINGLE_VARIABLE_LENGTH	0x40
+#define TRL_MULTIPLE_FIXED_LENGTH	0x80
+
+ /* msg header defines for MsgFlags */
+#define MSG_STATIC	0x0100
+#define MSG_64BIT_CNTXT	0x0200
+#define MSG_MULTI_TRANS	0x1000
+#define MSG_FAIL	0x2000
+#define MSG_FINAL	0x4000
+#define MSG_REPLY	0x8000
+
+ /* minimum size msg */
+#define THREE_WORD_MSG_SIZE	0x00030000
+#define FOUR_WORD_MSG_SIZE	0x00040000
+#define FIVE_WORD_MSG_SIZE	0x00050000
+#define SIX_WORD_MSG_SIZE	0x00060000
+#define SEVEN_WORD_MSG_SIZE	0x00070000
+#define EIGHT_WORD_MSG_SIZE	0x00080000
+#define NINE_WORD_MSG_SIZE	0x00090000
+#define TEN_WORD_MSG_SIZE	0x000A0000
+#define ELEVEN_WORD_MSG_SIZE	0x000B0000
+#define I2O_MESSAGE_SIZE(x)	((x)<<16)
+
+/* special TID assignments */
+#define ADAPTER_TID		0
+#define HOST_TID		1
+
+/* outbound queue defines */
+#define I2O_MAX_OUTBOUND_MSG_FRAMES	128
+#define I2O_OUTBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
+
+/* inbound queue definitions */
+#define I2O_MSG_INPOOL_MIN		32
+#define I2O_INBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
+
+#define I2O_POST_WAIT_OK	0
+#define I2O_POST_WAIT_TIMEOUT	-ETIMEDOUT
+
+#define I2O_CONTEXT_LIST_MIN_LENGTH	15
+#define I2O_CONTEXT_LIST_USED		0x01
+#define I2O_CONTEXT_LIST_DELETED	0x02
+
+/* timeouts */
+#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE	15
+#define I2O_TIMEOUT_MESSAGE_GET		5
+#define I2O_TIMEOUT_RESET		30
+#define I2O_TIMEOUT_STATUS_GET		5
+#define I2O_TIMEOUT_LCT_GET		360
+#define I2O_TIMEOUT_SCSI_SCB_ABORT	240
+
+/* retries */
+#define I2O_HRT_GET_TRIES		3
+#define I2O_LCT_GET_TRIES		3
+
+/* defines for max_sectors and max_phys_segments */
+#define I2O_MAX_SECTORS			1024
+#define I2O_MAX_SECTORS_LIMITED		128
+#define I2O_MAX_PHYS_SEGMENTS		BLK_MAX_SEGMENTS
+
+/*
+ *	Message structures
+ */
+struct i2o_message {
+	union {
+		struct {
+			u8 version_offset;
+			u8 flags;
+			u16 size;
+			u32 target_tid:12;
+			u32 init_tid:12;
+			u32 function:8;
+			u32 icntxt;	/* initiator context */
+			u32 tcntxt;	/* transaction context */
+		} s;
+		u32 head[4];
+	} u;
+	/* List follows */
+	u32 body[0];
+};
+
+/* MFA and I2O message used by mempool */
+struct i2o_msg_mfa {
+	u32 mfa;		/* MFA returned by the controller */
+	struct i2o_message msg;	/* I2O message */
+};
+
+/*
+ *	Each I2O device entity has one of these. There is one per device.
+ */
+struct i2o_device {
+	i2o_lct_entry lct_data;	/* Device LCT information */
+
+	struct i2o_controller *iop;	/* Controlling IOP */
+	struct list_head list;	/* node in IOP devices list */
+
+	struct device device;
+
+	struct mutex lock;	/* device lock */
+};
+
+/*
+ *	Event structure provided to the event handling function
+ */
+struct i2o_event {
+	struct work_struct work;
+	struct i2o_device *i2o_dev;	/* I2O device pointer from which the
+					   event reply was initiated */
+	u16 size;		/* Size of data in 32-bit words */
+	u32 tcntxt;		/* Transaction context used at
+				   registration */
+	u32 event_indicator;	/* Event indicator from reply */
+	u32 data[0];		/* Event data from reply */
+};
+
+/*
+ *	I2O classes which could be handled by the OSM
+ */
+struct i2o_class_id {
+	u16 class_id:12;
+};
+
+/*
+ *	I2O driver structure for OSMs
+ */
+struct i2o_driver {
+	char *name;		/* OSM name */
+	int context;		/* Low 8 bits of the transaction info */
+	struct i2o_class_id *classes;	/* I2O classes that this OSM handles */
+
+	/* Message reply handler */
+	int (*reply) (struct i2o_controller *, u32, struct i2o_message *);
+
+	/* Event handler */
+	work_func_t event;
+
+	struct workqueue_struct *event_queue;	/* Event queue */
+
+	struct device_driver driver;
+
+	/* notification of changes */
+	void (*notify_controller_add) (struct i2o_controller *);
+	void (*notify_controller_remove) (struct i2o_controller *);
+	void (*notify_device_add) (struct i2o_device *);
+	void (*notify_device_remove) (struct i2o_device *);
+
+	struct semaphore lock;
+};
+
+/*
+ *	Contains DMA mapped address information
+ */
+struct i2o_dma {
+	void *virt;
+	dma_addr_t phys;
+	size_t len;
+};
+
+/*
+ *	Contains slab cache and mempool information
+ */
+struct i2o_pool {
+	char *name;
+	struct kmem_cache *slab;
+	mempool_t *mempool;
+};
+
+/*
+ *	Contains IO mapped address information
+ */
+struct i2o_io {
+	void __iomem *virt;
+	unsigned long phys;
+	unsigned long len;
+};
+
+/*
+ *	Context queue entry, used for 32-bit context on 64-bit systems
+ */
+struct i2o_context_list_element {
+	struct list_head list;
+	u32 context;
+	void *ptr;
+	unsigned long timestamp;
+};
+
+/*
+ * Each I2O controller has one of these objects
+ */
+struct i2o_controller {
+	char name[16];
+	int unit;
+	int type;
+
+	struct pci_dev *pdev;	/* PCI device */
+
+	unsigned int promise:1;	/* Promise controller */
+	unsigned int adaptec:1;	/* DPT / Adaptec controller */
+	unsigned int raptor:1;	/* split bar */
+	unsigned int no_quiesce:1;	/* dont quiesce before reset */
+	unsigned int short_req:1;	/* use small block sizes */
+	unsigned int limit_sectors:1;	/* limit number of sectors / request */
+	unsigned int pae_support:1;	/* controller has 64-bit SGL support */
+
+	struct list_head devices;	/* list of I2O devices */
+	struct list_head list;	/* Controller list */
+
+	void __iomem *in_port;	/* Inbout port address */
+	void __iomem *out_port;	/* Outbound port address */
+	void __iomem *irq_status;	/* Interrupt status register address */
+	void __iomem *irq_mask;	/* Interrupt mask register address */
+
+	struct i2o_dma status;	/* IOP status block */
+
+	struct i2o_dma hrt;	/* HW Resource Table */
+	i2o_lct *lct;		/* Logical Config Table */
+	struct i2o_dma dlct;	/* Temp LCT */
+	struct mutex lct_lock;	/* Lock for LCT updates */
+	struct i2o_dma status_block;	/* IOP status block */
+
+	struct i2o_io base;	/* controller messaging unit */
+	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
+	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
+
+	struct i2o_pool in_msg;	/* mempool for inbound messages */
+
+	unsigned int battery:1;	/* Has a battery backup */
+	unsigned int io_alloc:1;	/* An I/O resource was allocated */
+	unsigned int mem_alloc:1;	/* A memory resource was allocated */
+
+	struct resource io_resource;	/* I/O resource allocated to the IOP */
+	struct resource mem_resource;	/* Mem resource allocated to the IOP */
+
+	struct device device;
+	struct i2o_device *exec;	/* Executive */
+#if BITS_PER_LONG == 64
+	spinlock_t context_list_lock;	/* lock for context_list */
+	atomic_t context_list_counter;	/* needed for unique contexts */
+	struct list_head context_list;	/* list of context id's
+					   and pointers */
+#endif
+	spinlock_t lock;	/* lock for controller
+				   configuration */
+	void *driver_data[I2O_MAX_DRIVERS];	/* storage for drivers */
+};
+
+/*
+ * I2O System table entry
+ *
+ * The system table contains information about all the IOPs in the
+ * system.  It is sent to all IOPs so that they can create peer2peer
+ * connections between them.
+ */
+struct i2o_sys_tbl_entry {
+	u16 org_id;
+	u16 reserved1;
+	u32 iop_id:12;
+	u32 reserved2:20;
+	u16 seg_num:12;
+	u16 i2o_version:4;
+	u8 iop_state;
+	u8 msg_type;
+	u16 frame_size;
+	u16 reserved3;
+	u32 last_changed;
+	u32 iop_capabilities;
+	u32 inbound_low;
+	u32 inbound_high;
+};
+
+struct i2o_sys_tbl {
+	u8 num_entries;
+	u8 version;
+	u16 reserved1;
+	u32 change_ind;
+	u32 reserved2;
+	u32 reserved3;
+	struct i2o_sys_tbl_entry iops[0];
+};
+
+extern struct list_head i2o_controllers;
+
+/* Message functions */
+extern struct i2o_message *i2o_msg_get_wait(struct i2o_controller *, int);
+extern int i2o_msg_post_wait_mem(struct i2o_controller *, struct i2o_message *,
+				 unsigned long, struct i2o_dma *);
+
+/* IOP functions */
+extern int i2o_status_get(struct i2o_controller *);
+
+extern int i2o_event_register(struct i2o_device *, struct i2o_driver *, int,
+			      u32);
+extern struct i2o_device *i2o_iop_find_device(struct i2o_controller *, u16);
+extern struct i2o_controller *i2o_find_iop(int);
+
+/* Functions needed for handling 64-bit pointers in 32-bit context */
+#if BITS_PER_LONG == 64
+extern u32 i2o_cntxt_list_add(struct i2o_controller *, void *);
+extern void *i2o_cntxt_list_get(struct i2o_controller *, u32);
+extern u32 i2o_cntxt_list_remove(struct i2o_controller *, void *);
+extern u32 i2o_cntxt_list_get_ptr(struct i2o_controller *, void *);
+
+static inline u32 i2o_ptr_low(void *ptr)
+{
+	return (u32) (u64) ptr;
+};
+
+static inline u32 i2o_ptr_high(void *ptr)
+{
+	return (u32) ((u64) ptr >> 32);
+};
+
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
+{
+	return (u32) (u64) dma_addr;
+};
+
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
+{
+	return (u32) ((u64) dma_addr >> 32);
+};
+#else
+static inline u32 i2o_cntxt_list_add(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
+{
+	return (void *)context;
+};
+
+static inline u32 i2o_cntxt_list_remove(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_cntxt_list_get_ptr(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_ptr_low(void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_ptr_high(void *ptr)
+{
+	return 0;
+};
+
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
+{
+	return (u32) dma_addr;
+};
+
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
+{
+	return 0;
+};
+#endif
+
+extern u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size);
+extern dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
+					    size_t size,
+					    enum dma_data_direction direction,
+					    u32 ** sg_ptr);
+extern int i2o_dma_map_sg(struct i2o_controller *c,
+				 struct scatterlist *sg, int sg_count,
+				 enum dma_data_direction direction,
+				 u32 ** sg_ptr);
+extern int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len);
+extern void i2o_dma_free(struct device *dev, struct i2o_dma *addr);
+extern int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
+								size_t len);
+extern int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
+				 size_t size, int min_nr);
+extern void i2o_pool_free(struct i2o_pool *pool);
+/* I2O driver (OSM) functions */
+extern int i2o_driver_register(struct i2o_driver *);
+extern void i2o_driver_unregister(struct i2o_driver *);
+
+/**
+ *	i2o_driver_notify_controller_add - Send notification of added controller
+ *	@drv: I2O driver
+ *	@c: I2O controller
+ *
+ *	Send notification of added controller to a single registered driver.
+ */
+static inline void i2o_driver_notify_controller_add(struct i2o_driver *drv,
+						    struct i2o_controller *c)
+{
+	if (drv->notify_controller_add)
+		drv->notify_controller_add(c);
+};
+
+/**
+ *	i2o_driver_notify_controller_remove - Send notification of removed controller
+ *	@drv: I2O driver
+ *	@c: I2O controller
+ *
+ *	Send notification of removed controller to a single registered driver.
+ */
+static inline void i2o_driver_notify_controller_remove(struct i2o_driver *drv,
+						       struct i2o_controller *c)
+{
+	if (drv->notify_controller_remove)
+		drv->notify_controller_remove(c);
+};
+
+/**
+ *	i2o_driver_notify_device_add - Send notification of added device
+ *	@drv: I2O driver
+ *	@i2o_dev: the added i2o_device
+ *
+ *	Send notification of added device to a single registered driver.
+ */
+static inline void i2o_driver_notify_device_add(struct i2o_driver *drv,
+						struct i2o_device *i2o_dev)
+{
+	if (drv->notify_device_add)
+		drv->notify_device_add(i2o_dev);
+};
+
+/**
+ *	i2o_driver_notify_device_remove - Send notification of removed device
+ *	@drv: I2O driver
+ *	@i2o_dev: the added i2o_device
+ *
+ *	Send notification of removed device to a single registered driver.
+ */
+static inline void i2o_driver_notify_device_remove(struct i2o_driver *drv,
+						   struct i2o_device *i2o_dev)
+{
+	if (drv->notify_device_remove)
+		drv->notify_device_remove(i2o_dev);
+};
+
+extern void i2o_driver_notify_controller_add_all(struct i2o_controller *);
+extern void i2o_driver_notify_controller_remove_all(struct i2o_controller *);
+extern void i2o_driver_notify_device_add_all(struct i2o_device *);
+extern void i2o_driver_notify_device_remove_all(struct i2o_device *);
+
+/* I2O device functions */
+extern int i2o_device_claim(struct i2o_device *);
+extern int i2o_device_claim_release(struct i2o_device *);
+
+/* Exec OSM functions */
+extern int i2o_exec_lct_get(struct i2o_controller *);
+
+/* device / driver / kobject conversion functions */
+#define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
+#define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
+#define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
+
+/**
+ *	i2o_out_to_virt - Turn an I2O message to a virtual address
+ *	@c: controller
+ *	@m: message engine value
+ *
+ *	Turn a receive message from an I2O controller bus address into
+ *	a Linux virtual address. The shared page frame is a linear block
+ *	so we simply have to shift the offset. This function does not
+ *	work for sender side messages as they are ioremap objects
+ *	provided by the I2O controller.
+ */
+static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
+						      u32 m)
+{
+	BUG_ON(m < c->out_queue.phys
+	       || m >= c->out_queue.phys + c->out_queue.len);
+
+	return c->out_queue.virt + (m - c->out_queue.phys);
+};
+
+/**
+ *	i2o_msg_in_to_virt - Turn an I2O message to a virtual address
+ *	@c: controller
+ *	@m: message engine value
+ *
+ *	Turn a send message from an I2O controller bus address into
+ *	a Linux virtual address. The shared page frame is a linear block
+ *	so we simply have to shift the offset. This function does not
+ *	work for receive side messages as they are kmalloc objects
+ *	in a different pool.
+ */
+static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
+							     i2o_controller *c,
+							     u32 m)
+{
+	return c->in_queue.virt + m;
+};
+
+/**
+ *	i2o_msg_get - obtain an I2O message from the IOP
+ *	@c: I2O controller
+ *
+ *	This function tries to get a message frame. If no message frame is
+ *	available do not wait until one is available (see also i2o_msg_get_wait).
+ *	The returned pointer to the message frame is not in I/O memory, it is
+ *	allocated from a mempool. But because a MFA is allocated from the
+ *	controller too it is guaranteed that i2o_msg_post() will never fail.
+ *
+ *	On a success a pointer to the message frame is returned. If the message
+ *	queue is empty -EBUSY is returned and if no memory is available -ENOMEM
+ *	is returned.
+ */
+static inline struct i2o_message *i2o_msg_get(struct i2o_controller *c)
+{
+	struct i2o_msg_mfa *mmsg = mempool_alloc(c->in_msg.mempool, GFP_ATOMIC);
+	if (!mmsg)
+		return ERR_PTR(-ENOMEM);
+
+	mmsg->mfa = readl(c->in_port);
+	if (unlikely(mmsg->mfa >= c->in_queue.len)) {
+		u32 mfa = mmsg->mfa;
+
+		mempool_free(mmsg, c->in_msg.mempool);
+
+		if (mfa == I2O_QUEUE_EMPTY)
+			return ERR_PTR(-EBUSY);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return &mmsg->msg;
+};
+
+/**
+ *	i2o_msg_post - Post I2O message to I2O controller
+ *	@c: I2O controller to which the message should be send
+ *	@msg: message returned by i2o_msg_get()
+ *
+ *	Post the message to the I2O controller and return immediately.
+ */
+static inline void i2o_msg_post(struct i2o_controller *c,
+				struct i2o_message *msg)
+{
+	struct i2o_msg_mfa *mmsg;
+
+	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
+	memcpy_toio(i2o_msg_in_to_virt(c, mmsg->mfa), msg,
+		    (le32_to_cpu(msg->u.head[0]) >> 16) << 2);
+	writel(mmsg->mfa, c->in_port);
+	mempool_free(mmsg, c->in_msg.mempool);
+};
+
+/**
+ * 	i2o_msg_post_wait - Post and wait a message and wait until return
+ *	@c: controller
+ *	@msg: message to post
+ *	@timeout: time in seconds to wait
+ *
+ * 	This API allows an OSM to post a message and then be told whether or
+ *	not the system received a successful reply. If the message times out
+ *	then the value '-ETIMEDOUT' is returned.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static inline int i2o_msg_post_wait(struct i2o_controller *c,
+				    struct i2o_message *msg,
+				    unsigned long timeout)
+{
+	return i2o_msg_post_wait_mem(c, msg, timeout, NULL);
+};
+
+/**
+ *	i2o_msg_nop_mfa - Returns a fetched MFA back to the controller
+ *	@c: I2O controller from which the MFA was fetched
+ *	@mfa: MFA which should be returned
+ *
+ *	This function must be used for preserved messages, because i2o_msg_nop()
+ *	also returns the allocated memory back to the msg_pool mempool.
+ */
+static inline void i2o_msg_nop_mfa(struct i2o_controller *c, u32 mfa)
+{
+	struct i2o_message __iomem *msg;
+	u32 nop[3] = {
+		THREE_WORD_MSG_SIZE | SGL_OFFSET_0,
+		I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
+		0x00000000
+	};
+
+	msg = i2o_msg_in_to_virt(c, mfa);
+	memcpy_toio(msg, nop, sizeof(nop));
+	writel(mfa, c->in_port);
+};
+
+/**
+ *	i2o_msg_nop - Returns a message which is not used
+ *	@c: I2O controller from which the message was created
+ *	@msg: message which should be returned
+ *
+ *	If you fetch a message via i2o_msg_get, and can't use it, you must
+ *	return the message with this function. Otherwise the MFA is lost as well
+ *	as the allocated memory from the mempool.
+ */
+static inline void i2o_msg_nop(struct i2o_controller *c,
+			       struct i2o_message *msg)
+{
+	struct i2o_msg_mfa *mmsg;
+	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
+
+	i2o_msg_nop_mfa(c, mmsg->mfa);
+	mempool_free(mmsg, c->in_msg.mempool);
+};
+
+/**
+ *	i2o_flush_reply - Flush reply from I2O controller
+ *	@c: I2O controller
+ *	@m: the message identifier
+ *
+ *	The I2O controller must be informed that the reply message is not needed
+ *	anymore. If you forget to flush the reply, the message frame can't be
+ *	used by the controller anymore and is therefore lost.
+ */
+static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
+{
+	writel(m, c->out_port);
+};
+
+/*
+ *	Endian handling wrapped into the macro - keeps the core code
+ *	cleaner.
+ */
+
+#define i2o_raw_writel(val, mem)	__raw_writel(cpu_to_le32(val), mem)
+
+extern int i2o_parm_field_get(struct i2o_device *, int, int, void *, int);
+extern int i2o_parm_table_get(struct i2o_device *, int, int, int, void *, int,
+			      void *, int);
+
+/* debugging and troubleshooting/diagnostic helpers. */
+#define osm_printk(level, format, arg...)  \
+	printk(level "%s: " format, OSM_NAME , ## arg)
+
+#ifdef DEBUG
+#define osm_debug(format, arg...) \
+	osm_printk(KERN_DEBUG, format , ## arg)
+#else
+#define osm_debug(format, arg...) \
+        do { } while (0)
+#endif
+
+#define osm_err(format, arg...)		\
+	osm_printk(KERN_ERR, format , ## arg)
+#define osm_info(format, arg...)		\
+	osm_printk(KERN_INFO, format , ## arg)
+#define osm_warn(format, arg...)		\
+	osm_printk(KERN_WARNING, format , ## arg)
+
+/* debugging functions */
+extern void i2o_report_status(const char *, const char *, struct i2o_message *);
+extern void i2o_dump_message(struct i2o_message *);
+extern void i2o_dump_hrt(struct i2o_controller *c);
+extern void i2o_debug_state(struct i2o_controller *c);
+
+#endif				/* _I2O_H */
diff --git a/drivers/staging/i2o/i2o_block.c b/drivers/staging/i2o/i2o_block.c
new file mode 100644
index 000000000000..0a13c64ce000
--- /dev/null
+++ b/drivers/staging/i2o/i2o_block.c
@@ -0,0 +1,1228 @@
+/*
+ *	Block OSM
+ *
+ * 	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful, but
+ *	WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *	General Public License for more details.
+ *
+ *	For the purpose of avoiding doubt the preferred form of the work
+ *	for making modifications shall be a standards compliant form such
+ *	gzipped tar and not one requiring a proprietary or patent encumbered
+ *	tool to unpack.
+ *
+ *	Fixes/additions:
+ *		Steve Ralston:
+ *			Multiple device handling error fixes,
+ *			Added a queue depth.
+ *		Alan Cox:
+ *			FC920 has an rmw bug. Dont or in the end marker.
+ *			Removed queue walk, fixed for 64bitness.
+ *			Rewrote much of the code over time
+ *			Added indirect block lists
+ *			Handle 64K limits on many controllers
+ *			Don't use indirects on the Promise (breaks)
+ *			Heavily chop down the queue depths
+ *		Deepak Saxena:
+ *			Independent queues per IOP
+ *			Support for dynamic device creation/deletion
+ *			Code cleanup
+ *	    		Support for larger I/Os through merge* functions
+ *			(taken from DAC960 driver)
+ *		Boji T Kannanthanam:
+ *			Set the I2O Block devices to be detected in increasing
+ *			order of TIDs during boot.
+ *			Search and set the I2O block device that we boot off
+ *			from as the first device to be claimed (as /dev/i2o/hda)
+ *			Properly attach/detach I2O gendisk structure from the
+ *			system gendisk list. The I2O block devices now appear in
+ *			/proc/partitions.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor bugfixes for 2.6.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "i2o.h"
+#include <linux/mutex.h>
+
+#include <linux/mempool.h>
+
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+
+#include <scsi/scsi.h>
+
+#include "i2o_block.h"
+
+#define OSM_NAME	"block-osm"
+#define OSM_VERSION	"1.325"
+#define OSM_DESCRIPTION	"I2O Block Device OSM"
+
+static DEFINE_MUTEX(i2o_block_mutex);
+static struct i2o_driver i2o_block_driver;
+
+/* global Block OSM request mempool */
+static struct i2o_block_mempool i2o_blk_req_pool;
+
+/* Block OSM class handling definition */
+static struct i2o_class_id i2o_block_class_id[] = {
+	{I2O_CLASS_RANDOM_BLOCK_STORAGE},
+	{I2O_CLASS_END}
+};
+
+/**
+ *	i2o_block_device_free - free the memory of the I2O Block device
+ *	@dev: I2O Block device, which should be cleaned up
+ *
+ *	Frees the request queue, gendisk and the i2o_block_device structure.
+ */
+static void i2o_block_device_free(struct i2o_block_device *dev)
+{
+	blk_cleanup_queue(dev->gd->queue);
+
+	put_disk(dev->gd);
+
+	kfree(dev);
+};
+
+/**
+ *	i2o_block_remove - remove the I2O Block device from the system again
+ *	@dev: I2O Block device which should be removed
+ *
+ *	Remove gendisk from system and free all allocated memory.
+ *
+ *	Always returns 0.
+ */
+static int i2o_block_remove(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_block_device *i2o_blk_dev = dev_get_drvdata(dev);
+
+	osm_info("device removed (TID: %03x): %s\n", i2o_dev->lct_data.tid,
+		 i2o_blk_dev->gd->disk_name);
+
+	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0);
+
+	del_gendisk(i2o_blk_dev->gd);
+
+	dev_set_drvdata(dev, NULL);
+
+	i2o_device_claim_release(i2o_dev);
+
+	i2o_block_device_free(i2o_blk_dev);
+
+	return 0;
+};
+
+/**
+ *	i2o_block_device flush - Flush all dirty data of I2O device dev
+ *	@dev: I2O device which should be flushed
+ *
+ *	Flushes all dirty data on device dev.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_flush(struct i2o_device *dev)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(60 << 16);
+	osm_debug("Flushing...\n");
+
+	return i2o_msg_post_wait(dev->iop, msg, 60);
+};
+
+/**
+ *	i2o_block_device_mount - Mount (load) the media of device dev
+ *	@dev: I2O device which should receive the mount request
+ *	@media_id: Media Identifier
+ *
+ *	Load a media into drive. Identifier should be set to -1, because the
+ *	spec does not support any other value.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(-1);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	osm_debug("Mounting...\n");
+
+	return i2o_msg_post_wait(dev->iop, msg, 2);
+};
+
+/**
+ *	i2o_block_device_lock - Locks the media of device dev
+ *	@dev: I2O device which should receive the lock request
+ *	@media_id: Media Identifier
+ *
+ *	Lock media of device dev to prevent removal. The media identifier
+ *	should be set to -1, because the spec does not support any other value.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(-1);
+	osm_debug("Locking...\n");
+
+	return i2o_msg_post_wait(dev->iop, msg, 2);
+};
+
+/**
+ *	i2o_block_device_unlock - Unlocks the media of device dev
+ *	@dev: I2O device which should receive the unlocked request
+ *	@media_id: Media Identifier
+ *
+ *	Unlocks the media in device dev. The media identifier should be set to
+ *	-1, because the spec does not support any other value.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_unlock(struct i2o_device *dev, u32 media_id)
+{
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(media_id);
+	osm_debug("Unlocking...\n");
+
+	return i2o_msg_post_wait(dev->iop, msg, 2);
+};
+
+/**
+ *	i2o_block_device_power - Power management for device dev
+ *	@dev: I2O device which should receive the power management request
+ *	@op: Operation to send
+ *
+ *	Send a power management request to the device dev.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_device_power(struct i2o_block_device *dev, u8 op)
+{
+	struct i2o_device *i2o_dev = dev->i2o_dev;
+	struct i2o_controller *c = i2o_dev->iop;
+	struct i2o_message *msg;
+	int rc;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(op << 24);
+	osm_debug("Power...\n");
+
+	rc = i2o_msg_post_wait(c, msg, 60);
+	if (!rc)
+		dev->power = op;
+
+	return rc;
+};
+
+/**
+ *	i2o_block_request_alloc - Allocate an I2O block request struct
+ *
+ *	Allocates an I2O block request struct and initialize the list.
+ *
+ *	Returns a i2o_block_request pointer on success or negative error code
+ *	on failure.
+ */
+static inline struct i2o_block_request *i2o_block_request_alloc(void)
+{
+	struct i2o_block_request *ireq;
+
+	ireq = mempool_alloc(i2o_blk_req_pool.pool, GFP_ATOMIC);
+	if (!ireq)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ireq->queue);
+	sg_init_table(ireq->sg_table, I2O_MAX_PHYS_SEGMENTS);
+
+	return ireq;
+};
+
+/**
+ *	i2o_block_request_free - Frees a I2O block request
+ *	@ireq: I2O block request which should be freed
+ *
+ *	Frees the allocated memory (give it back to the request mempool).
+ */
+static inline void i2o_block_request_free(struct i2o_block_request *ireq)
+{
+	mempool_free(ireq, i2o_blk_req_pool.pool);
+};
+
+/**
+ *	i2o_block_sglist_alloc - Allocate the SG list and map it
+ *	@c: I2O controller to which the request belongs
+ *	@ireq: I2O block request
+ *	@mptr: message body pointer
+ *
+ *	Builds the SG list and map it to be accessible by the controller.
+ *
+ *	Returns 0 on failure or 1 on success.
+ */
+static inline int i2o_block_sglist_alloc(struct i2o_controller *c,
+					 struct i2o_block_request *ireq,
+					 u32 ** mptr)
+{
+	int nents;
+	enum dma_data_direction direction;
+
+	ireq->dev = &c->pdev->dev;
+	nents = blk_rq_map_sg(ireq->req->q, ireq->req, ireq->sg_table);
+
+	if (rq_data_dir(ireq->req) == READ)
+		direction = PCI_DMA_FROMDEVICE;
+	else
+		direction = PCI_DMA_TODEVICE;
+
+	ireq->sg_nents = nents;
+
+	return i2o_dma_map_sg(c, ireq->sg_table, nents, direction, mptr);
+};
+
+/**
+ *	i2o_block_sglist_free - Frees the SG list
+ *	@ireq: I2O block request from which the SG should be freed
+ *
+ *	Frees the SG list from the I2O block request.
+ */
+static inline void i2o_block_sglist_free(struct i2o_block_request *ireq)
+{
+	enum dma_data_direction direction;
+
+	if (rq_data_dir(ireq->req) == READ)
+		direction = PCI_DMA_FROMDEVICE;
+	else
+		direction = PCI_DMA_TODEVICE;
+
+	dma_unmap_sg(ireq->dev, ireq->sg_table, ireq->sg_nents, direction);
+};
+
+/**
+ *	i2o_block_prep_req_fn - Allocates I2O block device specific struct
+ *	@q: request queue for the request
+ *	@req: the request to prepare
+ *
+ *	Allocate the necessary i2o_block_request struct and connect it to
+ *	the request. This is needed that we not lose the SG list later on.
+ *
+ *	Returns BLKPREP_OK on success or BLKPREP_DEFER on failure.
+ */
+static int i2o_block_prep_req_fn(struct request_queue *q, struct request *req)
+{
+	struct i2o_block_device *i2o_blk_dev = q->queuedata;
+	struct i2o_block_request *ireq;
+
+	if (unlikely(!i2o_blk_dev)) {
+		osm_err("block device already removed\n");
+		return BLKPREP_KILL;
+	}
+
+	/* connect the i2o_block_request to the request */
+	if (!req->special) {
+		ireq = i2o_block_request_alloc();
+		if (IS_ERR(ireq)) {
+			osm_debug("unable to allocate i2o_block_request!\n");
+			return BLKPREP_DEFER;
+		}
+
+		ireq->i2o_blk_dev = i2o_blk_dev;
+		req->special = ireq;
+		ireq->req = req;
+	}
+	/* do not come back here */
+	req->cmd_flags |= REQ_DONTPREP;
+
+	return BLKPREP_OK;
+};
+
+/**
+ *	i2o_block_delayed_request_fn - delayed request queue function
+ *	@work: the delayed request with the queue to start
+ *
+ *	If the request queue is stopped for a disk, and there is no open
+ *	request, a new event is created, which calls this function to start
+ *	the queue after I2O_BLOCK_REQUEST_TIME. Otherwise the queue will never
+ *	be started again.
+ */
+static void i2o_block_delayed_request_fn(struct work_struct *work)
+{
+	struct i2o_block_delayed_request *dreq =
+		container_of(work, struct i2o_block_delayed_request,
+			     work.work);
+	struct request_queue *q = dreq->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	blk_start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+	kfree(dreq);
+};
+
+/**
+ *	i2o_block_end_request - Post-processing of completed commands
+ *	@req: request which should be completed
+ *	@error: 0 for success, < 0 for error
+ *	@nr_bytes: number of bytes to complete
+ *
+ *	Mark the request as complete. The lock must not be held when entering.
+ *
+ */
+static void i2o_block_end_request(struct request *req, int error,
+				  int nr_bytes)
+{
+	struct i2o_block_request *ireq = req->special;
+	struct i2o_block_device *dev = ireq->i2o_blk_dev;
+	struct request_queue *q = req->q;
+	unsigned long flags;
+
+	if (blk_end_request(req, error, nr_bytes))
+		if (error)
+			blk_end_request_all(req, -EIO);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (likely(dev)) {
+		dev->open_queue_depth--;
+		list_del(&ireq->queue);
+	}
+
+	blk_start_queue(q);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	i2o_block_sglist_free(ireq);
+	i2o_block_request_free(ireq);
+};
+
+/**
+ *	i2o_block_reply - Block OSM reply handler.
+ *	@c: I2O controller from which the message arrives
+ *	@m: message id of reply
+ *	@msg: the actual I2O message reply
+ *
+ *	This function gets all the message replies.
+ *
+ */
+static int i2o_block_reply(struct i2o_controller *c, u32 m,
+			   struct i2o_message *msg)
+{
+	struct request *req;
+	int error = 0;
+
+	req = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
+	if (unlikely(!req)) {
+		osm_err("NULL reply received!\n");
+		return -1;
+	}
+
+	/*
+	 *      Lets see what is cooking. We stuffed the
+	 *      request in the context.
+	 */
+
+	if ((le32_to_cpu(msg->body[0]) >> 24) != 0) {
+		u32 status = le32_to_cpu(msg->body[0]);
+		/*
+		 *      Device not ready means two things. One is that the
+		 *      the thing went offline (but not a removal media)
+		 *
+		 *      The second is that you have a SuperTrak 100 and the
+		 *      firmware got constipated. Unlike standard i2o card
+		 *      setups the supertrak returns an error rather than
+		 *      blocking for the timeout in these cases.
+		 *
+		 *      Don't stick a supertrak100 into cache aggressive modes
+		 */
+
+		osm_err("TID %03x error status: 0x%02x, detailed status: "
+			"0x%04x\n", (le32_to_cpu(msg->u.head[1]) >> 12 & 0xfff),
+			status >> 24, status & 0xffff);
+
+		req->errors++;
+
+		error = -EIO;
+	}
+
+	i2o_block_end_request(req, error, le32_to_cpu(msg->body[1]));
+
+	return 1;
+};
+
+static void i2o_block_event(struct work_struct *work)
+{
+	struct i2o_event *evt = container_of(work, struct i2o_event, work);
+	osm_debug("event received\n");
+	kfree(evt);
+};
+
+/*
+ *	SCSI-CAM for ioctl geometry mapping
+ *	Duplicated with SCSI - this should be moved into somewhere common
+ *	perhaps genhd ?
+ *
+ * LBA -> CHS mapping table taken from:
+ *
+ * "Incorporating the I2O Architecture into BIOS for Intel Architecture
+ *  Platforms"
+ *
+ * This is an I2O document that is only available to I2O members,
+ * not developers.
+ *
+ * From my understanding, this is how all the I2O cards do this
+ *
+ * Disk Size      | Sectors | Heads | Cylinders
+ * ---------------+---------+-------+-------------------
+ * 1 < X <= 528M  | 63      | 16    | X/(63 * 16 * 512)
+ * 528M < X <= 1G | 63      | 32    | X/(63 * 32 * 512)
+ * 1 < X <528M    | 63      | 16    | X/(63 * 16 * 512)
+ * 1 < X <528M    | 63      | 16    | X/(63 * 16 * 512)
+ *
+ */
+#define	BLOCK_SIZE_528M		1081344
+#define	BLOCK_SIZE_1G		2097152
+#define	BLOCK_SIZE_21G		4403200
+#define	BLOCK_SIZE_42G		8806400
+#define	BLOCK_SIZE_84G		17612800
+
+static void i2o_block_biosparam(unsigned long capacity, unsigned short *cyls,
+				unsigned char *hds, unsigned char *secs)
+{
+	unsigned long heads, sectors, cylinders;
+
+	sectors = 63L;		/* Maximize sectors per track */
+	if (capacity <= BLOCK_SIZE_528M)
+		heads = 16;
+	else if (capacity <= BLOCK_SIZE_1G)
+		heads = 32;
+	else if (capacity <= BLOCK_SIZE_21G)
+		heads = 64;
+	else if (capacity <= BLOCK_SIZE_42G)
+		heads = 128;
+	else
+		heads = 255;
+
+	cylinders = (unsigned long)capacity / (heads * sectors);
+
+	*cyls = (unsigned short)cylinders;	/* Stuff return values */
+	*secs = (unsigned char)sectors;
+	*hds = (unsigned char)heads;
+}
+
+/**
+ *	i2o_block_open - Open the block device
+ *	@bdev: block device being opened
+ *	@mode: file open mode
+ *
+ *	Power up the device, mount and lock the media. This function is called,
+ *	if the block device is opened for access.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_open(struct block_device *bdev, fmode_t mode)
+{
+	struct i2o_block_device *dev = bdev->bd_disk->private_data;
+
+	if (!dev->i2o_dev)
+		return -ENODEV;
+
+	mutex_lock(&i2o_block_mutex);
+	if (dev->power > 0x1f)
+		i2o_block_device_power(dev, 0x02);
+
+	i2o_block_device_mount(dev->i2o_dev, -1);
+
+	i2o_block_device_lock(dev->i2o_dev, -1);
+
+	osm_debug("Ready.\n");
+	mutex_unlock(&i2o_block_mutex);
+
+	return 0;
+};
+
+/**
+ *	i2o_block_release - Release the I2O block device
+ *	@disk: gendisk device being released
+ *	@mode: file open mode
+ *
+ *	Unlock and unmount the media, and power down the device. Gets called if
+ *	the block device is closed.
+ */
+static void i2o_block_release(struct gendisk *disk, fmode_t mode)
+{
+	struct i2o_block_device *dev = disk->private_data;
+	u8 operation;
+
+	/*
+	 * This is to deal with the case of an application
+	 * opening a device and then the device disappears while
+	 * it's in use, and then the application tries to release
+	 * it.  ex: Unmounting a deleted RAID volume at reboot.
+	 * If we send messages, it will just cause FAILs since
+	 * the TID no longer exists.
+	 */
+	if (!dev->i2o_dev)
+		return;
+
+	mutex_lock(&i2o_block_mutex);
+	i2o_block_device_flush(dev->i2o_dev);
+
+	i2o_block_device_unlock(dev->i2o_dev, -1);
+
+	if (dev->flags & (1 << 3 | 1 << 4))	/* Removable */
+		operation = 0x21;
+	else
+		operation = 0x24;
+
+	i2o_block_device_power(dev, operation);
+	mutex_unlock(&i2o_block_mutex);
+}
+
+static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	i2o_block_biosparam(get_capacity(bdev->bd_disk),
+			    &geo->cylinders, &geo->heads, &geo->sectors);
+	return 0;
+}
+
+/**
+ *	i2o_block_ioctl - Issue device specific ioctl calls.
+ *	@bdev: block device being opened
+ *	@mode: file open mode
+ *	@cmd: ioctl command
+ *	@arg: arg
+ *
+ *	Handles ioctl request for the block device.
+ *
+ *	Return 0 on success or negative error on failure.
+ */
+static int i2o_block_ioctl(struct block_device *bdev, fmode_t mode,
+			   unsigned int cmd, unsigned long arg)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct i2o_block_device *dev = disk->private_data;
+	int ret = -ENOTTY;
+
+	/* Anyone capable of this syscall can do *real bad* things */
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&i2o_block_mutex);
+	switch (cmd) {
+	case BLKI2OGRSTRAT:
+		ret = put_user(dev->rcache, (int __user *)arg);
+		break;
+	case BLKI2OGWSTRAT:
+		ret = put_user(dev->wcache, (int __user *)arg);
+		break;
+	case BLKI2OSRSTRAT:
+		ret = -EINVAL;
+		if (arg < 0 || arg > CACHE_SMARTFETCH)
+			break;
+		dev->rcache = arg;
+		ret = 0;
+		break;
+	case BLKI2OSWSTRAT:
+		ret = -EINVAL;
+		if (arg != 0
+		    && (arg < CACHE_WRITETHROUGH || arg > CACHE_SMARTBACK))
+			break;
+		dev->wcache = arg;
+		ret = 0;
+		break;
+	}
+	mutex_unlock(&i2o_block_mutex);
+
+	return ret;
+};
+
+/**
+ *	i2o_block_check_events - Have we seen a media change?
+ *	@disk: gendisk which should be verified
+ *	@clearing: events being cleared
+ *
+ *	Verifies if the media has changed.
+ *
+ *	Returns 1 if the media was changed or 0 otherwise.
+ */
+static unsigned int i2o_block_check_events(struct gendisk *disk,
+					   unsigned int clearing)
+{
+	struct i2o_block_device *p = disk->private_data;
+
+	if (p->media_change_flag) {
+		p->media_change_flag = 0;
+		return DISK_EVENT_MEDIA_CHANGE;
+	}
+	return 0;
+}
+
+/**
+ *	i2o_block_transfer - Transfer a request to/from the I2O controller
+ *	@req: the request which should be transferred
+ *
+ *	This function converts the request into a I2O message. The necessary
+ *	DMA buffers are allocated and after everything is setup post the message
+ *	to the I2O controller. No cleanup is done by this function. It is done
+ *	on the interrupt side when the reply arrives.
+ *
+ *	Return 0 on success or negative error code on failure.
+ */
+static int i2o_block_transfer(struct request *req)
+{
+	struct i2o_block_device *dev = req->rq_disk->private_data;
+	struct i2o_controller *c;
+	u32 tid;
+	struct i2o_message *msg;
+	u32 *mptr;
+	struct i2o_block_request *ireq = req->special;
+	u32 tcntxt;
+	u32 sgl_offset = SGL_OFFSET_8;
+	u32 ctl_flags = 0x00000000;
+	int rc;
+	u32 cmd;
+
+	if (unlikely(!dev->i2o_dev)) {
+		osm_err("transfer to removed drive\n");
+		rc = -ENODEV;
+		goto exit;
+	}
+
+	tid = dev->i2o_dev->lct_data.tid;
+	c = dev->i2o_dev->iop;
+
+	msg = i2o_msg_get(c);
+	if (IS_ERR(msg)) {
+		rc = PTR_ERR(msg);
+		goto exit;
+	}
+
+	tcntxt = i2o_cntxt_list_add(c, req);
+	if (!tcntxt) {
+		rc = -ENOMEM;
+		goto nop_msg;
+	}
+
+	msg->u.s.icntxt = cpu_to_le32(i2o_block_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
+
+	mptr = &msg->body[0];
+
+	if (rq_data_dir(req) == READ) {
+		cmd = I2O_CMD_BLOCK_READ << 24;
+
+		switch (dev->rcache) {
+		case CACHE_PREFETCH:
+			ctl_flags = 0x201F0008;
+			break;
+
+		case CACHE_SMARTFETCH:
+			if (blk_rq_sectors(req) > 16)
+				ctl_flags = 0x201F0008;
+			else
+				ctl_flags = 0x001F0000;
+			break;
+
+		default:
+			break;
+		}
+	} else {
+		cmd = I2O_CMD_BLOCK_WRITE << 24;
+
+		switch (dev->wcache) {
+		case CACHE_WRITETHROUGH:
+			ctl_flags = 0x001F0008;
+			break;
+		case CACHE_WRITEBACK:
+			ctl_flags = 0x001F0010;
+			break;
+		case CACHE_SMARTBACK:
+			if (blk_rq_sectors(req) > 16)
+				ctl_flags = 0x001F0004;
+			else
+				ctl_flags = 0x001F0010;
+			break;
+		case CACHE_SMARTTHROUGH:
+			if (blk_rq_sectors(req) > 16)
+				ctl_flags = 0x001F0004;
+			else
+				ctl_flags = 0x001F0010;
+		default:
+			break;
+		}
+	}
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		u8 cmd[10];
+		u32 scsi_flags;
+		u16 hwsec;
+
+		hwsec = queue_logical_block_size(req->q) >> KERNEL_SECTOR_SHIFT;
+		memset(cmd, 0, 10);
+
+		sgl_offset = SGL_OFFSET_12;
+
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid);
+
+		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
+		*mptr++ = cpu_to_le32(tid);
+
+		/*
+		 * ENABLE_DISCONNECT
+		 * SIMPLE_TAG
+		 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
+		 */
+		if (rq_data_dir(req) == READ) {
+			cmd[0] = READ_10;
+			scsi_flags = 0x60a0000a;
+		} else {
+			cmd[0] = WRITE_10;
+			scsi_flags = 0xa0a0000a;
+		}
+
+		*mptr++ = cpu_to_le32(scsi_flags);
+
+		*((u32 *) & cmd[2]) = cpu_to_be32(blk_rq_pos(req) * hwsec);
+		*((u16 *) & cmd[7]) = cpu_to_be16(blk_rq_sectors(req) * hwsec);
+
+		memcpy(mptr, cmd, 10);
+		mptr += 4;
+		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
+	} else
+#endif
+	{
+		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
+		*mptr++ = cpu_to_le32(ctl_flags);
+		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
+		*mptr++ =
+		    cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT));
+		*mptr++ =
+		    cpu_to_le32(blk_rq_pos(req) >> (32 - KERNEL_SECTOR_SHIFT));
+	}
+
+	if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
+		rc = -ENOMEM;
+		goto context_remove;
+	}
+
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
+
+	list_add_tail(&ireq->queue, &dev->open_queue);
+	dev->open_queue_depth++;
+
+	i2o_msg_post(c, msg);
+
+	return 0;
+
+      context_remove:
+	i2o_cntxt_list_remove(c, req);
+
+      nop_msg:
+	i2o_msg_nop(c, msg);
+
+      exit:
+	return rc;
+};
+
+/**
+ *	i2o_block_request_fn - request queue handling function
+ *	@q: request queue from which the request could be fetched
+ *
+ *	Takes the next request from the queue, transfers it and if no error
+ *	occurs dequeue it from the queue. On arrival of the reply the message
+ *	will be processed further. If an error occurs requeue the request.
+ */
+static void i2o_block_request_fn(struct request_queue *q)
+{
+	struct request *req;
+
+	while ((req = blk_peek_request(q)) != NULL) {
+		if (req->cmd_type == REQ_TYPE_FS) {
+			struct i2o_block_delayed_request *dreq;
+			struct i2o_block_request *ireq = req->special;
+			unsigned int queue_depth;
+
+			queue_depth = ireq->i2o_blk_dev->open_queue_depth;
+
+			if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS) {
+				if (!i2o_block_transfer(req)) {
+					blk_start_request(req);
+					continue;
+				} else
+					osm_info("transfer error\n");
+			}
+
+			if (queue_depth)
+				break;
+
+			/* stop the queue and retry later */
+			dreq = kmalloc(sizeof(*dreq), GFP_ATOMIC);
+			if (!dreq)
+				continue;
+
+			dreq->queue = q;
+			INIT_DELAYED_WORK(&dreq->work,
+					  i2o_block_delayed_request_fn);
+
+			if (!queue_delayed_work(i2o_block_driver.event_queue,
+						&dreq->work,
+						I2O_BLOCK_RETRY_TIME))
+				kfree(dreq);
+			else {
+				blk_stop_queue(q);
+				break;
+			}
+		} else {
+			blk_start_request(req);
+			__blk_end_request_all(req, -EIO);
+		}
+	}
+};
+
+/* I2O Block device operations definition */
+static const struct block_device_operations i2o_block_fops = {
+	.owner = THIS_MODULE,
+	.open = i2o_block_open,
+	.release = i2o_block_release,
+	.ioctl = i2o_block_ioctl,
+	.compat_ioctl = i2o_block_ioctl,
+	.getgeo = i2o_block_getgeo,
+	.check_events = i2o_block_check_events,
+};
+
+/**
+ *	i2o_block_device_alloc - Allocate memory for a I2O Block device
+ *
+ *	Allocate memory for the i2o_block_device struct, gendisk and request
+ *	queue and initialize them as far as no additional information is needed.
+ *
+ *	Returns a pointer to the allocated I2O Block device on success or a
+ *	negative error code on failure.
+ */
+static struct i2o_block_device *i2o_block_device_alloc(void)
+{
+	struct i2o_block_device *dev;
+	struct gendisk *gd;
+	struct request_queue *queue;
+	int rc;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		osm_err("Insufficient memory to allocate I2O Block disk.\n");
+		rc = -ENOMEM;
+		goto exit;
+	}
+
+	INIT_LIST_HEAD(&dev->open_queue);
+	spin_lock_init(&dev->lock);
+	dev->rcache = CACHE_PREFETCH;
+	dev->wcache = CACHE_WRITEBACK;
+
+	/* allocate a gendisk with 16 partitions */
+	gd = alloc_disk(16);
+	if (!gd) {
+		osm_err("Insufficient memory to allocate gendisk.\n");
+		rc = -ENOMEM;
+		goto cleanup_dev;
+	}
+
+	/* initialize the request queue */
+	queue = blk_init_queue(i2o_block_request_fn, &dev->lock);
+	if (!queue) {
+		osm_err("Insufficient memory to allocate request queue.\n");
+		rc = -ENOMEM;
+		goto cleanup_queue;
+	}
+
+	blk_queue_prep_rq(queue, i2o_block_prep_req_fn);
+
+	gd->major = I2O_MAJOR;
+	gd->queue = queue;
+	gd->fops = &i2o_block_fops;
+	gd->private_data = dev;
+
+	dev->gd = gd;
+
+	return dev;
+
+      cleanup_queue:
+	put_disk(gd);
+
+      cleanup_dev:
+	kfree(dev);
+
+      exit:
+	return ERR_PTR(rc);
+};
+
+/**
+ *	i2o_block_probe - verify if dev is a I2O Block device and install it
+ *	@dev: device to verify if it is a I2O Block device
+ *
+ *	We only verify if the user_tid of the device is 0xfff and then install
+ *	the device. Otherwise it is used by some other device (e. g. RAID).
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_block_probe(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_controller *c = i2o_dev->iop;
+	struct i2o_block_device *i2o_blk_dev;
+	struct gendisk *gd;
+	struct request_queue *queue;
+	static int unit = 0;
+	int rc;
+	u64 size;
+	u32 blocksize;
+	u16 body_size = 4;
+	u16 power;
+	unsigned short max_sectors;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec)
+		body_size = 8;
+#endif
+
+	if (c->limit_sectors)
+		max_sectors = I2O_MAX_SECTORS_LIMITED;
+	else
+		max_sectors = I2O_MAX_SECTORS;
+
+	/* skip devices which are used by IOP */
+	if (i2o_dev->lct_data.user_tid != 0xfff) {
+		osm_debug("skipping used device %03x\n", i2o_dev->lct_data.tid);
+		return -ENODEV;
+	}
+
+	if (i2o_device_claim(i2o_dev)) {
+		osm_warn("Unable to claim device. Installation aborted\n");
+		rc = -EFAULT;
+		goto exit;
+	}
+
+	i2o_blk_dev = i2o_block_device_alloc();
+	if (IS_ERR(i2o_blk_dev)) {
+		osm_err("could not alloc a new I2O block device");
+		rc = PTR_ERR(i2o_blk_dev);
+		goto claim_release;
+	}
+
+	i2o_blk_dev->i2o_dev = i2o_dev;
+	dev_set_drvdata(dev, i2o_blk_dev);
+
+	/* setup gendisk */
+	gd = i2o_blk_dev->gd;
+	gd->first_minor = unit << 4;
+	sprintf(gd->disk_name, "i2o/hd%c", 'a' + unit);
+	gd->driverfs_dev = &i2o_dev->device;
+
+	/* setup request queue */
+	queue = gd->queue;
+	queue->queuedata = i2o_blk_dev;
+
+	blk_queue_max_hw_sectors(queue, max_sectors);
+	blk_queue_max_segments(queue, i2o_sg_tablesize(c, body_size));
+
+	osm_debug("max sectors = %d\n", queue->max_sectors);
+	osm_debug("phys segments = %d\n", queue->max_phys_segments);
+	osm_debug("max hw segments = %d\n", queue->max_hw_segments);
+
+	/*
+	 *      Ask for the current media data. If that isn't supported
+	 *      then we ask for the device capacity data
+	 */
+	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
+	    !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
+		blk_queue_logical_block_size(queue, le32_to_cpu(blocksize));
+	} else
+		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
+
+	if (!i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) ||
+	    !i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
+		set_capacity(gd, le64_to_cpu(size) >> KERNEL_SECTOR_SHIFT);
+	} else
+		osm_warn("could not get size of %s\n", gd->disk_name);
+
+	if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &power, 2))
+		i2o_blk_dev->power = power;
+
+	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0xffffffff);
+
+	add_disk(gd);
+
+	unit++;
+
+	osm_info("device added (TID: %03x): %s\n", i2o_dev->lct_data.tid,
+		 i2o_blk_dev->gd->disk_name);
+
+	return 0;
+
+      claim_release:
+	i2o_device_claim_release(i2o_dev);
+
+      exit:
+	return rc;
+};
+
+/* Block OSM driver struct */
+static struct i2o_driver i2o_block_driver = {
+	.name = OSM_NAME,
+	.event = i2o_block_event,
+	.reply = i2o_block_reply,
+	.classes = i2o_block_class_id,
+	.driver = {
+		   .probe = i2o_block_probe,
+		   .remove = i2o_block_remove,
+		   },
+};
+
+/**
+ *	i2o_block_init - Block OSM initialization function
+ *
+ *	Allocate the slab and mempool for request structs, registers i2o_block
+ *	block device and finally register the Block OSM in the I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_block_init(void)
+{
+	int rc;
+	int size;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	/* Allocate request mempool and slab */
+	size = sizeof(struct i2o_block_request);
+	i2o_blk_req_pool.slab = kmem_cache_create("i2o_block_req", size, 0,
+						  SLAB_HWCACHE_ALIGN, NULL);
+	if (!i2o_blk_req_pool.slab) {
+		osm_err("can't init request slab\n");
+		rc = -ENOMEM;
+		goto exit;
+	}
+
+	i2o_blk_req_pool.pool =
+		mempool_create_slab_pool(I2O_BLOCK_REQ_MEMPOOL_SIZE,
+					 i2o_blk_req_pool.slab);
+	if (!i2o_blk_req_pool.pool) {
+		osm_err("can't init request mempool\n");
+		rc = -ENOMEM;
+		goto free_slab;
+	}
+
+	/* Register the block device interfaces */
+	rc = register_blkdev(I2O_MAJOR, "i2o_block");
+	if (rc) {
+		osm_err("unable to register block device\n");
+		goto free_mempool;
+	}
+#ifdef MODULE
+	osm_info("registered device at major %d\n", I2O_MAJOR);
+#endif
+
+	/* Register Block OSM into I2O core */
+	rc = i2o_driver_register(&i2o_block_driver);
+	if (rc) {
+		osm_err("Could not register Block driver\n");
+		goto unregister_blkdev;
+	}
+
+	return 0;
+
+      unregister_blkdev:
+	unregister_blkdev(I2O_MAJOR, "i2o_block");
+
+      free_mempool:
+	mempool_destroy(i2o_blk_req_pool.pool);
+
+      free_slab:
+	kmem_cache_destroy(i2o_blk_req_pool.slab);
+
+      exit:
+	return rc;
+};
+
+/**
+ *	i2o_block_exit - Block OSM exit function
+ *
+ *	Unregisters Block OSM from I2O core, unregisters i2o_block block device
+ *	and frees the mempool and slab.
+ */
+static void __exit i2o_block_exit(void)
+{
+	/* Unregister I2O Block OSM from I2O core */
+	i2o_driver_unregister(&i2o_block_driver);
+
+	/* Unregister block device */
+	unregister_blkdev(I2O_MAJOR, "i2o_block");
+
+	/* Free request mempool and slab */
+	mempool_destroy(i2o_blk_req_pool.pool);
+	kmem_cache_destroy(i2o_blk_req_pool.slab);
+};
+
+MODULE_AUTHOR("Red Hat");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_block_init);
+module_exit(i2o_block_exit);
diff --git a/drivers/staging/i2o/i2o_block.h b/drivers/staging/i2o/i2o_block.h
new file mode 100644
index 000000000000..cf8873cbca3f
--- /dev/null
+++ b/drivers/staging/i2o/i2o_block.h
@@ -0,0 +1,103 @@
+/*
+ *	Block OSM structures/API
+ *
+ * 	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful, but
+ *	WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *	General Public License for more details.
+ *
+ *	For the purpose of avoiding doubt the preferred form of the work
+ *	for making modifications shall be a standards compliant form such
+ *	gzipped tar and not one requiring a proprietary or patent encumbered
+ *	tool to unpack.
+ *
+ *	Fixes/additions:
+ *		Steve Ralston:
+ *			Multiple device handling error fixes,
+ *			Added a queue depth.
+ *		Alan Cox:
+ *			FC920 has an rmw bug. Dont or in the end marker.
+ *			Removed queue walk, fixed for 64bitness.
+ *			Rewrote much of the code over time
+ *			Added indirect block lists
+ *			Handle 64K limits on many controllers
+ *			Don't use indirects on the Promise (breaks)
+ *			Heavily chop down the queue depths
+ *		Deepak Saxena:
+ *			Independent queues per IOP
+ *			Support for dynamic device creation/deletion
+ *			Code cleanup
+ *	    		Support for larger I/Os through merge* functions
+ *			(taken from DAC960 driver)
+ *		Boji T Kannanthanam:
+ *			Set the I2O Block devices to be detected in increasing
+ *			order of TIDs during boot.
+ *			Search and set the I2O block device that we boot off
+ *			from as the first device to be claimed (as /dev/i2o/hda)
+ *			Properly attach/detach I2O gendisk structure from the
+ *			system gendisk list. The I2O block devices now appear in
+ *			/proc/partitions.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor bugfixes for 2.6.
+ */
+
+#ifndef I2O_BLOCK_OSM_H
+#define I2O_BLOCK_OSM_H
+
+#define I2O_BLOCK_RETRY_TIME HZ/4
+#define I2O_BLOCK_MAX_OPEN_REQUESTS 50
+
+/* request queue sizes */
+#define I2O_BLOCK_REQ_MEMPOOL_SIZE		32
+
+#define KERNEL_SECTOR_SHIFT 9
+#define KERNEL_SECTOR_SIZE (1 << KERNEL_SECTOR_SHIFT)
+
+/* I2O Block OSM mempool struct */
+struct i2o_block_mempool {
+	struct kmem_cache *slab;
+	mempool_t *pool;
+};
+
+/* I2O Block device descriptor */
+struct i2o_block_device {
+	struct i2o_device *i2o_dev;	/* pointer to I2O device */
+	struct gendisk *gd;
+	spinlock_t lock;	/* queue lock */
+	struct list_head open_queue;	/* list of transferred, but unfinished
+					   requests */
+	unsigned int open_queue_depth;	/* number of requests in the queue */
+
+	int rcache;		/* read cache flags */
+	int wcache;		/* write cache flags */
+	int flags;
+	u16 power;		/* power state */
+	int media_change_flag;	/* media changed flag */
+};
+
+/* I2O Block device request */
+struct i2o_block_request {
+	struct list_head queue;
+	struct request *req;	/* corresponding request */
+	struct i2o_block_device *i2o_blk_dev;	/* I2O block device */
+	struct device *dev;	/* device used for DMA */
+	int sg_nents;		/* number of SG elements */
+	struct scatterlist sg_table[I2O_MAX_PHYS_SEGMENTS];	/* SG table */
+};
+
+/* I2O Block device delayed request */
+struct i2o_block_delayed_request {
+	struct delayed_work work;
+	struct request_queue *queue;
+};
+
+#endif
diff --git a/drivers/staging/i2o/i2o_config.c b/drivers/staging/i2o/i2o_config.c
new file mode 100644
index 000000000000..04bd3b6de401
--- /dev/null
+++ b/drivers/staging/i2o/i2o_config.c
@@ -0,0 +1,1163 @@
+/*
+ * I2O Configuration Interface Driver
+ *
+ * (C) Copyright 1999-2002  Red Hat
+ *
+ * Written by Alan Cox, Building Number Three Ltd
+ *
+ * Fixes/additions:
+ *	Deepak Saxena (04/20/1999):
+ *		Added basic ioctl() support
+ *	Deepak Saxena (06/07/1999):
+ *		Added software download ioctl (still testing)
+ *	Auvo Häkkinen (09/10/1999):
+ *		Changes to i2o_cfg_reply(), ioctl_parms()
+ *		Added ioct_validate()
+ *	Taneli Vähäkangas (09/30/1999):
+ *		Fixed ioctl_swdl()
+ *	Taneli Vähäkangas (10/04/1999):
+ *		Changed ioctl_swdl(), implemented ioctl_swul() and ioctl_swdel()
+ *	Deepak Saxena (11/18/1999):
+ *		Added event managmenet support
+ *	Alan Cox <alan@lxorguk.ukuu.org.uk>:
+ *		2.4 rewrite ported to 2.5
+ *	Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *		Added pass-thru support for Adaptec's raidutils
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/compat.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include "core.h"
+
+#define SG_TABLESIZE		30
+
+static DEFINE_MUTEX(i2o_cfg_mutex);
+static long i2o_cfg_ioctl(struct file *, unsigned int, unsigned long);
+
+static spinlock_t i2o_config_lock;
+
+#define MODINC(x,y) ((x) = ((x) + 1) % (y))
+
+struct sg_simple_element {
+	u32 flag_count;
+	u32 addr_bus;
+};
+
+struct i2o_cfg_info {
+	struct file *fp;
+	struct fasync_struct *fasync;
+	struct i2o_evt_info event_q[I2O_EVT_Q_LEN];
+	u16 q_in;		// Queue head index
+	u16 q_out;		// Queue tail index
+	u16 q_len;		// Queue length
+	u16 q_lost;		// Number of lost events
+	ulong q_id;		// Event queue ID...used as tx_context
+	struct i2o_cfg_info *next;
+};
+static struct i2o_cfg_info *open_files = NULL;
+static ulong i2o_cfg_info_id = 0;
+
+static int i2o_cfg_getiops(unsigned long arg)
+{
+	struct i2o_controller *c;
+	u8 __user *user_iop_table = (void __user *)arg;
+	u8 tmp[MAX_I2O_CONTROLLERS];
+	int ret = 0;
+
+	memset(tmp, 0, MAX_I2O_CONTROLLERS);
+
+	list_for_each_entry(c, &i2o_controllers, list)
+	    tmp[c->unit] = 1;
+
+	if (copy_to_user(user_iop_table, tmp, MAX_I2O_CONTROLLERS))
+		ret = -EFAULT;
+
+	return ret;
+};
+
+static int i2o_cfg_gethrt(unsigned long arg)
+{
+	struct i2o_controller *c;
+	struct i2o_cmd_hrtlct __user *cmd = (struct i2o_cmd_hrtlct __user *)arg;
+	struct i2o_cmd_hrtlct kcmd;
+	i2o_hrt *hrt;
+	int len;
+	u32 reslen;
+	int ret = 0;
+
+	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_hrtlct)))
+		return -EFAULT;
+
+	if (get_user(reslen, kcmd.reslen) < 0)
+		return -EFAULT;
+
+	if (kcmd.resbuf == NULL)
+		return -EFAULT;
+
+	c = i2o_find_iop(kcmd.iop);
+	if (!c)
+		return -ENXIO;
+
+	hrt = (i2o_hrt *) c->hrt.virt;
+
+	len = 8 + ((hrt->entry_len * hrt->num_entries) << 2);
+
+	if (put_user(len, kcmd.reslen))
+		ret = -EFAULT;
+	else if (len > reslen)
+		ret = -ENOBUFS;
+	else if (copy_to_user(kcmd.resbuf, (void *)hrt, len))
+		ret = -EFAULT;
+
+	return ret;
+};
+
+static int i2o_cfg_getlct(unsigned long arg)
+{
+	struct i2o_controller *c;
+	struct i2o_cmd_hrtlct __user *cmd = (struct i2o_cmd_hrtlct __user *)arg;
+	struct i2o_cmd_hrtlct kcmd;
+	i2o_lct *lct;
+	int len;
+	int ret = 0;
+	u32 reslen;
+
+	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_hrtlct)))
+		return -EFAULT;
+
+	if (get_user(reslen, kcmd.reslen) < 0)
+		return -EFAULT;
+
+	if (kcmd.resbuf == NULL)
+		return -EFAULT;
+
+	c = i2o_find_iop(kcmd.iop);
+	if (!c)
+		return -ENXIO;
+
+	lct = (i2o_lct *) c->lct;
+
+	len = (unsigned int)lct->table_size << 2;
+	if (put_user(len, kcmd.reslen))
+		ret = -EFAULT;
+	else if (len > reslen)
+		ret = -ENOBUFS;
+	else if (copy_to_user(kcmd.resbuf, lct, len))
+		ret = -EFAULT;
+
+	return ret;
+};
+
+static int i2o_cfg_parms(unsigned long arg, unsigned int type)
+{
+	int ret = 0;
+	struct i2o_controller *c;
+	struct i2o_device *dev;
+	struct i2o_cmd_psetget __user *cmd =
+	    (struct i2o_cmd_psetget __user *)arg;
+	struct i2o_cmd_psetget kcmd;
+	u32 reslen;
+	u8 *ops;
+	u8 *res;
+	int len = 0;
+
+	u32 i2o_cmd = (type == I2OPARMGET ?
+		       I2O_CMD_UTIL_PARAMS_GET : I2O_CMD_UTIL_PARAMS_SET);
+
+	if (copy_from_user(&kcmd, cmd, sizeof(struct i2o_cmd_psetget)))
+		return -EFAULT;
+
+	if (get_user(reslen, kcmd.reslen))
+		return -EFAULT;
+
+	c = i2o_find_iop(kcmd.iop);
+	if (!c)
+		return -ENXIO;
+
+	dev = i2o_iop_find_device(c, kcmd.tid);
+	if (!dev)
+		return -ENXIO;
+
+	/*
+	 * Stop users being able to try and allocate arbitrary amounts
+	 * of DMA space. 64K is way more than sufficient for this.
+	 */
+	if (kcmd.oplen > 65536)
+		return -EMSGSIZE;
+
+	ops = memdup_user(kcmd.opbuf, kcmd.oplen);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	/*
+	 * It's possible to have a _very_ large table
+	 * and that the user asks for all of it at once...
+	 */
+	res = kmalloc(65536, GFP_KERNEL);
+	if (!res) {
+		kfree(ops);
+		return -ENOMEM;
+	}
+
+	len = i2o_parm_issue(dev, i2o_cmd, ops, kcmd.oplen, res, 65536);
+	kfree(ops);
+
+	if (len < 0) {
+		kfree(res);
+		return -EAGAIN;
+	}
+
+	if (put_user(len, kcmd.reslen))
+		ret = -EFAULT;
+	else if (len > reslen)
+		ret = -ENOBUFS;
+	else if (copy_to_user(kcmd.resbuf, res, len))
+		ret = -EFAULT;
+
+	kfree(res);
+
+	return ret;
+};
+
+static int i2o_cfg_swdl(unsigned long arg)
+{
+	struct i2o_sw_xfer kxfer;
+	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
+	unsigned char maxfrag = 0, curfrag = 1;
+	struct i2o_dma buffer;
+	struct i2o_message *msg;
+	unsigned int status = 0, swlen = 0, fragsize = 8192;
+	struct i2o_controller *c;
+
+	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
+		return -EFAULT;
+
+	if (get_user(swlen, kxfer.swlen) < 0)
+		return -EFAULT;
+
+	if (get_user(maxfrag, kxfer.maxfrag) < 0)
+		return -EFAULT;
+
+	if (get_user(curfrag, kxfer.curfrag) < 0)
+		return -EFAULT;
+
+	if (curfrag == maxfrag)
+		fragsize = swlen - (maxfrag - 1) * 8192;
+
+	if (!kxfer.buf || !access_ok(VERIFY_READ, kxfer.buf, fragsize))
+		return -EFAULT;
+
+	c = i2o_find_iop(kxfer.iop);
+	if (!c)
+		return -ENXIO;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize)) {
+		i2o_msg_nop(c, msg);
+		return -ENOMEM;
+	}
+
+	if (__copy_from_user(buffer.virt, kxfer.buf, fragsize)) {
+		i2o_msg_nop(c, msg);
+		i2o_dma_free(&c->pdev->dev, &buffer);
+		return -EFAULT;
+	}
+
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((((u32) kxfer.flags) << 24) | (((u32) kxfer.
+							sw_type) << 16) |
+			(((u32) maxfrag) << 8) | (((u32) curfrag)));
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
+	msg->body[4] = cpu_to_le32(buffer.phys);
+
+	osm_debug("swdl frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
+	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
+
+	if (status != -ETIMEDOUT)
+		i2o_dma_free(&c->pdev->dev, &buffer);
+
+	if (status != I2O_POST_WAIT_OK) {
+		// it fails if you try and send frags out of order
+		// and for some yet unknown reasons too
+		osm_info("swdl failed, DetailedStatus = %d\n", status);
+		return status;
+	}
+
+	return 0;
+};
+
+static int i2o_cfg_swul(unsigned long arg)
+{
+	struct i2o_sw_xfer kxfer;
+	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
+	unsigned char maxfrag = 0, curfrag = 1;
+	struct i2o_dma buffer;
+	struct i2o_message *msg;
+	unsigned int status = 0, swlen = 0, fragsize = 8192;
+	struct i2o_controller *c;
+	int ret = 0;
+
+	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
+		return -EFAULT;
+
+	if (get_user(swlen, kxfer.swlen) < 0)
+		return -EFAULT;
+
+	if (get_user(maxfrag, kxfer.maxfrag) < 0)
+		return -EFAULT;
+
+	if (get_user(curfrag, kxfer.curfrag) < 0)
+		return -EFAULT;
+
+	if (curfrag == maxfrag)
+		fragsize = swlen - (maxfrag - 1) * 8192;
+
+	if (!kxfer.buf)
+		return -EFAULT;
+
+	c = i2o_find_iop(kxfer.iop);
+	if (!c)
+		return -ENXIO;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize)) {
+		i2o_msg_nop(c, msg);
+		return -ENOMEM;
+	}
+
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.
+			sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag);
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
+	msg->body[4] = cpu_to_le32(buffer.phys);
+
+	osm_debug("swul frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
+	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
+
+	if (status != I2O_POST_WAIT_OK) {
+		if (status != -ETIMEDOUT)
+			i2o_dma_free(&c->pdev->dev, &buffer);
+
+		osm_info("swul failed, DetailedStatus = %d\n", status);
+		return status;
+	}
+
+	if (copy_to_user(kxfer.buf, buffer.virt, fragsize))
+		ret = -EFAULT;
+
+	i2o_dma_free(&c->pdev->dev, &buffer);
+
+	return ret;
+}
+
+static int i2o_cfg_swdel(unsigned long arg)
+{
+	struct i2o_controller *c;
+	struct i2o_sw_xfer kxfer;
+	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
+	struct i2o_message *msg;
+	unsigned int swlen;
+	int token;
+
+	if (copy_from_user(&kxfer, pxfer, sizeof(struct i2o_sw_xfer)))
+		return -EFAULT;
+
+	if (get_user(swlen, kxfer.swlen) < 0)
+		return -EFAULT;
+
+	c = i2o_find_iop(kxfer.iop);
+	if (!c)
+		return -ENXIO;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16);
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+
+	token = i2o_msg_post_wait(c, msg, 10);
+
+	if (token != I2O_POST_WAIT_OK) {
+		osm_info("swdel failed, DetailedStatus = %d\n", token);
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+};
+
+static int i2o_cfg_validate(unsigned long arg)
+{
+	int token;
+	int iop = (int)arg;
+	struct i2o_message *msg;
+	struct i2o_controller *c;
+
+	c = i2o_find_iop(iop);
+	if (!c)
+		return -ENXIO;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+
+	token = i2o_msg_post_wait(c, msg, 10);
+
+	if (token != I2O_POST_WAIT_OK) {
+		osm_info("Can't validate configuration, ErrorStatus = %d\n",
+			 token);
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+};
+
+static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
+{
+	struct i2o_message *msg;
+	struct i2o_evt_id __user *pdesc = (struct i2o_evt_id __user *)arg;
+	struct i2o_evt_id kdesc;
+	struct i2o_controller *c;
+	struct i2o_device *d;
+
+	if (copy_from_user(&kdesc, pdesc, sizeof(struct i2o_evt_id)))
+		return -EFAULT;
+
+	/* IOP exists? */
+	c = i2o_find_iop(kdesc.iop);
+	if (!c)
+		return -ENXIO;
+
+	/* Device exists? */
+	d = i2o_iop_find_device(c, kdesc.tid);
+	if (!d)
+		return -ENODEV;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 |
+			kdesc.tid);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(i2o_cntxt_list_add(c, fp->private_data));
+	msg->body[0] = cpu_to_le32(kdesc.evt_mask);
+
+	i2o_msg_post(c, msg);
+
+	return 0;
+}
+
+static int i2o_cfg_evt_get(unsigned long arg, struct file *fp)
+{
+	struct i2o_cfg_info *p = NULL;
+	struct i2o_evt_get __user *uget = (struct i2o_evt_get __user *)arg;
+	struct i2o_evt_get kget;
+	unsigned long flags;
+
+	for (p = open_files; p; p = p->next)
+		if (p->q_id == (ulong) fp->private_data)
+			break;
+
+	if (!p->q_len)
+		return -ENOENT;
+
+	memcpy(&kget.info, &p->event_q[p->q_out], sizeof(struct i2o_evt_info));
+	MODINC(p->q_out, I2O_EVT_Q_LEN);
+	spin_lock_irqsave(&i2o_config_lock, flags);
+	p->q_len--;
+	kget.pending = p->q_len;
+	kget.lost = p->q_lost;
+	spin_unlock_irqrestore(&i2o_config_lock, flags);
+
+	if (copy_to_user(uget, &kget, sizeof(struct i2o_evt_get)))
+		return -EFAULT;
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
+			      unsigned long arg)
+{
+	struct i2o_cmd_passthru32 __user *cmd;
+	struct i2o_controller *c;
+	u32 __user *user_msg;
+	u32 *reply = NULL;
+	u32 __user *user_reply = NULL;
+	u32 size = 0;
+	u32 reply_size = 0;
+	u32 rcode = 0;
+	struct i2o_dma sg_list[SG_TABLESIZE];
+	u32 sg_offset = 0;
+	u32 sg_count = 0;
+	u32 i = 0;
+	u32 sg_index = 0;
+	i2o_status_block *sb;
+	struct i2o_message *msg;
+	unsigned int iop;
+
+	cmd = (struct i2o_cmd_passthru32 __user *)arg;
+
+	if (get_user(iop, &cmd->iop) || get_user(i, &cmd->msg))
+		return -EFAULT;
+
+	user_msg = compat_ptr(i);
+
+	c = i2o_find_iop(iop);
+	if (!c) {
+		osm_debug("controller %d not found\n", iop);
+		return -ENXIO;
+	}
+
+	sb = c->status_block.virt;
+
+	if (get_user(size, &user_msg[0])) {
+		osm_warn("unable to get size!\n");
+		return -EFAULT;
+	}
+	size = size >> 16;
+
+	if (size > sb->inbound_frame_size) {
+		osm_warn("size of message > inbound_frame_size");
+		return -EFAULT;
+	}
+
+	user_reply = &user_msg[size];
+
+	size <<= 2;		// Convert to bytes
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	rcode = -EFAULT;
+	/* Copy in the user's I2O command */
+	if (copy_from_user(msg, user_msg, size)) {
+		osm_warn("unable to copy user message\n");
+		goto out;
+	}
+	i2o_dump_message(msg);
+
+	if (get_user(reply_size, &user_reply[0]) < 0)
+		goto out;
+
+	reply_size >>= 16;
+	reply_size <<= 2;
+
+	rcode = -ENOMEM;
+	reply = kzalloc(reply_size, GFP_KERNEL);
+	if (!reply) {
+		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
+		       c->name);
+		goto out;
+	}
+
+	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
+
+	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
+	if (sg_offset) {
+		struct sg_simple_element *sg;
+
+		if (sg_offset * 4 >= size) {
+			rcode = -EFAULT;
+			goto cleanup;
+		}
+		// TODO 64bit fix
+		sg = (struct sg_simple_element *)((&msg->u.head[0]) +
+						  sg_offset);
+		sg_count =
+		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
+		if (sg_count > SG_TABLESIZE) {
+			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
+			       c->name, sg_count);
+			rcode = -EINVAL;
+			goto cleanup;
+		}
+
+		for (i = 0; i < sg_count; i++) {
+			int sg_size;
+			struct i2o_dma *p;
+
+			if (!(sg[i].flag_count & 0x10000000
+			      /*I2O_SGL_FLAGS_SIMPLE_ADDRESS_ELEMENT */ )) {
+				printk(KERN_DEBUG
+				       "%s:Bad SG element %d - not simple (%x)\n",
+				       c->name, i, sg[i].flag_count);
+				rcode = -EINVAL;
+				goto cleanup;
+			}
+			sg_size = sg[i].flag_count & 0xffffff;
+			p = &(sg_list[sg_index]);
+			/* Allocate memory for the transfer */
+			if (i2o_dma_alloc(&c->pdev->dev, p, sg_size)) {
+				printk(KERN_DEBUG
+				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
+				       c->name, sg_size, i, sg_count);
+				rcode = -ENOMEM;
+				goto sg_list_cleanup;
+			}
+			sg_index++;
+			/* Copy in the user's SG buffer if necessary */
+			if (sg[i].
+			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
+				// TODO 64bit fix
+				if (copy_from_user
+				    (p->virt,
+				     (void __user *)(unsigned long)sg[i].
+				     addr_bus, sg_size)) {
+					printk(KERN_DEBUG
+					       "%s: Could not copy SG buf %d FROM user\n",
+					       c->name, i);
+					rcode = -EFAULT;
+					goto sg_list_cleanup;
+				}
+			}
+			//TODO 64bit fix
+			sg[i].addr_bus = (u32) p->phys;
+		}
+	}
+
+	rcode = i2o_msg_post_wait(c, msg, 60);
+	msg = NULL;
+	if (rcode) {
+		reply[4] = ((u32) rcode) << 24;
+		goto sg_list_cleanup;
+	}
+
+	if (sg_offset) {
+		u32 rmsg[I2O_OUTBOUND_MSG_FRAME_SIZE];
+		/* Copy back the Scatter Gather buffers back to user space */
+		u32 j;
+		// TODO 64bit fix
+		struct sg_simple_element *sg;
+		int sg_size;
+
+		// re-acquire the original message to handle correctly the sg copy operation
+		memset(&rmsg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
+		// get user msg size in u32s
+		if (get_user(size, &user_msg[0])) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+		size = size >> 16;
+		size *= 4;
+		if (size > sizeof(rmsg)) {
+			rcode = -EINVAL;
+			goto sg_list_cleanup;
+		}
+
+		/* Copy in the user's I2O command */
+		if (copy_from_user(rmsg, user_msg, size)) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+		sg_count =
+		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
+
+		// TODO 64bit fix
+		sg = (struct sg_simple_element *)(rmsg + sg_offset);
+		for (j = 0; j < sg_count; j++) {
+			/* Copy out the SG list to user's buffer if necessary */
+			if (!
+			    (sg[j].
+			     flag_count & 0x4000000 /*I2O_SGL_FLAGS_DIR */ )) {
+				sg_size = sg[j].flag_count & 0xffffff;
+				// TODO 64bit fix
+				if (copy_to_user
+				    ((void __user *)(u64) sg[j].addr_bus,
+				     sg_list[j].virt, sg_size)) {
+					printk(KERN_WARNING
+					       "%s: Could not copy %p TO user %x\n",
+					       c->name, sg_list[j].virt,
+					       sg[j].addr_bus);
+					rcode = -EFAULT;
+					goto sg_list_cleanup;
+				}
+			}
+		}
+	}
+
+sg_list_cleanup:
+	/* Copy back the reply to user space */
+	if (reply_size) {
+		// we wrote our own values for context - now restore the user supplied ones
+		if (copy_from_user(reply + 2, user_msg + 2, sizeof(u32) * 2)) {
+			printk(KERN_WARNING
+			       "%s: Could not copy message context FROM user\n",
+			       c->name);
+			rcode = -EFAULT;
+		}
+		if (copy_to_user(user_reply, reply, reply_size)) {
+			printk(KERN_WARNING
+			       "%s: Could not copy reply TO user\n", c->name);
+			rcode = -EFAULT;
+		}
+	}
+	for (i = 0; i < sg_index; i++)
+		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
+
+cleanup:
+	kfree(reply);
+out:
+	if (msg)
+		i2o_msg_nop(c, msg);
+	return rcode;
+}
+
+static long i2o_cfg_compat_ioctl(struct file *file, unsigned cmd,
+				 unsigned long arg)
+{
+	int ret;
+	switch (cmd) {
+	case I2OGETIOPS:
+		ret = i2o_cfg_ioctl(file, cmd, arg);
+		break;
+	case I2OPASSTHRU32:
+		mutex_lock(&i2o_cfg_mutex);
+		ret = i2o_cfg_passthru32(file, cmd, arg);
+		mutex_unlock(&i2o_cfg_mutex);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+	return ret;
+}
+
+#endif
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+static int i2o_cfg_passthru(unsigned long arg)
+{
+	struct i2o_cmd_passthru __user *cmd =
+	    (struct i2o_cmd_passthru __user *)arg;
+	struct i2o_controller *c;
+	u32 __user *user_msg;
+	u32 *reply = NULL;
+	u32 __user *user_reply = NULL;
+	u32 size = 0;
+	u32 reply_size = 0;
+	u32 rcode = 0;
+	struct i2o_dma sg_list[SG_TABLESIZE];
+	u32 sg_offset = 0;
+	u32 sg_count = 0;
+	int sg_index = 0;
+	u32 i = 0;
+	i2o_status_block *sb;
+	struct i2o_message *msg;
+	unsigned int iop;
+
+	if (get_user(iop, &cmd->iop) || get_user(user_msg, &cmd->msg))
+		return -EFAULT;
+
+	c = i2o_find_iop(iop);
+	if (!c) {
+		osm_warn("controller %d not found\n", iop);
+		return -ENXIO;
+	}
+
+	sb = c->status_block.virt;
+
+	if (get_user(size, &user_msg[0]))
+		return -EFAULT;
+	size = size >> 16;
+
+	if (size > sb->inbound_frame_size) {
+		osm_warn("size of message > inbound_frame_size");
+		return -EFAULT;
+	}
+
+	user_reply = &user_msg[size];
+
+	size <<= 2;		// Convert to bytes
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	rcode = -EFAULT;
+	/* Copy in the user's I2O command */
+	if (copy_from_user(msg, user_msg, size))
+		goto out;
+
+	if (get_user(reply_size, &user_reply[0]) < 0)
+		goto out;
+
+	reply_size >>= 16;
+	reply_size <<= 2;
+
+	reply = kzalloc(reply_size, GFP_KERNEL);
+	if (!reply) {
+		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
+		       c->name);
+		rcode = -ENOMEM;
+		goto out;
+	}
+
+	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
+
+	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
+	if (sg_offset) {
+		struct sg_simple_element *sg;
+		struct i2o_dma *p;
+
+		if (sg_offset * 4 >= size) {
+			rcode = -EFAULT;
+			goto cleanup;
+		}
+		// TODO 64bit fix
+		sg = (struct sg_simple_element *)((&msg->u.head[0]) +
+						  sg_offset);
+		sg_count =
+		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
+		if (sg_count > SG_TABLESIZE) {
+			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
+			       c->name, sg_count);
+			rcode = -EINVAL;
+			goto cleanup;
+		}
+
+		for (i = 0; i < sg_count; i++) {
+			int sg_size;
+
+			if (!(sg[i].flag_count & 0x10000000
+			      /*I2O_SGL_FLAGS_SIMPLE_ADDRESS_ELEMENT */ )) {
+				printk(KERN_DEBUG
+				       "%s:Bad SG element %d - not simple (%x)\n",
+				       c->name, i, sg[i].flag_count);
+				rcode = -EINVAL;
+				goto sg_list_cleanup;
+			}
+			sg_size = sg[i].flag_count & 0xffffff;
+			p = &(sg_list[sg_index]);
+			if (i2o_dma_alloc(&c->pdev->dev, p, sg_size)) {
+			/* Allocate memory for the transfer */
+				printk(KERN_DEBUG
+				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
+				       c->name, sg_size, i, sg_count);
+				rcode = -ENOMEM;
+				goto sg_list_cleanup;
+			}
+			sg_index++;
+			/* Copy in the user's SG buffer if necessary */
+			if (sg[i].
+			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
+				// TODO 64bit fix
+				if (copy_from_user
+				    (p->virt, (void __user *)sg[i].addr_bus,
+				     sg_size)) {
+					printk(KERN_DEBUG
+					       "%s: Could not copy SG buf %d FROM user\n",
+					       c->name, i);
+					rcode = -EFAULT;
+					goto sg_list_cleanup;
+				}
+			}
+			sg[i].addr_bus = p->phys;
+		}
+	}
+
+	rcode = i2o_msg_post_wait(c, msg, 60);
+	msg = NULL;
+	if (rcode) {
+		reply[4] = ((u32) rcode) << 24;
+		goto sg_list_cleanup;
+	}
+
+	if (sg_offset) {
+		u32 rmsg[I2O_OUTBOUND_MSG_FRAME_SIZE];
+		/* Copy back the Scatter Gather buffers back to user space */
+		u32 j;
+		// TODO 64bit fix
+		struct sg_simple_element *sg;
+		int sg_size;
+
+		// re-acquire the original message to handle correctly the sg copy operation
+		memset(&rmsg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
+		// get user msg size in u32s
+		if (get_user(size, &user_msg[0])) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+		size = size >> 16;
+		size *= 4;
+		if (size > sizeof(rmsg)) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+
+		/* Copy in the user's I2O command */
+		if (copy_from_user(rmsg, user_msg, size)) {
+			rcode = -EFAULT;
+			goto sg_list_cleanup;
+		}
+		sg_count =
+		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
+
+		// TODO 64bit fix
+		sg = (struct sg_simple_element *)(rmsg + sg_offset);
+		for (j = 0; j < sg_count; j++) {
+			/* Copy out the SG list to user's buffer if necessary */
+			if (!
+			    (sg[j].
+			     flag_count & 0x4000000 /*I2O_SGL_FLAGS_DIR */ )) {
+				sg_size = sg[j].flag_count & 0xffffff;
+				// TODO 64bit fix
+				if (copy_to_user
+				    ((void __user *)sg[j].addr_bus, sg_list[j].virt,
+				     sg_size)) {
+					printk(KERN_WARNING
+					       "%s: Could not copy %p TO user %x\n",
+					       c->name, sg_list[j].virt,
+					       sg[j].addr_bus);
+					rcode = -EFAULT;
+					goto sg_list_cleanup;
+				}
+			}
+		}
+	}
+
+sg_list_cleanup:
+	/* Copy back the reply to user space */
+	if (reply_size) {
+		// we wrote our own values for context - now restore the user supplied ones
+		if (copy_from_user(reply + 2, user_msg + 2, sizeof(u32) * 2)) {
+			printk(KERN_WARNING
+			       "%s: Could not copy message context FROM user\n",
+			       c->name);
+			rcode = -EFAULT;
+		}
+		if (copy_to_user(user_reply, reply, reply_size)) {
+			printk(KERN_WARNING
+			       "%s: Could not copy reply TO user\n", c->name);
+			rcode = -EFAULT;
+		}
+	}
+
+	for (i = 0; i < sg_index; i++)
+		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
+
+cleanup:
+	kfree(reply);
+out:
+	if (msg)
+		i2o_msg_nop(c, msg);
+	return rcode;
+}
+#endif
+
+/*
+ * IOCTL Handler
+ */
+static long i2o_cfg_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	mutex_lock(&i2o_cfg_mutex);
+	switch (cmd) {
+	case I2OGETIOPS:
+		ret = i2o_cfg_getiops(arg);
+		break;
+
+	case I2OHRTGET:
+		ret = i2o_cfg_gethrt(arg);
+		break;
+
+	case I2OLCTGET:
+		ret = i2o_cfg_getlct(arg);
+		break;
+
+	case I2OPARMSET:
+		ret = i2o_cfg_parms(arg, I2OPARMSET);
+		break;
+
+	case I2OPARMGET:
+		ret = i2o_cfg_parms(arg, I2OPARMGET);
+		break;
+
+	case I2OSWDL:
+		ret = i2o_cfg_swdl(arg);
+		break;
+
+	case I2OSWUL:
+		ret = i2o_cfg_swul(arg);
+		break;
+
+	case I2OSWDEL:
+		ret = i2o_cfg_swdel(arg);
+		break;
+
+	case I2OVALIDATE:
+		ret = i2o_cfg_validate(arg);
+		break;
+
+	case I2OEVTREG:
+		ret = i2o_cfg_evt_reg(arg, fp);
+		break;
+
+	case I2OEVTGET:
+		ret = i2o_cfg_evt_get(arg, fp);
+		break;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	case I2OPASSTHRU:
+		ret = i2o_cfg_passthru(arg);
+		break;
+#endif
+
+	default:
+		osm_debug("unknown ioctl called!\n");
+		ret = -EINVAL;
+	}
+	mutex_unlock(&i2o_cfg_mutex);
+	return ret;
+}
+
+static int cfg_open(struct inode *inode, struct file *file)
+{
+	struct i2o_cfg_info *tmp = kmalloc(sizeof(struct i2o_cfg_info),
+					   GFP_KERNEL);
+	unsigned long flags;
+
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&i2o_cfg_mutex);
+	file->private_data = (void *)(i2o_cfg_info_id++);
+	tmp->fp = file;
+	tmp->fasync = NULL;
+	tmp->q_id = (ulong) file->private_data;
+	tmp->q_len = 0;
+	tmp->q_in = 0;
+	tmp->q_out = 0;
+	tmp->q_lost = 0;
+	tmp->next = open_files;
+
+	spin_lock_irqsave(&i2o_config_lock, flags);
+	open_files = tmp;
+	spin_unlock_irqrestore(&i2o_config_lock, flags);
+	mutex_unlock(&i2o_cfg_mutex);
+
+	return 0;
+}
+
+static int cfg_fasync(int fd, struct file *fp, int on)
+{
+	ulong id = (ulong) fp->private_data;
+	struct i2o_cfg_info *p;
+	int ret = -EBADF;
+
+	mutex_lock(&i2o_cfg_mutex);
+	for (p = open_files; p; p = p->next)
+		if (p->q_id == id)
+			break;
+
+	if (p)
+		ret = fasync_helper(fd, fp, on, &p->fasync);
+	mutex_unlock(&i2o_cfg_mutex);
+	return ret;
+}
+
+static int cfg_release(struct inode *inode, struct file *file)
+{
+	ulong id = (ulong) file->private_data;
+	struct i2o_cfg_info *p, **q;
+	unsigned long flags;
+
+	mutex_lock(&i2o_cfg_mutex);
+	spin_lock_irqsave(&i2o_config_lock, flags);
+	for (q = &open_files; (p = *q) != NULL; q = &p->next) {
+		if (p->q_id == id) {
+			*q = p->next;
+			kfree(p);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&i2o_config_lock, flags);
+	mutex_unlock(&i2o_cfg_mutex);
+
+	return 0;
+}
+
+static const struct file_operations config_fops = {
+	.owner = THIS_MODULE,
+	.llseek = no_llseek,
+	.unlocked_ioctl = i2o_cfg_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = i2o_cfg_compat_ioctl,
+#endif
+	.open = cfg_open,
+	.release = cfg_release,
+	.fasync = cfg_fasync,
+};
+
+static struct miscdevice i2o_miscdev = {
+	I2O_MINOR,
+	"i2octl",
+	&config_fops
+};
+
+static int __init i2o_config_old_init(void)
+{
+	spin_lock_init(&i2o_config_lock);
+
+	if (misc_register(&i2o_miscdev) < 0) {
+		osm_err("can't register device.\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void i2o_config_old_exit(void)
+{
+	misc_deregister(&i2o_miscdev);
+}
+
+MODULE_AUTHOR("Red Hat Software");
diff --git a/drivers/staging/i2o/i2o_proc.c b/drivers/staging/i2o/i2o_proc.c
new file mode 100644
index 000000000000..ad84f3304f3c
--- /dev/null
+++ b/drivers/staging/i2o/i2o_proc.c
@@ -0,0 +1,2045 @@
+/*
+ *	procfs handler for Linux I2O subsystem
+ *
+ *	(c) Copyright 1999	Deepak Saxena
+ *
+ *	Originally written by Deepak Saxena(deepak@plexity.net)
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	This is an initial test release. The code is based on the design of the
+ *	ide procfs system (drivers/block/ide-proc.c). Some code taken from
+ *	i2o-core module by Alan Cox.
+ *
+ *	DISCLAIMER: This code is still under development/test and may cause
+ *	your system to behave unpredictably.  Use at your own discretion.
+ *
+ *
+ *	Fixes/additions:
+ *		Juha Sievänen (Juha.Sievanen@cs.Helsinki.FI),
+ *		Auvo Häkkinen (Auvo.Hakkinen@cs.Helsinki.FI)
+ *		University of Helsinki, Department of Computer Science
+ *			LAN entries
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			Changes for new I2O API
+ */
+
+#define OSM_NAME	"proc-osm"
+#define OSM_VERSION	"1.316"
+#define OSM_DESCRIPTION	"I2O ProcFS OSM"
+
+#define I2O_MAX_MODULES 4
+// FIXME!
+#define FMT_U64_HEX "0x%08x%08x"
+#define U64_VAL(pu64) *((u32*)(pu64)+1), *((u32*)(pu64))
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "i2o.h"
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+/* Structure used to define /proc entries */
+typedef struct _i2o_proc_entry_t {
+	char *name;		/* entry name */
+	umode_t mode;		/* mode */
+	const struct file_operations *fops;	/* open function */
+} i2o_proc_entry;
+
+/* global I2O /proc/i2o entry */
+static struct proc_dir_entry *i2o_proc_dir_root;
+
+/* proc OSM driver struct */
+static struct i2o_driver i2o_proc_driver = {
+	.name = OSM_NAME,
+};
+
+static int print_serial_number(struct seq_file *seq, u8 * serialno, int max_len)
+{
+	int i;
+
+	/* 19990419 -sralston
+	 *      The I2O v1.5 (and v2.0 so far) "official specification"
+	 *      got serial numbers WRONG!
+	 *      Apparently, and despite what Section 3.4.4 says and
+	 *      Figure 3-35 shows (pg 3-39 in the pdf doc),
+	 *      the convention / consensus seems to be:
+	 *        + First byte is SNFormat
+	 *        + Second byte is SNLen (but only if SNFormat==7 (?))
+	 *        + (v2.0) SCSI+BS may use IEEE Registered (64 or 128 bit) format
+	 */
+	switch (serialno[0]) {
+	case I2O_SNFORMAT_BINARY:	/* Binary */
+		seq_printf(seq, "0x");
+		for (i = 0; i < serialno[1]; i++) {
+			seq_printf(seq, "%02X", serialno[2 + i]);
+		}
+		break;
+
+	case I2O_SNFORMAT_ASCII:	/* ASCII */
+		if (serialno[1] < ' ') {	/* printable or SNLen? */
+			/* sanity */
+			max_len =
+			    (max_len < serialno[1]) ? max_len : serialno[1];
+			serialno[1 + max_len] = '\0';
+
+			/* just print it */
+			seq_printf(seq, "%s", &serialno[2]);
+		} else {
+			/* print chars for specified length */
+			for (i = 0; i < serialno[1]; i++) {
+				seq_printf(seq, "%c", serialno[2 + i]);
+			}
+		}
+		break;
+
+	case I2O_SNFORMAT_UNICODE:	/* UNICODE */
+		seq_printf(seq, "UNICODE Format.  Can't Display\n");
+		break;
+
+	case I2O_SNFORMAT_LAN48_MAC:	/* LAN-48 MAC Address */
+		seq_printf(seq, "LAN-48 MAC address @ %pM", &serialno[2]);
+		break;
+
+	case I2O_SNFORMAT_WAN:	/* WAN MAC Address */
+		/* FIXME: Figure out what a WAN access address looks like?? */
+		seq_printf(seq, "WAN Access Address");
+		break;
+
+/* plus new in v2.0 */
+	case I2O_SNFORMAT_LAN64_MAC:	/* LAN-64 MAC Address */
+		/* FIXME: Figure out what a LAN-64 address really looks like?? */
+		seq_printf(seq,
+			   "LAN-64 MAC address @ [?:%02X:%02X:?] %pM",
+			   serialno[8], serialno[9], &serialno[2]);
+		break;
+
+	case I2O_SNFORMAT_DDM:	/* I2O DDM */
+		seq_printf(seq,
+			   "DDM: Tid=%03Xh, Rsvd=%04Xh, OrgId=%04Xh",
+			   *(u16 *) & serialno[2],
+			   *(u16 *) & serialno[4], *(u16 *) & serialno[6]);
+		break;
+
+	case I2O_SNFORMAT_IEEE_REG64:	/* IEEE Registered (64-bit) */
+	case I2O_SNFORMAT_IEEE_REG128:	/* IEEE Registered (128-bit) */
+		/* FIXME: Figure if this is even close?? */
+		seq_printf(seq,
+			   "IEEE NodeName(hi,lo)=(%08Xh:%08Xh), PortName(hi,lo)=(%08Xh:%08Xh)\n",
+			   *(u32 *) & serialno[2],
+			   *(u32 *) & serialno[6],
+			   *(u32 *) & serialno[10], *(u32 *) & serialno[14]);
+		break;
+
+	case I2O_SNFORMAT_UNKNOWN:	/* Unknown 0    */
+	case I2O_SNFORMAT_UNKNOWN2:	/* Unknown 0xff */
+	default:
+		seq_printf(seq, "Unknown data format (0x%02x)", serialno[0]);
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ *	i2o_get_class_name - 	do i2o class name lookup
+ *	@class: class number
+ *
+ *	Return a descriptive string for an i2o class.
+ */
+static const char *i2o_get_class_name(int class)
+{
+	int idx = 16;
+	static char *i2o_class_name[] = {
+		"Executive",
+		"Device Driver Module",
+		"Block Device",
+		"Tape Device",
+		"LAN Interface",
+		"WAN Interface",
+		"Fibre Channel Port",
+		"Fibre Channel Device",
+		"SCSI Device",
+		"ATE Port",
+		"ATE Device",
+		"Floppy Controller",
+		"Floppy Device",
+		"Secondary Bus Port",
+		"Peer Transport Agent",
+		"Peer Transport",
+		"Unknown"
+	};
+
+	switch (class & 0xfff) {
+	case I2O_CLASS_EXECUTIVE:
+		idx = 0;
+		break;
+	case I2O_CLASS_DDM:
+		idx = 1;
+		break;
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		idx = 2;
+		break;
+	case I2O_CLASS_SEQUENTIAL_STORAGE:
+		idx = 3;
+		break;
+	case I2O_CLASS_LAN:
+		idx = 4;
+		break;
+	case I2O_CLASS_WAN:
+		idx = 5;
+		break;
+	case I2O_CLASS_FIBRE_CHANNEL_PORT:
+		idx = 6;
+		break;
+	case I2O_CLASS_FIBRE_CHANNEL_PERIPHERAL:
+		idx = 7;
+		break;
+	case I2O_CLASS_SCSI_PERIPHERAL:
+		idx = 8;
+		break;
+	case I2O_CLASS_ATE_PORT:
+		idx = 9;
+		break;
+	case I2O_CLASS_ATE_PERIPHERAL:
+		idx = 10;
+		break;
+	case I2O_CLASS_FLOPPY_CONTROLLER:
+		idx = 11;
+		break;
+	case I2O_CLASS_FLOPPY_DEVICE:
+		idx = 12;
+		break;
+	case I2O_CLASS_BUS_ADAPTER:
+		idx = 13;
+		break;
+	case I2O_CLASS_PEER_TRANSPORT_AGENT:
+		idx = 14;
+		break;
+	case I2O_CLASS_PEER_TRANSPORT:
+		idx = 15;
+		break;
+	}
+
+	return i2o_class_name[idx];
+}
+
+#define SCSI_TABLE_SIZE	13
+static char *scsi_devices[] = {
+	"Direct-Access Read/Write",
+	"Sequential-Access Storage",
+	"Printer",
+	"Processor",
+	"WORM Device",
+	"CD-ROM Device",
+	"Scanner Device",
+	"Optical Memory Device",
+	"Medium Changer Device",
+	"Communications Device",
+	"Graphics Art Pre-Press Device",
+	"Graphics Art Pre-Press Device",
+	"Array Controller Device"
+};
+
+static char *chtostr(char *tmp, u8 *chars, int n)
+{
+	tmp[0] = 0;
+	return strncat(tmp, (char *)chars, n);
+}
+
+static int i2o_report_query_status(struct seq_file *seq, int block_status,
+				   char *group)
+{
+	switch (block_status) {
+	case -ETIMEDOUT:
+		return seq_printf(seq, "Timeout reading group %s.\n", group);
+	case -ENOMEM:
+		return seq_printf(seq, "No free memory to read the table.\n");
+	case -I2O_PARAMS_STATUS_INVALID_GROUP_ID:
+		return seq_printf(seq, "Group %s not supported.\n", group);
+	default:
+		return seq_printf(seq,
+				  "Error reading group %s. BlockStatus 0x%02X\n",
+				  group, -block_status);
+	}
+}
+
+static char *bus_strings[] = {
+	"Local Bus",
+	"ISA",
+	"EISA",
+	"PCI",
+	"PCMCIA",
+	"NUBUS",
+	"CARDBUS"
+};
+
+static int i2o_seq_show_hrt(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	i2o_hrt *hrt = (i2o_hrt *) c->hrt.virt;
+	u32 bus;
+	int i;
+
+	if (hrt->hrt_version) {
+		seq_printf(seq,
+			   "HRT table for controller is too new a version.\n");
+		return 0;
+	}
+
+	seq_printf(seq, "HRT has %d entries of %d bytes each.\n",
+		   hrt->num_entries, hrt->entry_len << 2);
+
+	for (i = 0; i < hrt->num_entries; i++) {
+		seq_printf(seq, "Entry %d:\n", i);
+		seq_printf(seq, "   Adapter ID: %0#10x\n",
+			   hrt->hrt_entry[i].adapter_id);
+		seq_printf(seq, "   Controlling tid: %0#6x\n",
+			   hrt->hrt_entry[i].parent_tid);
+
+		if (hrt->hrt_entry[i].bus_type != 0x80) {
+			bus = hrt->hrt_entry[i].bus_type;
+			seq_printf(seq, "   %s Information\n",
+				   bus_strings[bus]);
+
+			switch (bus) {
+			case I2O_BUS_LOCAL:
+				seq_printf(seq, "     IOBase: %0#6x,",
+					   hrt->hrt_entry[i].bus.local_bus.
+					   LbBaseIOPort);
+				seq_printf(seq, " MemoryBase: %0#10x\n",
+					   hrt->hrt_entry[i].bus.local_bus.
+					   LbBaseMemoryAddress);
+				break;
+
+			case I2O_BUS_ISA:
+				seq_printf(seq, "     IOBase: %0#6x,",
+					   hrt->hrt_entry[i].bus.isa_bus.
+					   IsaBaseIOPort);
+				seq_printf(seq, " MemoryBase: %0#10x,",
+					   hrt->hrt_entry[i].bus.isa_bus.
+					   IsaBaseMemoryAddress);
+				seq_printf(seq, " CSN: %0#4x,",
+					   hrt->hrt_entry[i].bus.isa_bus.CSN);
+				break;
+
+			case I2O_BUS_EISA:
+				seq_printf(seq, "     IOBase: %0#6x,",
+					   hrt->hrt_entry[i].bus.eisa_bus.
+					   EisaBaseIOPort);
+				seq_printf(seq, " MemoryBase: %0#10x,",
+					   hrt->hrt_entry[i].bus.eisa_bus.
+					   EisaBaseMemoryAddress);
+				seq_printf(seq, " Slot: %0#4x,",
+					   hrt->hrt_entry[i].bus.eisa_bus.
+					   EisaSlotNumber);
+				break;
+
+			case I2O_BUS_PCI:
+				seq_printf(seq, "     Bus: %0#4x",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciBusNumber);
+				seq_printf(seq, " Dev: %0#4x",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciDeviceNumber);
+				seq_printf(seq, " Func: %0#4x",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciFunctionNumber);
+				seq_printf(seq, " Vendor: %0#6x",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciVendorID);
+				seq_printf(seq, " Device: %0#6x\n",
+					   hrt->hrt_entry[i].bus.pci_bus.
+					   PciDeviceID);
+				break;
+
+			default:
+				seq_printf(seq, "      Unsupported Bus Type\n");
+			}
+		} else
+			seq_printf(seq, "   Unknown Bus Type\n");
+	}
+
+	return 0;
+}
+
+static int i2o_seq_show_lct(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	i2o_lct *lct = (i2o_lct *) c->lct;
+	int entries;
+	int i;
+
+#define BUS_TABLE_SIZE 3
+	static char *bus_ports[] = {
+		"Generic Bus",
+		"SCSI Bus",
+		"Fibre Channel Bus"
+	};
+
+	entries = (lct->table_size - 3) / 9;
+
+	seq_printf(seq, "LCT contains %d %s\n", entries,
+		   entries == 1 ? "entry" : "entries");
+	if (lct->boot_tid)
+		seq_printf(seq, "Boot Device @ ID %d\n", lct->boot_tid);
+
+	seq_printf(seq, "Current Change Indicator: %#10x\n", lct->change_ind);
+
+	for (i = 0; i < entries; i++) {
+		seq_printf(seq, "Entry %d\n", i);
+		seq_printf(seq, "  Class, SubClass  : %s",
+			   i2o_get_class_name(lct->lct_entry[i].class_id));
+
+		/*
+		 *      Classes which we'll print subclass info for
+		 */
+		switch (lct->lct_entry[i].class_id & 0xFFF) {
+		case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+			switch (lct->lct_entry[i].sub_class) {
+			case 0x00:
+				seq_printf(seq, ", Direct-Access Read/Write");
+				break;
+
+			case 0x04:
+				seq_printf(seq, ", WORM Drive");
+				break;
+
+			case 0x05:
+				seq_printf(seq, ", CD-ROM Drive");
+				break;
+
+			case 0x07:
+				seq_printf(seq, ", Optical Memory Device");
+				break;
+
+			default:
+				seq_printf(seq, ", Unknown (0x%02x)",
+					   lct->lct_entry[i].sub_class);
+				break;
+			}
+			break;
+
+		case I2O_CLASS_LAN:
+			switch (lct->lct_entry[i].sub_class & 0xFF) {
+			case 0x30:
+				seq_printf(seq, ", Ethernet");
+				break;
+
+			case 0x40:
+				seq_printf(seq, ", 100base VG");
+				break;
+
+			case 0x50:
+				seq_printf(seq, ", IEEE 802.5/Token-Ring");
+				break;
+
+			case 0x60:
+				seq_printf(seq, ", ANSI X3T9.5 FDDI");
+				break;
+
+			case 0x70:
+				seq_printf(seq, ", Fibre Channel");
+				break;
+
+			default:
+				seq_printf(seq, ", Unknown Sub-Class (0x%02x)",
+					   lct->lct_entry[i].sub_class & 0xFF);
+				break;
+			}
+			break;
+
+		case I2O_CLASS_SCSI_PERIPHERAL:
+			if (lct->lct_entry[i].sub_class < SCSI_TABLE_SIZE)
+				seq_printf(seq, ", %s",
+					   scsi_devices[lct->lct_entry[i].
+							sub_class]);
+			else
+				seq_printf(seq, ", Unknown Device Type");
+			break;
+
+		case I2O_CLASS_BUS_ADAPTER:
+			if (lct->lct_entry[i].sub_class < BUS_TABLE_SIZE)
+				seq_printf(seq, ", %s",
+					   bus_ports[lct->lct_entry[i].
+						     sub_class]);
+			else
+				seq_printf(seq, ", Unknown Bus Type");
+			break;
+		}
+		seq_printf(seq, "\n");
+
+		seq_printf(seq, "  Local TID        : 0x%03x\n",
+			   lct->lct_entry[i].tid);
+		seq_printf(seq, "  User TID         : 0x%03x\n",
+			   lct->lct_entry[i].user_tid);
+		seq_printf(seq, "  Parent TID       : 0x%03x\n",
+			   lct->lct_entry[i].parent_tid);
+		seq_printf(seq, "  Identity Tag     : 0x%x%x%x%x%x%x%x%x\n",
+			   lct->lct_entry[i].identity_tag[0],
+			   lct->lct_entry[i].identity_tag[1],
+			   lct->lct_entry[i].identity_tag[2],
+			   lct->lct_entry[i].identity_tag[3],
+			   lct->lct_entry[i].identity_tag[4],
+			   lct->lct_entry[i].identity_tag[5],
+			   lct->lct_entry[i].identity_tag[6],
+			   lct->lct_entry[i].identity_tag[7]);
+		seq_printf(seq, "  Change Indicator : %0#10x\n",
+			   lct->lct_entry[i].change_ind);
+		seq_printf(seq, "  Event Capab Mask : %0#10x\n",
+			   lct->lct_entry[i].device_flags);
+	}
+
+	return 0;
+}
+
+static int i2o_seq_show_status(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	char prodstr[25];
+	int version;
+	i2o_status_block *sb = c->status_block.virt;
+
+	i2o_status_get(c);	// reread the status block
+
+	seq_printf(seq, "Organization ID        : %0#6x\n", sb->org_id);
+
+	version = sb->i2o_version;
+
+/* FIXME for Spec 2.0
+	if (version == 0x02) {
+		seq_printf(seq, "Lowest I2O version supported: ");
+		switch(workspace[2]) {
+			case 0x00:
+				seq_printf(seq, "1.0\n");
+				break;
+			case 0x01:
+				seq_printf(seq, "1.5\n");
+				break;
+			case 0x02:
+				seq_printf(seq, "2.0\n");
+				break;
+		}
+
+		seq_printf(seq, "Highest I2O version supported: ");
+		switch(workspace[3]) {
+			case 0x00:
+				seq_printf(seq, "1.0\n");
+				break;
+			case 0x01:
+				seq_printf(seq, "1.5\n");
+				break;
+			case 0x02:
+				seq_printf(seq, "2.0\n");
+				break;
+		}
+	}
+*/
+	seq_printf(seq, "IOP ID                 : %0#5x\n", sb->iop_id);
+	seq_printf(seq, "Host Unit ID           : %0#6x\n", sb->host_unit_id);
+	seq_printf(seq, "Segment Number         : %0#5x\n", sb->segment_number);
+
+	seq_printf(seq, "I2O version            : ");
+	switch (version) {
+	case 0x00:
+		seq_printf(seq, "1.0\n");
+		break;
+	case 0x01:
+		seq_printf(seq, "1.5\n");
+		break;
+	case 0x02:
+		seq_printf(seq, "2.0\n");
+		break;
+	default:
+		seq_printf(seq, "Unknown version\n");
+	}
+
+	seq_printf(seq, "IOP State              : ");
+	switch (sb->iop_state) {
+	case 0x01:
+		seq_printf(seq, "INIT\n");
+		break;
+
+	case 0x02:
+		seq_printf(seq, "RESET\n");
+		break;
+
+	case 0x04:
+		seq_printf(seq, "HOLD\n");
+		break;
+
+	case 0x05:
+		seq_printf(seq, "READY\n");
+		break;
+
+	case 0x08:
+		seq_printf(seq, "OPERATIONAL\n");
+		break;
+
+	case 0x10:
+		seq_printf(seq, "FAILED\n");
+		break;
+
+	case 0x11:
+		seq_printf(seq, "FAULTED\n");
+		break;
+
+	default:
+		seq_printf(seq, "Unknown\n");
+		break;
+	}
+
+	seq_printf(seq, "Messenger Type         : ");
+	switch (sb->msg_type) {
+	case 0x00:
+		seq_printf(seq, "Memory mapped\n");
+		break;
+	case 0x01:
+		seq_printf(seq, "Memory mapped only\n");
+		break;
+	case 0x02:
+		seq_printf(seq, "Remote only\n");
+		break;
+	case 0x03:
+		seq_printf(seq, "Memory mapped and remote\n");
+		break;
+	default:
+		seq_printf(seq, "Unknown\n");
+	}
+
+	seq_printf(seq, "Inbound Frame Size     : %d bytes\n",
+		   sb->inbound_frame_size << 2);
+	seq_printf(seq, "Max Inbound Frames     : %d\n",
+		   sb->max_inbound_frames);
+	seq_printf(seq, "Current Inbound Frames : %d\n",
+		   sb->cur_inbound_frames);
+	seq_printf(seq, "Max Outbound Frames    : %d\n",
+		   sb->max_outbound_frames);
+
+	/* Spec doesn't say if NULL terminated or not... */
+	memcpy(prodstr, sb->product_id, 24);
+	prodstr[24] = '\0';
+	seq_printf(seq, "Product ID             : %s\n", prodstr);
+	seq_printf(seq, "Expected LCT Size      : %d bytes\n",
+		   sb->expected_lct_size);
+
+	seq_printf(seq, "IOP Capabilities\n");
+	seq_printf(seq, "    Context Field Size Support : ");
+	switch (sb->iop_capabilities & 0x0000003) {
+	case 0:
+		seq_printf(seq, "Supports only 32-bit context fields\n");
+		break;
+	case 1:
+		seq_printf(seq, "Supports only 64-bit context fields\n");
+		break;
+	case 2:
+		seq_printf(seq, "Supports 32-bit and 64-bit context fields, "
+			   "but not concurrently\n");
+		break;
+	case 3:
+		seq_printf(seq, "Supports 32-bit and 64-bit context fields "
+			   "concurrently\n");
+		break;
+	default:
+		seq_printf(seq, "0x%08x\n", sb->iop_capabilities);
+	}
+	seq_printf(seq, "    Current Context Field Size : ");
+	switch (sb->iop_capabilities & 0x0000000C) {
+	case 0:
+		seq_printf(seq, "not configured\n");
+		break;
+	case 4:
+		seq_printf(seq, "Supports only 32-bit context fields\n");
+		break;
+	case 8:
+		seq_printf(seq, "Supports only 64-bit context fields\n");
+		break;
+	case 12:
+		seq_printf(seq, "Supports both 32-bit or 64-bit context fields "
+			   "concurrently\n");
+		break;
+	default:
+		seq_printf(seq, "\n");
+	}
+	seq_printf(seq, "    Inbound Peer Support       : %s\n",
+		   (sb->
+		    iop_capabilities & 0x00000010) ? "Supported" :
+		   "Not supported");
+	seq_printf(seq, "    Outbound Peer Support      : %s\n",
+		   (sb->
+		    iop_capabilities & 0x00000020) ? "Supported" :
+		   "Not supported");
+	seq_printf(seq, "    Peer to Peer Support       : %s\n",
+		   (sb->
+		    iop_capabilities & 0x00000040) ? "Supported" :
+		   "Not supported");
+
+	seq_printf(seq, "Desired private memory size   : %d kB\n",
+		   sb->desired_mem_size >> 10);
+	seq_printf(seq, "Allocated private memory size : %d kB\n",
+		   sb->current_mem_size >> 10);
+	seq_printf(seq, "Private memory base address   : %0#10x\n",
+		   sb->current_mem_base);
+	seq_printf(seq, "Desired private I/O size      : %d kB\n",
+		   sb->desired_io_size >> 10);
+	seq_printf(seq, "Allocated private I/O size    : %d kB\n",
+		   sb->current_io_size >> 10);
+	seq_printf(seq, "Private I/O base address      : %0#10x\n",
+		   sb->current_io_base);
+
+	return 0;
+}
+
+static int i2o_seq_show_hw(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	static u32 work32[5];
+	static u8 *work8 = (u8 *) work32;
+	static u16 *work16 = (u16 *) work32;
+	int token;
+	u32 hwcap;
+
+	static char *cpu_table[] = {
+		"Intel 80960 series",
+		"AMD2900 series",
+		"Motorola 68000 series",
+		"ARM series",
+		"MIPS series",
+		"Sparc series",
+		"PowerPC series",
+		"Intel x86 series"
+	};
+
+	token =
+	    i2o_parm_field_get(c->exec, 0x0000, -1, &work32, sizeof(work32));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0x0000 IOP Hardware");
+		return 0;
+	}
+
+	seq_printf(seq, "I2O Vendor ID    : %0#6x\n", work16[0]);
+	seq_printf(seq, "Product ID       : %0#6x\n", work16[1]);
+	seq_printf(seq, "CPU              : ");
+	if (work8[16] > 8)
+		seq_printf(seq, "Unknown\n");
+	else
+		seq_printf(seq, "%s\n", cpu_table[work8[16]]);
+	/* Anyone using ProcessorVersion? */
+
+	seq_printf(seq, "RAM              : %dkB\n", work32[1] >> 10);
+	seq_printf(seq, "Non-Volatile Mem : %dkB\n", work32[2] >> 10);
+
+	hwcap = work32[3];
+	seq_printf(seq, "Capabilities : 0x%08x\n", hwcap);
+	seq_printf(seq, "   [%s] Self booting\n",
+		   (hwcap & 0x00000001) ? "+" : "-");
+	seq_printf(seq, "   [%s] Upgradable IRTOS\n",
+		   (hwcap & 0x00000002) ? "+" : "-");
+	seq_printf(seq, "   [%s] Supports downloading DDMs\n",
+		   (hwcap & 0x00000004) ? "+" : "-");
+	seq_printf(seq, "   [%s] Supports installing DDMs\n",
+		   (hwcap & 0x00000008) ? "+" : "-");
+	seq_printf(seq, "   [%s] Battery-backed RAM\n",
+		   (hwcap & 0x00000010) ? "+" : "-");
+
+	return 0;
+}
+
+/* Executive group 0003h - Executing DDM List (table) */
+static int i2o_seq_show_ddm_table(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	int token;
+	int i;
+
+	typedef struct _i2o_exec_execute_ddm_table {
+		u16 ddm_tid;
+		u8 module_type;
+		u8 reserved;
+		u16 i2o_vendor_id;
+		u16 module_id;
+		u8 module_name_version[28];
+		u32 data_size;
+		u32 code_size;
+	} i2o_exec_execute_ddm_table;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_exec_execute_ddm_table ddm_table[I2O_MAX_MODULES];
+	} *result;
+
+	i2o_exec_execute_ddm_table ddm_table;
+	char tmp[28 + 1];
+
+	result = kmalloc(sizeof(*result), GFP_KERNEL);
+	if (!result)
+		return -ENOMEM;
+
+	token = i2o_parm_table_get(c->exec, I2O_PARAMS_TABLE_GET, 0x0003, -1,
+				   NULL, 0, result, sizeof(*result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0x0003 Executing DDM List");
+		goto out;
+	}
+
+	seq_printf(seq,
+		   "Tid   Module_type     Vendor Mod_id  Module_name             Vrs  Data_size Code_size\n");
+	ddm_table = result->ddm_table[0];
+
+	for (i = 0; i < result->row_count; ddm_table = result->ddm_table[++i]) {
+		seq_printf(seq, "0x%03x ", ddm_table.ddm_tid & 0xFFF);
+
+		switch (ddm_table.module_type) {
+		case 0x01:
+			seq_printf(seq, "Downloaded DDM  ");
+			break;
+		case 0x22:
+			seq_printf(seq, "Embedded DDM    ");
+			break;
+		default:
+			seq_printf(seq, "                ");
+		}
+
+		seq_printf(seq, "%-#7x", ddm_table.i2o_vendor_id);
+		seq_printf(seq, "%-#8x", ddm_table.module_id);
+		seq_printf(seq, "%-29s",
+			   chtostr(tmp, ddm_table.module_name_version, 28));
+		seq_printf(seq, "%9d  ", ddm_table.data_size);
+		seq_printf(seq, "%8d", ddm_table.code_size);
+
+		seq_printf(seq, "\n");
+	}
+      out:
+	kfree(result);
+	return 0;
+}
+
+/* Executive group 0004h - Driver Store (scalar) */
+static int i2o_seq_show_driver_store(struct seq_file *seq, void *v)
+{
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	u32 work32[8];
+	int token;
+
+	token =
+	    i2o_parm_field_get(c->exec, 0x0004, -1, &work32, sizeof(work32));
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0x0004 Driver Store");
+		return 0;
+	}
+
+	seq_printf(seq, "Module limit  : %d\n"
+		   "Module count  : %d\n"
+		   "Current space : %d kB\n"
+		   "Free space    : %d kB\n",
+		   work32[0], work32[1], work32[2] >> 10, work32[3] >> 10);
+
+	return 0;
+}
+
+/* Executive group 0005h - Driver Store Table (table) */
+static int i2o_seq_show_drivers_stored(struct seq_file *seq, void *v)
+{
+	typedef struct _i2o_driver_store {
+		u16 stored_ddm_index;
+		u8 module_type;
+		u8 reserved;
+		u16 i2o_vendor_id;
+		u16 module_id;
+		u8 module_name_version[28];
+		u8 date[8];
+		u32 module_size;
+		u32 mpb_size;
+		u32 module_flags;
+	} i2o_driver_store_table;
+
+	struct i2o_controller *c = (struct i2o_controller *)seq->private;
+	int token;
+	int i;
+
+	typedef struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_driver_store_table dst[I2O_MAX_MODULES];
+	} i2o_driver_result_table;
+
+	i2o_driver_result_table *result;
+	i2o_driver_store_table *dst;
+	char tmp[28 + 1];
+
+	result = kmalloc(sizeof(i2o_driver_result_table), GFP_KERNEL);
+	if (result == NULL)
+		return -ENOMEM;
+
+	token = i2o_parm_table_get(c->exec, I2O_PARAMS_TABLE_GET, 0x0005, -1,
+				   NULL, 0, result, sizeof(*result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0x0005 DRIVER STORE TABLE");
+		kfree(result);
+		return 0;
+	}
+
+	seq_printf(seq,
+		   "#  Module_type     Vendor Mod_id  Module_name             Vrs"
+		   "Date     Mod_size Par_size Flags\n");
+	for (i = 0, dst = &result->dst[0]; i < result->row_count;
+	     dst = &result->dst[++i]) {
+		seq_printf(seq, "%-3d", dst->stored_ddm_index);
+		switch (dst->module_type) {
+		case 0x01:
+			seq_printf(seq, "Downloaded DDM  ");
+			break;
+		case 0x22:
+			seq_printf(seq, "Embedded DDM    ");
+			break;
+		default:
+			seq_printf(seq, "                ");
+		}
+
+		seq_printf(seq, "%-#7x", dst->i2o_vendor_id);
+		seq_printf(seq, "%-#8x", dst->module_id);
+		seq_printf(seq, "%-29s",
+			   chtostr(tmp, dst->module_name_version, 28));
+		seq_printf(seq, "%-9s", chtostr(tmp, dst->date, 8));
+		seq_printf(seq, "%8d ", dst->module_size);
+		seq_printf(seq, "%8d ", dst->mpb_size);
+		seq_printf(seq, "0x%04x", dst->module_flags);
+		seq_printf(seq, "\n");
+	}
+
+	kfree(result);
+	return 0;
+}
+
+/* Generic group F000h - Params Descriptor (table) */
+static int i2o_seq_show_groups(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+	u8 properties;
+
+	typedef struct _i2o_group_info {
+		u16 group_number;
+		u16 field_count;
+		u16 row_count;
+		u8 properties;
+		u8 reserved;
+	} i2o_group_info;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_group_info group[256];
+	} *result;
+
+	result = kmalloc(sizeof(*result), GFP_KERNEL);
+	if (!result)
+		return -ENOMEM;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF000, -1, NULL, 0,
+				   result, sizeof(*result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF000 Params Descriptor");
+		goto out;
+	}
+
+	seq_printf(seq,
+		   "#  Group   FieldCount RowCount Type   Add Del Clear\n");
+
+	for (i = 0; i < result->row_count; i++) {
+		seq_printf(seq, "%-3d", i);
+		seq_printf(seq, "0x%04X ", result->group[i].group_number);
+		seq_printf(seq, "%10d ", result->group[i].field_count);
+		seq_printf(seq, "%8d ", result->group[i].row_count);
+
+		properties = result->group[i].properties;
+		if (properties & 0x1)
+			seq_printf(seq, "Table  ");
+		else
+			seq_printf(seq, "Scalar ");
+		if (properties & 0x2)
+			seq_printf(seq, " + ");
+		else
+			seq_printf(seq, " - ");
+		if (properties & 0x4)
+			seq_printf(seq, "  + ");
+		else
+			seq_printf(seq, "  - ");
+		if (properties & 0x8)
+			seq_printf(seq, "  + ");
+		else
+			seq_printf(seq, "  - ");
+
+		seq_printf(seq, "\n");
+	}
+
+	if (result->more_flag)
+		seq_printf(seq, "There is more...\n");
+      out:
+	kfree(result);
+	return 0;
+}
+
+/* Generic group F001h - Physical Device Table (table) */
+static int i2o_seq_show_phys_device(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		u32 adapter_id[64];
+	} result;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF001, -1, NULL, 0,
+				   &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF001 Physical Device Table");
+		return 0;
+	}
+
+	if (result.row_count)
+		seq_printf(seq, "#  AdapterId\n");
+
+	for (i = 0; i < result.row_count; i++) {
+		seq_printf(seq, "%-2d", i);
+		seq_printf(seq, "%#7x\n", result.adapter_id[i]);
+	}
+
+	if (result.more_flag)
+		seq_printf(seq, "There is more...\n");
+
+	return 0;
+}
+
+/* Generic group F002h - Claimed Table (table) */
+static int i2o_seq_show_claimed(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		u16 claimed_tid[64];
+	} result;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF002, -1, NULL, 0,
+				   &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF002 Claimed Table");
+		return 0;
+	}
+
+	if (result.row_count)
+		seq_printf(seq, "#  ClaimedTid\n");
+
+	for (i = 0; i < result.row_count; i++) {
+		seq_printf(seq, "%-2d", i);
+		seq_printf(seq, "%#7x\n", result.claimed_tid[i]);
+	}
+
+	if (result.more_flag)
+		seq_printf(seq, "There is more...\n");
+
+	return 0;
+}
+
+/* Generic group F003h - User Table (table) */
+static int i2o_seq_show_users(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	typedef struct _i2o_user_table {
+		u16 instance;
+		u16 user_tid;
+		u8 claim_type;
+		u8 reserved1;
+		u16 reserved2;
+	} i2o_user_table;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_user_table user[64];
+	} *result;
+
+	result = kmalloc(sizeof(*result), GFP_KERNEL);
+	if (!result)
+		return -ENOMEM;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF003, -1, NULL, 0,
+				   result, sizeof(*result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF003 User Table");
+		goto out;
+	}
+
+	seq_printf(seq, "#  Instance UserTid ClaimType\n");
+
+	for (i = 0; i < result->row_count; i++) {
+		seq_printf(seq, "%-3d", i);
+		seq_printf(seq, "%#8x ", result->user[i].instance);
+		seq_printf(seq, "%#7x ", result->user[i].user_tid);
+		seq_printf(seq, "%#9x\n", result->user[i].claim_type);
+	}
+
+	if (result->more_flag)
+		seq_printf(seq, "There is more...\n");
+      out:
+	kfree(result);
+	return 0;
+}
+
+/* Generic group F005h - Private message extensions (table) (optional) */
+static int i2o_seq_show_priv_msgs(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	typedef struct _i2o_private {
+		u16 ext_instance;
+		u16 organization_id;
+		u16 x_function_code;
+	} i2o_private;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		i2o_private extension[64];
+	} result;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF000, -1, NULL, 0,
+				   &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF005 Private Message Extensions (optional)");
+		return 0;
+	}
+
+	seq_printf(seq, "Instance#  OrgId  FunctionCode\n");
+
+	for (i = 0; i < result.row_count; i++) {
+		seq_printf(seq, "%0#9x ", result.extension[i].ext_instance);
+		seq_printf(seq, "%0#6x ", result.extension[i].organization_id);
+		seq_printf(seq, "%0#6x", result.extension[i].x_function_code);
+
+		seq_printf(seq, "\n");
+	}
+
+	if (result.more_flag)
+		seq_printf(seq, "There is more...\n");
+
+	return 0;
+}
+
+/* Generic group F006h - Authorized User Table (table) */
+static int i2o_seq_show_authorized_users(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+	int i;
+
+	struct {
+		u16 result_count;
+		u16 pad;
+		u16 block_size;
+		u8 block_status;
+		u8 error_info_size;
+		u16 row_count;
+		u16 more_flag;
+		u32 alternate_tid[64];
+	} result;
+
+	token = i2o_parm_table_get(d, I2O_PARAMS_TABLE_GET, 0xF006, -1, NULL, 0,
+				   &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF006 Autohorized User Table");
+		return 0;
+	}
+
+	if (result.row_count)
+		seq_printf(seq, "#  AlternateTid\n");
+
+	for (i = 0; i < result.row_count; i++) {
+		seq_printf(seq, "%-2d", i);
+		seq_printf(seq, "%#7x ", result.alternate_tid[i]);
+	}
+
+	if (result.more_flag)
+		seq_printf(seq, "There is more...\n");
+
+	return 0;
+}
+
+/* Generic group F100h - Device Identity (scalar) */
+static int i2o_seq_show_dev_identity(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	static u32 work32[128];	// allow for "stuff" + up to 256 byte (max) serial number
+	// == (allow) 512d bytes (max)
+	static u16 *work16 = (u16 *) work32;
+	int token;
+	char tmp[16 + 1];
+
+	token = i2o_parm_field_get(d, 0xF100, -1, &work32, sizeof(work32));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF100 Device Identity");
+		return 0;
+	}
+
+	seq_printf(seq, "Device Class  : %s\n", i2o_get_class_name(work16[0]));
+	seq_printf(seq, "Owner TID     : %0#5x\n", work16[2]);
+	seq_printf(seq, "Parent TID    : %0#5x\n", work16[3]);
+	seq_printf(seq, "Vendor info   : %s\n",
+		   chtostr(tmp, (u8 *) (work32 + 2), 16));
+	seq_printf(seq, "Product info  : %s\n",
+		   chtostr(tmp, (u8 *) (work32 + 6), 16));
+	seq_printf(seq, "Description   : %s\n",
+		   chtostr(tmp, (u8 *) (work32 + 10), 16));
+	seq_printf(seq, "Product rev.  : %s\n",
+		   chtostr(tmp, (u8 *) (work32 + 14), 8));
+
+	seq_printf(seq, "Serial number : ");
+	print_serial_number(seq, (u8 *) (work32 + 16),
+			    /* allow for SNLen plus
+			     * possible trailing '\0'
+			     */
+			    sizeof(work32) - (16 * sizeof(u32)) - 2);
+	seq_printf(seq, "\n");
+
+	return 0;
+}
+
+static int i2o_seq_show_dev_name(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+
+	seq_printf(seq, "%s\n", dev_name(&d->device));
+
+	return 0;
+}
+
+/* Generic group F101h - DDM Identity (scalar) */
+static int i2o_seq_show_ddm_identity(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+
+	struct {
+		u16 ddm_tid;
+		u8 module_name[24];
+		u8 module_rev[8];
+		u8 sn_format;
+		u8 serial_number[12];
+		u8 pad[256];	// allow up to 256 byte (max) serial number
+	} result;
+
+	char tmp[24 + 1];
+
+	token = i2o_parm_field_get(d, 0xF101, -1, &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF101 DDM Identity");
+		return 0;
+	}
+
+	seq_printf(seq, "Registering DDM TID : 0x%03x\n", result.ddm_tid);
+	seq_printf(seq, "Module name         : %s\n",
+		   chtostr(tmp, result.module_name, 24));
+	seq_printf(seq, "Module revision     : %s\n",
+		   chtostr(tmp, result.module_rev, 8));
+
+	seq_printf(seq, "Serial number       : ");
+	print_serial_number(seq, result.serial_number, sizeof(result) - 36);
+	/* allow for SNLen plus possible trailing '\0' */
+
+	seq_printf(seq, "\n");
+
+	return 0;
+}
+
+/* Generic group F102h - User Information (scalar) */
+static int i2o_seq_show_uinfo(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+
+	struct {
+		u8 device_name[64];
+		u8 service_name[64];
+		u8 physical_location[64];
+		u8 instance_number[4];
+	} result;
+
+	char tmp[64 + 1];
+
+	token = i2o_parm_field_get(d, 0xF102, -1, &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token, "0xF102 User Information");
+		return 0;
+	}
+
+	seq_printf(seq, "Device name     : %s\n",
+		   chtostr(tmp, result.device_name, 64));
+	seq_printf(seq, "Service name    : %s\n",
+		   chtostr(tmp, result.service_name, 64));
+	seq_printf(seq, "Physical name   : %s\n",
+		   chtostr(tmp, result.physical_location, 64));
+	seq_printf(seq, "Instance number : %s\n",
+		   chtostr(tmp, result.instance_number, 4));
+
+	return 0;
+}
+
+/* Generic group F103h - SGL Operating Limits (scalar) */
+static int i2o_seq_show_sgl_limits(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	static u32 work32[12];
+	static u16 *work16 = (u16 *) work32;
+	static u8 *work8 = (u8 *) work32;
+	int token;
+
+	token = i2o_parm_field_get(d, 0xF103, -1, &work32, sizeof(work32));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF103 SGL Operating Limits");
+		return 0;
+	}
+
+	seq_printf(seq, "SGL chain size        : %d\n", work32[0]);
+	seq_printf(seq, "Max SGL chain size    : %d\n", work32[1]);
+	seq_printf(seq, "SGL chain size target : %d\n", work32[2]);
+	seq_printf(seq, "SGL frag count        : %d\n", work16[6]);
+	seq_printf(seq, "Max SGL frag count    : %d\n", work16[7]);
+	seq_printf(seq, "SGL frag count target : %d\n", work16[8]);
+
+/* FIXME
+	if (d->i2oversion == 0x02)
+	{
+*/
+	seq_printf(seq, "SGL data alignment    : %d\n", work16[8]);
+	seq_printf(seq, "SGL addr limit        : %d\n", work8[20]);
+	seq_printf(seq, "SGL addr sizes supported : ");
+	if (work8[21] & 0x01)
+		seq_printf(seq, "32 bit ");
+	if (work8[21] & 0x02)
+		seq_printf(seq, "64 bit ");
+	if (work8[21] & 0x04)
+		seq_printf(seq, "96 bit ");
+	if (work8[21] & 0x08)
+		seq_printf(seq, "128 bit ");
+	seq_printf(seq, "\n");
+/*
+	}
+*/
+
+	return 0;
+}
+
+/* Generic group F200h - Sensors (scalar) */
+static int i2o_seq_show_sensors(struct seq_file *seq, void *v)
+{
+	struct i2o_device *d = (struct i2o_device *)seq->private;
+	int token;
+
+	struct {
+		u16 sensor_instance;
+		u8 component;
+		u16 component_instance;
+		u8 sensor_class;
+		u8 sensor_type;
+		u8 scaling_exponent;
+		u32 actual_reading;
+		u32 minimum_reading;
+		u32 low2lowcat_treshold;
+		u32 lowcat2low_treshold;
+		u32 lowwarn2low_treshold;
+		u32 low2lowwarn_treshold;
+		u32 norm2lowwarn_treshold;
+		u32 lowwarn2norm_treshold;
+		u32 nominal_reading;
+		u32 hiwarn2norm_treshold;
+		u32 norm2hiwarn_treshold;
+		u32 high2hiwarn_treshold;
+		u32 hiwarn2high_treshold;
+		u32 hicat2high_treshold;
+		u32 hi2hicat_treshold;
+		u32 maximum_reading;
+		u8 sensor_state;
+		u16 event_enable;
+	} result;
+
+	token = i2o_parm_field_get(d, 0xF200, -1, &result, sizeof(result));
+
+	if (token < 0) {
+		i2o_report_query_status(seq, token,
+					"0xF200 Sensors (optional)");
+		return 0;
+	}
+
+	seq_printf(seq, "Sensor instance       : %d\n", result.sensor_instance);
+
+	seq_printf(seq, "Component             : %d = ", result.component);
+	switch (result.component) {
+	case 0:
+		seq_printf(seq, "Other");
+		break;
+	case 1:
+		seq_printf(seq, "Planar logic Board");
+		break;
+	case 2:
+		seq_printf(seq, "CPU");
+		break;
+	case 3:
+		seq_printf(seq, "Chassis");
+		break;
+	case 4:
+		seq_printf(seq, "Power Supply");
+		break;
+	case 5:
+		seq_printf(seq, "Storage");
+		break;
+	case 6:
+		seq_printf(seq, "External");
+		break;
+	}
+	seq_printf(seq, "\n");
+
+	seq_printf(seq, "Component instance    : %d\n",
+		   result.component_instance);
+	seq_printf(seq, "Sensor class          : %s\n",
+		   result.sensor_class ? "Analog" : "Digital");
+
+	seq_printf(seq, "Sensor type           : %d = ", result.sensor_type);
+	switch (result.sensor_type) {
+	case 0:
+		seq_printf(seq, "Other\n");
+		break;
+	case 1:
+		seq_printf(seq, "Thermal\n");
+		break;
+	case 2:
+		seq_printf(seq, "DC voltage (DC volts)\n");
+		break;
+	case 3:
+		seq_printf(seq, "AC voltage (AC volts)\n");
+		break;
+	case 4:
+		seq_printf(seq, "DC current (DC amps)\n");
+		break;
+	case 5:
+		seq_printf(seq, "AC current (AC volts)\n");
+		break;
+	case 6:
+		seq_printf(seq, "Door open\n");
+		break;
+	case 7:
+		seq_printf(seq, "Fan operational\n");
+		break;
+	}
+
+	seq_printf(seq, "Scaling exponent      : %d\n",
+		   result.scaling_exponent);
+	seq_printf(seq, "Actual reading        : %d\n", result.actual_reading);
+	seq_printf(seq, "Minimum reading       : %d\n", result.minimum_reading);
+	seq_printf(seq, "Low2LowCat treshold   : %d\n",
+		   result.low2lowcat_treshold);
+	seq_printf(seq, "LowCat2Low treshold   : %d\n",
+		   result.lowcat2low_treshold);
+	seq_printf(seq, "LowWarn2Low treshold  : %d\n",
+		   result.lowwarn2low_treshold);
+	seq_printf(seq, "Low2LowWarn treshold  : %d\n",
+		   result.low2lowwarn_treshold);
+	seq_printf(seq, "Norm2LowWarn treshold : %d\n",
+		   result.norm2lowwarn_treshold);
+	seq_printf(seq, "LowWarn2Norm treshold : %d\n",
+		   result.lowwarn2norm_treshold);
+	seq_printf(seq, "Nominal reading       : %d\n", result.nominal_reading);
+	seq_printf(seq, "HiWarn2Norm treshold  : %d\n",
+		   result.hiwarn2norm_treshold);
+	seq_printf(seq, "Norm2HiWarn treshold  : %d\n",
+		   result.norm2hiwarn_treshold);
+	seq_printf(seq, "High2HiWarn treshold  : %d\n",
+		   result.high2hiwarn_treshold);
+	seq_printf(seq, "HiWarn2High treshold  : %d\n",
+		   result.hiwarn2high_treshold);
+	seq_printf(seq, "HiCat2High treshold   : %d\n",
+		   result.hicat2high_treshold);
+	seq_printf(seq, "High2HiCat treshold   : %d\n",
+		   result.hi2hicat_treshold);
+	seq_printf(seq, "Maximum reading       : %d\n", result.maximum_reading);
+
+	seq_printf(seq, "Sensor state          : %d = ", result.sensor_state);
+	switch (result.sensor_state) {
+	case 0:
+		seq_printf(seq, "Normal\n");
+		break;
+	case 1:
+		seq_printf(seq, "Abnormal\n");
+		break;
+	case 2:
+		seq_printf(seq, "Unknown\n");
+		break;
+	case 3:
+		seq_printf(seq, "Low Catastrophic (LoCat)\n");
+		break;
+	case 4:
+		seq_printf(seq, "Low (Low)\n");
+		break;
+	case 5:
+		seq_printf(seq, "Low Warning (LoWarn)\n");
+		break;
+	case 6:
+		seq_printf(seq, "High Warning (HiWarn)\n");
+		break;
+	case 7:
+		seq_printf(seq, "High (High)\n");
+		break;
+	case 8:
+		seq_printf(seq, "High Catastrophic (HiCat)\n");
+		break;
+	}
+
+	seq_printf(seq, "Event_enable : 0x%02X\n", result.event_enable);
+	seq_printf(seq, "    [%s] Operational state change. \n",
+		   (result.event_enable & 0x01) ? "+" : "-");
+	seq_printf(seq, "    [%s] Low catastrophic. \n",
+		   (result.event_enable & 0x02) ? "+" : "-");
+	seq_printf(seq, "    [%s] Low reading. \n",
+		   (result.event_enable & 0x04) ? "+" : "-");
+	seq_printf(seq, "    [%s] Low warning. \n",
+		   (result.event_enable & 0x08) ? "+" : "-");
+	seq_printf(seq,
+		   "    [%s] Change back to normal from out of range state. \n",
+		   (result.event_enable & 0x10) ? "+" : "-");
+	seq_printf(seq, "    [%s] High warning. \n",
+		   (result.event_enable & 0x20) ? "+" : "-");
+	seq_printf(seq, "    [%s] High reading. \n",
+		   (result.event_enable & 0x40) ? "+" : "-");
+	seq_printf(seq, "    [%s] High catastrophic. \n",
+		   (result.event_enable & 0x80) ? "+" : "-");
+
+	return 0;
+}
+
+static int i2o_seq_open_hrt(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_hrt, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_lct(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_lct, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_status(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_status, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_hw(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_hw, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_ddm_table(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_ddm_table, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_driver_store(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_driver_store, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_drivers_stored(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_drivers_stored, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_groups(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_groups, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_phys_device(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_phys_device, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_claimed(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_claimed, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_users(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_users, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_priv_msgs(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_priv_msgs, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_authorized_users(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_authorized_users,
+			   PDE_DATA(inode));
+};
+
+static int i2o_seq_open_dev_identity(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_dev_identity, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_ddm_identity(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_ddm_identity, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_uinfo(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_uinfo, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_sgl_limits(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_sgl_limits, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_sensors(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_sensors, PDE_DATA(inode));
+};
+
+static int i2o_seq_open_dev_name(struct inode *inode, struct file *file)
+{
+	return single_open(file, i2o_seq_show_dev_name, PDE_DATA(inode));
+};
+
+static const struct file_operations i2o_seq_fops_lct = {
+	.open = i2o_seq_open_lct,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_hrt = {
+	.open = i2o_seq_open_hrt,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_status = {
+	.open = i2o_seq_open_status,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_hw = {
+	.open = i2o_seq_open_hw,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_ddm_table = {
+	.open = i2o_seq_open_ddm_table,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_driver_store = {
+	.open = i2o_seq_open_driver_store,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_drivers_stored = {
+	.open = i2o_seq_open_drivers_stored,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_groups = {
+	.open = i2o_seq_open_groups,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_phys_device = {
+	.open = i2o_seq_open_phys_device,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_claimed = {
+	.open = i2o_seq_open_claimed,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_users = {
+	.open = i2o_seq_open_users,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_priv_msgs = {
+	.open = i2o_seq_open_priv_msgs,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_authorized_users = {
+	.open = i2o_seq_open_authorized_users,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_dev_name = {
+	.open = i2o_seq_open_dev_name,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_dev_identity = {
+	.open = i2o_seq_open_dev_identity,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_ddm_identity = {
+	.open = i2o_seq_open_ddm_identity,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_uinfo = {
+	.open = i2o_seq_open_uinfo,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_sgl_limits = {
+	.open = i2o_seq_open_sgl_limits,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations i2o_seq_fops_sensors = {
+	.open = i2o_seq_open_sensors,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ * IOP specific entries...write field just in case someone
+ * ever wants one.
+ */
+static i2o_proc_entry i2o_proc_generic_iop_entries[] = {
+	{"hrt", S_IFREG | S_IRUGO, &i2o_seq_fops_hrt},
+	{"lct", S_IFREG | S_IRUGO, &i2o_seq_fops_lct},
+	{"status", S_IFREG | S_IRUGO, &i2o_seq_fops_status},
+	{"hw", S_IFREG | S_IRUGO, &i2o_seq_fops_hw},
+	{"ddm_table", S_IFREG | S_IRUGO, &i2o_seq_fops_ddm_table},
+	{"driver_store", S_IFREG | S_IRUGO, &i2o_seq_fops_driver_store},
+	{"drivers_stored", S_IFREG | S_IRUGO, &i2o_seq_fops_drivers_stored},
+	{NULL, 0, NULL}
+};
+
+/*
+ * Device specific entries
+ */
+static i2o_proc_entry generic_dev_entries[] = {
+	{"groups", S_IFREG | S_IRUGO, &i2o_seq_fops_groups},
+	{"phys_dev", S_IFREG | S_IRUGO, &i2o_seq_fops_phys_device},
+	{"claimed", S_IFREG | S_IRUGO, &i2o_seq_fops_claimed},
+	{"users", S_IFREG | S_IRUGO, &i2o_seq_fops_users},
+	{"priv_msgs", S_IFREG | S_IRUGO, &i2o_seq_fops_priv_msgs},
+	{"authorized_users", S_IFREG | S_IRUGO, &i2o_seq_fops_authorized_users},
+	{"dev_identity", S_IFREG | S_IRUGO, &i2o_seq_fops_dev_identity},
+	{"ddm_identity", S_IFREG | S_IRUGO, &i2o_seq_fops_ddm_identity},
+	{"user_info", S_IFREG | S_IRUGO, &i2o_seq_fops_uinfo},
+	{"sgl_limits", S_IFREG | S_IRUGO, &i2o_seq_fops_sgl_limits},
+	{"sensors", S_IFREG | S_IRUGO, &i2o_seq_fops_sensors},
+	{NULL, 0, NULL}
+};
+
+/*
+ *  Storage unit specific entries (SCSI Periph, BS) with device names
+ */
+static i2o_proc_entry rbs_dev_entries[] = {
+	{"dev_name", S_IFREG | S_IRUGO, &i2o_seq_fops_dev_name},
+	{NULL, 0, NULL}
+};
+
+/**
+ *	i2o_proc_create_entries - Creates proc dir entries
+ *	@dir: proc dir entry under which the entries should be placed
+ *	@i2o_pe: pointer to the entries which should be added
+ *	@data: pointer to I2O controller or device
+ *
+ *	Create proc dir entries for a I2O controller or I2O device.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_proc_create_entries(struct proc_dir_entry *dir,
+				   i2o_proc_entry * i2o_pe, void *data)
+{
+	struct proc_dir_entry *tmp;
+
+	while (i2o_pe->name) {
+		tmp = proc_create_data(i2o_pe->name, i2o_pe->mode, dir,
+				       i2o_pe->fops, data);
+		if (!tmp)
+			return -1;
+
+		i2o_pe++;
+	}
+
+	return 0;
+}
+
+/**
+ *	i2o_proc_device_add - Add an I2O device to the proc dir
+ *	@dir: proc dir entry to which the device should be added
+ *	@dev: I2O device which should be added
+ *
+ *	Add an I2O device to the proc dir entry dir and create the entries for
+ *	the device depending on the class of the I2O device.
+ */
+static void i2o_proc_device_add(struct proc_dir_entry *dir,
+				struct i2o_device *dev)
+{
+	char buff[10];
+	struct proc_dir_entry *devdir;
+	i2o_proc_entry *i2o_pe = NULL;
+
+	sprintf(buff, "%03x", dev->lct_data.tid);
+
+	osm_debug("adding device /proc/i2o/%s/%s\n", dev->iop->name, buff);
+
+	devdir = proc_mkdir_data(buff, 0, dir, dev);
+	if (!devdir) {
+		osm_warn("Could not allocate procdir!\n");
+		return;
+	}
+
+	i2o_proc_create_entries(devdir, generic_dev_entries, dev);
+
+	/* Inform core that we want updates about this device's status */
+	switch (dev->lct_data.class_id) {
+	case I2O_CLASS_SCSI_PERIPHERAL:
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		i2o_pe = rbs_dev_entries;
+		break;
+	default:
+		break;
+	}
+	if (i2o_pe)
+		i2o_proc_create_entries(devdir, i2o_pe, dev);
+}
+
+/**
+ *	i2o_proc_iop_add - Add an I2O controller to the i2o proc tree
+ *	@dir: parent proc dir entry
+ *	@c: I2O controller which should be added
+ *
+ *	Add the entries to the parent proc dir entry. Also each device is added
+ *	to the controllers proc dir entry.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_proc_iop_add(struct proc_dir_entry *dir,
+			    struct i2o_controller *c)
+{
+	struct proc_dir_entry *iopdir;
+	struct i2o_device *dev;
+
+	osm_debug("adding IOP /proc/i2o/%s\n", c->name);
+
+	iopdir = proc_mkdir_data(c->name, 0, dir, c);
+	if (!iopdir)
+		return -1;
+
+	i2o_proc_create_entries(iopdir, i2o_proc_generic_iop_entries, c);
+
+	list_for_each_entry(dev, &c->devices, list)
+	    i2o_proc_device_add(iopdir, dev);
+
+	return 0;
+}
+
+/**
+ *	i2o_proc_fs_create - Create the i2o proc fs.
+ *
+ *	Iterate over each I2O controller and create the entries for it.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_proc_fs_create(void)
+{
+	struct i2o_controller *c;
+
+	i2o_proc_dir_root = proc_mkdir("i2o", NULL);
+	if (!i2o_proc_dir_root)
+		return -1;
+
+	list_for_each_entry(c, &i2o_controllers, list)
+	    i2o_proc_iop_add(i2o_proc_dir_root, c);
+
+	return 0;
+};
+
+/**
+ *	i2o_proc_fs_destroy - Cleanup the all i2o proc entries
+ *
+ *	Iterate over each I2O controller and remove the entries for it.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __exit i2o_proc_fs_destroy(void)
+{
+	remove_proc_subtree("i2o", NULL);
+
+	return 0;
+};
+
+/**
+ *	i2o_proc_init - Init function for procfs
+ *
+ *	Registers Proc OSM and creates procfs entries.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_proc_init(void)
+{
+	int rc;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	rc = i2o_driver_register(&i2o_proc_driver);
+	if (rc)
+		return rc;
+
+	rc = i2o_proc_fs_create();
+	if (rc) {
+		i2o_driver_unregister(&i2o_proc_driver);
+		return rc;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_proc_exit - Exit function for procfs
+ *
+ *	Unregisters Proc OSM and removes procfs entries.
+ */
+static void __exit i2o_proc_exit(void)
+{
+	i2o_driver_unregister(&i2o_proc_driver);
+	i2o_proc_fs_destroy();
+};
+
+MODULE_AUTHOR("Deepak Saxena");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_proc_init);
+module_exit(i2o_proc_exit);
diff --git a/drivers/staging/i2o/i2o_scsi.c b/drivers/staging/i2o/i2o_scsi.c
new file mode 100644
index 000000000000..1b11dcb3faea
--- /dev/null
+++ b/drivers/staging/i2o/i2o_scsi.c
@@ -0,0 +1,814 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * For the avoidance of doubt the "preferred form" of this code is one which
+ * is in an open non patent encumbered format. Where cryptographic key signing
+ * forms part of the process of creating an executable the information
+ * including keys needed to generate an equivalently functional executable
+ * are deemed to be part of the source code.
+ *
+ *  Complications for I2O scsi
+ *
+ *	o	Each (bus,lun) is a logical device in I2O. We keep a map
+ *		table. We spoof failed selection for unmapped units
+ *	o	Request sense buffers can come back for free.
+ *	o	Scatter gather is a bit dynamic. We have to investigate at
+ *		setup time.
+ *	o	Some of our resources are dynamically shared. The i2o core
+ *		needs a message reservation protocol to avoid swap v net
+ *		deadlocking. We need to back off queue requests.
+ *
+ *	In general the firmware wants to help. Where its help isn't performance
+ *	useful we just ignore the aid. Its not worth the code in truth.
+ *
+ * Fixes/additions:
+ *	Steve Ralston:
+ *		Scatter gather now works
+ *	Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *		Minor fixes for 2.6.
+ *
+ * To Do:
+ *	64bit cleanups
+ *	Fix the resource management problems.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ioport.h>
+#include <linux/jiffies.h>
+#include <linux/interrupt.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/proc_fs.h>
+#include <linux/prefetch.h>
+#include <linux/pci.h>
+#include <linux/blkdev.h>
+#include "i2o.h"
+#include <linux/scatterlist.h>
+
+#include <asm/dma.h>
+#include <asm/io.h>
+#include <linux/atomic.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/sg.h>
+
+#define OSM_NAME	"scsi-osm"
+#define OSM_VERSION	"1.316"
+#define OSM_DESCRIPTION	"I2O SCSI Peripheral OSM"
+
+static struct i2o_driver i2o_scsi_driver;
+
+static unsigned int i2o_scsi_max_id = 16;
+static unsigned int i2o_scsi_max_lun = 255;
+
+struct i2o_scsi_host {
+	struct Scsi_Host *scsi_host;	/* pointer to the SCSI host */
+	struct i2o_controller *iop;	/* pointer to the I2O controller */
+	u64 lun;	/* lun's used for block devices */
+	struct i2o_device *channel[0];	/* channel->i2o_dev mapping table */
+};
+
+static struct scsi_host_template i2o_scsi_host_template;
+
+#define I2O_SCSI_CAN_QUEUE	4
+
+/* SCSI OSM class handling definition */
+static struct i2o_class_id i2o_scsi_class_id[] = {
+	{I2O_CLASS_SCSI_PERIPHERAL},
+	{I2O_CLASS_END}
+};
+
+static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
+{
+	struct i2o_scsi_host *i2o_shost;
+	struct i2o_device *i2o_dev;
+	struct Scsi_Host *scsi_host;
+	int max_channel = 0;
+	u8 type;
+	int i;
+	size_t size;
+	u16 body_size = 6;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec)
+		body_size = 8;
+#endif
+
+	list_for_each_entry(i2o_dev, &c->devices, list)
+	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
+		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
+		    && (type == 0x01))	/* SCSI bus */
+			max_channel++;
+	}
+
+	if (!max_channel) {
+		osm_warn("no channels found on %s\n", c->name);
+		return ERR_PTR(-EFAULT);
+	}
+
+	size = max_channel * sizeof(struct i2o_device *)
+	    + sizeof(struct i2o_scsi_host);
+
+	scsi_host = scsi_host_alloc(&i2o_scsi_host_template, size);
+	if (!scsi_host) {
+		osm_warn("Could not allocate SCSI host\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	scsi_host->max_channel = max_channel - 1;
+	scsi_host->max_id = i2o_scsi_max_id;
+	scsi_host->max_lun = i2o_scsi_max_lun;
+	scsi_host->this_id = c->unit;
+	scsi_host->sg_tablesize = i2o_sg_tablesize(c, body_size);
+
+	i2o_shost = (struct i2o_scsi_host *)scsi_host->hostdata;
+	i2o_shost->scsi_host = scsi_host;
+	i2o_shost->iop = c;
+	i2o_shost->lun = 1;
+
+	i = 0;
+	list_for_each_entry(i2o_dev, &c->devices, list)
+	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
+		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
+		    && (type == 0x01))	/* only SCSI bus */
+			i2o_shost->channel[i++] = i2o_dev;
+
+		if (i >= max_channel)
+			break;
+	}
+
+	return i2o_shost;
+};
+
+/**
+ *	i2o_scsi_get_host - Get an I2O SCSI host
+ *	@c: I2O controller to for which to get the SCSI host
+ *
+ *	If the I2O controller already exists as SCSI host, the SCSI host
+ *	is returned, otherwise the I2O controller is added to the SCSI
+ *	core.
+ *
+ *	Returns pointer to the I2O SCSI host on success or NULL on failure.
+ */
+static struct i2o_scsi_host *i2o_scsi_get_host(struct i2o_controller *c)
+{
+	return c->driver_data[i2o_scsi_driver.context];
+};
+
+/**
+ *	i2o_scsi_remove - Remove I2O device from SCSI core
+ *	@dev: device which should be removed
+ *
+ *	Removes the I2O device from the SCSI core again.
+ *
+ *	Returns 0 on success.
+ */
+static int i2o_scsi_remove(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_controller *c = i2o_dev->iop;
+	struct i2o_scsi_host *i2o_shost;
+	struct scsi_device *scsi_dev;
+
+	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
+
+	i2o_shost = i2o_scsi_get_host(c);
+
+	shost_for_each_device(scsi_dev, i2o_shost->scsi_host)
+	    if (scsi_dev->hostdata == i2o_dev) {
+		sysfs_remove_link(&i2o_dev->device.kobj, "scsi");
+		scsi_remove_device(scsi_dev);
+		scsi_device_put(scsi_dev);
+		break;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_scsi_probe - verify if dev is a I2O SCSI device and install it
+ *	@dev: device to verify if it is a I2O SCSI device
+ *
+ *	Retrieve channel, id and lun for I2O device. If everything goes well
+ *	register the I2O device as SCSI device on the I2O SCSI controller.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_scsi_probe(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_controller *c = i2o_dev->iop;
+	struct i2o_scsi_host *i2o_shost;
+	struct Scsi_Host *scsi_host;
+	struct i2o_device *parent;
+	struct scsi_device *scsi_dev;
+	u32 id = -1;
+	u64 lun = -1;
+	int channel = -1;
+	int i, rc;
+
+	i2o_shost = i2o_scsi_get_host(c);
+	if (!i2o_shost)
+		return -EFAULT;
+
+	scsi_host = i2o_shost->scsi_host;
+
+	switch (i2o_dev->lct_data.class_id) {
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+	case I2O_CLASS_EXECUTIVE:
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+		if (c->adaptec) {
+			u8 type;
+			struct i2o_device *d = i2o_shost->channel[0];
+
+			if (!i2o_parm_field_get(d, 0x0000, 0, &type, 1)
+			    && (type == 0x01))	/* SCSI bus */
+				if (!i2o_parm_field_get(d, 0x0200, 4, &id, 4)) {
+					channel = 0;
+					if (i2o_dev->lct_data.class_id ==
+					    I2O_CLASS_RANDOM_BLOCK_STORAGE)
+						lun =
+						    cpu_to_le64(i2o_shost->
+								lun++);
+					else
+						lun = 0;
+				}
+		}
+#endif
+		break;
+
+	case I2O_CLASS_SCSI_PERIPHERAL:
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 3, &id, 4))
+			return -EFAULT;
+
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 4, &lun, 8))
+			return -EFAULT;
+
+		parent = i2o_iop_find_device(c, i2o_dev->lct_data.parent_tid);
+		if (!parent) {
+			osm_warn("can not find parent of device %03x\n",
+				 i2o_dev->lct_data.tid);
+			return -EFAULT;
+		}
+
+		for (i = 0; i <= i2o_shost->scsi_host->max_channel; i++)
+			if (i2o_shost->channel[i] == parent)
+				channel = i;
+		break;
+
+	default:
+		return -EFAULT;
+	}
+
+	if (channel == -1) {
+		osm_warn("can not find channel of device %03x\n",
+			 i2o_dev->lct_data.tid);
+		return -EFAULT;
+	}
+
+	if (le32_to_cpu(id) >= scsi_host->max_id) {
+		osm_warn("SCSI device id (%d) >= max_id of I2O host (%d)",
+			 le32_to_cpu(id), scsi_host->max_id);
+		return -EFAULT;
+	}
+
+	if (le64_to_cpu(lun) >= scsi_host->max_lun) {
+		osm_warn("SCSI device lun (%llu) >= max_lun of I2O host (%llu)",
+			 le64_to_cpu(lun), scsi_host->max_lun);
+		return -EFAULT;
+	}
+
+	scsi_dev =
+	    __scsi_add_device(i2o_shost->scsi_host, channel, le32_to_cpu(id),
+			      le64_to_cpu(lun), i2o_dev);
+
+	if (IS_ERR(scsi_dev)) {
+		osm_warn("can not add SCSI device %03x\n",
+			 i2o_dev->lct_data.tid);
+		return PTR_ERR(scsi_dev);
+	}
+
+	rc = sysfs_create_link(&i2o_dev->device.kobj,
+			       &scsi_dev->sdev_gendev.kobj, "scsi");
+	if (rc)
+		goto err;
+
+	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %llu\n",
+		 i2o_dev->lct_data.tid, channel, le32_to_cpu(id),
+		 le64_to_cpu(lun));
+
+	return 0;
+
+err:
+	scsi_remove_device(scsi_dev);
+	return rc;
+};
+
+static const char *i2o_scsi_info(struct Scsi_Host *SChost)
+{
+	struct i2o_scsi_host *hostdata;
+	hostdata = (struct i2o_scsi_host *)SChost->hostdata;
+	return hostdata->iop->name;
+}
+
+/**
+ *	i2o_scsi_reply - SCSI OSM message reply handler
+ *	@c: controller issuing the reply
+ *	@m: message id for flushing
+ *	@msg: the message from the controller
+ *
+ *	Process reply messages (interrupts in normal scsi controller think).
+ *	We can get a variety of messages to process. The normal path is
+ *	scsi command completions. We must also deal with IOP failures,
+ *	the reply to a bus reset and the reply to a LUN query.
+ *
+ *	Returns 0 on success and if the reply should not be flushed or > 0
+ *	on success and if the reply should be flushed. Returns negative error
+ *	code on failure and if the reply should be flushed.
+ */
+static int i2o_scsi_reply(struct i2o_controller *c, u32 m,
+			  struct i2o_message *msg)
+{
+	struct scsi_cmnd *cmd;
+	u32 error;
+	struct device *dev;
+
+	cmd = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
+	if (unlikely(!cmd)) {
+		osm_err("NULL reply received!\n");
+		return -1;
+	}
+
+	/*
+	 *      Low byte is device status, next is adapter status,
+	 *      (then one byte reserved), then request status.
+	 */
+	error = le32_to_cpu(msg->body[0]);
+
+	osm_debug("Completed %0x%p\n", cmd);
+
+	cmd->result = error & 0xff;
+	/*
+	 * if DeviceStatus is not SCSI_SUCCESS copy over the sense data and let
+	 * the SCSI layer handle the error
+	 */
+	if (cmd->result)
+		memcpy(cmd->sense_buffer, &msg->body[3],
+		       min(SCSI_SENSE_BUFFERSIZE, 40));
+
+	/* only output error code if AdapterStatus is not HBA_SUCCESS */
+	if ((error >> 8) & 0xff)
+		osm_err("SCSI error %08x\n", error);
+
+	dev = &c->pdev->dev;
+
+	scsi_dma_unmap(cmd);
+
+	cmd->scsi_done(cmd);
+
+	return 1;
+};
+
+/**
+ *	i2o_scsi_notify_device_add - Retrieve notifications of added devices
+ *	@i2o_dev: the I2O device which was added
+ *
+ *	If a I2O device is added we catch the notification, because I2O classes
+ *	other than SCSI peripheral will not be received through
+ *	i2o_scsi_probe().
+ */
+static void i2o_scsi_notify_device_add(struct i2o_device *i2o_dev)
+{
+	switch (i2o_dev->lct_data.class_id) {
+	case I2O_CLASS_EXECUTIVE:
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		i2o_scsi_probe(&i2o_dev->device);
+		break;
+
+	default:
+		break;
+	}
+};
+
+/**
+ *	i2o_scsi_notify_device_remove - Retrieve notifications of removed devices
+ *	@i2o_dev: the I2O device which was removed
+ *
+ *	If a I2O device is removed, we catch the notification to remove the
+ *	corresponding SCSI device.
+ */
+static void i2o_scsi_notify_device_remove(struct i2o_device *i2o_dev)
+{
+	switch (i2o_dev->lct_data.class_id) {
+	case I2O_CLASS_EXECUTIVE:
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		i2o_scsi_remove(&i2o_dev->device);
+		break;
+
+	default:
+		break;
+	}
+};
+
+/**
+ *	i2o_scsi_notify_controller_add - Retrieve notifications of added controllers
+ *	@c: the controller which was added
+ *
+ *	If a I2O controller is added, we catch the notification to add a
+ *	corresponding Scsi_Host.
+ */
+static void i2o_scsi_notify_controller_add(struct i2o_controller *c)
+{
+	struct i2o_scsi_host *i2o_shost;
+	int rc;
+
+	i2o_shost = i2o_scsi_host_alloc(c);
+	if (IS_ERR(i2o_shost)) {
+		osm_err("Could not initialize SCSI host\n");
+		return;
+	}
+
+	rc = scsi_add_host(i2o_shost->scsi_host, &c->device);
+	if (rc) {
+		osm_err("Could not add SCSI host\n");
+		scsi_host_put(i2o_shost->scsi_host);
+		return;
+	}
+
+	c->driver_data[i2o_scsi_driver.context] = i2o_shost;
+
+	osm_debug("new I2O SCSI host added\n");
+};
+
+/**
+ *	i2o_scsi_notify_controller_remove - Retrieve notifications of removed controllers
+ *	@c: the controller which was removed
+ *
+ *	If a I2O controller is removed, we catch the notification to remove the
+ *	corresponding Scsi_Host.
+ */
+static void i2o_scsi_notify_controller_remove(struct i2o_controller *c)
+{
+	struct i2o_scsi_host *i2o_shost;
+	i2o_shost = i2o_scsi_get_host(c);
+	if (!i2o_shost)
+		return;
+
+	c->driver_data[i2o_scsi_driver.context] = NULL;
+
+	scsi_remove_host(i2o_shost->scsi_host);
+	scsi_host_put(i2o_shost->scsi_host);
+	osm_debug("I2O SCSI host removed\n");
+};
+
+/* SCSI OSM driver struct */
+static struct i2o_driver i2o_scsi_driver = {
+	.name = OSM_NAME,
+	.reply = i2o_scsi_reply,
+	.classes = i2o_scsi_class_id,
+	.notify_device_add = i2o_scsi_notify_device_add,
+	.notify_device_remove = i2o_scsi_notify_device_remove,
+	.notify_controller_add = i2o_scsi_notify_controller_add,
+	.notify_controller_remove = i2o_scsi_notify_controller_remove,
+	.driver = {
+		   .probe = i2o_scsi_probe,
+		   .remove = i2o_scsi_remove,
+		   },
+};
+
+/**
+ *	i2o_scsi_queuecommand - queue a SCSI command
+ *	@SCpnt: scsi command pointer
+ *	@done: callback for completion
+ *
+ *	Issue a scsi command asynchronously. Return 0 on success or 1 if
+ *	we hit an error (normally message queue congestion). The only
+ *	minor complication here is that I2O deals with the device addressing
+ *	so we have to map the bus/dev/lun back to an I2O handle as well
+ *	as faking absent devices ourself.
+ *
+ *	Locks: takes the controller lock on error path only
+ */
+
+static int i2o_scsi_queuecommand_lck(struct scsi_cmnd *SCpnt,
+				 void (*done) (struct scsi_cmnd *))
+{
+	struct i2o_controller *c;
+	struct i2o_device *i2o_dev;
+	int tid;
+	struct i2o_message *msg;
+	/*
+	 * ENABLE_DISCONNECT
+	 * SIMPLE_TAG
+	 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
+	 */
+	u32 scsi_flags = 0x20a00000;
+	u32 sgl_offset;
+	u32 *mptr;
+	u32 cmd = I2O_CMD_SCSI_EXEC << 24;
+	int rc = 0;
+
+	/*
+	 *      Do the incoming paperwork
+	 */
+	i2o_dev = SCpnt->device->hostdata;
+
+	SCpnt->scsi_done = done;
+
+	if (unlikely(!i2o_dev)) {
+		osm_warn("no I2O device in request\n");
+		SCpnt->result = DID_NO_CONNECT << 16;
+		done(SCpnt);
+		goto exit;
+	}
+	c = i2o_dev->iop;
+	tid = i2o_dev->lct_data.tid;
+
+	osm_debug("qcmd: Tid = %03x\n", tid);
+	osm_debug("Real scsi messages.\n");
+
+	/*
+	 *      Put together a scsi execscb message
+	 */
+	switch (SCpnt->sc_data_direction) {
+	case PCI_DMA_NONE:
+		/* DATA NO XFER */
+		sgl_offset = SGL_OFFSET_0;
+		break;
+
+	case PCI_DMA_TODEVICE:
+		/* DATA OUT (iop-->dev) */
+		scsi_flags |= 0x80000000;
+		sgl_offset = SGL_OFFSET_10;
+		break;
+
+	case PCI_DMA_FROMDEVICE:
+		/* DATA IN  (iop<--dev) */
+		scsi_flags |= 0x40000000;
+		sgl_offset = SGL_OFFSET_10;
+		break;
+
+	default:
+		/* Unknown - kill the command */
+		SCpnt->result = DID_NO_CONNECT << 16;
+		done(SCpnt);
+		goto exit;
+	}
+
+	/*
+	 *      Obtain an I2O message. If there are none free then
+	 *      throw it back to the scsi layer
+	 */
+
+	msg = i2o_msg_get(c);
+	if (IS_ERR(msg)) {
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+		goto exit;
+	}
+
+	mptr = &msg->body[0];
+
+#if 0 /* this code can't work */
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		u32 adpt_flags = 0;
+
+		if (SCpnt->sc_request && SCpnt->sc_request->upper_private_data) {
+			i2o_sg_io_hdr_t __user *usr_ptr =
+			    ((Sg_request *) (SCpnt->sc_request->
+					     upper_private_data))->header.
+			    usr_ptr;
+
+			if (usr_ptr)
+				get_user(adpt_flags, &usr_ptr->flags);
+		}
+
+		switch (i2o_dev->lct_data.class_id) {
+		case I2O_CLASS_EXECUTIVE:
+		case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+			/* interpret flag has to be set for executive */
+			adpt_flags ^= I2O_DPT_SG_FLAG_INTERPRET;
+			break;
+
+		default:
+			break;
+		}
+
+		/*
+		 * for Adaptec controllers we use the PRIVATE command, because
+		 * the normal SCSI EXEC doesn't support all SCSI commands on
+		 * all controllers (for example READ CAPACITY).
+		 */
+		if (sgl_offset == SGL_OFFSET_10)
+			sgl_offset = SGL_OFFSET_12;
+		cmd = I2O_CMD_PRIVATE << 24;
+		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
+		*mptr++ = cpu_to_le32(adpt_flags | tid);
+	}
+#endif
+#endif
+
+	msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
+	msg->u.s.icntxt = cpu_to_le32(i2o_scsi_driver.context);
+
+	/* We want the SCSI control block back */
+	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, SCpnt));
+
+	/* LSI_920_PCI_QUIRK
+	 *
+	 *      Intermittant observations of msg frame word data corruption
+	 *      observed on msg[4] after:
+	 *        WRITE, READ-MODIFY-WRITE
+	 *      operations.  19990606 -sralston
+	 *
+	 *      (Hence we build this word via tag. Its good practice anyway
+	 *       we don't want fetches over PCI needlessly)
+	 */
+
+	/* Attach tags to the devices */
+	/* FIXME: implement
+	   if(SCpnt->device->tagged_supported) {
+	   if(SCpnt->tag == HEAD_OF_QUEUE_TAG)
+	   scsi_flags |= 0x01000000;
+	   else if(SCpnt->tag == ORDERED_QUEUE_TAG)
+	   scsi_flags |= 0x01800000;
+	   }
+	 */
+
+	*mptr++ = cpu_to_le32(scsi_flags | SCpnt->cmd_len);
+
+	/* Write SCSI command into the message - always 16 byte block */
+	memcpy(mptr, SCpnt->cmnd, 16);
+	mptr += 4;
+
+	if (sgl_offset != SGL_OFFSET_0) {
+		/* write size of data addressed by SGL */
+		*mptr++ = cpu_to_le32(scsi_bufflen(SCpnt));
+
+		/* Now fill in the SGList and command */
+
+		if (scsi_sg_count(SCpnt)) {
+			if (!i2o_dma_map_sg(c, scsi_sglist(SCpnt),
+					    scsi_sg_count(SCpnt),
+					    SCpnt->sc_data_direction, &mptr))
+				goto nomem;
+		}
+	}
+
+	/* Stick the headers on */
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
+
+	/* Queue the message */
+	i2o_msg_post(c, msg);
+
+	osm_debug("Issued %0x%p\n", SCpnt);
+
+	return 0;
+
+      nomem:
+	rc = -ENOMEM;
+	i2o_msg_nop(c, msg);
+
+      exit:
+	return rc;
+}
+
+static DEF_SCSI_QCMD(i2o_scsi_queuecommand)
+
+/**
+ *	i2o_scsi_abort - abort a running command
+ *	@SCpnt: command to abort
+ *
+ *	Ask the I2O controller to abort a command. This is an asynchrnous
+ *	process and our callback handler will see the command complete with an
+ *	aborted message if it succeeds.
+ *
+ *	Returns 0 if the command is successfully aborted or negative error code
+ *	on failure.
+ */
+static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
+{
+	struct i2o_device *i2o_dev;
+	struct i2o_controller *c;
+	struct i2o_message *msg;
+	int tid;
+	int status = FAILED;
+
+	osm_warn("Aborting command block.\n");
+
+	i2o_dev = SCpnt->device->hostdata;
+	c = i2o_dev->iop;
+	tid = i2o_dev->lct_data.tid;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return SCSI_MLQUEUE_HOST_BUSY;
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid);
+	msg->body[0] = cpu_to_le32(i2o_cntxt_list_get_ptr(c, SCpnt));
+
+	if (!i2o_msg_post_wait(c, msg, I2O_TIMEOUT_SCSI_SCB_ABORT))
+		status = SUCCESS;
+
+	return status;
+}
+
+/**
+ *	i2o_scsi_bios_param	-	Invent disk geometry
+ *	@sdev: scsi device
+ *	@dev: block layer device
+ *	@capacity: size in sectors
+ *	@ip: geometry array
+ *
+ *	This is anyone's guess quite frankly. We use the same rules everyone
+ *	else appears to and hope. It seems to work.
+ */
+
+static int i2o_scsi_bios_param(struct scsi_device *sdev,
+			       struct block_device *dev, sector_t capacity,
+			       int *ip)
+{
+	int size;
+
+	size = capacity;
+	ip[0] = 64;		/* heads                        */
+	ip[1] = 32;		/* sectors                      */
+	if ((ip[2] = size >> 11) > 1024) {	/* cylinders, test for big disk */
+		ip[0] = 255;	/* heads                        */
+		ip[1] = 63;	/* sectors                      */
+		ip[2] = size / (255 * 63);	/* cylinders                    */
+	}
+	return 0;
+}
+
+static struct scsi_host_template i2o_scsi_host_template = {
+	.proc_name = OSM_NAME,
+	.name = OSM_DESCRIPTION,
+	.info = i2o_scsi_info,
+	.queuecommand = i2o_scsi_queuecommand,
+	.eh_abort_handler = i2o_scsi_abort,
+	.bios_param = i2o_scsi_bios_param,
+	.can_queue = I2O_SCSI_CAN_QUEUE,
+	.sg_tablesize = 8,
+	.cmd_per_lun = 6,
+	.use_clustering = ENABLE_CLUSTERING,
+};
+
+/**
+ *	i2o_scsi_init - SCSI OSM initialization function
+ *
+ *	Register SCSI OSM into I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_scsi_init(void)
+{
+	int rc;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	/* Register SCSI OSM into I2O core */
+	rc = i2o_driver_register(&i2o_scsi_driver);
+	if (rc) {
+		osm_err("Could not register SCSI driver\n");
+		return rc;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_scsi_exit - SCSI OSM exit function
+ *
+ *	Unregisters SCSI OSM from I2O core.
+ */
+static void __exit i2o_scsi_exit(void)
+{
+	/* Unregister I2O SCSI OSM from I2O core */
+	i2o_driver_unregister(&i2o_scsi_driver);
+};
+
+MODULE_AUTHOR("Red Hat Software");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_scsi_init);
+module_exit(i2o_scsi_exit);
diff --git a/drivers/staging/i2o/iop.c b/drivers/staging/i2o/iop.c
new file mode 100644
index 000000000000..52334fc8b547
--- /dev/null
+++ b/drivers/staging/i2o/iop.c
@@ -0,0 +1,1247 @@
+/*
+ *	Functions to handle I2O controllers and I2O message handling
+ *
+ *	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	A lot of the I2O message side code from this is taken from the
+ *	Red Creek RCPCI45 adapter driver by Red Creek Communications
+ *
+ *	Fixes/additions:
+ *		Philipp Rumpf
+ *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
+ *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
+ *		Deepak Saxena <deepak@plexity.net>
+ *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
+ *			Ported to Linux 2.5.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor fixes for 2.6.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "core.h"
+
+#define OSM_NAME	"i2o"
+#define OSM_VERSION	"1.325"
+#define OSM_DESCRIPTION	"I2O subsystem"
+
+/* global I2O controller list */
+LIST_HEAD(i2o_controllers);
+
+/*
+ * global I2O System Table. Contains information about all the IOPs in the
+ * system. Used to inform IOPs about each others existence.
+ */
+static struct i2o_dma i2o_systab;
+
+static int i2o_hrt_get(struct i2o_controller *c);
+
+/**
+ *	i2o_msg_get_wait - obtain an I2O message from the IOP
+ *	@c: I2O controller
+ *	@wait: how long to wait until timeout
+ *
+ *	This function waits up to wait seconds for a message slot to be
+ *	available.
+ *
+ *	On a success the message is returned and the pointer to the message is
+ *	set in msg. The returned message is the physical page frame offset
+ *	address from the read port (see the i2o spec). If no message is
+ *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
+ */
+struct i2o_message *i2o_msg_get_wait(struct i2o_controller *c, int wait)
+{
+	unsigned long timeout = jiffies + wait * HZ;
+	struct i2o_message *msg;
+
+	while (IS_ERR(msg = i2o_msg_get(c))) {
+		if (time_after(jiffies, timeout)) {
+			osm_debug("%s: Timeout waiting for message frame.\n",
+				  c->name);
+			return ERR_PTR(-ETIMEDOUT);
+		}
+		schedule_timeout_uninterruptible(1);
+	}
+
+	return msg;
+};
+
+#if BITS_PER_LONG == 64
+/**
+ *      i2o_cntxt_list_add - Append a pointer to context list and return a id
+ *	@c: controller to which the context list belong
+ *	@ptr: pointer to add to the context list
+ *
+ *	Because the context field in I2O is only 32-bit large, on 64-bit the
+ *	pointer is to large to fit in the context field. The i2o_cntxt_list
+ *	functions therefore map pointers to context fields.
+ *
+ *	Returns context id > 0 on success or 0 on failure.
+ */
+u32 i2o_cntxt_list_add(struct i2o_controller * c, void *ptr)
+{
+	struct i2o_context_list_element *entry;
+	unsigned long flags;
+
+	if (!ptr)
+		osm_err("%s: couldn't add NULL pointer to context list!\n",
+			c->name);
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry) {
+		osm_err("%s: Could not allocate memory for context list element"
+			"\n", c->name);
+		return 0;
+	}
+
+	entry->ptr = ptr;
+	entry->timestamp = jiffies;
+	INIT_LIST_HEAD(&entry->list);
+
+	spin_lock_irqsave(&c->context_list_lock, flags);
+
+	if (unlikely(atomic_inc_and_test(&c->context_list_counter)))
+		atomic_inc(&c->context_list_counter);
+
+	entry->context = atomic_read(&c->context_list_counter);
+
+	list_add(&entry->list, &c->context_list);
+
+	spin_unlock_irqrestore(&c->context_list_lock, flags);
+
+	osm_debug("%s: Add context to list %p -> %d\n", c->name, ptr, context);
+
+	return entry->context;
+};
+
+/**
+ *      i2o_cntxt_list_remove - Remove a pointer from the context list
+ *	@c: controller to which the context list belong
+ *	@ptr: pointer which should be removed from the context list
+ *
+ *	Removes a previously added pointer from the context list and returns
+ *	the matching context id.
+ *
+ *	Returns context id on success or 0 on failure.
+ */
+u32 i2o_cntxt_list_remove(struct i2o_controller * c, void *ptr)
+{
+	struct i2o_context_list_element *entry;
+	u32 context = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c->context_list_lock, flags);
+	list_for_each_entry(entry, &c->context_list, list)
+	    if (entry->ptr == ptr) {
+		list_del(&entry->list);
+		context = entry->context;
+		kfree(entry);
+		break;
+	}
+	spin_unlock_irqrestore(&c->context_list_lock, flags);
+
+	if (!context)
+		osm_warn("%s: Could not remove nonexistent ptr %p\n", c->name,
+			 ptr);
+
+	osm_debug("%s: remove ptr from context list %d -> %p\n", c->name,
+		  context, ptr);
+
+	return context;
+};
+
+/**
+ *      i2o_cntxt_list_get - Get a pointer from the context list and remove it
+ *	@c: controller to which the context list belong
+ *	@context: context id to which the pointer belong
+ *
+ *	Returns pointer to the matching context id on success or NULL on
+ *	failure.
+ */
+void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
+{
+	struct i2o_context_list_element *entry;
+	unsigned long flags;
+	void *ptr = NULL;
+
+	spin_lock_irqsave(&c->context_list_lock, flags);
+	list_for_each_entry(entry, &c->context_list, list)
+	    if (entry->context == context) {
+		list_del(&entry->list);
+		ptr = entry->ptr;
+		kfree(entry);
+		break;
+	}
+	spin_unlock_irqrestore(&c->context_list_lock, flags);
+
+	if (!ptr)
+		osm_warn("%s: context id %d not found\n", c->name, context);
+
+	osm_debug("%s: get ptr from context list %d -> %p\n", c->name, context,
+		  ptr);
+
+	return ptr;
+};
+
+/**
+ *      i2o_cntxt_list_get_ptr - Get a context id from the context list
+ *	@c: controller to which the context list belong
+ *	@ptr: pointer to which the context id should be fetched
+ *
+ *	Returns context id which matches to the pointer on success or 0 on
+ *	failure.
+ */
+u32 i2o_cntxt_list_get_ptr(struct i2o_controller * c, void *ptr)
+{
+	struct i2o_context_list_element *entry;
+	u32 context = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c->context_list_lock, flags);
+	list_for_each_entry(entry, &c->context_list, list)
+	    if (entry->ptr == ptr) {
+		context = entry->context;
+		break;
+	}
+	spin_unlock_irqrestore(&c->context_list_lock, flags);
+
+	if (!context)
+		osm_warn("%s: Could not find nonexistent ptr %p\n", c->name,
+			 ptr);
+
+	osm_debug("%s: get context id from context list %p -> %d\n", c->name,
+		  ptr, context);
+
+	return context;
+};
+#endif
+
+/**
+ *	i2o_iop_find - Find an I2O controller by id
+ *	@unit: unit number of the I2O controller to search for
+ *
+ *	Lookup the I2O controller on the controller list.
+ *
+ *	Returns pointer to the I2O controller on success or NULL if not found.
+ */
+struct i2o_controller *i2o_find_iop(int unit)
+{
+	struct i2o_controller *c;
+
+	list_for_each_entry(c, &i2o_controllers, list) {
+		if (c->unit == unit)
+			return c;
+	}
+
+	return NULL;
+};
+
+/**
+ *	i2o_iop_find_device - Find a I2O device on an I2O controller
+ *	@c: I2O controller where the I2O device hangs on
+ *	@tid: TID of the I2O device to search for
+ *
+ *	Searches the devices of the I2O controller for a device with TID tid and
+ *	returns it.
+ *
+ *	Returns a pointer to the I2O device if found, otherwise NULL.
+ */
+struct i2o_device *i2o_iop_find_device(struct i2o_controller *c, u16 tid)
+{
+	struct i2o_device *dev;
+
+	list_for_each_entry(dev, &c->devices, list)
+	    if (dev->lct_data.tid == tid)
+		return dev;
+
+	return NULL;
+};
+
+/**
+ *	i2o_quiesce_controller - quiesce controller
+ *	@c: controller
+ *
+ *	Quiesce an IOP. Causes IOP to make external operation quiescent
+ *	(i2o 'READY' state). Internal operation of the IOP continues normally.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_quiesce(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	i2o_status_block *sb = c->status_block.virt;
+	int rc;
+
+	i2o_status_get(c);
+
+	/* SysQuiesce discarded if IOP not in READY or OPERATIONAL state */
+	if ((sb->iop_state != ADAPTER_STATE_READY) &&
+	    (sb->iop_state != ADAPTER_STATE_OPERATIONAL))
+		return 0;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+
+	/* Long timeout needed for quiesce if lots of devices */
+	if ((rc = i2o_msg_post_wait(c, msg, 240)))
+		osm_info("%s: Unable to quiesce (status=%#x).\n", c->name, -rc);
+	else
+		osm_debug("%s: Quiesced.\n", c->name);
+
+	i2o_status_get(c);	// Entered READY state
+
+	return rc;
+};
+
+/**
+ *	i2o_iop_enable - move controller from ready to OPERATIONAL
+ *	@c: I2O controller
+ *
+ *	Enable IOP. This allows the IOP to resume external operations and
+ *	reverses the effect of a quiesce. Returns zero or an error code if
+ *	an error occurs.
+ */
+static int i2o_iop_enable(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	i2o_status_block *sb = c->status_block.virt;
+	int rc;
+
+	i2o_status_get(c);
+
+	/* Enable only allowed on READY state */
+	if (sb->iop_state != ADAPTER_STATE_READY)
+		return -EINVAL;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+
+	/* How long of a timeout do we need? */
+	if ((rc = i2o_msg_post_wait(c, msg, 240)))
+		osm_err("%s: Could not enable (status=%#x).\n", c->name, -rc);
+	else
+		osm_debug("%s: Enabled.\n", c->name);
+
+	i2o_status_get(c);	// entered OPERATIONAL state
+
+	return rc;
+};
+
+/**
+ *	i2o_iop_quiesce_all - Quiesce all I2O controllers on the system
+ *
+ *	Quiesce all I2O controllers which are connected to the system.
+ */
+static inline void i2o_iop_quiesce_all(void)
+{
+	struct i2o_controller *c, *tmp;
+
+	list_for_each_entry_safe(c, tmp, &i2o_controllers, list) {
+		if (!c->no_quiesce)
+			i2o_iop_quiesce(c);
+	}
+};
+
+/**
+ *	i2o_iop_enable_all - Enables all controllers on the system
+ *
+ *	Enables all I2O controllers which are connected to the system.
+ */
+static inline void i2o_iop_enable_all(void)
+{
+	struct i2o_controller *c, *tmp;
+
+	list_for_each_entry_safe(c, tmp, &i2o_controllers, list)
+	    i2o_iop_enable(c);
+};
+
+/**
+ *	i2o_clear_controller - Bring I2O controller into HOLD state
+ *	@c: controller
+ *
+ *	Clear an IOP to HOLD state, ie. terminate external operations, clear all
+ *	input queues and prepare for a system restart. IOP's internal operation
+ *	continues normally and the outbound queue is alive. The IOP is not
+ *	expected to rebuild its LCT.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_clear(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	int rc;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	/* Quiesce all IOPs first */
+	i2o_iop_quiesce_all();
+
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+
+	if ((rc = i2o_msg_post_wait(c, msg, 30)))
+		osm_info("%s: Unable to clear (status=%#x).\n", c->name, -rc);
+	else
+		osm_debug("%s: Cleared.\n", c->name);
+
+	/* Enable all IOPs */
+	i2o_iop_enable_all();
+
+	return rc;
+}
+
+/**
+ *	i2o_iop_init_outbound_queue - setup the outbound message queue
+ *	@c: I2O controller
+ *
+ *	Clear and (re)initialize IOP's outbound queue and post the message
+ *	frames to the IOP.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
+{
+	u32 m;
+	volatile u8 *status = c->status.virt;
+	struct i2o_message *msg;
+	ulong timeout;
+	int i;
+
+	osm_debug("%s: Initializing Outbound Queue...\n", c->name);
+
+	memset(c->status.virt, 0, 4);
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(PAGE_SIZE);
+	/* Outbound msg frame size in words and Initcode */
+	msg->body[1] = cpu_to_le32(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80);
+	msg->body[2] = cpu_to_le32(0xd0000004);
+	msg->body[3] = cpu_to_le32(i2o_dma_low(c->status.phys));
+	msg->body[4] = cpu_to_le32(i2o_dma_high(c->status.phys));
+
+	i2o_msg_post(c, msg);
+
+	timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ;
+	while (*status <= I2O_CMD_IN_PROGRESS) {
+		if (time_after(jiffies, timeout)) {
+			osm_warn("%s: Timeout Initializing\n", c->name);
+			return -ETIMEDOUT;
+		}
+		schedule_timeout_uninterruptible(1);
+	}
+
+	m = c->out_queue.phys;
+
+	/* Post frames */
+	for (i = 0; i < I2O_MAX_OUTBOUND_MSG_FRAMES; i++) {
+		i2o_flush_reply(c, m);
+		udelay(1);	/* Promise */
+		m += I2O_OUTBOUND_MSG_FRAME_SIZE * sizeof(u32);
+	}
+
+	return 0;
+}
+
+/**
+ *	i2o_iop_reset - reset an I2O controller
+ *	@c: controller to reset
+ *
+ *	Reset the IOP into INIT state and wait until IOP gets into RESET state.
+ *	Terminate all external operations, clear IOP's inbound and outbound
+ *	queues, terminate all DDMs, and reload the IOP's operating environment
+ *	and all local DDMs. The IOP rebuilds its LCT.
+ */
+static int i2o_iop_reset(struct i2o_controller *c)
+{
+	volatile u8 *status = c->status.virt;
+	struct i2o_message *msg;
+	unsigned long timeout;
+	i2o_status_block *sb = c->status_block.virt;
+	int rc = 0;
+
+	osm_debug("%s: Resetting controller\n", c->name);
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	memset(c->status_block.virt, 0, 8);
+
+	/* Quiesce all IOPs first */
+	i2o_iop_quiesce_all();
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0x00000000);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status.phys));
+	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status.phys));
+
+	i2o_msg_post(c, msg);
+
+	/* Wait for a reply */
+	timeout = jiffies + I2O_TIMEOUT_RESET * HZ;
+	while (!*status) {
+		if (time_after(jiffies, timeout))
+			break;
+
+		schedule_timeout_uninterruptible(1);
+	}
+
+	switch (*status) {
+	case I2O_CMD_REJECTED:
+		osm_warn("%s: IOP reset rejected\n", c->name);
+		rc = -EPERM;
+		break;
+
+	case I2O_CMD_IN_PROGRESS:
+		/*
+		 * Once the reset is sent, the IOP goes into the INIT state
+		 * which is indeterminate. We need to wait until the IOP has
+		 * rebooted before we can let the system talk to it. We read
+		 * the inbound Free_List until a message is available. If we
+		 * can't read one in the given amount of time, we assume the
+		 * IOP could not reboot properly.
+		 */
+		osm_debug("%s: Reset in progress, waiting for reboot...\n",
+			  c->name);
+
+		while (IS_ERR(msg = i2o_msg_get_wait(c, I2O_TIMEOUT_RESET))) {
+			if (time_after(jiffies, timeout)) {
+				osm_err("%s: IOP reset timeout.\n", c->name);
+				rc = PTR_ERR(msg);
+				goto exit;
+			}
+			schedule_timeout_uninterruptible(1);
+		}
+		i2o_msg_nop(c, msg);
+
+		/* from here all quiesce commands are safe */
+		c->no_quiesce = 0;
+
+		/* verify if controller is in state RESET */
+		i2o_status_get(c);
+
+		if (!c->promise && (sb->iop_state != ADAPTER_STATE_RESET))
+			osm_warn("%s: reset completed, but adapter not in RESET"
+				 " state.\n", c->name);
+		else
+			osm_debug("%s: reset completed.\n", c->name);
+
+		break;
+
+	default:
+		osm_err("%s: IOP reset timeout.\n", c->name);
+		rc = -ETIMEDOUT;
+		break;
+	}
+
+      exit:
+	/* Enable all IOPs */
+	i2o_iop_enable_all();
+
+	return rc;
+};
+
+/**
+ *	i2o_iop_activate - Bring controller up to HOLD
+ *	@c: controller
+ *
+ *	This function brings an I2O controller into HOLD state. The adapter
+ *	is reset if necessary and then the queues and resource table are read.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_activate(struct i2o_controller *c)
+{
+	i2o_status_block *sb = c->status_block.virt;
+	int rc;
+	int state;
+
+	/* In INIT state, Wait Inbound Q to initialize (in i2o_status_get) */
+	/* In READY state, Get status */
+
+	rc = i2o_status_get(c);
+	if (rc) {
+		osm_info("%s: Unable to obtain status, attempting a reset.\n",
+			 c->name);
+		rc = i2o_iop_reset(c);
+		if (rc)
+			return rc;
+	}
+
+	if (sb->i2o_version > I2OVER15) {
+		osm_err("%s: Not running version 1.5 of the I2O Specification."
+			"\n", c->name);
+		return -ENODEV;
+	}
+
+	switch (sb->iop_state) {
+	case ADAPTER_STATE_FAULTED:
+		osm_err("%s: hardware fault\n", c->name);
+		return -EFAULT;
+
+	case ADAPTER_STATE_READY:
+	case ADAPTER_STATE_OPERATIONAL:
+	case ADAPTER_STATE_HOLD:
+	case ADAPTER_STATE_FAILED:
+		osm_debug("%s: already running, trying to reset...\n", c->name);
+		rc = i2o_iop_reset(c);
+		if (rc)
+			return rc;
+	}
+
+	/* preserve state */
+	state = sb->iop_state;
+
+	rc = i2o_iop_init_outbound_queue(c);
+	if (rc)
+		return rc;
+
+	/* if adapter was not in RESET state clear now */
+	if (state != ADAPTER_STATE_RESET)
+		i2o_iop_clear(c);
+
+	i2o_status_get(c);
+
+	if (sb->iop_state != ADAPTER_STATE_HOLD) {
+		osm_err("%s: failed to bring IOP into HOLD state\n", c->name);
+		return -EIO;
+	}
+
+	return i2o_hrt_get(c);
+};
+
+static void i2o_res_alloc(struct i2o_controller *c, unsigned long flags)
+{
+	i2o_status_block *sb = c->status_block.virt;
+	struct resource *res = &c->mem_resource;
+	resource_size_t size, align;
+	int err;
+
+	res->name = c->pdev->bus->name;
+	res->flags = flags;
+	res->start = 0;
+	res->end = 0;
+	osm_info("%s: requires private memory resources.\n", c->name);
+
+	if (flags & IORESOURCE_MEM) {
+		size = sb->desired_mem_size;
+		align = 1 << 20;	/* unspecified, use 1Mb and play safe */
+	} else {
+		size = sb->desired_io_size;
+		align = 1 << 12;	/* unspecified, use 4Kb and play safe */
+	}
+
+	err = pci_bus_alloc_resource(c->pdev->bus, res, size, align, 0, 0,
+				     NULL, NULL);
+	if (err < 0)
+		return;
+
+	if (flags & IORESOURCE_MEM) {
+		c->mem_alloc = 1;
+		sb->current_mem_size = resource_size(res);
+		sb->current_mem_base = res->start;
+	} else if (flags & IORESOURCE_IO) {
+		c->io_alloc = 1;
+		sb->current_io_size = resource_size(res);
+		sb->current_io_base = res->start;
+	}
+	osm_info("%s: allocated PCI space %pR\n", c->name, res);
+}
+
+/**
+ *	i2o_iop_systab_set - Set the I2O System Table of the specified IOP
+ *	@c: I2O controller to which the system table should be send
+ *
+ *	Before the systab could be set i2o_systab_build() must be called.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_systab_set(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	i2o_status_block *sb = c->status_block.virt;
+	struct device *dev = &c->pdev->dev;
+	int rc;
+
+	if (sb->current_mem_size < sb->desired_mem_size)
+		i2o_res_alloc(c, IORESOURCE_MEM);
+
+	if (sb->current_io_size < sb->desired_io_size)
+		i2o_res_alloc(c, IORESOURCE_IO);
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	i2o_systab.phys = dma_map_single(dev, i2o_systab.virt, i2o_systab.len,
+					 PCI_DMA_TODEVICE);
+	if (!i2o_systab.phys) {
+		i2o_msg_nop(c, msg);
+		return -ENOMEM;
+	}
+
+	msg->u.head[0] = cpu_to_le32(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+
+	/*
+	 * Provide three SGL-elements:
+	 * System table (SysTab), Private memory space declaration and
+	 * Private i/o space declaration
+	 */
+
+	msg->body[0] = cpu_to_le32(c->unit + 2);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(0x54000000 | i2o_systab.len);
+	msg->body[3] = cpu_to_le32(i2o_systab.phys);
+	msg->body[4] = cpu_to_le32(0x54000000 | sb->current_mem_size);
+	msg->body[5] = cpu_to_le32(sb->current_mem_base);
+	msg->body[6] = cpu_to_le32(0xd4000000 | sb->current_io_size);
+	msg->body[6] = cpu_to_le32(sb->current_io_base);
+
+	rc = i2o_msg_post_wait(c, msg, 120);
+
+	dma_unmap_single(dev, i2o_systab.phys, i2o_systab.len,
+			 PCI_DMA_TODEVICE);
+
+	if (rc < 0)
+		osm_err("%s: Unable to set SysTab (status=%#x).\n", c->name,
+			-rc);
+	else
+		osm_debug("%s: SysTab set.\n", c->name);
+
+	return rc;
+}
+
+/**
+ *	i2o_iop_online - Bring a controller online into OPERATIONAL state.
+ *	@c: I2O controller
+ *
+ *	Send the system table and enable the I2O controller.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_iop_online(struct i2o_controller *c)
+{
+	int rc;
+
+	rc = i2o_iop_systab_set(c);
+	if (rc)
+		return rc;
+
+	/* In READY state */
+	osm_debug("%s: Attempting to enable...\n", c->name);
+	rc = i2o_iop_enable(c);
+	if (rc)
+		return rc;
+
+	return 0;
+};
+
+/**
+ *	i2o_iop_remove - Remove the I2O controller from the I2O core
+ *	@c: I2O controller
+ *
+ *	Remove the I2O controller from the I2O core. If devices are attached to
+ *	the controller remove these also and finally reset the controller.
+ */
+void i2o_iop_remove(struct i2o_controller *c)
+{
+	struct i2o_device *dev, *tmp;
+
+	osm_debug("%s: deleting controller\n", c->name);
+
+	i2o_driver_notify_controller_remove_all(c);
+
+	list_del(&c->list);
+
+	list_for_each_entry_safe(dev, tmp, &c->devices, list)
+	    i2o_device_remove(dev);
+
+	device_del(&c->device);
+
+	/* Ask the IOP to switch to RESET state */
+	i2o_iop_reset(c);
+}
+
+/**
+ *	i2o_systab_build - Build system table
+ *
+ *	The system table contains information about all the IOPs in the system
+ *	(duh) and is used by the Executives on the IOPs to establish peer2peer
+ *	connections. We're not supporting peer2peer at the moment, but this
+ *	will be needed down the road for things like lan2lan forwarding.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_systab_build(void)
+{
+	struct i2o_controller *c, *tmp;
+	int num_controllers = 0;
+	u32 change_ind = 0;
+	int count = 0;
+	struct i2o_sys_tbl *systab = i2o_systab.virt;
+
+	list_for_each_entry_safe(c, tmp, &i2o_controllers, list)
+	    num_controllers++;
+
+	if (systab) {
+		change_ind = systab->change_ind;
+		kfree(i2o_systab.virt);
+	}
+
+	/* Header + IOPs */
+	i2o_systab.len = sizeof(struct i2o_sys_tbl) + num_controllers *
+	    sizeof(struct i2o_sys_tbl_entry);
+
+	systab = i2o_systab.virt = kzalloc(i2o_systab.len, GFP_KERNEL);
+	if (!systab) {
+		osm_err("unable to allocate memory for System Table\n");
+		return -ENOMEM;
+	}
+
+	systab->version = I2OVERSION;
+	systab->change_ind = change_ind + 1;
+
+	list_for_each_entry_safe(c, tmp, &i2o_controllers, list) {
+		i2o_status_block *sb;
+
+		if (count >= num_controllers) {
+			osm_err("controller added while building system table"
+				"\n");
+			break;
+		}
+
+		sb = c->status_block.virt;
+
+		/*
+		 * Get updated IOP state so we have the latest information
+		 *
+		 * We should delete the controller at this point if it
+		 * doesn't respond since if it's not on the system table
+		 * it is techninically not part of the I2O subsystem...
+		 */
+		if (unlikely(i2o_status_get(c))) {
+			osm_err("%s: Deleting b/c could not get status while "
+				"attempting to build system table\n", c->name);
+			i2o_iop_remove(c);
+			continue;	// try the next one
+		}
+
+		systab->iops[count].org_id = sb->org_id;
+		systab->iops[count].iop_id = c->unit + 2;
+		systab->iops[count].seg_num = 0;
+		systab->iops[count].i2o_version = sb->i2o_version;
+		systab->iops[count].iop_state = sb->iop_state;
+		systab->iops[count].msg_type = sb->msg_type;
+		systab->iops[count].frame_size = sb->inbound_frame_size;
+		systab->iops[count].last_changed = change_ind;
+		systab->iops[count].iop_capabilities = sb->iop_capabilities;
+		systab->iops[count].inbound_low =
+		    i2o_dma_low(c->base.phys + I2O_IN_PORT);
+		systab->iops[count].inbound_high =
+		    i2o_dma_high(c->base.phys + I2O_IN_PORT);
+
+		count++;
+	}
+
+	systab->num_entries = count;
+
+	return 0;
+};
+
+/**
+ *	i2o_parse_hrt - Parse the hardware resource table.
+ *	@c: I2O controller
+ *
+ *	We don't do anything with it except dumping it (in debug mode).
+ *
+ *	Returns 0.
+ */
+static int i2o_parse_hrt(struct i2o_controller *c)
+{
+	i2o_dump_hrt(c);
+	return 0;
+};
+
+/**
+ *	i2o_status_get - Get the status block from the I2O controller
+ *	@c: I2O controller
+ *
+ *	Issue a status query on the controller. This updates the attached
+ *	status block. The status block could then be accessed through
+ *	c->status_block.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_status_get(struct i2o_controller *c)
+{
+	struct i2o_message *msg;
+	volatile u8 *status_block;
+	unsigned long timeout;
+
+	status_block = (u8 *) c->status_block.virt;
+	memset(c->status_block.virt, 0, sizeof(i2o_status_block));
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0x00000000);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status_block.phys));
+	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status_block.phys));
+	msg->body[4] = cpu_to_le32(sizeof(i2o_status_block));	/* always 88 bytes */
+
+	i2o_msg_post(c, msg);
+
+	/* Wait for a reply */
+	timeout = jiffies + I2O_TIMEOUT_STATUS_GET * HZ;
+	while (status_block[87] != 0xFF) {
+		if (time_after(jiffies, timeout)) {
+			osm_err("%s: Get status timeout.\n", c->name);
+			return -ETIMEDOUT;
+		}
+
+		schedule_timeout_uninterruptible(1);
+	}
+
+#ifdef DEBUG
+	i2o_debug_state(c);
+#endif
+
+	return 0;
+}
+
+/*
+ *	i2o_hrt_get - Get the Hardware Resource Table from the I2O controller
+ *	@c: I2O controller from which the HRT should be fetched
+ *
+ *	The HRT contains information about possible hidden devices but is
+ *	mostly useless to us.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_hrt_get(struct i2o_controller *c)
+{
+	int rc;
+	int i;
+	i2o_hrt *hrt = c->hrt.virt;
+	u32 size = sizeof(i2o_hrt);
+	struct device *dev = &c->pdev->dev;
+
+	for (i = 0; i < I2O_HRT_GET_TRIES; i++) {
+		struct i2o_message *msg;
+
+		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
+
+		msg->u.head[0] = cpu_to_le32(SIX_WORD_MSG_SIZE | SGL_OFFSET_4);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 |
+				ADAPTER_TID);
+		msg->body[0] = cpu_to_le32(0xd0000000 | c->hrt.len);
+		msg->body[1] = cpu_to_le32(c->hrt.phys);
+
+		rc = i2o_msg_post_wait_mem(c, msg, 20, &c->hrt);
+
+		if (rc < 0) {
+			osm_err("%s: Unable to get HRT (status=%#x)\n", c->name,
+				-rc);
+			return rc;
+		}
+
+		size = hrt->num_entries * hrt->entry_len << 2;
+		if (size > c->hrt.len) {
+			if (i2o_dma_realloc(dev, &c->hrt, size))
+				return -ENOMEM;
+			else
+				hrt = c->hrt.virt;
+		} else
+			return i2o_parse_hrt(c);
+	}
+
+	osm_err("%s: Unable to get HRT after %d tries, giving up\n", c->name,
+		I2O_HRT_GET_TRIES);
+
+	return -EBUSY;
+}
+
+/**
+ *	i2o_iop_release - release the memory for a I2O controller
+ *	@dev: I2O controller which should be released
+ *
+ *	Release the allocated memory. This function is called if refcount of
+ *	device reaches 0 automatically.
+ */
+static void i2o_iop_release(struct device *dev)
+{
+	struct i2o_controller *c = to_i2o_controller(dev);
+
+	i2o_iop_free(c);
+};
+
+/**
+ *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
+ *
+ *	Allocate the necessary memory for a i2o_controller struct and
+ *	initialize the lists and message mempool.
+ *
+ *	Returns a pointer to the I2O controller or a negative error code on
+ *	failure.
+ */
+struct i2o_controller *i2o_iop_alloc(void)
+{
+	static int unit = 0;	/* 0 and 1 are NULL IOP and Local Host */
+	struct i2o_controller *c;
+	char poolname[32];
+
+	c = kzalloc(sizeof(*c), GFP_KERNEL);
+	if (!c) {
+		osm_err("i2o: Insufficient memory to allocate a I2O controller."
+			"\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	c->unit = unit++;
+	sprintf(c->name, "iop%d", c->unit);
+
+	snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name);
+	if (i2o_pool_alloc
+	    (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4 + sizeof(u32),
+	     I2O_MSG_INPOOL_MIN)) {
+		kfree(c);
+		return ERR_PTR(-ENOMEM);
+	};
+
+	INIT_LIST_HEAD(&c->devices);
+	spin_lock_init(&c->lock);
+	mutex_init(&c->lct_lock);
+
+	device_initialize(&c->device);
+
+	c->device.release = &i2o_iop_release;
+
+	dev_set_name(&c->device, "iop%d", c->unit);
+
+#if BITS_PER_LONG == 64
+	spin_lock_init(&c->context_list_lock);
+	atomic_set(&c->context_list_counter, 0);
+	INIT_LIST_HEAD(&c->context_list);
+#endif
+
+	return c;
+};
+
+/**
+ *	i2o_iop_add - Initialize the I2O controller and add him to the I2O core
+ *	@c: controller
+ *
+ *	Initialize the I2O controller and if no error occurs add him to the I2O
+ *	core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_iop_add(struct i2o_controller *c)
+{
+	int rc;
+
+	if ((rc = device_add(&c->device))) {
+		osm_err("%s: could not add controller\n", c->name);
+		goto iop_reset;
+	}
+
+	osm_info("%s: Activating I2O controller...\n", c->name);
+	osm_info("%s: This may take a few minutes if there are many devices\n",
+		 c->name);
+
+	if ((rc = i2o_iop_activate(c))) {
+		osm_err("%s: could not activate controller\n", c->name);
+		goto device_del;
+	}
+
+	osm_debug("%s: building sys table...\n", c->name);
+
+	if ((rc = i2o_systab_build()))
+		goto device_del;
+
+	osm_debug("%s: online controller...\n", c->name);
+
+	if ((rc = i2o_iop_online(c)))
+		goto device_del;
+
+	osm_debug("%s: getting LCT...\n", c->name);
+
+	if ((rc = i2o_exec_lct_get(c)))
+		goto device_del;
+
+	list_add(&c->list, &i2o_controllers);
+
+	i2o_driver_notify_controller_add_all(c);
+
+	osm_info("%s: Controller added\n", c->name);
+
+	return 0;
+
+      device_del:
+	device_del(&c->device);
+
+      iop_reset:
+	i2o_iop_reset(c);
+
+	return rc;
+};
+
+/**
+ *	i2o_event_register - Turn on/off event notification for a I2O device
+ *	@dev: I2O device which should receive the event registration request
+ *	@drv: driver which want to get notified
+ *	@tcntxt: transaction context to use with this notifier
+ *	@evt_mask: mask of events
+ *
+ *	Create and posts an event registration message to the task. No reply
+ *	is waited for, or expected. If you do not want further notifications,
+ *	call the i2o_event_register again with a evt_mask of 0.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_event_register(struct i2o_device *dev, struct i2o_driver *drv,
+		       int tcntxt, u32 evt_mask)
+{
+	struct i2o_controller *c = dev->iop;
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->u.s.icntxt = cpu_to_le32(drv->context);
+	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
+	msg->body[0] = cpu_to_le32(evt_mask);
+
+	i2o_msg_post(c, msg);
+
+	return 0;
+};
+
+/**
+ *	i2o_iop_init - I2O main initialization function
+ *
+ *	Initialize the I2O drivers (OSM) functions, register the Executive OSM,
+ *	initialize the I2O PCI part and finally initialize I2O device stuff.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_iop_init(void)
+{
+	int rc = 0;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	if ((rc = i2o_driver_init()))
+		goto exit;
+
+	if ((rc = i2o_exec_init()))
+		goto driver_exit;
+
+	if ((rc = i2o_pci_init()))
+		goto exec_exit;
+
+	return 0;
+
+      exec_exit:
+	i2o_exec_exit();
+
+      driver_exit:
+	i2o_driver_exit();
+
+      exit:
+	return rc;
+}
+
+/**
+ *	i2o_iop_exit - I2O main exit function
+ *
+ *	Removes I2O controllers from PCI subsystem and shut down OSMs.
+ */
+static void __exit i2o_iop_exit(void)
+{
+	i2o_pci_exit();
+	i2o_exec_exit();
+	i2o_driver_exit();
+};
+
+module_init(i2o_iop_init);
+module_exit(i2o_iop_exit);
+
+MODULE_AUTHOR("Red Hat Software");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+#if BITS_PER_LONG == 64
+EXPORT_SYMBOL(i2o_cntxt_list_add);
+EXPORT_SYMBOL(i2o_cntxt_list_get);
+EXPORT_SYMBOL(i2o_cntxt_list_remove);
+EXPORT_SYMBOL(i2o_cntxt_list_get_ptr);
+#endif
+EXPORT_SYMBOL(i2o_msg_get_wait);
+EXPORT_SYMBOL(i2o_find_iop);
+EXPORT_SYMBOL(i2o_iop_find_device);
+EXPORT_SYMBOL(i2o_event_register);
+EXPORT_SYMBOL(i2o_status_get);
+EXPORT_SYMBOL(i2o_controllers);
diff --git a/drivers/staging/i2o/memory.c b/drivers/staging/i2o/memory.c
new file mode 100644
index 000000000000..8f9509d275a4
--- /dev/null
+++ b/drivers/staging/i2o/memory.c
@@ -0,0 +1,313 @@
+/*
+ *	Functions to handle I2O memory
+ *
+ *	Pulled from the inlines in i2o headers and uninlined
+ *
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ */
+
+#include <linux/module.h>
+#include "i2o.h"
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "core.h"
+
+/* Protects our 32/64bit mask switching */
+static DEFINE_MUTEX(mem_lock);
+
+/**
+ *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
+ *	@c: I2O controller for which the calculation should be done
+ *	@body_size: maximum body size used for message in 32-bit words.
+ *
+ *	Return the maximum number of SG elements in a SG list.
+ */
+u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
+{
+	i2o_status_block *sb = c->status_block.virt;
+	u16 sg_count =
+	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
+	    body_size;
+
+	if (c->pae_support) {
+		/*
+		 * for 64-bit a SG attribute element must be added and each
+		 * SG element needs 12 bytes instead of 8.
+		 */
+		sg_count -= 2;
+		sg_count /= 3;
+	} else
+		sg_count /= 2;
+
+	if (c->short_req && (sg_count > 8))
+		sg_count = 8;
+
+	return sg_count;
+}
+EXPORT_SYMBOL_GPL(i2o_sg_tablesize);
+
+
+/**
+ *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@ptr: pointer to the data which should be mapped
+ *	@size: size of data in bytes
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
+ *	SG list if the allocation was successful.
+ *
+ *	Returns DMA address which must be checked for failures using
+ *	dma_mapping_error().
+ */
+dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
+					    size_t size,
+					    enum dma_data_direction direction,
+					    u32 ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 *mptr = *sg_ptr;
+	dma_addr_t dma_addr;
+
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0xd4000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0xd0000000;
+		break;
+	default:
+		return 0;
+	}
+
+	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
+	if (!dma_mapping_error(&c->pdev->dev, dma_addr)) {
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+			*mptr++ = cpu_to_le32(0x7C020002);
+			*mptr++ = cpu_to_le32(PAGE_SIZE);
+		}
+#endif
+
+		*mptr++ = cpu_to_le32(sg_flags | size);
+		*mptr++ = cpu_to_le32(i2o_dma_low(dma_addr));
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			*mptr++ = cpu_to_le32(i2o_dma_high(dma_addr));
+#endif
+		*sg_ptr = mptr;
+	}
+	return dma_addr;
+}
+EXPORT_SYMBOL_GPL(i2o_dma_map_single);
+
+/**
+ *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@sg: SG list to be mapped
+ *	@sg_count: number of elements in the SG list
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
+ *	list if the allocation was successful.
+ *
+ *	Returns 0 on failure or 1 on success.
+ */
+int i2o_dma_map_sg(struct i2o_controller *c, struct scatterlist *sg,
+	    int sg_count, enum dma_data_direction direction, u32 ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 *mptr = *sg_ptr;
+
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0x14000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0x10000000;
+		break;
+	default:
+		return 0;
+	}
+
+	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
+	if (!sg_count)
+		return 0;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+		*mptr++ = cpu_to_le32(0x7C020002);
+		*mptr++ = cpu_to_le32(PAGE_SIZE);
+	}
+#endif
+
+	while (sg_count-- > 0) {
+		if (!sg_count)
+			sg_flags |= 0xC0000000;
+		*mptr++ = cpu_to_le32(sg_flags | sg_dma_len(sg));
+		*mptr++ = cpu_to_le32(i2o_dma_low(sg_dma_address(sg)));
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			*mptr++ = cpu_to_le32(i2o_dma_high(sg_dma_address(sg)));
+#endif
+		sg = sg_next(sg);
+	}
+	*sg_ptr = mptr;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(i2o_dma_map_sg);
+
+/**
+ *	i2o_dma_alloc - Allocate DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which should get the DMA buffer
+ *	@len: length of the new DMA memory
+ *
+ *	Allocate a coherent DMA memory and write the pointers into addr.
+ *
+ *	Returns 0 on success or -ENOMEM on failure.
+ */
+int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int dma_64 = 0;
+
+	mutex_lock(&mem_lock);
+	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_BIT_MASK(64))) {
+		dma_64 = 1;
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
+			mutex_unlock(&mem_lock);
+			return -ENOMEM;
+		}
+	}
+
+	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, GFP_KERNEL);
+
+	if ((sizeof(dma_addr_t) > 4) && dma_64)
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
+			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
+	mutex_unlock(&mem_lock);
+
+	if (!addr->virt)
+		return -ENOMEM;
+
+	memset(addr->virt, 0, len);
+	addr->len = len;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(i2o_dma_alloc);
+
+
+/**
+ *	i2o_dma_free - Free DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which contains the DMA buffer
+ *
+ *	Free a coherent DMA memory and set virtual address of addr to NULL.
+ */
+void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
+{
+	if (addr->virt) {
+		if (addr->phys)
+			dma_free_coherent(dev, addr->len, addr->virt,
+					  addr->phys);
+		else
+			kfree(addr->virt);
+		addr->virt = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(i2o_dma_free);
+
+
+/**
+ *	i2o_dma_realloc - Realloc DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: pointer to a i2o_dma struct DMA buffer
+ *	@len: new length of memory
+ *
+ *	If there was something allocated in the addr, free it first. If len > 0
+ *	than try to allocate it and write the addresses back to the addr
+ *	structure. If len == 0 set the virtual address to NULL.
+ *
+ *	Returns the 0 on success or negative error code on failure.
+ */
+int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr, size_t len)
+{
+	i2o_dma_free(dev, addr);
+
+	if (len)
+		return i2o_dma_alloc(dev, addr, len);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(i2o_dma_realloc);
+
+/*
+ *	i2o_pool_alloc - Allocate an slab cache and mempool
+ *	@mempool: pointer to struct i2o_pool to write data into.
+ *	@name: name which is used to identify cache
+ *	@size: size of each object
+ *	@min_nr: minimum number of objects
+ *
+ *	First allocates a slab cache with name and size. Then allocates a
+ *	mempool which uses the slab cache for allocation and freeing.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
+				 size_t size, int min_nr)
+{
+	pool->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+	if (!pool->name)
+		goto exit;
+	strcpy(pool->name, name);
+
+	pool->slab =
+	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!pool->slab)
+		goto free_name;
+
+	pool->mempool = mempool_create_slab_pool(min_nr, pool->slab);
+	if (!pool->mempool)
+		goto free_slab;
+
+	return 0;
+
+free_slab:
+	kmem_cache_destroy(pool->slab);
+
+free_name:
+	kfree(pool->name);
+
+exit:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(i2o_pool_alloc);
+
+/*
+ *	i2o_pool_free - Free slab cache and mempool again
+ *	@mempool: pointer to struct i2o_pool which should be freed
+ *
+ *	Note that you have to return all objects to the mempool again before
+ *	calling i2o_pool_free().
+ */
+void i2o_pool_free(struct i2o_pool *pool)
+{
+	mempool_destroy(pool->mempool);
+	kmem_cache_destroy(pool->slab);
+	kfree(pool->name);
+};
+EXPORT_SYMBOL_GPL(i2o_pool_free);
diff --git a/drivers/staging/i2o/pci.c b/drivers/staging/i2o/pci.c
new file mode 100644
index 000000000000..b3b8a61dd4a6
--- /dev/null
+++ b/drivers/staging/i2o/pci.c
@@ -0,0 +1,497 @@
+/*
+ *	PCI handling of I2O controller
+ *
+ * 	Copyright (C) 1999-2002	Red Hat Software
+ *
+ *	Written by Alan Cox, Building Number Three Ltd
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	A lot of the I2O message side code from this is taken from the Red
+ *	Creek RCPCI45 adapter driver by Red Creek Communications
+ *
+ *	Fixes/additions:
+ *		Philipp Rumpf
+ *		Juha Sievänen <Juha.Sievanen@cs.Helsinki.FI>
+ *		Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
+ *		Deepak Saxena <deepak@plexity.net>
+ *		Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>:
+ *			Ported to Linux 2.5.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Minor fixes for 2.6.
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>:
+ *			Support for sysfs included.
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include "i2o.h"
+#include <linux/module.h>
+#include "core.h"
+
+#define OSM_DESCRIPTION	"I2O-subsystem"
+
+/* PCI device id table for all I2O controllers */
+static struct pci_device_id i2o_pci_ids[] = {
+	{PCI_DEVICE_CLASS(PCI_CLASS_INTELLIGENT_I2O << 8, 0xffff00)},
+	{PCI_DEVICE(PCI_VENDOR_ID_DPT, 0xa511)},
+	{.vendor = PCI_VENDOR_ID_INTEL,.device = 0x1962,
+	 .subvendor = PCI_VENDOR_ID_PROMISE,.subdevice = PCI_ANY_ID},
+	{0}
+};
+
+/**
+ *	i2o_pci_free - Frees the DMA memory for the I2O controller
+ *	@c: I2O controller to free
+ *
+ *	Remove all allocated DMA memory and unmap memory IO regions. If MTRR
+ *	is enabled, also remove it again.
+ */
+static void i2o_pci_free(struct i2o_controller *c)
+{
+	struct device *dev;
+
+	dev = &c->pdev->dev;
+
+	i2o_dma_free(dev, &c->out_queue);
+	i2o_dma_free(dev, &c->status_block);
+	kfree(c->lct);
+	i2o_dma_free(dev, &c->dlct);
+	i2o_dma_free(dev, &c->hrt);
+	i2o_dma_free(dev, &c->status);
+
+	if (c->raptor && c->in_queue.virt)
+		iounmap(c->in_queue.virt);
+
+	if (c->base.virt)
+		iounmap(c->base.virt);
+
+	pci_release_regions(c->pdev);
+}
+
+/**
+ *	i2o_pci_alloc - Allocate DMA memory, map IO memory for I2O controller
+ *	@c: I2O controller
+ *
+ *	Allocate DMA memory for a PCI (or in theory AGP) I2O controller. All
+ *	IO mappings are also done here. If MTRR is enabled, also do add memory
+ *	regions here.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_pci_alloc(struct i2o_controller *c)
+{
+	struct pci_dev *pdev = c->pdev;
+	struct device *dev = &pdev->dev;
+	int i;
+
+	if (pci_request_regions(pdev, OSM_DESCRIPTION)) {
+		printk(KERN_ERR "%s: device already claimed\n", c->name);
+		return -ENODEV;
+	}
+
+	for (i = 0; i < 6; i++) {
+		/* Skip I/O spaces */
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_IO)) {
+			if (!c->base.phys) {
+				c->base.phys = pci_resource_start(pdev, i);
+				c->base.len = pci_resource_len(pdev, i);
+
+				/*
+				 * If we know what card it is, set the size
+				 * correctly. Code is taken from dpt_i2o.c
+				 */
+				if (pdev->device == 0xa501) {
+					if (pdev->subsystem_device >= 0xc032 &&
+					    pdev->subsystem_device <= 0xc03b) {
+						if (c->base.len > 0x400000)
+							c->base.len = 0x400000;
+					} else {
+						if (c->base.len > 0x100000)
+							c->base.len = 0x100000;
+					}
+				}
+				if (!c->raptor)
+					break;
+			} else {
+				c->in_queue.phys = pci_resource_start(pdev, i);
+				c->in_queue.len = pci_resource_len(pdev, i);
+				break;
+			}
+		}
+	}
+
+	if (i == 6) {
+		printk(KERN_ERR "%s: I2O controller has no memory regions"
+		       " defined.\n", c->name);
+		i2o_pci_free(c);
+		return -EINVAL;
+	}
+
+	/* Map the I2O controller */
+	if (c->raptor) {
+		printk(KERN_INFO "%s: PCI I2O controller\n", c->name);
+		printk(KERN_INFO "     BAR0 at 0x%08lX size=%ld\n",
+		       (unsigned long)c->base.phys, (unsigned long)c->base.len);
+		printk(KERN_INFO "     BAR1 at 0x%08lX size=%ld\n",
+		       (unsigned long)c->in_queue.phys,
+		       (unsigned long)c->in_queue.len);
+	} else
+		printk(KERN_INFO "%s: PCI I2O controller at %08lX size=%ld\n",
+		       c->name, (unsigned long)c->base.phys,
+		       (unsigned long)c->base.len);
+
+	c->base.virt = ioremap_nocache(c->base.phys, c->base.len);
+	if (!c->base.virt) {
+		printk(KERN_ERR "%s: Unable to map controller.\n", c->name);
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (c->raptor) {
+		c->in_queue.virt =
+		    ioremap_nocache(c->in_queue.phys, c->in_queue.len);
+		if (!c->in_queue.virt) {
+			printk(KERN_ERR "%s: Unable to map controller.\n",
+			       c->name);
+			i2o_pci_free(c);
+			return -ENOMEM;
+		}
+	} else
+		c->in_queue = c->base;
+
+	c->irq_status = c->base.virt + I2O_IRQ_STATUS;
+	c->irq_mask = c->base.virt + I2O_IRQ_MASK;
+	c->in_port = c->base.virt + I2O_IN_PORT;
+	c->out_port = c->base.virt + I2O_OUT_PORT;
+
+	/* Motorola/Freescale chip does not follow spec */
+	if (pdev->vendor == PCI_VENDOR_ID_MOTOROLA && pdev->device == 0x18c0) {
+		/* Check if CPU is enabled */
+		if (be32_to_cpu(readl(c->base.virt + 0x10000)) & 0x10000000) {
+			printk(KERN_INFO "%s: MPC82XX needs CPU running to "
+			       "service I2O.\n", c->name);
+			i2o_pci_free(c);
+			return -ENODEV;
+		} else {
+			c->irq_status += I2O_MOTOROLA_PORT_OFFSET;
+			c->irq_mask += I2O_MOTOROLA_PORT_OFFSET;
+			c->in_port += I2O_MOTOROLA_PORT_OFFSET;
+			c->out_port += I2O_MOTOROLA_PORT_OFFSET;
+			printk(KERN_INFO "%s: MPC82XX workarounds activated.\n",
+			       c->name);
+		}
+	}
+
+	if (i2o_dma_alloc(dev, &c->status, 8)) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (i2o_dma_alloc(dev, &c->hrt, sizeof(i2o_hrt))) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (i2o_dma_alloc(dev, &c->dlct, 8192)) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (i2o_dma_alloc(dev, &c->status_block, sizeof(i2o_status_block))) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	if (i2o_dma_alloc(dev, &c->out_queue,
+		I2O_MAX_OUTBOUND_MSG_FRAMES * I2O_OUTBOUND_MSG_FRAME_SIZE *
+				sizeof(u32))) {
+		i2o_pci_free(c);
+		return -ENOMEM;
+	}
+
+	pci_set_drvdata(pdev, c);
+
+	return 0;
+}
+
+/**
+ *	i2o_pci_interrupt - Interrupt handler for I2O controller
+ *	@irq: interrupt line
+ *	@dev_id: pointer to the I2O controller
+ *
+ *	Handle an interrupt from a PCI based I2O controller. This turns out
+ *	to be rather simple. We keep the controller pointer in the cookie.
+ */
+static irqreturn_t i2o_pci_interrupt(int irq, void *dev_id)
+{
+	struct i2o_controller *c = dev_id;
+	u32 m;
+	irqreturn_t rc = IRQ_NONE;
+
+	while (readl(c->irq_status) & I2O_IRQ_OUTBOUND_POST) {
+		m = readl(c->out_port);
+		if (m == I2O_QUEUE_EMPTY) {
+			/*
+			 * Old 960 steppings had a bug in the I2O unit that
+			 * caused the queue to appear empty when it wasn't.
+			 */
+			m = readl(c->out_port);
+			if (unlikely(m == I2O_QUEUE_EMPTY))
+				break;
+		}
+
+		/* dispatch it */
+		if (i2o_driver_dispatch(c, m))
+			/* flush it if result != 0 */
+			i2o_flush_reply(c, m);
+
+		rc = IRQ_HANDLED;
+	}
+
+	return rc;
+}
+
+/**
+ *	i2o_pci_irq_enable - Allocate interrupt for I2O controller
+ *	@c: i2o_controller that the request is for
+ *
+ *	Allocate an interrupt for the I2O controller, and activate interrupts
+ *	on the I2O controller.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_pci_irq_enable(struct i2o_controller *c)
+{
+	struct pci_dev *pdev = c->pdev;
+	int rc;
+
+	writel(0xffffffff, c->irq_mask);
+
+	if (pdev->irq) {
+		rc = request_irq(pdev->irq, i2o_pci_interrupt, IRQF_SHARED,
+				 c->name, c);
+		if (rc < 0) {
+			printk(KERN_ERR "%s: unable to allocate interrupt %d."
+			       "\n", c->name, pdev->irq);
+			return rc;
+		}
+	}
+
+	writel(0x00000000, c->irq_mask);
+
+	printk(KERN_INFO "%s: Installed at IRQ %d\n", c->name, pdev->irq);
+
+	return 0;
+}
+
+/**
+ *	i2o_pci_irq_disable - Free interrupt for I2O controller
+ *	@c: I2O controller
+ *
+ *	Disable interrupts in I2O controller and then free interrupt.
+ */
+static void i2o_pci_irq_disable(struct i2o_controller *c)
+{
+	writel(0xffffffff, c->irq_mask);
+
+	if (c->pdev->irq > 0)
+		free_irq(c->pdev->irq, c);
+}
+
+/**
+ *	i2o_pci_probe - Probe the PCI device for an I2O controller
+ *	@pdev: PCI device to test
+ *	@id: id which matched with the PCI device id table
+ *
+ *	Probe the PCI device for any device which is a memory of the
+ *	Intelligent, I2O class or an Adaptec Zero Channel Controller. We
+ *	attempt to set up each such device and register it with the core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct i2o_controller *c;
+	int rc;
+	struct pci_dev *i960 = NULL;
+
+	printk(KERN_INFO "i2o: Checking for PCI I2O controllers...\n");
+
+	if ((pdev->class & 0xff) > 1) {
+		printk(KERN_WARNING "i2o: %s does not support I2O 1.5 "
+		       "(skipping).\n", pci_name(pdev));
+		return -ENODEV;
+	}
+
+	if ((rc = pci_enable_device(pdev))) {
+		printk(KERN_WARNING "i2o: couldn't enable device %s\n",
+		       pci_name(pdev));
+		return rc;
+	}
+
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
+		printk(KERN_WARNING "i2o: no suitable DMA found for %s\n",
+		       pci_name(pdev));
+		rc = -ENODEV;
+		goto disable;
+	}
+
+	pci_set_master(pdev);
+
+	c = i2o_iop_alloc();
+	if (IS_ERR(c)) {
+		printk(KERN_ERR "i2o: couldn't allocate memory for %s\n",
+		       pci_name(pdev));
+		rc = PTR_ERR(c);
+		goto disable;
+	} else
+		printk(KERN_INFO "%s: controller found (%s)\n", c->name,
+		       pci_name(pdev));
+
+	c->pdev = pdev;
+	c->device.parent = &pdev->dev;
+
+	/* Cards that fall apart if you hit them with large I/O loads... */
+	if (pdev->vendor == PCI_VENDOR_ID_NCR && pdev->device == 0x0630) {
+		c->short_req = 1;
+		printk(KERN_INFO "%s: Symbios FC920 workarounds activated.\n",
+		       c->name);
+	}
+
+	if (pdev->subsystem_vendor == PCI_VENDOR_ID_PROMISE) {
+		/*
+		 * Expose the ship behind i960 for initialization, or it will
+		 * failed
+		 */
+		i960 = pci_get_slot(c->pdev->bus,
+				  PCI_DEVFN(PCI_SLOT(c->pdev->devfn), 0));
+
+		if (i960) {
+			pci_write_config_word(i960, 0x42, 0);
+			pci_dev_put(i960);
+		}
+
+		c->promise = 1;
+		c->limit_sectors = 1;
+	}
+
+	if (pdev->subsystem_vendor == PCI_VENDOR_ID_DPT)
+		c->adaptec = 1;
+
+	/* Cards that go bananas if you quiesce them before you reset them. */
+	if (pdev->vendor == PCI_VENDOR_ID_DPT) {
+		c->no_quiesce = 1;
+		if (pdev->device == 0xa511)
+			c->raptor = 1;
+
+		if (pdev->subsystem_device == 0xc05a) {
+			c->limit_sectors = 1;
+			printk(KERN_INFO
+			       "%s: limit sectors per request to %d\n", c->name,
+			       I2O_MAX_SECTORS_LIMITED);
+		}
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if (sizeof(dma_addr_t) > 4) {
+			if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
+				printk(KERN_INFO "%s: 64-bit DMA unavailable\n",
+				       c->name);
+			else {
+				c->pae_support = 1;
+				printk(KERN_INFO "%s: using 64-bit DMA\n",
+				       c->name);
+			}
+		}
+#endif
+	}
+
+	if ((rc = i2o_pci_alloc(c))) {
+		printk(KERN_ERR "%s: DMA / IO allocation for I2O controller "
+		       "failed\n", c->name);
+		goto free_controller;
+	}
+
+	if (i2o_pci_irq_enable(c)) {
+		printk(KERN_ERR "%s: unable to enable interrupts for I2O "
+		       "controller\n", c->name);
+		goto free_pci;
+	}
+
+	if ((rc = i2o_iop_add(c)))
+		goto uninstall;
+
+	if (i960)
+		pci_write_config_word(i960, 0x42, 0x03ff);
+
+	return 0;
+
+      uninstall:
+	i2o_pci_irq_disable(c);
+
+      free_pci:
+	i2o_pci_free(c);
+
+      free_controller:
+	i2o_iop_free(c);
+
+      disable:
+	pci_disable_device(pdev);
+
+	return rc;
+}
+
+/**
+ *	i2o_pci_remove - Removes a I2O controller from the system
+ *	@pdev: I2O controller which should be removed
+ *
+ *	Reset the I2O controller, disable interrupts and remove all allocated
+ *	resources.
+ */
+static void i2o_pci_remove(struct pci_dev *pdev)
+{
+	struct i2o_controller *c;
+	c = pci_get_drvdata(pdev);
+
+	i2o_iop_remove(c);
+	i2o_pci_irq_disable(c);
+	i2o_pci_free(c);
+
+	pci_disable_device(pdev);
+
+	printk(KERN_INFO "%s: Controller removed.\n", c->name);
+
+	put_device(&c->device);
+};
+
+/* PCI driver for I2O controller */
+static struct pci_driver i2o_pci_driver = {
+	.name = "PCI_I2O",
+	.id_table = i2o_pci_ids,
+	.probe = i2o_pci_probe,
+	.remove = i2o_pci_remove,
+};
+
+/**
+ *	i2o_pci_init - registers I2O PCI driver in PCI subsystem
+ *
+ *	Returns > 0 on success or negative error code on failure.
+ */
+int __init i2o_pci_init(void)
+{
+	return pci_register_driver(&i2o_pci_driver);
+};
+
+/**
+ *	i2o_pci_exit - unregisters I2O PCI driver from PCI subsystem
+ */
+void __exit i2o_pci_exit(void)
+{
+	pci_unregister_driver(&i2o_pci_driver);
+};
+
+MODULE_DEVICE_TABLE(pci, i2o_pci_ids);
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
deleted file mode 100644
index d23c3c20b201..000000000000
--- a/include/linux/i2o.h
+++ /dev/null
@@ -1,988 +0,0 @@
-/*
- * I2O kernel space accessible structures/APIs
- *
- * (c) Copyright 1999, 2000 Red Hat Software
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *************************************************************************
- *
- * This header file defined the I2O APIs/structures for use by
- * the I2O kernel modules.
- *
- */
-
-#ifndef _I2O_H
-#define _I2O_H
-
-#include <linux/i2o-dev.h>
-
-/* How many different OSM's are we allowing */
-#define I2O_MAX_DRIVERS		8
-
-#include <linux/pci.h>
-#include <linux/bug.h>
-#include <linux/dma-mapping.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>	/* work_struct */
-#include <linux/mempool.h>
-#include <linux/mutex.h>
-#include <linux/scatterlist.h>
-#include <linux/semaphore.h>	/* Needed for MUTEX init macros */
-
-#include <asm/io.h>
-
-/* message queue empty */
-#define I2O_QUEUE_EMPTY		0xffffffff
-
-/*
- *	Cache strategies
- */
-
-/*	The NULL strategy leaves everything up to the controller. This tends to be a
- *	pessimal but functional choice.
- */
-#define CACHE_NULL		0
-/*	Prefetch data when reading. We continually attempt to load the next 32 sectors
- *	into the controller cache.
- */
-#define CACHE_PREFETCH		1
-/*	Prefetch data when reading. We sometimes attempt to load the next 32 sectors
- *	into the controller cache. When an I/O is less <= 8K we assume its probably
- *	not sequential and don't prefetch (default)
- */
-#define CACHE_SMARTFETCH	2
-/*	Data is written to the cache and then out on to the disk. The I/O must be
- *	physically on the medium before the write is acknowledged (default without
- *	NVRAM)
- */
-#define CACHE_WRITETHROUGH	17
-/*	Data is written to the cache and then out on to the disk. The controller
- *	is permitted to write back the cache any way it wants. (default if battery
- *	backed NVRAM is present). It can be useful to set this for swap regardless of
- *	battery state.
- */
-#define CACHE_WRITEBACK		18
-/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
- *	write large I/O's directly to disk bypassing the cache to avoid the extra
- *	memory copy hits. Small writes are writeback cached
- */
-#define CACHE_SMARTBACK		19
-/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
- *	write large I/O's directly to disk bypassing the cache to avoid the extra
- *	memory copy hits. Small writes are writethrough cached. Suitable for devices
- *	lacking battery backup
- */
-#define CACHE_SMARTTHROUGH	20
-
-/*
- *	Ioctl structures
- */
-
-#define 	BLKI2OGRSTRAT	_IOR('2', 1, int)
-#define 	BLKI2OGWSTRAT	_IOR('2', 2, int)
-#define 	BLKI2OSRSTRAT	_IOW('2', 3, int)
-#define 	BLKI2OSWSTRAT	_IOW('2', 4, int)
-
-/*
- *	I2O Function codes
- */
-
-/*
- *	Executive Class
- */
-#define	I2O_CMD_ADAPTER_ASSIGN		0xB3
-#define	I2O_CMD_ADAPTER_READ		0xB2
-#define	I2O_CMD_ADAPTER_RELEASE		0xB5
-#define	I2O_CMD_BIOS_INFO_SET		0xA5
-#define	I2O_CMD_BOOT_DEVICE_SET		0xA7
-#define	I2O_CMD_CONFIG_VALIDATE		0xBB
-#define	I2O_CMD_CONN_SETUP		0xCA
-#define	I2O_CMD_DDM_DESTROY		0xB1
-#define	I2O_CMD_DDM_ENABLE		0xD5
-#define	I2O_CMD_DDM_QUIESCE		0xC7
-#define	I2O_CMD_DDM_RESET		0xD9
-#define	I2O_CMD_DDM_SUSPEND		0xAF
-#define	I2O_CMD_DEVICE_ASSIGN		0xB7
-#define	I2O_CMD_DEVICE_RELEASE		0xB9
-#define	I2O_CMD_HRT_GET			0xA8
-#define	I2O_CMD_ADAPTER_CLEAR		0xBE
-#define	I2O_CMD_ADAPTER_CONNECT		0xC9
-#define	I2O_CMD_ADAPTER_RESET		0xBD
-#define	I2O_CMD_LCT_NOTIFY		0xA2
-#define	I2O_CMD_OUTBOUND_INIT		0xA1
-#define	I2O_CMD_PATH_ENABLE		0xD3
-#define	I2O_CMD_PATH_QUIESCE		0xC5
-#define	I2O_CMD_PATH_RESET		0xD7
-#define	I2O_CMD_STATIC_MF_CREATE	0xDD
-#define	I2O_CMD_STATIC_MF_RELEASE	0xDF
-#define	I2O_CMD_STATUS_GET		0xA0
-#define	I2O_CMD_SW_DOWNLOAD		0xA9
-#define	I2O_CMD_SW_UPLOAD		0xAB
-#define	I2O_CMD_SW_REMOVE		0xAD
-#define	I2O_CMD_SYS_ENABLE		0xD1
-#define	I2O_CMD_SYS_MODIFY		0xC1
-#define	I2O_CMD_SYS_QUIESCE		0xC3
-#define	I2O_CMD_SYS_TAB_SET		0xA3
-
-/*
- * Utility Class
- */
-#define I2O_CMD_UTIL_NOP		0x00
-#define I2O_CMD_UTIL_ABORT		0x01
-#define I2O_CMD_UTIL_CLAIM		0x09
-#define I2O_CMD_UTIL_RELEASE		0x0B
-#define I2O_CMD_UTIL_PARAMS_GET		0x06
-#define I2O_CMD_UTIL_PARAMS_SET		0x05
-#define I2O_CMD_UTIL_EVT_REGISTER	0x13
-#define I2O_CMD_UTIL_EVT_ACK		0x14
-#define I2O_CMD_UTIL_CONFIG_DIALOG	0x10
-#define I2O_CMD_UTIL_DEVICE_RESERVE	0x0D
-#define I2O_CMD_UTIL_DEVICE_RELEASE	0x0F
-#define I2O_CMD_UTIL_LOCK		0x17
-#define I2O_CMD_UTIL_LOCK_RELEASE	0x19
-#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY	0x15
-
-/*
- * SCSI Host Bus Adapter Class
- */
-#define I2O_CMD_SCSI_EXEC		0x81
-#define I2O_CMD_SCSI_ABORT		0x83
-#define I2O_CMD_SCSI_BUSRESET		0x27
-
-/*
- * Bus Adapter Class
- */
-#define I2O_CMD_BUS_ADAPTER_RESET	0x85
-#define I2O_CMD_BUS_RESET		0x87
-#define I2O_CMD_BUS_SCAN		0x89
-#define I2O_CMD_BUS_QUIESCE		0x8b
-
-/*
- * Random Block Storage Class
- */
-#define I2O_CMD_BLOCK_READ		0x30
-#define I2O_CMD_BLOCK_WRITE		0x31
-#define I2O_CMD_BLOCK_CFLUSH		0x37
-#define I2O_CMD_BLOCK_MLOCK		0x49
-#define I2O_CMD_BLOCK_MUNLOCK		0x4B
-#define I2O_CMD_BLOCK_MMOUNT		0x41
-#define I2O_CMD_BLOCK_MEJECT		0x43
-#define I2O_CMD_BLOCK_POWER		0x70
-
-#define I2O_CMD_PRIVATE			0xFF
-
-/* Command status values  */
-
-#define I2O_CMD_IN_PROGRESS	0x01
-#define I2O_CMD_REJECTED	0x02
-#define I2O_CMD_FAILED		0x03
-#define I2O_CMD_COMPLETED	0x04
-
-/* I2O API function return values */
-
-#define I2O_RTN_NO_ERROR			0
-#define I2O_RTN_NOT_INIT			1
-#define I2O_RTN_FREE_Q_EMPTY			2
-#define I2O_RTN_TCB_ERROR			3
-#define I2O_RTN_TRANSACTION_ERROR		4
-#define I2O_RTN_ADAPTER_ALREADY_INIT		5
-#define I2O_RTN_MALLOC_ERROR			6
-#define I2O_RTN_ADPTR_NOT_REGISTERED		7
-#define I2O_RTN_MSG_REPLY_TIMEOUT		8
-#define I2O_RTN_NO_STATUS			9
-#define I2O_RTN_NO_FIRM_VER			10
-#define	I2O_RTN_NO_LINK_SPEED			11
-
-/* Reply message status defines for all messages */
-
-#define I2O_REPLY_STATUS_SUCCESS                    	0x00
-#define I2O_REPLY_STATUS_ABORT_DIRTY                	0x01
-#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER     	0x02
-#define	I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER		0x03
-#define	I2O_REPLY_STATUS_ERROR_DIRTY			0x04
-#define	I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER		0x05
-#define	I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER		0x06
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY		0x08
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER	0x09
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER	0x0A
-#define	I2O_REPLY_STATUS_TRANSACTION_ERROR		0x0B
-#define	I2O_REPLY_STATUS_PROGRESS_REPORT		0x80
-
-/* Status codes and Error Information for Parameter functions */
-
-#define I2O_PARAMS_STATUS_SUCCESS		0x00
-#define I2O_PARAMS_STATUS_BAD_KEY_ABORT		0x01
-#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE   	0x02
-#define I2O_PARAMS_STATUS_BUFFER_FULL		0x03
-#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL	0x04
-#define I2O_PARAMS_STATUS_FIELD_UNREADABLE	0x05
-#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE	0x06
-#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS	0x07
-#define I2O_PARAMS_STATUS_INVALID_GROUP_ID	0x08
-#define I2O_PARAMS_STATUS_INVALID_OPERATION	0x09
-#define I2O_PARAMS_STATUS_NO_KEY_FIELD		0x0A
-#define I2O_PARAMS_STATUS_NO_SUCH_FIELD		0x0B
-#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP	0x0C
-#define I2O_PARAMS_STATUS_OPERATION_ERROR	0x0D
-#define I2O_PARAMS_STATUS_SCALAR_ERROR		0x0E
-#define I2O_PARAMS_STATUS_TABLE_ERROR		0x0F
-#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE	0x10
-
-/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
- * messages: Table 3-2 Detailed Status Codes.*/
-
-#define I2O_DSC_SUCCESS                        0x0000
-#define I2O_DSC_BAD_KEY                        0x0002
-#define I2O_DSC_TCL_ERROR                      0x0003
-#define I2O_DSC_REPLY_BUFFER_FULL              0x0004
-#define I2O_DSC_NO_SUCH_PAGE                   0x0005
-#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT     0x0006
-#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD     0x0007
-#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE         0x0009
-#define I2O_DSC_UNSUPPORTED_FUNCTION           0x000A
-#define I2O_DSC_DEVICE_LOCKED                  0x000B
-#define I2O_DSC_DEVICE_RESET                   0x000C
-#define I2O_DSC_INAPPROPRIATE_FUNCTION         0x000D
-#define I2O_DSC_INVALID_INITIATOR_ADDRESS      0x000E
-#define I2O_DSC_INVALID_MESSAGE_FLAGS          0x000F
-#define I2O_DSC_INVALID_OFFSET                 0x0010
-#define I2O_DSC_INVALID_PARAMETER              0x0011
-#define I2O_DSC_INVALID_REQUEST                0x0012
-#define I2O_DSC_INVALID_TARGET_ADDRESS         0x0013
-#define I2O_DSC_MESSAGE_TOO_LARGE              0x0014
-#define I2O_DSC_MESSAGE_TOO_SMALL              0x0015
-#define I2O_DSC_MISSING_PARAMETER              0x0016
-#define I2O_DSC_TIMEOUT                        0x0017
-#define I2O_DSC_UNKNOWN_ERROR                  0x0018
-#define I2O_DSC_UNKNOWN_FUNCTION               0x0019
-#define I2O_DSC_UNSUPPORTED_VERSION            0x001A
-#define I2O_DSC_DEVICE_BUSY                    0x001B
-#define I2O_DSC_DEVICE_NOT_AVAILABLE           0x001C
-
-/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
-   Status Codes.*/
-
-#define I2O_BSA_DSC_SUCCESS               0x0000
-#define I2O_BSA_DSC_MEDIA_ERROR           0x0001
-#define I2O_BSA_DSC_ACCESS_ERROR          0x0002
-#define I2O_BSA_DSC_DEVICE_FAILURE        0x0003
-#define I2O_BSA_DSC_DEVICE_NOT_READY      0x0004
-#define I2O_BSA_DSC_MEDIA_NOT_PRESENT     0x0005
-#define I2O_BSA_DSC_MEDIA_LOCKED          0x0006
-#define I2O_BSA_DSC_MEDIA_FAILURE         0x0007
-#define I2O_BSA_DSC_PROTOCOL_FAILURE      0x0008
-#define I2O_BSA_DSC_BUS_FAILURE           0x0009
-#define I2O_BSA_DSC_ACCESS_VIOLATION      0x000A
-#define I2O_BSA_DSC_WRITE_PROTECTED       0x000B
-#define I2O_BSA_DSC_DEVICE_RESET          0x000C
-#define I2O_BSA_DSC_VOLUME_CHANGED        0x000D
-#define I2O_BSA_DSC_TIMEOUT               0x000E
-
-/* FailureStatusCodes, Table 3-3 Message Failure Codes */
-
-#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED             0x81
-#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED            0x82
-#define I2O_FSC_TRANSPORT_CONGESTION                    0x83
-#define I2O_FSC_TRANSPORT_FAILURE                       0x84
-#define I2O_FSC_TRANSPORT_STATE_ERROR                   0x85
-#define I2O_FSC_TRANSPORT_TIME_OUT                      0x86
-#define I2O_FSC_TRANSPORT_ROUTING_FAILURE               0x87
-#define I2O_FSC_TRANSPORT_INVALID_VERSION               0x88
-#define I2O_FSC_TRANSPORT_INVALID_OFFSET                0x89
-#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS             0x8A
-#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL               0x8B
-#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE               0x8C
-#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID             0x8D
-#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID          0x8E
-#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT     0x8F
-#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE               0xFF
-
-/* Device Claim Types */
-#define	I2O_CLAIM_PRIMARY					0x01000000
-#define	I2O_CLAIM_MANAGEMENT					0x02000000
-#define	I2O_CLAIM_AUTHORIZED					0x03000000
-#define	I2O_CLAIM_SECONDARY					0x04000000
-
-/* Message header defines for VersionOffset */
-#define I2OVER15	0x0001
-#define I2OVER20	0x0002
-
-/* Default is 1.5 */
-#define I2OVERSION	I2OVER15
-
-#define SGL_OFFSET_0    I2OVERSION
-#define SGL_OFFSET_4    (0x0040 | I2OVERSION)
-#define SGL_OFFSET_5    (0x0050 | I2OVERSION)
-#define SGL_OFFSET_6    (0x0060 | I2OVERSION)
-#define SGL_OFFSET_7    (0x0070 | I2OVERSION)
-#define SGL_OFFSET_8    (0x0080 | I2OVERSION)
-#define SGL_OFFSET_9    (0x0090 | I2OVERSION)
-#define SGL_OFFSET_10   (0x00A0 | I2OVERSION)
-#define SGL_OFFSET_11   (0x00B0 | I2OVERSION)
-#define SGL_OFFSET_12   (0x00C0 | I2OVERSION)
-#define SGL_OFFSET(x)   (((x)<<4) | I2OVERSION)
-
-/* Transaction Reply Lists (TRL) Control Word structure */
-#define TRL_SINGLE_FIXED_LENGTH		0x00
-#define TRL_SINGLE_VARIABLE_LENGTH	0x40
-#define TRL_MULTIPLE_FIXED_LENGTH	0x80
-
- /* msg header defines for MsgFlags */
-#define MSG_STATIC	0x0100
-#define MSG_64BIT_CNTXT	0x0200
-#define MSG_MULTI_TRANS	0x1000
-#define MSG_FAIL	0x2000
-#define MSG_FINAL	0x4000
-#define MSG_REPLY	0x8000
-
- /* minimum size msg */
-#define THREE_WORD_MSG_SIZE	0x00030000
-#define FOUR_WORD_MSG_SIZE	0x00040000
-#define FIVE_WORD_MSG_SIZE	0x00050000
-#define SIX_WORD_MSG_SIZE	0x00060000
-#define SEVEN_WORD_MSG_SIZE	0x00070000
-#define EIGHT_WORD_MSG_SIZE	0x00080000
-#define NINE_WORD_MSG_SIZE	0x00090000
-#define TEN_WORD_MSG_SIZE	0x000A0000
-#define ELEVEN_WORD_MSG_SIZE	0x000B0000
-#define I2O_MESSAGE_SIZE(x)	((x)<<16)
-
-/* special TID assignments */
-#define ADAPTER_TID		0
-#define HOST_TID		1
-
-/* outbound queue defines */
-#define I2O_MAX_OUTBOUND_MSG_FRAMES	128
-#define I2O_OUTBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
-
-/* inbound queue definitions */
-#define I2O_MSG_INPOOL_MIN		32
-#define I2O_INBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
-
-#define I2O_POST_WAIT_OK	0
-#define I2O_POST_WAIT_TIMEOUT	-ETIMEDOUT
-
-#define I2O_CONTEXT_LIST_MIN_LENGTH	15
-#define I2O_CONTEXT_LIST_USED		0x01
-#define I2O_CONTEXT_LIST_DELETED	0x02
-
-/* timeouts */
-#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE	15
-#define I2O_TIMEOUT_MESSAGE_GET		5
-#define I2O_TIMEOUT_RESET		30
-#define I2O_TIMEOUT_STATUS_GET		5
-#define I2O_TIMEOUT_LCT_GET		360
-#define I2O_TIMEOUT_SCSI_SCB_ABORT	240
-
-/* retries */
-#define I2O_HRT_GET_TRIES		3
-#define I2O_LCT_GET_TRIES		3
-
-/* defines for max_sectors and max_phys_segments */
-#define I2O_MAX_SECTORS			1024
-#define I2O_MAX_SECTORS_LIMITED		128
-#define I2O_MAX_PHYS_SEGMENTS		BLK_MAX_SEGMENTS
-
-/*
- *	Message structures
- */
-struct i2o_message {
-	union {
-		struct {
-			u8 version_offset;
-			u8 flags;
-			u16 size;
-			u32 target_tid:12;
-			u32 init_tid:12;
-			u32 function:8;
-			u32 icntxt;	/* initiator context */
-			u32 tcntxt;	/* transaction context */
-		} s;
-		u32 head[4];
-	} u;
-	/* List follows */
-	u32 body[0];
-};
-
-/* MFA and I2O message used by mempool */
-struct i2o_msg_mfa {
-	u32 mfa;		/* MFA returned by the controller */
-	struct i2o_message msg;	/* I2O message */
-};
-
-/*
- *	Each I2O device entity has one of these. There is one per device.
- */
-struct i2o_device {
-	i2o_lct_entry lct_data;	/* Device LCT information */
-
-	struct i2o_controller *iop;	/* Controlling IOP */
-	struct list_head list;	/* node in IOP devices list */
-
-	struct device device;
-
-	struct mutex lock;	/* device lock */
-};
-
-/*
- *	Event structure provided to the event handling function
- */
-struct i2o_event {
-	struct work_struct work;
-	struct i2o_device *i2o_dev;	/* I2O device pointer from which the
-					   event reply was initiated */
-	u16 size;		/* Size of data in 32-bit words */
-	u32 tcntxt;		/* Transaction context used at
-				   registration */
-	u32 event_indicator;	/* Event indicator from reply */
-	u32 data[0];		/* Event data from reply */
-};
-
-/*
- *	I2O classes which could be handled by the OSM
- */
-struct i2o_class_id {
-	u16 class_id:12;
-};
-
-/*
- *	I2O driver structure for OSMs
- */
-struct i2o_driver {
-	char *name;		/* OSM name */
-	int context;		/* Low 8 bits of the transaction info */
-	struct i2o_class_id *classes;	/* I2O classes that this OSM handles */
-
-	/* Message reply handler */
-	int (*reply) (struct i2o_controller *, u32, struct i2o_message *);
-
-	/* Event handler */
-	work_func_t event;
-
-	struct workqueue_struct *event_queue;	/* Event queue */
-
-	struct device_driver driver;
-
-	/* notification of changes */
-	void (*notify_controller_add) (struct i2o_controller *);
-	void (*notify_controller_remove) (struct i2o_controller *);
-	void (*notify_device_add) (struct i2o_device *);
-	void (*notify_device_remove) (struct i2o_device *);
-
-	struct semaphore lock;
-};
-
-/*
- *	Contains DMA mapped address information
- */
-struct i2o_dma {
-	void *virt;
-	dma_addr_t phys;
-	size_t len;
-};
-
-/*
- *	Contains slab cache and mempool information
- */
-struct i2o_pool {
-	char *name;
-	struct kmem_cache *slab;
-	mempool_t *mempool;
-};
-
-/*
- *	Contains IO mapped address information
- */
-struct i2o_io {
-	void __iomem *virt;
-	unsigned long phys;
-	unsigned long len;
-};
-
-/*
- *	Context queue entry, used for 32-bit context on 64-bit systems
- */
-struct i2o_context_list_element {
-	struct list_head list;
-	u32 context;
-	void *ptr;
-	unsigned long timestamp;
-};
-
-/*
- * Each I2O controller has one of these objects
- */
-struct i2o_controller {
-	char name[16];
-	int unit;
-	int type;
-
-	struct pci_dev *pdev;	/* PCI device */
-
-	unsigned int promise:1;	/* Promise controller */
-	unsigned int adaptec:1;	/* DPT / Adaptec controller */
-	unsigned int raptor:1;	/* split bar */
-	unsigned int no_quiesce:1;	/* dont quiesce before reset */
-	unsigned int short_req:1;	/* use small block sizes */
-	unsigned int limit_sectors:1;	/* limit number of sectors / request */
-	unsigned int pae_support:1;	/* controller has 64-bit SGL support */
-
-	struct list_head devices;	/* list of I2O devices */
-	struct list_head list;	/* Controller list */
-
-	void __iomem *in_port;	/* Inbout port address */
-	void __iomem *out_port;	/* Outbound port address */
-	void __iomem *irq_status;	/* Interrupt status register address */
-	void __iomem *irq_mask;	/* Interrupt mask register address */
-
-	struct i2o_dma status;	/* IOP status block */
-
-	struct i2o_dma hrt;	/* HW Resource Table */
-	i2o_lct *lct;		/* Logical Config Table */
-	struct i2o_dma dlct;	/* Temp LCT */
-	struct mutex lct_lock;	/* Lock for LCT updates */
-	struct i2o_dma status_block;	/* IOP status block */
-
-	struct i2o_io base;	/* controller messaging unit */
-	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
-	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
-
-	struct i2o_pool in_msg;	/* mempool for inbound messages */
-
-	unsigned int battery:1;	/* Has a battery backup */
-	unsigned int io_alloc:1;	/* An I/O resource was allocated */
-	unsigned int mem_alloc:1;	/* A memory resource was allocated */
-
-	struct resource io_resource;	/* I/O resource allocated to the IOP */
-	struct resource mem_resource;	/* Mem resource allocated to the IOP */
-
-	struct device device;
-	struct i2o_device *exec;	/* Executive */
-#if BITS_PER_LONG == 64
-	spinlock_t context_list_lock;	/* lock for context_list */
-	atomic_t context_list_counter;	/* needed for unique contexts */
-	struct list_head context_list;	/* list of context id's
-					   and pointers */
-#endif
-	spinlock_t lock;	/* lock for controller
-				   configuration */
-	void *driver_data[I2O_MAX_DRIVERS];	/* storage for drivers */
-};
-
-/*
- * I2O System table entry
- *
- * The system table contains information about all the IOPs in the
- * system.  It is sent to all IOPs so that they can create peer2peer
- * connections between them.
- */
-struct i2o_sys_tbl_entry {
-	u16 org_id;
-	u16 reserved1;
-	u32 iop_id:12;
-	u32 reserved2:20;
-	u16 seg_num:12;
-	u16 i2o_version:4;
-	u8 iop_state;
-	u8 msg_type;
-	u16 frame_size;
-	u16 reserved3;
-	u32 last_changed;
-	u32 iop_capabilities;
-	u32 inbound_low;
-	u32 inbound_high;
-};
-
-struct i2o_sys_tbl {
-	u8 num_entries;
-	u8 version;
-	u16 reserved1;
-	u32 change_ind;
-	u32 reserved2;
-	u32 reserved3;
-	struct i2o_sys_tbl_entry iops[0];
-};
-
-extern struct list_head i2o_controllers;
-
-/* Message functions */
-extern struct i2o_message *i2o_msg_get_wait(struct i2o_controller *, int);
-extern int i2o_msg_post_wait_mem(struct i2o_controller *, struct i2o_message *,
-				 unsigned long, struct i2o_dma *);
-
-/* IOP functions */
-extern int i2o_status_get(struct i2o_controller *);
-
-extern int i2o_event_register(struct i2o_device *, struct i2o_driver *, int,
-			      u32);
-extern struct i2o_device *i2o_iop_find_device(struct i2o_controller *, u16);
-extern struct i2o_controller *i2o_find_iop(int);
-
-/* Functions needed for handling 64-bit pointers in 32-bit context */
-#if BITS_PER_LONG == 64
-extern u32 i2o_cntxt_list_add(struct i2o_controller *, void *);
-extern void *i2o_cntxt_list_get(struct i2o_controller *, u32);
-extern u32 i2o_cntxt_list_remove(struct i2o_controller *, void *);
-extern u32 i2o_cntxt_list_get_ptr(struct i2o_controller *, void *);
-
-static inline u32 i2o_ptr_low(void *ptr)
-{
-	return (u32) (u64) ptr;
-};
-
-static inline u32 i2o_ptr_high(void *ptr)
-{
-	return (u32) ((u64) ptr >> 32);
-};
-
-static inline u32 i2o_dma_low(dma_addr_t dma_addr)
-{
-	return (u32) (u64) dma_addr;
-};
-
-static inline u32 i2o_dma_high(dma_addr_t dma_addr)
-{
-	return (u32) ((u64) dma_addr >> 32);
-};
-#else
-static inline u32 i2o_cntxt_list_add(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
-
-static inline void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
-{
-	return (void *)context;
-};
-
-static inline u32 i2o_cntxt_list_remove(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
-
-static inline u32 i2o_cntxt_list_get_ptr(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
-
-static inline u32 i2o_ptr_low(void *ptr)
-{
-	return (u32) ptr;
-};
-
-static inline u32 i2o_ptr_high(void *ptr)
-{
-	return 0;
-};
-
-static inline u32 i2o_dma_low(dma_addr_t dma_addr)
-{
-	return (u32) dma_addr;
-};
-
-static inline u32 i2o_dma_high(dma_addr_t dma_addr)
-{
-	return 0;
-};
-#endif
-
-extern u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size);
-extern dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
-					    size_t size,
-					    enum dma_data_direction direction,
-					    u32 ** sg_ptr);
-extern int i2o_dma_map_sg(struct i2o_controller *c,
-				 struct scatterlist *sg, int sg_count,
-				 enum dma_data_direction direction,
-				 u32 ** sg_ptr);
-extern int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len);
-extern void i2o_dma_free(struct device *dev, struct i2o_dma *addr);
-extern int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
-								size_t len);
-extern int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
-				 size_t size, int min_nr);
-extern void i2o_pool_free(struct i2o_pool *pool);
-/* I2O driver (OSM) functions */
-extern int i2o_driver_register(struct i2o_driver *);
-extern void i2o_driver_unregister(struct i2o_driver *);
-
-/**
- *	i2o_driver_notify_controller_add - Send notification of added controller
- *	@drv: I2O driver
- *	@c: I2O controller
- *
- *	Send notification of added controller to a single registered driver.
- */
-static inline void i2o_driver_notify_controller_add(struct i2o_driver *drv,
-						    struct i2o_controller *c)
-{
-	if (drv->notify_controller_add)
-		drv->notify_controller_add(c);
-};
-
-/**
- *	i2o_driver_notify_controller_remove - Send notification of removed controller
- *	@drv: I2O driver
- *	@c: I2O controller
- *
- *	Send notification of removed controller to a single registered driver.
- */
-static inline void i2o_driver_notify_controller_remove(struct i2o_driver *drv,
-						       struct i2o_controller *c)
-{
-	if (drv->notify_controller_remove)
-		drv->notify_controller_remove(c);
-};
-
-/**
- *	i2o_driver_notify_device_add - Send notification of added device
- *	@drv: I2O driver
- *	@i2o_dev: the added i2o_device
- *
- *	Send notification of added device to a single registered driver.
- */
-static inline void i2o_driver_notify_device_add(struct i2o_driver *drv,
-						struct i2o_device *i2o_dev)
-{
-	if (drv->notify_device_add)
-		drv->notify_device_add(i2o_dev);
-};
-
-/**
- *	i2o_driver_notify_device_remove - Send notification of removed device
- *	@drv: I2O driver
- *	@i2o_dev: the added i2o_device
- *
- *	Send notification of removed device to a single registered driver.
- */
-static inline void i2o_driver_notify_device_remove(struct i2o_driver *drv,
-						   struct i2o_device *i2o_dev)
-{
-	if (drv->notify_device_remove)
-		drv->notify_device_remove(i2o_dev);
-};
-
-extern void i2o_driver_notify_controller_add_all(struct i2o_controller *);
-extern void i2o_driver_notify_controller_remove_all(struct i2o_controller *);
-extern void i2o_driver_notify_device_add_all(struct i2o_device *);
-extern void i2o_driver_notify_device_remove_all(struct i2o_device *);
-
-/* I2O device functions */
-extern int i2o_device_claim(struct i2o_device *);
-extern int i2o_device_claim_release(struct i2o_device *);
-
-/* Exec OSM functions */
-extern int i2o_exec_lct_get(struct i2o_controller *);
-
-/* device / driver / kobject conversion functions */
-#define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
-#define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
-#define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
-
-/**
- *	i2o_out_to_virt - Turn an I2O message to a virtual address
- *	@c: controller
- *	@m: message engine value
- *
- *	Turn a receive message from an I2O controller bus address into
- *	a Linux virtual address. The shared page frame is a linear block
- *	so we simply have to shift the offset. This function does not
- *	work for sender side messages as they are ioremap objects
- *	provided by the I2O controller.
- */
-static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
-						      u32 m)
-{
-	BUG_ON(m < c->out_queue.phys
-	       || m >= c->out_queue.phys + c->out_queue.len);
-
-	return c->out_queue.virt + (m - c->out_queue.phys);
-};
-
-/**
- *	i2o_msg_in_to_virt - Turn an I2O message to a virtual address
- *	@c: controller
- *	@m: message engine value
- *
- *	Turn a send message from an I2O controller bus address into
- *	a Linux virtual address. The shared page frame is a linear block
- *	so we simply have to shift the offset. This function does not
- *	work for receive side messages as they are kmalloc objects
- *	in a different pool.
- */
-static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
-							     i2o_controller *c,
-							     u32 m)
-{
-	return c->in_queue.virt + m;
-};
-
-/**
- *	i2o_msg_get - obtain an I2O message from the IOP
- *	@c: I2O controller
- *
- *	This function tries to get a message frame. If no message frame is
- *	available do not wait until one is available (see also i2o_msg_get_wait).
- *	The returned pointer to the message frame is not in I/O memory, it is
- *	allocated from a mempool. But because a MFA is allocated from the
- *	controller too it is guaranteed that i2o_msg_post() will never fail.
- *
- *	On a success a pointer to the message frame is returned. If the message
- *	queue is empty -EBUSY is returned and if no memory is available -ENOMEM
- *	is returned.
- */
-static inline struct i2o_message *i2o_msg_get(struct i2o_controller *c)
-{
-	struct i2o_msg_mfa *mmsg = mempool_alloc(c->in_msg.mempool, GFP_ATOMIC);
-	if (!mmsg)
-		return ERR_PTR(-ENOMEM);
-
-	mmsg->mfa = readl(c->in_port);
-	if (unlikely(mmsg->mfa >= c->in_queue.len)) {
-		u32 mfa = mmsg->mfa;
-
-		mempool_free(mmsg, c->in_msg.mempool);
-
-		if (mfa == I2O_QUEUE_EMPTY)
-			return ERR_PTR(-EBUSY);
-		return ERR_PTR(-EFAULT);
-	}
-
-	return &mmsg->msg;
-};
-
-/**
- *	i2o_msg_post - Post I2O message to I2O controller
- *	@c: I2O controller to which the message should be send
- *	@msg: message returned by i2o_msg_get()
- *
- *	Post the message to the I2O controller and return immediately.
- */
-static inline void i2o_msg_post(struct i2o_controller *c,
-				struct i2o_message *msg)
-{
-	struct i2o_msg_mfa *mmsg;
-
-	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
-	memcpy_toio(i2o_msg_in_to_virt(c, mmsg->mfa), msg,
-		    (le32_to_cpu(msg->u.head[0]) >> 16) << 2);
-	writel(mmsg->mfa, c->in_port);
-	mempool_free(mmsg, c->in_msg.mempool);
-};
-
-/**
- * 	i2o_msg_post_wait - Post and wait a message and wait until return
- *	@c: controller
- *	@msg: message to post
- *	@timeout: time in seconds to wait
- *
- * 	This API allows an OSM to post a message and then be told whether or
- *	not the system received a successful reply. If the message times out
- *	then the value '-ETIMEDOUT' is returned.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static inline int i2o_msg_post_wait(struct i2o_controller *c,
-				    struct i2o_message *msg,
-				    unsigned long timeout)
-{
-	return i2o_msg_post_wait_mem(c, msg, timeout, NULL);
-};
-
-/**
- *	i2o_msg_nop_mfa - Returns a fetched MFA back to the controller
- *	@c: I2O controller from which the MFA was fetched
- *	@mfa: MFA which should be returned
- *
- *	This function must be used for preserved messages, because i2o_msg_nop()
- *	also returns the allocated memory back to the msg_pool mempool.
- */
-static inline void i2o_msg_nop_mfa(struct i2o_controller *c, u32 mfa)
-{
-	struct i2o_message __iomem *msg;
-	u32 nop[3] = {
-		THREE_WORD_MSG_SIZE | SGL_OFFSET_0,
-		I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
-		0x00000000
-	};
-
-	msg = i2o_msg_in_to_virt(c, mfa);
-	memcpy_toio(msg, nop, sizeof(nop));
-	writel(mfa, c->in_port);
-};
-
-/**
- *	i2o_msg_nop - Returns a message which is not used
- *	@c: I2O controller from which the message was created
- *	@msg: message which should be returned
- *
- *	If you fetch a message via i2o_msg_get, and can't use it, you must
- *	return the message with this function. Otherwise the MFA is lost as well
- *	as the allocated memory from the mempool.
- */
-static inline void i2o_msg_nop(struct i2o_controller *c,
-			       struct i2o_message *msg)
-{
-	struct i2o_msg_mfa *mmsg;
-	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
-
-	i2o_msg_nop_mfa(c, mmsg->mfa);
-	mempool_free(mmsg, c->in_msg.mempool);
-};
-
-/**
- *	i2o_flush_reply - Flush reply from I2O controller
- *	@c: I2O controller
- *	@m: the message identifier
- *
- *	The I2O controller must be informed that the reply message is not needed
- *	anymore. If you forget to flush the reply, the message frame can't be
- *	used by the controller anymore and is therefore lost.
- */
-static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
-{
-	writel(m, c->out_port);
-};
-
-/*
- *	Endian handling wrapped into the macro - keeps the core code
- *	cleaner.
- */
-
-#define i2o_raw_writel(val, mem)	__raw_writel(cpu_to_le32(val), mem)
-
-extern int i2o_parm_field_get(struct i2o_device *, int, int, void *, int);
-extern int i2o_parm_table_get(struct i2o_device *, int, int, int, void *, int,
-			      void *, int);
-
-/* debugging and troubleshooting/diagnostic helpers. */
-#define osm_printk(level, format, arg...)  \
-	printk(level "%s: " format, OSM_NAME , ## arg)
-
-#ifdef DEBUG
-#define osm_debug(format, arg...) \
-	osm_printk(KERN_DEBUG, format , ## arg)
-#else
-#define osm_debug(format, arg...) \
-        do { } while (0)
-#endif
-
-#define osm_err(format, arg...)		\
-	osm_printk(KERN_ERR, format , ## arg)
-#define osm_info(format, arg...)		\
-	osm_printk(KERN_INFO, format , ## arg)
-#define osm_warn(format, arg...)		\
-	osm_printk(KERN_WARNING, format , ## arg)
-
-/* debugging functions */
-extern void i2o_report_status(const char *, const char *, struct i2o_message *);
-extern void i2o_dump_message(struct i2o_message *);
-extern void i2o_dump_hrt(struct i2o_controller *c);
-extern void i2o_debug_state(struct i2o_controller *c);
-
-#endif				/* _I2O_H */
-- 
cgit v1.2.3


From 09aa8ac0f9a76b7d650cf3a27a2cfcb4d3d612fb Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 3 Feb 2015 17:07:35 -0600
Subject: dma: mmp_tdma: Fix build for ARM64

sram_get_gpool is only used for legacy, non-DT MMP/PXA platforms. Provide
an empty version in order to build on ARM64.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/platform_data/dma-mmp_tdma.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/dma-mmp_tdma.h b/include/linux/platform_data/dma-mmp_tdma.h
index 66574ea39f97..0c72886030ef 100644
--- a/include/linux/platform_data/dma-mmp_tdma.h
+++ b/include/linux/platform_data/dma-mmp_tdma.h
@@ -28,6 +28,13 @@ struct sram_platdata {
 	int granularity;
 };
 
+#ifdef CONFIG_ARM
 extern struct gen_pool *sram_get_gpool(char *pool_name);
+#else
+static inline struct gen_pool *sram_get_gpool(char *pool_name)
+{
+	return NULL;
+}
+#endif
 
 #endif /* __DMA_MMP_TDMA_H */
-- 
cgit v1.2.3


From 851c63e3b381fdbf5aca1a797f37d8606b5588d2 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Mon, 5 Jan 2015 15:43:39 +0000
Subject: of: Fix brace position for struct of_device_id definition

Currently it is not easy to grep for the definition of struct of_device_id.
This is trivially fixed by moving the brace to the right place.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Rob Herring <robh+dt@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/mod_devicetable.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 745def862580..bbf85d612be5 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -220,8 +220,7 @@ struct serio_device_id {
 /*
  * Struct used for matching a device
  */
-struct of_device_id
-{
+struct of_device_id {
 	char	name[32];
 	char	type[32];
 	char	compatible[128];
-- 
cgit v1.2.3


From 4c946d9c11d173c2ea6b9081b248f8072e6b46f1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 27 Nov 2014 19:52:04 -0500
Subject: vmci: propagate msghdr all way down to __qp_memcpy_to_queue()

Switch from passing msg->iov_iter.iov to passing msg itself

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 16 ++++++++--------
 include/linux/vmw_vmci_api.h            |  2 +-
 net/vmw_vsock/vmci_transport.c          |  3 +--
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 7aaaf51e1596..35f19a683822 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -370,12 +370,12 @@ static int __qp_memcpy_to_queue(struct vmci_queue *queue,
 			to_copy = size - bytes_copied;
 
 		if (is_iovec) {
-			struct iovec *iov = (struct iovec *)src;
+			struct msghdr *msg = (struct msghdr *)src;
 			int err;
 
 			/* The iovec will track bytes_copied internally. */
-			err = memcpy_fromiovec((u8 *)va + page_offset,
-					       iov, to_copy);
+			err = memcpy_from_msg((u8 *)va + page_offset,
+					      msg, to_copy);
 			if (err != 0) {
 				if (kernel_if->host)
 					kunmap(kernel_if->u.h.page[page_index]);
@@ -580,7 +580,7 @@ static int qp_memcpy_from_queue(void *dest,
  */
 static int qp_memcpy_to_queue_iov(struct vmci_queue *queue,
 				  u64 queue_offset,
-				  const void *src,
+				  const void *msg,
 				  size_t src_offset, size_t size)
 {
 
@@ -588,7 +588,7 @@ static int qp_memcpy_to_queue_iov(struct vmci_queue *queue,
 	 * We ignore src_offset because src is really a struct iovec * and will
 	 * maintain offset internally.
 	 */
-	return __qp_memcpy_to_queue(queue, queue_offset, src, size, true);
+	return __qp_memcpy_to_queue(queue, queue_offset, msg, size, true);
 }
 
 /*
@@ -3223,13 +3223,13 @@ EXPORT_SYMBOL_GPL(vmci_qpair_peek);
  * of bytes enqueued or < 0 on error.
  */
 ssize_t vmci_qpair_enquev(struct vmci_qp *qpair,
-			  void *iov,
+			  struct msghdr *msg,
 			  size_t iov_size,
 			  int buf_type)
 {
 	ssize_t result;
 
-	if (!qpair || !iov)
+	if (!qpair)
 		return VMCI_ERROR_INVALID_ARGS;
 
 	qp_lock(qpair);
@@ -3238,7 +3238,7 @@ ssize_t vmci_qpair_enquev(struct vmci_qp *qpair,
 		result = qp_enqueue_locked(qpair->produce_q,
 					   qpair->consume_q,
 					   qpair->produce_q_size,
-					   iov, iov_size,
+					   msg, iov_size,
 					   qp_memcpy_to_queue_iov);
 
 		if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY &&
diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h
index 5691f752ce8f..63df3a2a8ce5 100644
--- a/include/linux/vmw_vmci_api.h
+++ b/include/linux/vmw_vmci_api.h
@@ -74,7 +74,7 @@ ssize_t vmci_qpair_dequeue(struct vmci_qp *qpair,
 ssize_t vmci_qpair_peek(struct vmci_qp *qpair, void *buf, size_t buf_size,
 			int mode);
 ssize_t vmci_qpair_enquev(struct vmci_qp *qpair,
-			  void *iov, size_t iov_size, int mode);
+			  struct msghdr *msg, size_t iov_size, int mode);
 ssize_t vmci_qpair_dequev(struct vmci_qp *qpair,
 			  struct msghdr *msg, size_t iov_size, int mode);
 ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size,
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 02d2e5229240..7f3255084a6c 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1850,8 +1850,7 @@ static ssize_t vmci_transport_stream_enqueue(
 	struct msghdr *msg,
 	size_t len)
 {
-	/* XXX: stripping const */
-	return vmci_qpair_enquev(vmci_trans(vsk)->qpair, (struct iovec *)msg->msg_iter.iov, len, 0);
+	return vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg, len, 0);
 }
 
 static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk)
-- 
cgit v1.2.3


From af2b040e470b470bfc881981db3c796072853eae Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 27 Nov 2014 21:44:24 -0500
Subject: rxrpc: switch rxrpc_send_data() to iov_iter primitives

Convert skb_add_data() to iov_iter; allows to get rid of the explicit
messing with iovec in its only caller - skb_add_data() will keep advancing
->msg_iter for us, so there's no need to similate that manually.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h | 11 +++++------
 net/rxrpc/ar-output.c  | 43 ++++++++++---------------------------------
 2 files changed, 15 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 85ab7d72b54c..9a8bafee1b67 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2484,19 +2484,18 @@ static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
 }
 
 static inline int skb_add_data(struct sk_buff *skb,
-			       char __user *from, int copy)
+			       struct iov_iter *from, int copy)
 {
 	const int off = skb->len;
 
 	if (skb->ip_summed == CHECKSUM_NONE) {
-		int err = 0;
-		__wsum csum = csum_and_copy_from_user(from, skb_put(skb, copy),
-							    copy, 0, &err);
-		if (!err) {
+		__wsum csum = 0;
+		if (csum_and_copy_from_iter(skb_put(skb, copy), copy,
+					    &csum, from) == copy) {
 			skb->csum = csum_block_add(skb->csum, csum, off);
 			return 0;
 		}
-	} else if (!copy_from_user(skb_put(skb, copy), from, copy))
+	} else if (copy_from_iter(skb_put(skb, copy), copy, from) == copy)
 		return 0;
 
 	__skb_trim(skb, off);
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index e1a9373e5979..963a5b91f3e8 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -529,13 +529,11 @@ static int rxrpc_send_data(struct kiocb *iocb,
 			   struct msghdr *msg, size_t len)
 {
 	struct rxrpc_skb_priv *sp;
-	unsigned char __user *from;
 	struct sk_buff *skb;
-	const struct iovec *iov;
 	struct sock *sk = &rx->sk;
 	long timeo;
 	bool more;
-	int ret, ioc, segment, copied;
+	int ret, copied;
 
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
@@ -545,25 +543,17 @@ static int rxrpc_send_data(struct kiocb *iocb,
 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 		return -EPIPE;
 
-	iov = msg->msg_iter.iov;
-	ioc = msg->msg_iter.nr_segs - 1;
-	from = iov->iov_base;
-	segment = iov->iov_len;
-	iov++;
 	more = msg->msg_flags & MSG_MORE;
 
 	skb = call->tx_pending;
 	call->tx_pending = NULL;
 
 	copied = 0;
-	do {
+	if (len > iov_iter_count(&msg->msg_iter))
+		len = iov_iter_count(&msg->msg_iter);
+	while (len) {
 		int copy;
 
-		if (segment > len)
-			segment = len;
-
-		_debug("SEGMENT %d @%p", segment, from);
-
 		if (!skb) {
 			size_t size, chunk, max, space;
 
@@ -631,13 +621,13 @@ static int rxrpc_send_data(struct kiocb *iocb,
 		/* append next segment of data to the current buffer */
 		copy = skb_tailroom(skb);
 		ASSERTCMP(copy, >, 0);
-		if (copy > segment)
-			copy = segment;
+		if (copy > len)
+			copy = len;
 		if (copy > sp->remain)
 			copy = sp->remain;
 
 		_debug("add");
-		ret = skb_add_data(skb, from, copy);
+		ret = skb_add_data(skb, &msg->msg_iter, copy);
 		_debug("added");
 		if (ret < 0)
 			goto efault;
@@ -646,18 +636,6 @@ static int rxrpc_send_data(struct kiocb *iocb,
 		copied += copy;
 
 		len -= copy;
-		segment -= copy;
-		from += copy;
-		while (segment == 0 && ioc > 0) {
-			from = iov->iov_base;
-			segment = iov->iov_len;
-			iov++;
-			ioc--;
-		}
-		if (len == 0) {
-			segment = 0;
-			ioc = 0;
-		}
 
 		/* check for the far side aborting the call or a network error
 		 * occurring */
@@ -665,7 +643,7 @@ static int rxrpc_send_data(struct kiocb *iocb,
 			goto call_aborted;
 
 		/* add the packet to the send queue if it's now full */
-		if (sp->remain <= 0 || (segment == 0 && !more)) {
+		if (sp->remain <= 0 || (!len && !more)) {
 			struct rxrpc_connection *conn = call->conn;
 			uint32_t seq;
 			size_t pad;
@@ -711,11 +689,10 @@ static int rxrpc_send_data(struct kiocb *iocb,
 
 			memcpy(skb->head, &sp->hdr,
 			       sizeof(struct rxrpc_header));
-			rxrpc_queue_packet(call, skb, segment == 0 && !more);
+			rxrpc_queue_packet(call, skb, !iov_iter_count(&msg->msg_iter) && !more);
 			skb = NULL;
 		}
-
-	} while (segment > 0);
+	}
 
 success:
 	ret = copied;
-- 
cgit v1.2.3


From 21226abb4e9f14d88238964d89b279e461ddc30c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 28 Nov 2014 15:48:29 -0500
Subject: net: switch memcpy_fromiovec()/memcpy_fromiovecend() users to
 copy_from_iter()

That takes care of the majority of ->sendmsg() instances - most of them
via memcpy_to_msg() or assorted getfrag() callbacks.  One place where we
still keep memcpy_fromiovecend() is tipc - there we potentially read the
same data over and over; separate patch, that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h |  3 +--
 include/net/udplite.h  |  3 +--
 net/ipv4/ip_output.c   |  6 ++----
 net/ipv4/ping.c        | 14 +++++++-------
 net/ipv4/raw.c         |  2 +-
 net/ipv4/tcp_input.c   |  2 +-
 net/ipv6/raw.c         |  2 +-
 7 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9a8bafee1b67..b349c96dc80a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2692,8 +2692,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 
 static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 {
-	/* XXX: stripping const */
-	return memcpy_fromiovec(data, (struct iovec *)msg->msg_iter.iov, len);
+	return copy_from_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
 }
 
 static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
diff --git a/include/net/udplite.h b/include/net/udplite.h
index ae7c8d1fbcad..80761938b9a7 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -20,8 +20,7 @@ static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
 				      int len, int odd, struct sk_buff *skb)
 {
 	struct msghdr *msg = from;
-	/* XXX: stripping const */
-	return memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len);
+	return copy_from_iter(to, len, &msg->msg_iter) != len ? -EFAULT : 0;
 }
 
 /* Designate sk as UDP-Lite socket */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b50861b22b6b..f998bc87ae38 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -755,13 +755,11 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
 	struct msghdr *msg = from;
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		/* XXX: stripping const */
-		if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0)
+		if (copy_from_iter(to, len, &msg->msg_iter) != len)
 			return -EFAULT;
 	} else {
 		__wsum csum = 0;
-		/* XXX: stripping const */
-		if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0)
+		if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
 			return -EFAULT;
 		skb->csum = csum_block_add(skb->csum, csum, odd);
 	}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9e15ba701401..e9f66e1cda50 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -599,18 +599,18 @@ int ping_getfrag(void *from, char *to,
 	struct pingfakehdr *pfh = (struct pingfakehdr *)from;
 
 	if (offset == 0) {
-		if (fraglen < sizeof(struct icmphdr))
+		fraglen -= sizeof(struct icmphdr);
+		if (fraglen < 0)
 			BUG();
-		if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
-			    pfh->msg->msg_iter.iov, 0, fraglen - sizeof(struct icmphdr),
-			    &pfh->wcheck))
+		if (csum_and_copy_from_iter(to + sizeof(struct icmphdr),
+			    fraglen, &pfh->wcheck,
+			    &pfh->msg->msg_iter) != fraglen)
 			return -EFAULT;
 	} else if (offset < sizeof(struct icmphdr)) {
 			BUG();
 	} else {
-		if (csum_partial_copy_fromiovecend
-				(to, pfh->msg->msg_iter.iov, offset - sizeof(struct icmphdr),
-				 fraglen, &pfh->wcheck))
+		if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck,
+					    &pfh->msg->msg_iter) != fraglen)
 			return -EFAULT;
 	}
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c9d252072a2..f027a708b7e0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -382,7 +382,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	skb->transport_header = skb->network_header;
 	err = -EFAULT;
-	if (memcpy_fromiovecend((void *)iph, msg->msg_iter.iov, 0, length))
+	if (memcpy_from_msg(iph, msg, length))
 		goto error_free;
 
 	iphlen = iph->ihl * 4;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 93c74829cbce..71fb37c70581 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4368,7 +4368,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 	if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
 		goto err_free;
 
-	if (copy_from_iter(skb_put(skb, size), size, &msg->msg_iter) != size)
+	if (memcpy_from_msg(skb_put(skb, size), msg, size))
 		goto err_free;
 
 	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 0dbb328fa688..dae7f1a1e464 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -648,7 +648,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	skb->ip_summed = CHECKSUM_NONE;
 
 	skb->transport_header = skb->network_header;
-	err = memcpy_fromiovecend((void *)iph, msg->msg_iter.iov, 0, length);
+	err = memcpy_from_msg(iph, msg, length);
 	if (err)
 		goto error_fault;
 
-- 
cgit v1.2.3


From 31a25fae85956e3a9c778141d29e5e803fb0b124 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 28 Nov 2014 15:53:57 -0500
Subject: net: bury net/core/iovec.c - nothing in there is used anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/socket.h |   7 ---
 net/core/Makefile      |   2 +-
 net/core/iovec.c       | 137 -------------------------------------------------
 3 files changed, 1 insertion(+), 145 deletions(-)
 delete mode 100644 net/core/iovec.c

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 6e49a14365dc..5c19cba34dce 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -318,13 +318,6 @@ struct ucred {
 /* IPX options */
 #define IPX_TYPE	1
 
-extern int csum_partial_copy_fromiovecend(unsigned char *kdata, 
-					  struct iovec *iov, 
-					  int offset, 
-					  unsigned int len, __wsum *csump);
-extern unsigned long iov_pages(const struct iovec *iov, int offset,
-			       unsigned long nr_segs);
-
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
diff --git a/net/core/Makefile b/net/core/Makefile
index 235e6c50708d..fec0856dd6c0 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux networking core.
 #
 
-obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
+obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
 	 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
diff --git a/net/core/iovec.c b/net/core/iovec.c
deleted file mode 100644
index dcbe98b3726a..000000000000
--- a/net/core/iovec.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- *	iovec manipulation routines.
- *
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- *
- *	Fixes:
- *		Andrew Lunn	:	Errors in iovec copying.
- *		Pedro Roque	:	Added memcpy_fromiovecend and
- *					csum_..._fromiovecend.
- *		Andi Kleen	:	fixed error handling for 2.1
- *		Alexey Kuznetsov:	2.1 optimisations
- *		Andi Kleen	:	Fix csum*fromiovecend for IPv6.
- */
-
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/net.h>
-#include <linux/in6.h>
-#include <asm/uaccess.h>
-#include <asm/byteorder.h>
-#include <net/checksum.h>
-#include <net/sock.h>
-
-/*
- *	And now for the all-in-one: copy and checksum from a user iovec
- *	directly to a datagram
- *	Calls to csum_partial but the last must be in 32 bit chunks
- *
- *	ip_build_xmit must ensure that when fragmenting only the last
- *	call to this function will be unaligned also.
- */
-int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
-				 int offset, unsigned int len, __wsum *csump)
-{
-	__wsum csum = *csump;
-	int partial_cnt = 0, err = 0;
-
-	/* Skip over the finished iovecs */
-	while (offset >= iov->iov_len) {
-		offset -= iov->iov_len;
-		iov++;
-	}
-
-	while (len > 0) {
-		u8 __user *base = iov->iov_base + offset;
-		int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
-		offset = 0;
-
-		/* There is a remnant from previous iov. */
-		if (partial_cnt) {
-			int par_len = 4 - partial_cnt;
-
-			/* iov component is too short ... */
-			if (par_len > copy) {
-				if (copy_from_user(kdata, base, copy))
-					goto out_fault;
-				kdata += copy;
-				base += copy;
-				partial_cnt += copy;
-				len -= copy;
-				iov++;
-				if (len)
-					continue;
-				*csump = csum_partial(kdata - partial_cnt,
-							 partial_cnt, csum);
-				goto out;
-			}
-			if (copy_from_user(kdata, base, par_len))
-				goto out_fault;
-			csum = csum_partial(kdata - partial_cnt, 4, csum);
-			kdata += par_len;
-			base  += par_len;
-			copy  -= par_len;
-			len   -= par_len;
-			partial_cnt = 0;
-		}
-
-		if (len > copy) {
-			partial_cnt = copy % 4;
-			if (partial_cnt) {
-				copy -= partial_cnt;
-				if (copy_from_user(kdata + copy, base + copy,
-						partial_cnt))
-					goto out_fault;
-			}
-		}
-
-		if (copy) {
-			csum = csum_and_copy_from_user(base, kdata, copy,
-							csum, &err);
-			if (err)
-				goto out;
-		}
-		len   -= copy + partial_cnt;
-		kdata += copy + partial_cnt;
-		iov++;
-	}
-	*csump = csum;
-out:
-	return err;
-
-out_fault:
-	err = -EFAULT;
-	goto out;
-}
-EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
-
-unsigned long iov_pages(const struct iovec *iov, int offset,
-			unsigned long nr_segs)
-{
-	unsigned long seg, base;
-	int pages = 0, len, size;
-
-	while (nr_segs && (offset >= iov->iov_len)) {
-		offset -= iov->iov_len;
-		++iov;
-		--nr_segs;
-	}
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		base = (unsigned long)iov[seg].iov_base + offset;
-		len = iov[seg].iov_len - offset;
-		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
-		pages += size;
-		offset = 0;
-	}
-
-	return pages;
-}
-EXPORT_SYMBOL(iov_pages);
-- 
cgit v1.2.3


From aad9a1cec7dcd1d45809b64643fce37061b17788 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 14:49:01 -0500
Subject: vhost: switch vhost get_indirect() to iov_iter, kill
 memcpy_fromiovec()

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/vhost/vhost.c |  6 ++++--
 include/linux/uio.h   |  1 -
 lib/iovec.c           | 25 -------------------------
 3 files changed, 4 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index cb807d0ea498..2ee28266fd07 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1125,6 +1125,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	struct vring_desc desc;
 	unsigned int i = 0, count, found = 0;
 	u32 len = vhost32_to_cpu(vq, indirect->len);
+	struct iov_iter from;
 	int ret;
 
 	/* Sanity check */
@@ -1142,6 +1143,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 		vq_err(vq, "Translation failure %d in indirect.\n", ret);
 		return ret;
 	}
+	iov_iter_init(&from, READ, vq->indirect, ret, len);
 
 	/* We will use the result as an address to read from, so most
 	 * architectures only need a compiler barrier here. */
@@ -1164,8 +1166,8 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			       i, count);
 			return -EINVAL;
 		}
-		if (unlikely(memcpy_fromiovec((unsigned char *)&desc,
-					      vq->indirect, sizeof desc))) {
+		if (unlikely(copy_from_iter(&desc, sizeof(desc), &from) !=
+			     sizeof(desc))) {
 			vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
 			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
 			return -EINVAL;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 1c5e453f7ea9..af3439f4ebf2 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -135,7 +135,6 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 
-int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 			int offset, int len);
 int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
diff --git a/lib/iovec.c b/lib/iovec.c
index 2d99cb4a5006..4a90875c64ae 100644
--- a/lib/iovec.c
+++ b/lib/iovec.c
@@ -2,31 +2,6 @@
 #include <linux/export.h>
 #include <linux/uio.h>
 
-/*
- *	Copy iovec to kernel. Returns -EFAULT on error.
- *
- *	Note: this modifies the original iovec.
- */
-
-int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
-{
-	while (len > 0) {
-		if (iov->iov_len) {
-			int copy = min_t(unsigned int, len, iov->iov_len);
-			if (copy_from_user(kdata, iov->iov_base, copy))
-				return -EFAULT;
-			len -= copy;
-			kdata += copy;
-			iov->iov_base += copy;
-			iov->iov_len -= copy;
-		}
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovec);
-
 /*
  *	Copy kernel to iovec. Returns -EFAULT on error.
  */
-- 
cgit v1.2.3


From ba7438aed924133df54a60e4cd5499d359bcf2a8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 15:51:28 -0500
Subject: vhost: don't bother copying iovecs in handle_rx(), kill
 memcpy_toiovecend()

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/vhost/net.c | 82 +++++++++++++++--------------------------------------
 include/linux/uio.h |  3 --
 lib/iovec.c         | 26 -----------------
 3 files changed, 23 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d86cc9bb9ea4..e022cc40303d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -84,10 +84,6 @@ struct vhost_net_ubuf_ref {
 
 struct vhost_net_virtqueue {
 	struct vhost_virtqueue vq;
-	/* hdr is used to store the virtio header.
-	 * Since each iovec has >= 1 byte length, we never need more than
-	 * header length entries to store the header. */
-	struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)];
 	size_t vhost_hlen;
 	size_t sock_hlen;
 	/* vhost zerocopy support fields below: */
@@ -235,44 +231,6 @@ static bool vhost_sock_zcopy(struct socket *sock)
 		sock_flag(sock->sk, SOCK_ZEROCOPY);
 }
 
-/* Pop first len bytes from iovec. Return number of segments used. */
-static int move_iovec_hdr(struct iovec *from, struct iovec *to,
-			  size_t len, int iov_count)
-{
-	int seg = 0;
-	size_t size;
-
-	while (len && seg < iov_count) {
-		size = min(from->iov_len, len);
-		to->iov_base = from->iov_base;
-		to->iov_len = size;
-		from->iov_len -= size;
-		from->iov_base += size;
-		len -= size;
-		++from;
-		++to;
-		++seg;
-	}
-	return seg;
-}
-/* Copy iovec entries for len bytes from iovec. */
-static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
-			   size_t len, int iovcount)
-{
-	int seg = 0;
-	size_t size;
-
-	while (len && seg < iovcount) {
-		size = min(from->iov_len, len);
-		to->iov_base = from->iov_base;
-		to->iov_len = size;
-		len -= size;
-		++from;
-		++to;
-		++seg;
-	}
-}
-
 /* In case of DMA done not in order in lower device driver for some reason.
  * upend_idx is used to track end of used idx, done_idx is used to track head
  * of used idx. Once lower device DMA done contiguously, we will signal KVM
@@ -570,9 +528,9 @@ static void handle_rx(struct vhost_net *net)
 		.msg_controllen = 0,
 		.msg_flags = MSG_DONTWAIT,
 	};
-	struct virtio_net_hdr_mrg_rxbuf hdr = {
-		.hdr.flags = 0,
-		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	struct virtio_net_hdr hdr = {
+		.flags = 0,
+		.gso_type = VIRTIO_NET_HDR_GSO_NONE
 	};
 	size_t total_len = 0;
 	int err, mergeable;
@@ -580,6 +538,7 @@ static void handle_rx(struct vhost_net *net)
 	size_t vhost_hlen, sock_hlen;
 	size_t vhost_len, sock_len;
 	struct socket *sock;
+	struct iov_iter fixup;
 
 	mutex_lock(&vq->mutex);
 	sock = vq->private_data;
@@ -624,14 +583,19 @@ static void handle_rx(struct vhost_net *net)
 			break;
 		}
 		/* We don't need to be notified again. */
-		if (unlikely((vhost_hlen)))
-			/* Skip header. TODO: support TSO. */
-			move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in);
-		else
-			/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
-			 * needed because recvmsg can modify msg_iov. */
-			copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
-		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, sock_len);
+		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
+		fixup = msg.msg_iter;
+		if (unlikely((vhost_hlen))) {
+			/* We will supply the header ourselves
+			 * TODO: support TSO.
+			 */
+			iov_iter_advance(&msg.msg_iter, vhost_hlen);
+		} else {
+			/* It'll come from socket; we'll need to patch
+			 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
+			 */
+			iov_iter_advance(&fixup, sizeof(hdr));
+		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
 		/* Userspace might have consumed the packet meanwhile:
@@ -643,18 +607,18 @@ static void handle_rx(struct vhost_net *net)
 			vhost_discard_vq_desc(vq, headcount);
 			continue;
 		}
+		/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
 		if (unlikely(vhost_hlen) &&
-		    memcpy_toiovecend(nvq->hdr, (unsigned char *)&hdr, 0,
-				      vhost_hlen)) {
+		    copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) {
 			vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
 			       vq->iov->iov_base);
 			break;
 		}
-		/* TODO: Should check and handle checksum. */
+		/* Supply (or replace) ->num_buffers if VIRTIO_NET_F_MRG_RXBUF
+		 * TODO: Should check and handle checksum.
+		 */
 		if (likely(mergeable) &&
-		    memcpy_toiovecend(nvq->hdr, (unsigned char *)&headcount,
-				      offsetof(typeof(hdr), num_buffers),
-				      sizeof hdr.num_buffers)) {
+		    copy_to_iter(&headcount, 2, &fixup) != 2) {
 			vq_err(vq, "Failed num_buffers write");
 			vhost_discard_vq_desc(vq, headcount);
 			break;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index af3439f4ebf2..02bd8a92038a 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -137,7 +137,4 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct io
 
 int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 			int offset, int len);
-int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
-		      int offset, int len);
-
 #endif
diff --git a/lib/iovec.c b/lib/iovec.c
index 4a90875c64ae..d8f17a9b1ccf 100644
--- a/lib/iovec.c
+++ b/lib/iovec.c
@@ -2,32 +2,6 @@
 #include <linux/export.h>
 #include <linux/uio.h>
 
-/*
- *	Copy kernel to iovec. Returns -EFAULT on error.
- */
-
-int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
-		      int offset, int len)
-{
-	int copy;
-	for (; len > 0; ++iov) {
-		/* Skip over the finished iovecs */
-		if (unlikely(offset >= iov->iov_len)) {
-			offset -= iov->iov_len;
-			continue;
-		}
-		copy = min_t(unsigned int, iov->iov_len - offset, len);
-		if (copy_to_user(iov->iov_base + offset, kdata, copy))
-			return -EFAULT;
-		offset = 0;
-		kdata += copy;
-		len -= copy;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovecend);
-
 /*
  *	Copy iovec to kernel. Returns -EFAULT on error.
  */
-- 
cgit v1.2.3


From 57dd8a0735aabff4862025cf64ad94da3d80e620 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 16:03:43 -0500
Subject: vhost: vhost_scsi_handle_vq() should just use copy_from_user()

it has just verified that it asks no more than the length of the
first segment of iovec.

And with that the last user of stuff in lib/iovec.c is gone.
RIP.

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/vhost/scsi.c |  2 +-
 include/linux/uio.h  |  2 --
 lib/Makefile         |  2 +-
 lib/iovec.c          | 36 ------------------------------------
 4 files changed, 2 insertions(+), 40 deletions(-)
 delete mode 100644 lib/iovec.c

(limited to 'include/linux')

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index d695b1673ae5..dc78d87e0fc2 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1079,7 +1079,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			       req_size, vq->iov[0].iov_len);
 			break;
 		}
-		ret = memcpy_fromiovecend(req, &vq->iov[0], 0, req_size);
+		ret = copy_from_user(req, vq->iov[0].iov_base, req_size);
 		if (unlikely(ret)) {
 			vq_err(vq, "Faulted on virtio_scsi_cmd_req\n");
 			break;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 02bd8a92038a..3e0cb4ea3905 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -135,6 +135,4 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 
-int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
-			int offset, int len);
 #endif
diff --git a/lib/Makefile b/lib/Makefile
index 3c3b30b9e020..1071d06398c3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -24,7 +24,7 @@ obj-y	+= lockref.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
-	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
+	 gcd.o lcm.o list_sort.o uuid.o flex_array.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
 obj-y += string_helpers.o
diff --git a/lib/iovec.c b/lib/iovec.c
deleted file mode 100644
index d8f17a9b1ccf..000000000000
--- a/lib/iovec.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include <linux/uaccess.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-
-/*
- *	Copy iovec to kernel. Returns -EFAULT on error.
- */
-
-int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
-			int offset, int len)
-{
-	/* No data? Done! */
-	if (len == 0)
-		return 0;
-
-	/* Skip over the finished iovecs */
-	while (offset >= iov->iov_len) {
-		offset -= iov->iov_len;
-		iov++;
-	}
-
-	while (len > 0) {
-		u8 __user *base = iov->iov_base + offset;
-		int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
-		offset = 0;
-		if (copy_from_user(kdata, base, copy))
-			return -EFAULT;
-		len -= copy;
-		kdata += copy;
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovecend);
-- 
cgit v1.2.3


From 44fc0e5eec00db5fba748803c95920098089c4cc Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Fri, 30 Jan 2015 13:14:36 +0200
Subject: sched/wait: Introduce wait_on_bit_timeout()

Add a new wait_on_bit_timeout() helper, basically the same as
wait_on_bit() except that it also takes a 'timeout' parameter.

All the building blocks like bit_wait_timeout() and
out_of_line_wait_on_bit_timeout() are already in place so the
addition is rather simple.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: davem@davemloft.net
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1422616476-2917-2-git-send-email-johan.hedberg@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 37423e0e1379..537d58eea8a0 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -989,6 +989,32 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
 				       mode);
 }
 
+/**
+ * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ * @timeout: timeout, in jiffies
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared. This is similar to wait_on_bit(), except also takes a
+ * timeout parameter.
+ *
+ * Returned value will be zero if the bit was cleared before the
+ * @timeout elapsed, or non-zero if the @timeout elapsed or process
+ * received a signal and the mode permitted wakeup on that signal.
+ */
+static inline int
+wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_timeout(word, bit,
+					       bit_wait_timeout,
+					       mode, timeout);
+}
+
 /**
  * wait_on_bit_action - wait for a bit to be cleared
  * @word: the word being waited on, a kernel virtual address
-- 
cgit v1.2.3


From 2fde4f94e0a9531251e706fa57131b51b0df042e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 7 Jan 2015 15:01:54 +0000
Subject: perf: Decouple unthrottling and rotating

Currently the adjusments made as part of perf_event_task_tick() use the
percpu rotation lists to iterate over any active PMU contexts, but these
are not used by the context rotation code, having been replaced by
separate (per-context) hrtimer callbacks. However, some manipulation of
the rotation lists (i.e. removal of contexts) has remained in
perf_rotate_context(). This leads to the following issues:

* Contexts are not always removed from the rotation lists. Removal of
  PMUs which have been placed in rotation lists, but have not been
  removed by a hrtimer callback can result in corruption of the rotation
  lists (when memory backing the context is freed).

  This has been observed to result in hangs when PMU drivers built as
  modules are inserted and removed around the creation of events for
  said PMUs.

* Contexts which do not require rotation may be removed from the
  rotation lists as a result of a hrtimer, and will not be considered by
  the unthrottling code in perf_event_task_tick.

This patch fixes the issue by updating the rotation ist when events are
scheduled in/out, ensuring that each rotation list stays in sync with
the HW state. As each event holds a refcount on the module of its PMU,
this ensures that when a PMU module is unloaded none of its CPU contexts
can be in a rotation list. By maintaining a list of perf_event_contexts
rather than perf_event_cpu_contexts, we don't need separate paths to
handle the cpu and task contexts, which also makes the code a little
simpler.

As the rotation_list variables are not used for rotation, these are
renamed to active_ctx_list, which better matches their current function.
perf_pmu_rotate_{start,stop} are renamed to
perf_pmu_ctx_{activate,deactivate}.

Reported-by: Johannes Jensen <johannes.jensen@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Will Deacon <Will.Deacon@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20150129134511.GR17721@leverpostej
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  2 +-
 kernel/events/core.c       | 81 +++++++++++++++++-----------------------------
 2 files changed, 30 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 216653466a67..5cad0e6f3552 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -469,6 +469,7 @@ struct perf_event_context {
 	 */
 	struct mutex			mutex;
 
+	struct list_head		active_ctx_list;
 	struct list_head		pinned_groups;
 	struct list_head		flexible_groups;
 	struct list_head		event_list;
@@ -519,7 +520,6 @@ struct perf_cpu_context {
 	int				exclusive;
 	struct hrtimer			hrtimer;
 	ktime_t				hrtimer_interval;
-	struct list_head		rotation_list;
 	struct pmu			*unique_pmu;
 	struct perf_cgroup		*cgrp;
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 37cc20e8aa3b..7f2fbb8b5069 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
 		pmu->pmu_enable(pmu);
 }
 
-static DEFINE_PER_CPU(struct list_head, rotation_list);
+static DEFINE_PER_CPU(struct list_head, active_ctx_list);
 
 /*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
+ * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
+ * perf_event_task_tick() are fully serialized because they're strictly cpu
+ * affine and perf_event_ctx{activate,deactivate} are called with IRQs
+ * disabled, while perf_event_task_tick is called from IRQ context.
  */
-static void perf_pmu_rotate_start(struct pmu *pmu)
+static void perf_event_ctx_activate(struct perf_event_context *ctx)
 {
-	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-	struct list_head *head = this_cpu_ptr(&rotation_list);
+	struct list_head *head = this_cpu_ptr(&active_ctx_list);
 
 	WARN_ON(!irqs_disabled());
 
-	if (list_empty(&cpuctx->rotation_list))
-		list_add(&cpuctx->rotation_list, head);
+	WARN_ON(!list_empty(&ctx->active_ctx_list));
+
+	list_add(&ctx->active_ctx_list, head);
+}
+
+static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+{
+	WARN_ON(!irqs_disabled());
+
+	WARN_ON(list_empty(&ctx->active_ctx_list));
+
+	list_del_init(&ctx->active_ctx_list);
 }
 
 static void get_ctx(struct perf_event_context *ctx)
@@ -1233,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		ctx->nr_branch_stack++;
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
-	if (!ctx->nr_events)
-		perf_pmu_rotate_start(ctx->pmu);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
@@ -1561,7 +1569,8 @@ event_sched_out(struct perf_event *event,
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu--;
-	ctx->nr_active--;
+	if (!--ctx->nr_active)
+		perf_event_ctx_deactivate(ctx);
 	if (event->attr.freq && event->attr.sample_freq)
 		ctx->nr_freq--;
 	if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1885,7 +1894,8 @@ event_sched_in(struct perf_event *event,
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
-	ctx->nr_active++;
+	if (!ctx->nr_active++)
+		perf_event_ctx_activate(ctx);
 	if (event->attr.freq && event->attr.sample_freq)
 		ctx->nr_freq++;
 
@@ -2742,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 
 	perf_pmu_enable(ctx->pmu);
 	perf_ctx_unlock(cpuctx, ctx);
-
-	/*
-	 * Since these rotations are per-cpu, we need to ensure the
-	 * cpu-context we got scheduled on is actually rotating.
-	 */
-	perf_pmu_rotate_start(ctx->pmu);
 }
 
 /*
@@ -3035,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
 		list_rotate_left(&ctx->flexible_groups);
 }
 
-/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
- */
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
 	struct perf_event_context *ctx = NULL;
-	int rotate = 0, remove = 1;
+	int rotate = 0;
 
 	if (cpuctx->ctx.nr_events) {
-		remove = 0;
 		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
 			rotate = 1;
 	}
 
 	ctx = cpuctx->task_ctx;
 	if (ctx && ctx->nr_events) {
-		remove = 0;
 		if (ctx->nr_events != ctx->nr_active)
 			rotate = 1;
 	}
@@ -3077,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 done:
-	if (remove)
-		list_del_init(&cpuctx->rotation_list);
 
 	return rotate;
 }
@@ -3096,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
 
 void perf_event_task_tick(void)
 {
-	struct list_head *head = this_cpu_ptr(&rotation_list);
-	struct perf_cpu_context *cpuctx, *tmp;
-	struct perf_event_context *ctx;
+	struct list_head *head = this_cpu_ptr(&active_ctx_list);
+	struct perf_event_context *ctx, *tmp;
 	int throttled;
 
 	WARN_ON(!irqs_disabled());
@@ -3106,14 +3100,8 @@ void perf_event_task_tick(void)
 	__this_cpu_inc(perf_throttled_seq);
 	throttled = __this_cpu_xchg(perf_throttled_count, 0);
 
-	list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
-		ctx = &cpuctx->ctx;
+	list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
 		perf_adjust_freq_unthr_context(ctx, throttled);
-
-		ctx = cpuctx->task_ctx;
-		if (ctx)
-			perf_adjust_freq_unthr_context(ctx, throttled);
-	}
 }
 
 static int event_enable_on_exec(struct perf_event *event,
@@ -3272,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 {
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
+	INIT_LIST_HEAD(&ctx->active_ctx_list);
 	INIT_LIST_HEAD(&ctx->pinned_groups);
 	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
@@ -6954,7 +6943,6 @@ skip_type:
 
 		__perf_cpu_hrtimer_init(cpuctx, cpu);
 
-		INIT_LIST_HEAD(&cpuctx->rotation_list);
 		cpuctx->unique_pmu = pmu;
 	}
 
@@ -8384,7 +8372,7 @@ static void __init perf_event_init_all_cpus(void)
 	for_each_possible_cpu(cpu) {
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
-		INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
+		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
 	}
 }
 
@@ -8405,22 +8393,11 @@ static void perf_event_init_cpu(int cpu)
 }
 
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
-static void perf_pmu_rotate_stop(struct pmu *pmu)
-{
-	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-	WARN_ON(!irqs_disabled());
-
-	list_del_init(&cpuctx->rotation_list);
-}
-
 static void __perf_event_exit_context(void *__info)
 {
 	struct remove_event re = { .detach_group = true };
 	struct perf_event_context *ctx = __info;
 
-	perf_pmu_rotate_stop(ctx->pmu);
-
 	rcu_read_lock();
 	list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
 		__perf_remove_from_context(&re);
-- 
cgit v1.2.3


From 12cf89b550d13eb7cb86ef182bd6c04345a33a1f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 3 Feb 2015 16:45:18 -0600
Subject: livepatch: rename config to CONFIG_LIVEPATCH

Rename CONFIG_LIVE_PATCHING to CONFIG_LIVEPATCH to make the naming of
the config and the code more consistent.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/Kconfig                 | 2 +-
 arch/x86/include/asm/livepatch.h | 4 ++--
 arch/x86/kernel/Makefile         | 2 +-
 include/linux/livepatch.h        | 4 ++--
 kernel/livepatch/Kconfig         | 6 +++---
 kernel/livepatch/Makefile        | 2 +-
 samples/Kconfig                  | 4 ++--
 samples/livepatch/Makefile       | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 29b095231276..11970b076862 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,7 +17,7 @@ config X86_64
 	depends on 64BIT
 	select X86_DEV_DMA_OPS
 	select ARCH_USE_CMPXCHG_LOCKREF
-	select HAVE_LIVE_PATCHING
+	select HAVE_LIVEPATCH
 
 ### Arch settings
 config X86
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 26e58134c8cb..a455a53d789a 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
-#ifdef CONFIG_LIVE_PATCHING
+#ifdef CONFIG_LIVEPATCH
 static inline int klp_check_compiler_support(void)
 {
 #ifndef CC_USING_FENTRY
@@ -40,7 +40,7 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 	regs->ip = ip;
 }
 #else
-#error Live patching support is disabled; check CONFIG_LIVE_PATCHING
+#error Live patching support is disabled; check CONFIG_LIVEPATCH
 #endif
 
 #endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 316b34e74c15..732223496968 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -63,7 +63,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_LIVE_PATCHING)	+= livepatch.o
+obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index f14c6fb262b4..95023fd8b00d 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
-#if IS_ENABLED(CONFIG_LIVE_PATCHING)
+#if IS_ENABLED(CONFIG_LIVEPATCH)
 
 #include <asm/livepatch.h>
 
@@ -128,6 +128,6 @@ extern int klp_unregister_patch(struct klp_patch *);
 extern int klp_enable_patch(struct klp_patch *);
 extern int klp_disable_patch(struct klp_patch *);
 
-#endif /* CONFIG_LIVE_PATCHING */
+#endif /* CONFIG_LIVEPATCH */
 
 #endif /* _LINUX_LIVEPATCH_H_ */
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 347ee2221137..045022557936 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,15 +1,15 @@
-config HAVE_LIVE_PATCHING
+config HAVE_LIVEPATCH
 	bool
 	help
 	  Arch supports kernel live patching
 
-config LIVE_PATCHING
+config LIVEPATCH
 	bool "Kernel Live Patching"
 	depends on DYNAMIC_FTRACE_WITH_REGS
 	depends on MODULES
 	depends on SYSFS
 	depends on KALLSYMS_ALL
-	depends on HAVE_LIVE_PATCHING
+	depends on HAVE_LIVEPATCH
 	help
 	  Say Y here if you want to support kernel live patching.
 	  This option has no runtime impact until a kernel "patch"
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 7c1f00861428..e8780c0901d9 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_LIVE_PATCHING) += livepatch.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
 livepatch-objs := core.o
diff --git a/samples/Kconfig b/samples/Kconfig
index 0aed20df5f0b..224ebb46bed5 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -63,9 +63,9 @@ config SAMPLE_RPMSG_CLIENT
 	  to communicate with an AMP-configured remote processor over
 	  the rpmsg bus.
 
-config SAMPLE_LIVE_PATCHING
+config SAMPLE_LIVEPATCH
 	tristate "Build live patching sample -- loadable modules only"
-	depends on LIVE_PATCHING && m
+	depends on LIVEPATCH && m
 	help
 	  Builds a sample live patch that replaces the procfs handler
 	  for /proc/cmdline to print "this has been live patched".
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
index 7f1cdc131a02..10319d7ea0b1 100644
--- a/samples/livepatch/Makefile
+++ b/samples/livepatch/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_SAMPLE_LIVE_PATCHING) += livepatch-sample.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
-- 
cgit v1.2.3


From 1e0fb9ec679c9273a641f1d6f3d25ea47baef2bb Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:10 -0700
Subject: perf: Add pmu callbacks to track event mapping and unmapping

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/266afcba1d1f91ea5501e4e16e94bbbc1a9339b6.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 7 +++++++
 kernel/events/core.c       | 9 +++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5cad0e6f3552..33262004c310 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -202,6 +202,13 @@ struct pmu {
 	 */
 	int (*event_init)		(struct perf_event *event);
 
+	/*
+	 * Notification that the event was mapped or unmapped.  Called
+	 * in the context of the mapping task.
+	 */
+	void (*event_mapped)		(struct perf_event *event); /*optional*/
+	void (*event_unmapped)		(struct perf_event *event); /*optional*/
+
 #define PERF_EF_START	0x01		/* start the counter when adding    */
 #define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
 #define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f2fbb8b5069..cc1487145d33 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4293,6 +4293,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 
 	atomic_inc(&event->mmap_count);
 	atomic_inc(&event->rb->mmap_count);
+
+	if (event->pmu->event_mapped)
+		event->pmu->event_mapped(event);
 }
 
 /*
@@ -4312,6 +4315,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	int mmap_locked = rb->mmap_locked;
 	unsigned long size = perf_data_size(rb);
 
+	if (event->pmu->event_unmapped)
+		event->pmu->event_unmapped(event);
+
 	atomic_dec(&rb->mmap_count);
 
 	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4513,6 +4519,9 @@ unlock:
 	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &perf_mmap_vmops;
 
+	if (event->pmu->event_mapped)
+		event->pmu->event_mapped(event);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 509102760da3a21831e763560ba4715760e3fbda Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Wed, 4 Feb 2015 11:45:28 +0100
Subject: regulator: Fix build breakage on !REGULATOR

Add missing stubs for regulator_suspend_prepare() and
regulator_suspend_finish() to fix exynos_defconfig build without
REGULATOR:

arch/arm/mach-exynos/built-in.o: In function `exynos_suspend_finish':
arch/arm/mach-exynos/suspend.c:537: undefined reference to `regulator_suspend_finish'
arch/arm/mach-exynos/built-in.o: In function `exynos_suspend_prepare':
arch/arm/mach-exynos/suspend.c:520: undefined reference to `regulator_suspend_prepare'
make: *** [vmlinux] Error 1

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Reported-by: Joerg Roedel <joro@8bytes.org>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/machine.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 0b08d05d470b..b07562e082c4 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -191,15 +191,22 @@ struct regulator_init_data {
 	void *driver_data;	/* core does not touch this */
 };
 
-int regulator_suspend_prepare(suspend_state_t state);
-int regulator_suspend_finish(void);
-
 #ifdef CONFIG_REGULATOR
 void regulator_has_full_constraints(void);
+int regulator_suspend_prepare(suspend_state_t state);
+int regulator_suspend_finish(void);
 #else
 static inline void regulator_has_full_constraints(void)
 {
 }
+static inline int regulator_suspend_prepare(suspend_state_t state)
+{
+	return 0;
+}
+static inline int regulator_suspend_finish(void)
+{
+	return 0;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 6ae373394c4257bad562817aa60464ff7fe8f9c4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 30 Jan 2015 14:21:14 -0500
Subject: NFSv4.1: Ask for no delegation on OPEN if using O_DIRECT

If we're using NFSv4.1, then we have the ability to let the server know
whether or not we believe that returning a delegation as part of our OPEN
request would be useful.
The feature needs to be used with care, since the client sending the request
doesn't necessarily know how other clients are using that file, and how
they may be affected by the delegation.
For this reason, our initial use of the feature will be to let the server
know when the client believes that handing out a delegation would not be
useful.
The first application for this function is when opening the file using
O_DIRECT.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 30 +++++++++++++++++++
 fs/nfs/nfs4xdr.c        | 79 +++++++++++++++++++++++++++++++++----------------
 include/linux/nfs_xdr.h |  2 ++
 3 files changed, 85 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6e1c9b2d92c5..cd4295d84d54 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -940,6 +940,31 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
 	return true;
 }
 
+static u32
+nfs4_map_atomic_open_share(struct nfs_server *server,
+		fmode_t fmode, int openflags)
+{
+	u32 res = 0;
+
+	switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+	case FMODE_READ:
+		res = NFS4_SHARE_ACCESS_READ;
+		break;
+	case FMODE_WRITE:
+		res = NFS4_SHARE_ACCESS_WRITE;
+		break;
+	case FMODE_READ|FMODE_WRITE:
+		res = NFS4_SHARE_ACCESS_BOTH;
+	}
+	if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+		goto out;
+	/* Want no delegation if we're using O_DIRECT */
+	if (openflags & O_DIRECT)
+		res |= NFS4_SHARE_WANT_NO_DELEG;
+out:
+	return res;
+}
+
 static enum open_claim_type4
 nfs4_map_atomic_open_claim(struct nfs_server *server,
 		enum open_claim_type4 claim)
@@ -1002,6 +1027,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	atomic_inc(&sp->so_count);
 	p->o_arg.open_flags = flags;
 	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+	p->o_arg.share_access = nfs4_map_atomic_open_share(server,
+			fmode, flags);
 	/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
 	 * will return permission denied for all bits until close */
 	if (!(flags & O_EXCL)) {
@@ -2695,6 +2722,9 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 			goto out_wait;
 		    }
 	}
+	calldata->arg.share_access =
+		nfs4_map_atomic_open_share(NFS_SERVER(inode),
+				calldata->arg.fmode, 0);
 
 	nfs_fattr_init(calldata->res.fattr);
 	calldata->timestamp = jiffies;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a2329d69502b..e23a0a664e12 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1351,24 +1351,12 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	encode_string(xdr, name->len, name->name);
 }
 
-static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
+static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
 {
 	__be32 *p;
 
 	p = reserve_space(xdr, 8);
-	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
-	case FMODE_READ:
-		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
-		break;
-	case FMODE_WRITE:
-		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
-		break;
-	case FMODE_READ|FMODE_WRITE:
-		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
-		break;
-	default:
-		*p++ = cpu_to_be32(0);
-	}
+	*p++ = cpu_to_be32(share_access);
 	*p = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
 }
 
@@ -1380,7 +1368,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  * owner 4 = 32
  */
 	encode_nfs4_seqid(xdr, arg->seqid);
-	encode_share_access(xdr, arg->fmode);
+	encode_share_access(xdr, arg->share_access);
 	p = reserve_space(xdr, 36);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(24);
@@ -1535,7 +1523,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 	encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
 	encode_nfs4_stateid(xdr, &arg->stateid);
 	encode_nfs4_seqid(xdr, arg->seqid);
-	encode_share_access(xdr, arg->fmode);
+	encode_share_access(xdr, arg->share_access);
 }
 
 static void
@@ -4935,20 +4923,13 @@ out_overflow:
 	return -EIO;
 }
 
-static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+static int decode_rw_delegation(struct xdr_stream *xdr,
+		uint32_t delegation_type,
+		struct nfs_openres *res)
 {
 	__be32 *p;
-	uint32_t delegation_type;
 	int status;
 
-	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
-		goto out_overflow;
-	delegation_type = be32_to_cpup(p);
-	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
-		res->delegation_type = 0;
-		return 0;
-	}
 	status = decode_stateid(xdr, &res->delegation);
 	if (unlikely(status))
 		return status;
@@ -4972,6 +4953,52 @@ out_overflow:
 	return -EIO;
 }
 
+static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+	__be32 *p;
+	uint32_t why_no_delegation;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	why_no_delegation = be32_to_cpup(p);
+	switch (why_no_delegation) {
+		case WND4_CONTENTION:
+		case WND4_RESOURCE:
+			xdr_inline_decode(xdr, 4);
+			/* Ignore for now */
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+	__be32 *p;
+	uint32_t delegation_type;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	delegation_type = be32_to_cpup(p);
+	res->delegation_type = 0;
+	switch (delegation_type) {
+	case NFS4_OPEN_DELEGATE_NONE:
+		return 0;
+	case NFS4_OPEN_DELEGATE_READ:
+	case NFS4_OPEN_DELEGATE_WRITE:
+		return decode_rw_delegation(xdr, delegation_type, res);
+	case NFS4_OPEN_DELEGATE_NONE_EXT:
+		return decode_no_delegation(xdr, res);
+	}
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 {
 	__be32 *p;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 81401125ab2d..2c35e2affa6f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -326,6 +326,7 @@ struct nfs_openargs {
 	struct nfs_seqid *	seqid;
 	int			open_flags;
 	fmode_t			fmode;
+	u32			share_access;
 	u32			access;
 	__u64                   clientid;
 	struct stateowner_id	id;
@@ -393,6 +394,7 @@ struct nfs_closeargs {
 	nfs4_stateid 		stateid;
 	struct nfs_seqid *	seqid;
 	fmode_t			fmode;
+	u32			share_access;
 	const u32 *		bitmask;
 };
 
-- 
cgit v1.2.3


From 2c561246524c3319473bf47b558354f7ff47f0cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 3 Feb 2015 12:55:31 +0100
Subject: block: Simplify bsg complete all

It took me a few tries to figure out what this code did; lets rewrite
it into a more regular form.

The thing that makes this one 'special' is the BSG_F_BLOCK flag, if
that is not set we're not supposed/allowed to block and should spin
wait for completion.

The (new) io_wait_event() will never see a false condition in case of
the spinning and we will therefore not block.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bsg.c          | 72 ++++++++++++++++++----------------------------------
 include/linux/wait.h | 15 +++++++++++
 2 files changed, 40 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 276e869e686c..d214e929ce18 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
 	return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
 }
 
-static int bsg_io_schedule(struct bsg_device *bd)
-{
-	DEFINE_WAIT(wait);
-	int ret = 0;
-
-	spin_lock_irq(&bd->lock);
-
-	BUG_ON(bd->done_cmds > bd->queued_cmds);
-
-	/*
-	 * -ENOSPC or -ENODATA?  I'm going for -ENODATA, meaning "I have no
-	 * work to do", even though we return -ENOSPC after this same test
-	 * during bsg_write() -- there, it means our buffer can't have more
-	 * bsg_commands added to it, thus has no space left.
-	 */
-	if (bd->done_cmds == bd->queued_cmds) {
-		ret = -ENODATA;
-		goto unlock;
-	}
-
-	if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
-		ret = -EAGAIN;
-		goto unlock;
-	}
-
-	prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE);
-	spin_unlock_irq(&bd->lock);
-	io_schedule();
-	finish_wait(&bd->wq_done, &wait);
-
-	return ret;
-unlock:
-	spin_unlock_irq(&bd->lock);
-	return ret;
-}
-
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
 				fmode_t has_write_perm)
@@ -482,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	return ret;
 }
 
+static bool bsg_complete(struct bsg_device *bd)
+{
+	bool ret = false;
+	bool spin;
+
+	do {
+		spin_lock_irq(&bd->lock);
+
+		BUG_ON(bd->done_cmds > bd->queued_cmds);
+
+		/*
+		 * All commands consumed.
+		 */
+		if (bd->done_cmds == bd->queued_cmds)
+			ret = true;
+
+		spin = !test_bit(BSG_F_BLOCK, &bd->flags);
+
+		spin_unlock_irq(&bd->lock);
+	} while (!ret && spin);
+
+	return ret;
+}
+
 static int bsg_complete_all_commands(struct bsg_device *bd)
 {
 	struct bsg_command *bc;
@@ -492,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
 	/*
 	 * wait for all commands to complete
 	 */
-	ret = 0;
-	do {
-		ret = bsg_io_schedule(bd);
-		/*
-		 * look for -ENODATA specifically -- we'll sometimes get
-		 * -ERESTARTSYS when we've taken a signal, but we can't
-		 * return until we're done freeing the queue, so ignore
-		 * it.  The signal will get handled when we're done freeing
-		 * the bsg_device.
-		 */
-	} while (ret != -ENODATA);
+	io_wait_event(bd->wq_done, bsg_complete(bd));
 
 	/*
 	 * discard done commands
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2232ed16635a..71fc1d31e48d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -267,6 +267,21 @@ do {									\
 	__wait_event(wq, condition);					\
 } while (0)
 
+#define __io_wait_event(wq, condition)					\
+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			    io_schedule())
+
+/*
+ * io_wait_event() -- like wait_event() but with io_schedule()
+ */
+#define io_wait_event(wq, condition)					\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__io_wait_event(wq, condition);					\
+} while (0)
+
 #define __wait_event_freezable(wq, condition)				\
 	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
 			    schedule(); try_to_freeze())
-- 
cgit v1.2.3


From c4546f246636ccf4cda092bcfcafcb5f5f752ec7 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Mon, 26 Jan 2015 09:22:22 -0700
Subject: coresight: remove the unnecessary function coresight_is_bit_set()

This function coresight_is_bit_set() isn't called, so we should
remove it.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/coresight.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index cd6d4c384ca3..3486b9082adb 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -227,7 +227,6 @@ coresight_register(struct coresight_desc *desc);
 extern void coresight_unregister(struct coresight_device *csdev);
 extern int coresight_enable(struct coresight_device *csdev);
 extern void coresight_disable(struct coresight_device *csdev);
-extern int coresight_is_bit_set(u32 val, int position, int value);
 extern int coresight_timeout(void __iomem *addr, u32 offset,
 			     int position, int value);
 #else
@@ -237,8 +236,6 @@ static inline void coresight_unregister(struct coresight_device *csdev) {}
 static inline int
 coresight_enable(struct coresight_device *csdev) { return -ENOSYS; }
 static inline void coresight_disable(struct coresight_device *csdev) {}
-static inline int coresight_is_bit_set(u32 val, int position, int value)
-					 { return 0; }
 static inline int coresight_timeout(void __iomem *addr, u32 offset,
 				     int position, int value) { return 1; }
 #endif
-- 
cgit v1.2.3


From 9064bf3c3cdf92f6b9ac6634ff570dedf0035992 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 3 Feb 2015 17:03:35 -0600
Subject: spi: spi-pxa2xx: only include mach/dma.h for legacy DMA

Move the include of mach/dma.h to the legacy PXA DMA code where it is used.
This enables building spi-pxa2xx on ARM64.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx-pxadma.c | 1 +
 include/linux/spi/pxa2xx_spi.h  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-pxa2xx-pxadma.c b/drivers/spi/spi-pxa2xx-pxadma.c
index 6c82d496b58e..3fecaaa5b0a5 100644
--- a/drivers/spi/spi-pxa2xx-pxadma.c
+++ b/drivers/spi/spi-pxa2xx-pxadma.c
@@ -25,6 +25,7 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/pxa2xx_spi.h>
 
+#include <mach/dma.h>
 #include "spi-pxa2xx.h"
 
 #define DMA_INT_MASK		(DCSR_ENDINTR | DCSR_STARTINTR | DCSR_BUSERR)
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index d5a316550177..46d8fa942631 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -57,7 +57,6 @@ struct pxa2xx_spi_chip {
 #if defined(CONFIG_ARCH_PXA) || defined(CONFIG_ARCH_MMP)
 
 #include <linux/clk.h>
-#include <mach/dma.h>
 
 extern void pxa2xx_set_spi_info(unsigned id, struct pxa2xx_spi_master *info);
 
-- 
cgit v1.2.3


From 2bd82484bb4c5db1d5dc983ac7c409b2782e0154 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 3 Feb 2015 23:48:24 -0800
Subject: xps: fix xps for stacked devices

A typical qdisc setup is the following :

bond0 : bonding device, using HTB hierarchy
eth1/eth2 : slaves, multiqueue NIC, using MQ + FQ qdisc

XPS allows to spread packets on specific tx queues, based on the cpu
doing the send.

Problem is that dequeues from bond0 qdisc can happen on random cpus,
due to the fact that qdisc_run() can dequeue a batch of packets.

CPUA -> queue packet P1 on bond0 qdisc, P1->ooo_okay=1
CPUA -> queue packet P2 on bond0 qdisc, P2->ooo_okay=0

CPUB -> dequeue packet P1 from bond0
        enqueue packet on eth1/eth2
CPUC -> dequeue packet P2 from bond0
        enqueue packet on eth1/eth2 using sk cache (ooo_okay is 0)

get_xps_queue() then might select wrong queue for P1, since current cpu
might be different than CPUA.

P2 might be sent on the old queue (stored in sk->sk_tx_queue_mapping),
if CPUC runs a bit faster (or CPUB spins a bit on qdisc lock)

Effect of this bug is TCP reorders, and more generally not optimal
TX queue placement. (A victim bulk flow can be migrated to the wrong TX
queue for a while)

To fix this, we have to record sender cpu number the first time
dev_queue_xmit() is called for one tx skb.

We can union napi_id (used on receive path) and sender_cpu,
granted we clear sender_cpu in skb_scrub_packet() (credit to Willem for
this union idea)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 7 +++++--
 net/core/flow_dissector.c | 7 ++++++-
 net/core/skbuff.c         | 4 ++++
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 85ab7d72b54c..2748ff639144 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -626,8 +626,11 @@ struct sk_buff {
 	__u32			hash;
 	__be16			vlan_proto;
 	__u16			vlan_tci;
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int	napi_id;
+#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
+	union {
+		unsigned int	napi_id;
+		unsigned int	sender_cpu;
+	};
 #endif
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32			secmark;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index beb83d1ac1c6..2c35c02a931e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -422,7 +422,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 	dev_maps = rcu_dereference(dev->xps_maps);
 	if (dev_maps) {
 		map = rcu_dereference(
-		    dev_maps->cpu_map[raw_smp_processor_id()]);
+		    dev_maps->cpu_map[skb->sender_cpu - 1]);
 		if (map) {
 			if (map->len == 1)
 				queue_index = map->queues[0];
@@ -468,6 +468,11 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 {
 	int queue_index = 0;
 
+#ifdef CONFIG_XPS
+	if (skb->sender_cpu == 0)
+		skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
 	if (dev->real_num_tx_queues != 1) {
 		const struct net_device_ops *ops = dev->netdev_ops;
 		if (ops->ndo_select_queue)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a5bff2767f15..88c613eab142 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -825,6 +825,9 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	CHECK_SKB_FIELD(napi_id);
 #endif
+#ifdef CONFIG_XPS
+	CHECK_SKB_FIELD(sender_cpu);
+#endif
 #ifdef CONFIG_NET_SCHED
 	CHECK_SKB_FIELD(tc_index);
 #ifdef CONFIG_NET_CLS_ACT
@@ -4169,6 +4172,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 	skb->ignore_df = 0;
 	skb_dst_drop(skb);
 	skb->mark = 0;
+	skb->sender_cpu = 0;
 	skb_init_secmark(skb);
 	secpath_reset(skb);
 	nf_reset(skb);
-- 
cgit v1.2.3


From dcdc8994697faa789669c3fdaca1a8bc27a8f356 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 2 Feb 2015 16:07:34 -0800
Subject: net: add skb functions to process remote checksum offload

This patch adds skb_remcsum_process and skb_gro_remcsum_process to
perform the appropriate adjustments to the skb when receiving
remote checksum offload.

Updated vxlan and gue to use these functions.

Tested: Ran TCP_RR and TCP_STREAM netperf for VXLAN and GUE, did
not see any change in performance.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 18 ++----------------
 include/linux/netdevice.h | 15 +++++++++++++++
 include/linux/skbuff.h    | 21 +++++++++++++++++++++
 net/ipv4/fou.c            | 18 ++----------------
 4 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 31bac2a21ce3..c184717e8b28 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -558,7 +558,6 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 					  u32 data)
 {
 	size_t start, offset, plen;
-	__wsum delta;
 
 	if (skb->remcsum_offload)
 		return vh;
@@ -580,12 +579,7 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 			return NULL;
 	}
 
-	delta = remcsum_adjust((void *)vh + hdrlen,
-			       NAPI_GRO_CB(skb)->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
-	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+	skb_gro_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
 
 	skb->remcsum_offload = 1;
 
@@ -1159,7 +1153,6 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 				      size_t hdrlen, u32 data)
 {
 	size_t start, offset, plen;
-	__wsum delta;
 
 	if (skb->remcsum_offload) {
 		/* Already processed in GRO path */
@@ -1179,14 +1172,7 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 
 	vh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
 
-	if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
-		__skb_checksum_complete(skb);
-
-	delta = remcsum_adjust((void *)vh + hdrlen,
-			       skb->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
+	skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
 
 	return vh;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 16251e96e6aa..1347ac50d2af 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2318,6 +2318,21 @@ do {									\
 					   compute_pseudo(skb, proto));	\
 } while (0)
 
+static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
+					   int start, int offset)
+{
+	__wsum delta;
+
+	BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
+
+	delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
+
+	/* Adjust skb->csum since we changed the packet */
+	skb->csum = csum_add(skb->csum, delta);
+	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+}
+
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2748ff639144..5405dfe02572 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3099,6 +3099,27 @@ do {									\
 				       compute_pseudo(skb, proto));	\
 } while (0)
 
+/* Update skbuf and packet to reflect the remote checksum offload operation.
+ * When called, ptr indicates the starting point for skb->csum when
+ * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
+ * here, skb_postpull_rcsum is done so skb->csum start is ptr.
+ */
+static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
+				       int start, int offset)
+{
+	__wsum delta;
+
+	 if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
+		__skb_checksum_complete(skb);
+		skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
+	}
+
+	delta = remcsum_adjust(ptr, skb->csum, start, offset);
+
+	/* Adjust skb->csum since we changed the packet */
+	skb->csum = csum_add(skb->csum, delta);
+}
+
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 void nf_conntrack_destroy(struct nf_conntrack *nfct);
 static inline void nf_conntrack_put(struct nf_conntrack *nfct)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 3bc0cf07661c..92ddea1e6457 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -70,7 +70,6 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
-	__wsum delta;
 
 	if (skb->remcsum_offload) {
 		/* Already processed in GRO path */
@@ -82,14 +81,7 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 		return NULL;
 	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
 
-	if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
-		__skb_checksum_complete(skb);
-
-	delta = remcsum_adjust((void *)guehdr + hdrlen,
-			       skb->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
+	skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
 
 	return guehdr;
 }
@@ -228,7 +220,6 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
-	__wsum delta;
 
 	if (skb->remcsum_offload)
 		return guehdr;
@@ -243,12 +234,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 			return NULL;
 	}
 
-	delta = remcsum_adjust((void *)guehdr + hdrlen,
-			       NAPI_GRO_CB(skb)->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
-	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
 
 	skb->remcsum_offload = 1;
 
-- 
cgit v1.2.3


From 61bd3857ff2c7daf756d49b41e6277bbdaa8f789 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 3 Feb 2015 16:48:29 +0200
Subject: net/core: Add event for a change in slave state

Add event which provides an indication on a change in the state
of a bonding slave. The event handler should cast the pointer to the
appropriate type (struct netdev_bonding_info) in order to get the
full info about the slave.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 15 +++++++++++++++
 net/core/dev.c            | 20 ++++++++++++++++++++
 net/core/rtnetlink.c      |  1 +
 3 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1347ac50d2af..ce784d5018e0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -51,6 +51,7 @@
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
+#include <uapi/linux/if_bonding.h>
 
 struct netpoll_info;
 struct device;
@@ -2056,6 +2057,7 @@ struct pcpu_sw_netstats {
 #define NETDEV_RESEND_IGMP	0x0016
 #define NETDEV_PRECHANGEMTU	0x0017 /* notify before mtu change happened */
 #define NETDEV_CHANGEINFODATA	0x0018
+#define NETDEV_BONDING_INFO	0x0019
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
@@ -3494,6 +3496,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 				    netdev_features_t features);
 
+struct netdev_bonding_info {
+	ifslave	slave;
+	ifbond	master;
+};
+
+struct netdev_notifier_bonding_info {
+	struct netdev_notifier_info info; /* must be first */
+	struct netdev_bonding_info  bonding_info;
+};
+
+void netdev_bonding_info_change(struct net_device *dev,
+				struct netdev_bonding_info *bonding_info);
+
 static inline
 struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 1d564d68e31a..ede0b161b115 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5355,6 +5355,26 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
+/**
+ * netdev_bonding_info_change - Dispatch event about slave change
+ * @dev: device
+ * @netdev_bonding_info: info to dispatch
+ *
+ * Send NETDEV_BONDING_INFO to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_bonding_info_change(struct net_device *dev,
+				struct netdev_bonding_info *bonding_info)
+{
+	struct netdev_notifier_bonding_info	info;
+
+	memcpy(&info.bonding_info, bonding_info,
+	       sizeof(struct netdev_bonding_info));
+	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
+				      &info.info);
+}
+EXPORT_SYMBOL(netdev_bonding_info_change);
+
 void netdev_adjacent_add_links(struct net_device *dev)
 {
 	struct netdev_adjacent *iter;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 673cb4c6f391..4cd5e350d129 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3180,6 +3180,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_UNREGISTER_FINAL:
 	case NETDEV_RELEASE:
 	case NETDEV_JOIN:
+	case NETDEV_BONDING_INFO:
 		break;
 	default:
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
-- 
cgit v1.2.3


From 59e14e325066be49b49b6c2503337c69a9ee29fc Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 3 Feb 2015 16:48:32 +0200
Subject: net/mlx4_core: Port aggregation low level interface

Implement the hardware interface required for port aggregation.

1. Disable RX port check on receive - don't perform a validity check
that matches to QP's port and the port where the packet is received.

2. Virtual to physical port remap - configure virtual to physical port
mapping. Port remap capability for virtual functions.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c |  9 +++++
 drivers/net/ethernet/mellanox/mlx4/fw.c  | 56 +++++++++++++++++++++++++++++---
 include/linux/mlx4/cmd.h                 |  7 ++++
 include/linux/mlx4/device.h              | 10 +++++-
 include/linux/mlx4/qp.h                  |  1 +
 5 files changed, 77 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 154effbfd8be..a681d7c0bb9f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1583,6 +1583,15 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.verify = NULL,
 		.wrapper = mlx4_CMD_EPERM_wrapper
 	},
+	{
+		.opcode = MLX4_CMD_VIRT_PORT_MAP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
 };
 
 static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index dbabfae3a3de..4b08a393ebcb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -142,7 +142,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[17] = "Asymmetric EQs support",
 		[18] = "More than 80 VFs support",
 		[19] = "Performance optimized for limited rule configuration flow steering support",
-		[20] = "Recoverable error events support"
+		[20] = "Recoverable error events support",
+		[21] = "Port Remap support"
 	};
 	int i;
 
@@ -863,6 +864,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
 	MLX4_GET(dev_cap->bmme_flags, outbox,
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
 	if (field & 0x20)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
@@ -1120,9 +1123,10 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 	field &= 0x7f;
 	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET);
 
-	/* For guests, disable mw type 2 */
+	/* For guests, disable mw type 2 and port remap*/
 	MLX4_GET(bmme_flags, outbox->buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
 	bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN;
+	bmme_flags &= ~MLX4_FLAG_PORT_REMAP;
 	MLX4_PUT(outbox->buf, bmme_flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
 
 	/* turn off device-managed steering capability if not enabled */
@@ -2100,13 +2104,16 @@ struct mlx4_config_dev {
 	__be32	rsvd1[3];
 	__be16	vxlan_udp_dport;
 	__be16	rsvd2;
-	__be32	rsvd3[27];
-	__be16	rsvd4;
-	u8	rsvd5;
+	__be32	rsvd3;
+	__be32	roce_flags;
+	__be32	rsvd4[25];
+	__be16	rsvd5;
+	u8	rsvd6;
 	u8	rx_checksum_val;
 };
 
 #define MLX4_VXLAN_UDP_DPORT (1 << 0)
+#define MLX4_DISABLE_RX_PORT BIT(18)
 
 static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
 {
@@ -2209,6 +2216,45 @@ int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
 }
 EXPORT_SYMBOL_GPL(mlx4_config_vxlan_port);
 
+#define CONFIG_DISABLE_RX_PORT BIT(15)
+int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags = cpu_to_be32(MLX4_DISABLE_RX_PORT);
+	if (dis)
+		config_dev.roce_flags =
+			cpu_to_be32(CONFIG_DISABLE_RX_PORT);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+
+int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct {
+		__be32 v_port1;
+		__be32 v_port2;
+	} *v2p;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	v2p = mailbox->buf;
+	v2p->v_port1 = cpu_to_be32(port1);
+	v2p->v_port2 = cpu_to_be32(port2);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0,
+		       MLX4_SET_PORT_VIRT2PHY, MLX4_CMD_VIRT_PORT_MAP,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
 
 int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
 {
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index ae95adc78509..7b6d4e9ff603 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -71,6 +71,7 @@ enum {
 
 	/*master notify fw on finish for slave's flr*/
 	MLX4_CMD_INFORM_FLR_DONE = 0x5b,
+	MLX4_CMD_VIRT_PORT_MAP   = 0x5c,
 	MLX4_CMD_GET_OP_REQ      = 0x59,
 
 	/* TPT commands */
@@ -170,6 +171,12 @@ enum {
 	MLX4_CMD_TIME_CLASS_C	= 60000,
 };
 
+enum {
+	/* virtual to physical port mapping opcode modifiers */
+	MLX4_GET_PORT_VIRT2PHY = 0x0,
+	MLX4_SET_PORT_VIRT2PHY = 0x1,
+};
+
 enum {
 	MLX4_MAILBOX_SIZE	= 4096,
 	MLX4_ACCESS_MEM_ALIGN	= 256,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index c95d659a39f2..d9afd99dde39 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -201,7 +201,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
 	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
 	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19,
-	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20
+	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20,
+	MLX4_DEV_CAP_FLAG2_PORT_REMAP		= 1LL <<  21
 };
 
 enum {
@@ -253,9 +254,14 @@ enum {
 	MLX4_BMME_FLAG_TYPE_2_WIN	= 1 <<  9,
 	MLX4_BMME_FLAG_RESERVED_LKEY	= 1 << 10,
 	MLX4_BMME_FLAG_FAST_REG_WR	= 1 << 11,
+	MLX4_BMME_FLAG_PORT_REMAP	= 1 << 24,
 	MLX4_BMME_FLAG_VSD_INIT2RTR	= 1 << 28,
 };
 
+enum {
+	MLX4_FLAG_PORT_REMAP		= MLX4_BMME_FLAG_PORT_REMAP
+};
+
 enum mlx4_event {
 	MLX4_EVENT_TYPE_COMP		   = 0x00,
 	MLX4_EVENT_TYPE_PATH_MIG	   = 0x01,
@@ -1378,6 +1384,8 @@ int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port);
 int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
 
 int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
+int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis);
+int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2);
 int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 467ccdf94c98..2bbc62aa818a 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -96,6 +96,7 @@ enum {
 	MLX4_QP_BIT_RRE				= 1 << 15,
 	MLX4_QP_BIT_RWE				= 1 << 14,
 	MLX4_QP_BIT_RAE				= 1 << 13,
+	MLX4_QP_BIT_FPP				= 1 <<	3,
 	MLX4_QP_BIT_RIC				= 1 <<	4,
 };
 
-- 
cgit v1.2.3


From 53f33ae295a5098f12218da1400f55ad7df7447c Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 3 Feb 2015 16:48:33 +0200
Subject: net/mlx4_core: Port aggregation upper layer interface

Supply interface functions to bond and unbond ports of a mlx4 internal
interfaces. Example for such an interface is the one registered by the
mlx4 IB driver under RoCE.

There are

1. Functions to go in/out to/from bonded mode
2. Function to remap virtual ports to physical ports

The bond_mutex prevents simultaneous access to data that keep status of
the device in bonded mode.

The upper mlx4 interface marks to the mlx4 core module that they
want to be subject for such bonding by setting the MLX4_INTFF_BONDING
flag. Interface which goes to/from bonded mode is re-created.

The mlx4 Ethernet driver does not set this flag when registering the
interface, the IB driver does.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_resources.c  |  8 +-
 drivers/net/ethernet/mellanox/mlx4/intf.c          | 54 +++++++++++++
 drivers/net/ethernet/mellanox/mlx4/main.c          | 89 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |  3 +
 drivers/net/ethernet/mellanox/mlx4/qp.c            |  2 +
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  3 +
 include/linux/mlx4/device.h                        |  1 +
 include/linux/mlx4/driver.h                        | 19 +++++
 8 files changed, 177 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
index f1a5500ff72d..34f2fdf4fe5d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -50,10 +50,14 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 	context->mtu_msgmax = 0xff;
 	if (!is_tx && !rss)
 		context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
-	if (is_tx)
+	if (is_tx) {
 		context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
-	else
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP)
+			context->params2 |= MLX4_QP_BIT_FPP;
+
+	} else {
 		context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
+	}
 	context->usr_page = cpu_to_be32(mdev->priv_uar.index);
 	context->local_qpn = cpu_to_be32(qpn);
 	context->pri_path.ackto = 1 & 0x07;
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 68d2bad325d5..6fce58718837 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -33,11 +33,13 @@
 
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/errno.h>
 
 #include "mlx4.h"
 
 struct mlx4_device_context {
 	struct list_head	list;
+	struct list_head	bond_list;
 	struct mlx4_interface  *intf;
 	void		       *context;
 };
@@ -115,6 +117,58 @@ void mlx4_unregister_interface(struct mlx4_interface *intf)
 }
 EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
 
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx = NULL, *temp_dev_ctx;
+	unsigned long flags;
+	int ret;
+	LIST_HEAD(bond_list);
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -ENOTSUPP;
+
+	ret = mlx4_disable_rx_port_check(dev, enable);
+	if (ret) {
+		mlx4_err(dev, "Fail to %s rx port check\n",
+			 enable ? "enable" : "disable");
+		return ret;
+	}
+	if (enable) {
+		dev->flags |= MLX4_FLAG_BONDED;
+	} else {
+		 ret = mlx4_virt2phy_port_map(dev, 1, 2);
+		if (ret) {
+			mlx4_err(dev, "Fail to reset port map\n");
+			return ret;
+		}
+		dev->flags &= ~MLX4_FLAG_BONDED;
+	}
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+	list_for_each_entry_safe(dev_ctx, temp_dev_ctx, &priv->ctx_list, list) {
+		if (dev_ctx->intf->flags & MLX4_INTFF_BONDING) {
+			list_add_tail(&dev_ctx->bond_list, &bond_list);
+			list_del(&dev_ctx->list);
+		}
+	}
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &bond_list, bond_list) {
+		dev_ctx->intf->remove(dev, dev_ctx->context);
+		dev_ctx->context =  dev_ctx->intf->add(dev);
+
+		spin_lock_irqsave(&priv->ctx_lock, flags);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+		mlx4_dbg(dev, "Inrerface for protocol %d restarted with when bonded mode is %s\n",
+			 dev_ctx->intf->protocol, enable ?
+			 "enabled" : "disabled");
+	}
+	return 0;
+}
+
 void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
 			 unsigned long param)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index cc9f48439244..f3245fe0f442 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1160,6 +1160,91 @@ err_set_port:
 	return err ? err : count;
 }
 
+int mlx4_bond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (!mlx4_is_bonded(dev))
+		ret = mlx4_do_bond(dev, true);
+	else
+		ret = 0;
+
+	mutex_unlock(&priv->bond_mutex);
+	if (ret)
+		mlx4_err(dev, "Failed to bond device: %d\n", ret);
+	else
+		mlx4_dbg(dev, "Device is bonded\n");
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_bond);
+
+int mlx4_unbond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (mlx4_is_bonded(dev))
+		ret = mlx4_do_bond(dev, false);
+
+	mutex_unlock(&priv->bond_mutex);
+	if (ret)
+		mlx4_err(dev, "Failed to unbond device: %d\n", ret);
+	else
+		mlx4_dbg(dev, "Device is unbonded\n");
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_unbond);
+
+
+int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p)
+{
+	u8 port1 = v2p->port1;
+	u8 port2 = v2p->port2;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -ENOTSUPP;
+
+	mutex_lock(&priv->bond_mutex);
+
+	/* zero means keep current mapping for this port */
+	if (port1 == 0)
+		port1 = priv->v2p.port1;
+	if (port2 == 0)
+		port2 = priv->v2p.port2;
+
+	if ((port1 < 1) || (port1 > MLX4_MAX_PORTS) ||
+	    (port2 < 1) || (port2 > MLX4_MAX_PORTS) ||
+	    (port1 == 2 && port2 == 1)) {
+		/* besides boundary checks cross mapping makes
+		 * no sense and therefore not allowed */
+		err = -EINVAL;
+	} else if ((port1 == priv->v2p.port1) &&
+		 (port2 == priv->v2p.port2)) {
+		err = 0;
+	} else {
+		err = mlx4_virt2phy_port_map(dev, port1, port2);
+		if (!err) {
+			mlx4_dbg(dev, "port map changed: [%d][%d]\n",
+				 port1, port2);
+			priv->v2p.port1 = port1;
+			priv->v2p.port2 = port2;
+		} else {
+			mlx4_err(dev, "Failed to change port mape: %d\n", err);
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_port_map_set);
+
 static int mlx4_load_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -2638,6 +2723,7 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 	spin_lock_init(&priv->ctx_lock);
 
 	mutex_init(&priv->port_mutex);
+	mutex_init(&priv->bond_mutex);
 
 	INIT_LIST_HEAD(&priv->pgdir_list);
 	mutex_init(&priv->pgdir_mutex);
@@ -2934,6 +3020,9 @@ slave_start:
 			goto err_port;
 	}
 
+	priv->v2p.port1 = 1;
+	priv->v2p.port2 = 2;
+
 	err = mlx4_register_device(dev);
 	if (err)
 		goto err_port;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 148dc0945aab..803f17653da7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -885,6 +885,8 @@ struct mlx4_priv {
 	int			reserved_mtts;
 	int			fs_hash_mode;
 	u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	struct mlx4_port_map	v2p; /* cached port mapping configuration */
+	struct mutex		bond_mutex; /* for bond mode */
 	__be64			slave_node_guids[MLX4_MFUNC_MAX];
 
 	atomic_t		opreq_count;
@@ -1364,6 +1366,7 @@ int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port);
 /* Returns the VF index of slave */
 int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave);
 int mlx4_config_mad_demux(struct mlx4_dev *dev);
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable);
 
 enum mlx4_zone_flags {
 	MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO	= 1UL << 0,
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 1586ecce13c7..2bb8553bd905 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -882,6 +882,8 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	for (i = 0; i < ARRAY_SIZE(states) - 1; i++) {
 		context->flags &= cpu_to_be32(~(0xf << 28));
 		context->flags |= cpu_to_be32(states[i + 1] << 28);
+		if (states[i + 1] != MLX4_QP_STATE_RTR)
+			context->params2 &= ~MLX4_QP_BIT_FPP;
 		err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
 				     context, 0, 0, qp);
 		if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 79feeb6b0d87..c5f3dfca226b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -2944,6 +2944,9 @@ static int verify_qp_parameters(struct mlx4_dev *dev,
 	qp_type	= (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
 	optpar	= be32_to_cpu(*(__be32 *) inbox->buf);
 
+	if (slave != mlx4_master_func_num(dev))
+		qp_ctx->params2 &= ~MLX4_QP_BIT_FPP;
+
 	switch (qp_type) {
 	case MLX4_QP_ST_RC:
 	case MLX4_QP_ST_XRC:
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index d9afd99dde39..977b0b164431 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -70,6 +70,7 @@ enum {
 	MLX4_FLAG_SLAVE		= 1 << 3,
 	MLX4_FLAG_SRIOV		= 1 << 4,
 	MLX4_FLAG_OLD_REG_MAC	= 1 << 6,
+	MLX4_FLAG_BONDED	= 1 << 7
 };
 
 enum {
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index 022055c8fb26..9553a73d2049 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -49,6 +49,10 @@ enum mlx4_dev_event {
 	MLX4_DEV_EVENT_SLAVE_SHUTDOWN,
 };
 
+enum {
+	MLX4_INTFF_BONDING	= 1 << 0
+};
+
 struct mlx4_interface {
 	void *			(*add)	 (struct mlx4_dev *dev);
 	void			(*remove)(struct mlx4_dev *dev, void *context);
@@ -57,11 +61,26 @@ struct mlx4_interface {
 	void *			(*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
 	struct list_head	list;
 	enum mlx4_protocol	protocol;
+	int			flags;
 };
 
 int mlx4_register_interface(struct mlx4_interface *intf);
 void mlx4_unregister_interface(struct mlx4_interface *intf);
 
+int mlx4_bond(struct mlx4_dev *dev);
+int mlx4_unbond(struct mlx4_dev *dev);
+static inline int mlx4_is_bonded(struct mlx4_dev *dev)
+{
+	return !!(dev->flags & MLX4_FLAG_BONDED);
+}
+
+struct mlx4_port_map {
+	u8	port1;
+	u8	port2;
+};
+
+int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p);
+
 void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port);
 
 static inline u64 mlx4_mac_to_u64(u8 *addr)
-- 
cgit v1.2.3


From f2dba9c6ff0d9a515b4c3f1b037cd65c8b2a868c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 4 Feb 2015 07:33:23 +1100
Subject: rhashtable: Introduce rhashtable_walk_*

Some existing rhashtable users get too intimate with it by walking
the buckets directly.  This prevents us from easily changing the
internals of rhashtable.

This patch adds the helpers rhashtable_walk_init/exit/start/next/stop
which will replace these custom walkers.

They are meant to be usable for both procfs seq_file walks as well
as walking by a netlink dump.  The iterator structure should fit
inside a netlink dump cb structure, with at least one element to
spare.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  35 ++++++++++
 lib/rhashtable.c           | 163 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index e0337844358e..58851275fed9 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -18,6 +18,7 @@
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
+#include <linux/compiler.h>
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
@@ -111,6 +112,7 @@ struct rhashtable_params {
  * @p: Configuration parameters
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
+ * @walkers: List of active walkers
  * @being_destroyed: True if table is set up for destruction
  */
 struct rhashtable {
@@ -121,9 +123,36 @@ struct rhashtable {
 	struct rhashtable_params	p;
 	struct work_struct		run_work;
 	struct mutex                    mutex;
+	struct list_head		walkers;
 	bool                            being_destroyed;
 };
 
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @resize: Resize event occured
+ */
+struct rhashtable_walker {
+	struct list_head list;
+	bool resize;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator, fits into netlink cb
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+	struct rhashtable *ht;
+	struct rhash_head *p;
+	struct rhashtable_walker *walker;
+	unsigned int slot;
+	unsigned int skip;
+};
+
 static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
 {
 	return NULLS_MARKER(ht->p.nulls_base + hash);
@@ -179,6 +208,12 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
 				      bool (*compare)(void *, void *),
 				      void *arg);
 
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
+void rhashtable_walk_exit(struct rhashtable_iter *iter);
+int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU);
+void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
+
 void rhashtable_destroy(struct rhashtable *ht);
 
 #define rht_dereference(p, ht) \
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 904b419b72f5..057919164e23 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -484,6 +484,7 @@ static void rht_deferred_worker(struct work_struct *work)
 {
 	struct rhashtable *ht;
 	struct bucket_table *tbl;
+	struct rhashtable_walker *walker;
 
 	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
@@ -492,6 +493,9 @@ static void rht_deferred_worker(struct work_struct *work)
 
 	tbl = rht_dereference(ht->tbl, ht);
 
+	list_for_each_entry(walker, &ht->walkers, list)
+		walker->resize = true;
+
 	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
 		rhashtable_expand(ht);
 	else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
@@ -822,6 +826,164 @@ exit:
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
 
+/**
+ * rhashtable_walk_init - Initialise an iterator
+ * @ht:		Table to walk over
+ * @iter:	Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may sleep so you must not call it from interrupt
+ * context or with spin locks held.
+ *
+ * You must call rhashtable_walk_exit if this function returns
+ * successfully.
+ */
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
+{
+	iter->ht = ht;
+	iter->p = NULL;
+	iter->slot = 0;
+	iter->skip = 0;
+
+	iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL);
+	if (!iter->walker)
+		return -ENOMEM;
+
+	mutex_lock(&ht->mutex);
+	list_add(&iter->walker->list, &ht->walkers);
+	mutex_unlock(&ht->mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_init);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter:	Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_init.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+	mutex_lock(&iter->ht->mutex);
+	list_del(&iter->walker->list);
+	mutex_unlock(&iter->ht->mutex);
+	kfree(iter->walker);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start - Start a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Start a hash table walk.  Note that we take the RCU lock in all
+ * cases including when we return an error.  So you must always call
+ * rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ */
+int rhashtable_walk_start(struct rhashtable_iter *iter)
+{
+	rcu_read_lock();
+
+	if (iter->walker->resize) {
+		iter->slot = 0;
+		iter->skip = 0;
+		iter->walker->resize = false;
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start);
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter:	Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+	const struct bucket_table *tbl;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+	void *obj = NULL;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	if (p) {
+		p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);
+		goto next;
+	}
+
+	for (; iter->slot < tbl->size; iter->slot++) {
+		int skip = iter->skip;
+
+		rht_for_each_rcu(p, tbl, iter->slot) {
+			if (!skip)
+				break;
+			skip--;
+		}
+
+next:
+		if (!rht_is_a_nulls(p)) {
+			iter->skip++;
+			iter->p = p;
+			obj = rht_obj(ht, p);
+			goto out;
+		}
+
+		iter->skip = 0;
+	}
+
+	iter->p = NULL;
+
+out:
+	if (iter->walker->resize) {
+		iter->p = NULL;
+		iter->slot = 0;
+		iter->skip = 0;
+		iter->walker->resize = false;
+		return ERR_PTR(-EAGAIN);
+	}
+
+	return obj;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Finish a hash table walk.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+{
+	rcu_read_unlock();
+	iter->p = NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
+
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
 	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
@@ -894,6 +1056,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	memcpy(&ht->p, params, sizeof(*params));
+	INIT_LIST_HEAD(&ht->walkers);
 
 	if (params->locks_mul)
 		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-- 
cgit v1.2.3


From cfd8fef322305bbe9955817464f0d2054ce545c0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Jan 2015 19:08:13 +0200
Subject: dmaengine: dw: amend description of dma_dev field

The dma_dev field is widely used in filter functions to mach with a proper DMA
controller device. Thus it's not deprecated. The patch fixes the description of
that field. There is no functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/platform_data/dma-dw.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index d8155c005242..359127d290e7 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -16,7 +16,7 @@
 /**
  * struct dw_dma_slave - Controller-specific information about a slave
  *
- * @dma_dev: required DMA master device. Depricated.
+ * @dma_dev:	required DMA master device
  * @src_id:	src request line
  * @dst_id:	dst request line
  * @src_master: src master for transfers on allocated channel.
-- 
cgit v1.2.3


From d8ded50f8b26a224df48f9f93e49440c6a39b77f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Jan 2015 19:08:14 +0200
Subject: dmaengine: dw: define DW_DMA_MAX_NR_MASTERS

Instead of using magic number in the code the patch provides
DW_DMA_MAX_NR_MASTERS constant.

While here, restrict the reading of data width array by amount of the actual
number of AHB masters.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/devicetree/bindings/dma/snps-dma.txt | 2 +-
 arch/arc/boot/dts/abilis_tb10x.dtsi                | 2 +-
 arch/arm/boot/dts/spear13xx.dtsi                   | 4 ++--
 arch/avr32/mach-at32ap/at32ap700x.c                | 2 +-
 drivers/dma/dw/core.c                              | 3 ++-
 drivers/dma/dw/platform.c                          | 4 ++--
 drivers/dma/dw/regs.h                              | 2 +-
 include/linux/platform_data/dma-dw.h               | 4 +++-
 8 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/dma/snps-dma.txt b/Documentation/devicetree/bindings/dma/snps-dma.txt
index d58675ea1abf..c261598164a7 100644
--- a/Documentation/devicetree/bindings/dma/snps-dma.txt
+++ b/Documentation/devicetree/bindings/dma/snps-dma.txt
@@ -38,7 +38,7 @@ Example:
 		chan_allocation_order = <1>;
 		chan_priority = <1>;
 		block_size = <0xfff>;
-		data_width = <3 3 0 0>;
+		data_width = <3 3>;
 	};
 
 DMA clients connected to the Designware DMA controller must use the format
diff --git a/arch/arc/boot/dts/abilis_tb10x.dtsi b/arch/arc/boot/dts/abilis_tb10x.dtsi
index a098d7c05e96..cfb5052239a1 100644
--- a/arch/arc/boot/dts/abilis_tb10x.dtsi
+++ b/arch/arc/boot/dts/abilis_tb10x.dtsi
@@ -112,7 +112,7 @@
 			chan_allocation_order = <0>;
 			chan_priority = <1>;
 			block_size = <0x7ff>;
-			data_width = <2 0 0 0>;
+			data_width = <2>;
 			clocks = <&ahb_clk>;
 			clock-names = "hclk";
 		};
diff --git a/arch/arm/boot/dts/spear13xx.dtsi b/arch/arm/boot/dts/spear13xx.dtsi
index a6eb5436d26d..40accc87e3a2 100644
--- a/arch/arm/boot/dts/spear13xx.dtsi
+++ b/arch/arm/boot/dts/spear13xx.dtsi
@@ -117,7 +117,7 @@
 			chan_priority = <1>;
 			block_size = <0xfff>;
 			dma-masters = <2>;
-			data_width = <3 3 0 0>;
+			data_width = <3 3>;
 		};
 
 		dma@eb000000 {
@@ -133,7 +133,7 @@
 			chan_allocation_order = <1>;
 			chan_priority = <1>;
 			block_size = <0xfff>;
-			data_width = <3 3 0 0>;
+			data_width = <3 3>;
 		};
 
 		fsmc: flash@b0000000 {
diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c
index cc92cdb9994c..1d8b147282cf 100644
--- a/arch/avr32/mach-at32ap/at32ap700x.c
+++ b/arch/avr32/mach-at32ap/at32ap700x.c
@@ -607,7 +607,7 @@ static struct dw_dma_platform_data dw_dmac0_data = {
 	.nr_channels	= 3,
 	.block_size	= 4095U,
 	.nr_masters	= 2,
-	.data_width	= { 2, 2, 0, 0 },
+	.data_width	= { 2, 2 },
 };
 
 static struct resource dw_dmac0_resource[] = {
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index fcb9a916e6f6..0469d8eda253 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -1562,7 +1562,8 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 		}
 	} else {
 		dw->nr_masters = pdata->nr_masters;
-		memcpy(dw->data_width, pdata->data_width, 4);
+		for (i = 0; i < dw->nr_masters; i++)
+			dw->data_width[i] = pdata->data_width[i];
 	}
 
 	/* Calculate all channel mask before DMA setup */
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index a630161473a4..aaff37f53523 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -99,7 +99,7 @@ dw_dma_parse_dt(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct dw_dma_platform_data *pdata;
-	u32 tmp, arr[4];
+	u32 tmp, arr[DW_DMA_MAX_NR_MASTERS];
 
 	if (!np) {
 		dev_err(&pdev->dev, "Missing DT data\n");
@@ -126,7 +126,7 @@ dw_dma_parse_dt(struct platform_device *pdev)
 		pdata->block_size = tmp;
 
 	if (!of_property_read_u32(np, "dma-masters", &tmp)) {
-		if (tmp > 4)
+		if (tmp > DW_DMA_MAX_NR_MASTERS)
 			return NULL;
 
 		pdata->nr_masters = tmp;
diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h
index 254a1db03680..241ff2b1402b 100644
--- a/drivers/dma/dw/regs.h
+++ b/drivers/dma/dw/regs.h
@@ -285,7 +285,7 @@ struct dw_dma {
 
 	/* hardware configuration */
 	unsigned char		nr_masters;
-	unsigned char		data_width[4];
+	unsigned char		data_width[DW_DMA_MAX_NR_MASTERS];
 };
 
 static inline struct dw_dma_regs __iomem *__dw_regs(struct dw_dma *dw)
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index 359127d290e7..87ac14c584f2 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -13,6 +13,8 @@
 
 #include <linux/device.h>
 
+#define DW_DMA_MAX_NR_MASTERS	4
+
 /**
  * struct dw_dma_slave - Controller-specific information about a slave
  *
@@ -53,7 +55,7 @@ struct dw_dma_platform_data {
 	unsigned char	chan_priority;
 	unsigned short	block_size;
 	unsigned char	nr_masters;
-	unsigned char	data_width[4];
+	unsigned char	data_width[DW_DMA_MAX_NR_MASTERS];
 };
 
 #endif /* _PLATFORM_DATA_DMA_DW_H */
-- 
cgit v1.2.3


From 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 2 Feb 2015 00:37:00 -0500
Subject: vfs: add support for a lazytime mount option

Add a new mount option which enables a new "lazytime" mode.  This mode
causes atime, mtime, and ctime updates to only be made to the
in-memory version of the inode.  The on-disk times will only get
updated when (a) if the inode needs to be updated for some non-time
related change, (b) if userspace calls fsync(), syncfs() or sync(), or
(c) just before an undeleted inode is evicted from memory.

This is OK according to POSIX because there are no guarantees after a
crash unless userspace explicitly requests via a fsync(2) call.

For workloads which feature a large number of random write to a
preallocated file, the lazytime mount option significantly reduces
writes to the inode table.  The repeated 4k writes to a single block
will result in undesirable stress on flash devices and SMR disk
drives.  Even on conventional HDD's, the repeated writes to the inode
table block will trigger Adjacent Track Interference (ATI) remediation
latencies, which very negatively impact long tail latencies --- which
is a very big deal for web serving tiers (for example).

Google-Bug-Id: 18297052

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/inode.c                  |  6 ++++
 fs/fs-writeback.c                | 62 +++++++++++++++++++++++++++++++++-------
 fs/gfs2/file.c                   |  4 +--
 fs/inode.c                       | 56 +++++++++++++++++++++++++-----------
 fs/jfs/file.c                    |  2 +-
 fs/libfs.c                       |  2 +-
 fs/proc_namespace.c              |  1 +
 fs/sync.c                        |  8 ++++++
 include/linux/backing-dev.h      |  1 +
 include/linux/fs.h               |  5 ++++
 include/trace/events/writeback.h | 60 +++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/fs.h          |  4 ++-
 mm/backing-dev.c                 | 10 +++++--
 13 files changed, 186 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa42930b..628df5ba44a6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4840,11 +4840,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
+ *
+ * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
+ * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
+ * to copy into the on-disk inode structure are the timestamp files.
  */
 void ext4_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *handle;
 
+	if (flags == I_DIRTY_TIME)
+		return;
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 	if (IS_ERR(handle))
 		goto out;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..004686191354 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -247,14 +247,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 	return ret;
 }
 
+#define EXPIRE_DIRTY_ATIME 0x0001
+
 /*
  * Move expired (dirtied before work->older_than_this) dirty inodes from
  * @delaying_queue to @dispatch_queue.
  */
 static int move_expired_inodes(struct list_head *delaying_queue,
 			       struct list_head *dispatch_queue,
+			       int flags,
 			       struct wb_writeback_work *work)
 {
+	unsigned long *older_than_this = NULL;
+	unsigned long expire_time;
 	LIST_HEAD(tmp);
 	struct list_head *pos, *node;
 	struct super_block *sb = NULL;
@@ -262,13 +267,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 	int do_sb_sort = 0;
 	int moved = 0;
 
+	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+		older_than_this = work->older_than_this;
+	else if ((work->reason == WB_REASON_SYNC) == 0) {
+		expire_time = jiffies - (HZ * 86400);
+		older_than_this = &expire_time;
+	}
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
-		if (work->older_than_this &&
-		    inode_dirtied_after(inode, *work->older_than_this))
+		if (older_than_this &&
+		    inode_dirtied_after(inode, *older_than_this))
 			break;
 		list_move(&inode->i_wb_list, &tmp);
 		moved++;
+		if (flags & EXPIRE_DIRTY_ATIME)
+			set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
 		if (sb_is_blkdev_sb(inode->i_sb))
 			continue;
 		if (sb && sb != inode->i_sb)
@@ -309,9 +322,12 @@ out:
 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
 	int moved;
+
 	assert_spin_locked(&wb->list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
-	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+				     EXPIRE_DIRTY_ATIME, work);
 	trace_writeback_queue_io(wb, work, moved);
 }
 
@@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		 * updates after data IO completion.
 		 */
 		redirty_tail(inode, wb);
+	} else if (inode->i_state & I_DIRTY_TIME) {
+		list_move(&inode->i_wb_list, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
 		list_del_init(&inode->i_wb_list);
@@ -481,7 +499,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_lock(&inode->i_lock);
 
 	dirty = inode->i_state & I_DIRTY;
-	inode->i_state &= ~I_DIRTY;
+	if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
+	     (inode->i_state & I_DIRTY_TIME)) ||
+	    (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
+		dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+		trace_writeback_lazytime(inode);
+	}
+	inode->i_state &= ~dirty;
 
 	/*
 	 * Paired with smp_mb() in __mark_inode_dirty().  This allows
@@ -501,8 +525,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	spin_unlock(&inode->i_lock);
 
+	if (dirty & I_DIRTY_TIME)
+		mark_inode_dirty_sync(inode);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
-	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+	if (dirty & ~I_DIRTY_PAGES) {
 		int err = write_inode(inode, wbc);
 		if (ret == 0)
 			ret = err;
@@ -550,7 +576,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * make sure inode is on some writeback list and leave it there unless
 	 * we have completely cleaned the inode.
 	 */
-	if (!(inode->i_state & I_DIRTY) &&
+	if (!(inode->i_state & I_DIRTY_ALL) &&
 	    (wbc->sync_mode != WB_SYNC_ALL ||
 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
@@ -565,7 +591,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * If inode is clean, remove it from writeback lists. Otherwise don't
 	 * touch it. See comment above for explanation.
 	 */
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY_ALL))
 		list_del_init(&inode->i_wb_list);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
@@ -707,7 +733,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		wrote += write_chunk - wbc.nr_to_write;
 		spin_lock(&wb->list_lock);
 		spin_lock(&inode->i_lock);
-		if (!(inode->i_state & I_DIRTY))
+		if (!(inode->i_state & I_DIRTY_ALL))
 			wrote++;
 		requeue_inode(inode, wb, &wbc);
 		inode_sync_complete(inode);
@@ -1145,16 +1171,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  * page->mapping->host, so the page-dirtying time is recorded in the internal
  * blockdev inode.
  */
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
 	struct backing_dev_info *bdi = NULL;
+	int dirtytime;
+
+	trace_writeback_mark_inode_dirty(inode, flags);
 
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
 	 * dirty the inode itself
 	 */
-	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
 		trace_writeback_dirty_inode_start(inode, flags);
 
 		if (sb->s_op->dirty_inode)
@@ -1162,6 +1192,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 		trace_writeback_dirty_inode(inode, flags);
 	}
+	if (flags & I_DIRTY_INODE)
+		flags &= ~I_DIRTY_TIME;
+	dirtytime = flags & I_DIRTY_TIME;
 
 	/*
 	 * Paired with smp_mb() in __writeback_single_inode() for the
@@ -1169,16 +1202,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	 */
 	smp_mb();
 
-	if ((inode->i_state & flags) == flags)
+	if (((inode->i_state & flags) == flags) ||
+	    (dirtytime && (inode->i_state & I_DIRTY_INODE)))
 		return;
 
 	if (unlikely(block_dump))
 		block_dump___mark_inode_dirty(inode);
 
 	spin_lock(&inode->i_lock);
+	if (dirtytime && (inode->i_state & I_DIRTY_INODE))
+		goto out_unlock_inode;
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 
+		if (flags & I_DIRTY_INODE)
+			inode->i_state &= ~I_DIRTY_TIME;
 		inode->i_state |= flags;
 
 		/*
@@ -1225,8 +1263,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			}
 
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			list_move(&inode->i_wb_list, dirtytime ?
+				  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
 			spin_unlock(&bdi->wb.list_lock);
+			trace_writeback_dirty_inode_enqueue(inode);
 
 			if (wakeup_bdi)
 				bdi_wakeup_thread_delayed(bdi);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..15c44cf457cc 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -655,7 +655,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 {
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	int sync_state = inode->i_state & I_DIRTY;
+	int sync_state = inode->i_state & I_DIRTY_ALL;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	int ret = 0, ret1 = 0;
 
@@ -668,7 +668,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 	if (!gfs2_is_jdata(ip))
 		sync_state &= ~I_DIRTY_PAGES;
 	if (datasync)
-		sync_state &= ~I_DIRTY_SYNC;
+		sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
 
 	if (sync_state) {
 		ret = sync_inode_metadata(inode, 1);
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..4feb85cc125f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/buffer_head.h> /* for inode_has_buffers */
 #include <linux/ratelimit.h>
 #include <linux/list_lru.h>
+#include <trace/events/writeback.h>
 #include "internal.h"
 
 /*
@@ -30,7 +31,7 @@
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
  * bdi->wb.list_lock protects:
- *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
  * inode_hash_lock protects:
  *   inode_hashtable, inode->i_hash
  *
@@ -416,7 +417,8 @@ static void inode_lru_list_add(struct inode *inode)
  */
 void inode_add_lru(struct inode *inode)
 {
-	if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
+				I_FREEING | I_WILL_FREE)) &&
 	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
 		inode_lru_list_add(inode);
 }
@@ -647,7 +649,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
-		if (inode->i_state & I_DIRTY && !kill_dirty) {
+		if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
 			spin_unlock(&inode->i_lock);
 			busy = 1;
 			continue;
@@ -1432,11 +1434,20 @@ static void iput_final(struct inode *inode)
  */
 void iput(struct inode *inode)
 {
-	if (inode) {
-		BUG_ON(inode->i_state & I_CLEAR);
-
-		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
-			iput_final(inode);
+	if (!inode)
+		return;
+	BUG_ON(inode->i_state & I_CLEAR);
+retry:
+	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+		if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+			atomic_inc(&inode->i_count);
+			inode->i_state &= ~I_DIRTY_TIME;
+			spin_unlock(&inode->i_lock);
+			trace_writeback_lazytime_iput(inode);
+			mark_inode_dirty_sync(inode);
+			goto retry;
+		}
+		iput_final(inode);
 	}
 }
 EXPORT_SYMBOL(iput);
@@ -1495,14 +1506,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
 	return 0;
 }
 
-/*
- * This does the actual work of updating an inodes time or version.  Must have
- * had called mnt_want_write() before calling this.
- */
-static int update_time(struct inode *inode, struct timespec *time, int flags)
+int generic_update_time(struct inode *inode, struct timespec *time, int flags)
 {
-	if (inode->i_op->update_time)
-		return inode->i_op->update_time(inode, time, flags);
+	int iflags = I_DIRTY_TIME;
 
 	if (flags & S_ATIME)
 		inode->i_atime = *time;
@@ -1512,9 +1518,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
 		inode->i_ctime = *time;
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
-	mark_inode_dirty_sync(inode);
+
+	if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+		iflags |= I_DIRTY_SYNC;
+	__mark_inode_dirty(inode, iflags);
 	return 0;
 }
+EXPORT_SYMBOL(generic_update_time);
+
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+	int (*update_time)(struct inode *, struct timespec *, int);
+
+	update_time = inode->i_op->update_time ? inode->i_op->update_time :
+		generic_update_time;
+
+	return update_time(inode, time, flags);
+}
 
 /**
  *	touch_atime	-	update the access time
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 33aa0cc1f8b8..10815f8dfd8b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		return rc;
 
 	mutex_lock(&inode->i_mutex);
-	if (!(inode->i_state & I_DIRTY) ||
+	if (!(inode->i_state & I_DIRTY_ALL) ||
 	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
diff --git a/fs/libfs.c b/fs/libfs.c
index 005843ce5dbd..b2ffdb045be4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
 
 	mutex_lock(&inode->i_mutex);
 	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY_ALL))
 		goto out;
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		goto out;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0f96f71ab32b..8db932da4009 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 		{ MS_SYNCHRONOUS, ",sync" },
 		{ MS_DIRSYNC, ",dirsync" },
 		{ MS_MANDLOCK, ",mand" },
+		{ MS_LAZYTIME, ",lazytime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
diff --git a/fs/sync.c b/fs/sync.c
index 01d9f18a70b5..fbc98ee62044 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
  */
 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	struct inode *inode = file->f_mapping->host;
+
 	if (!file->f_op->fsync)
 		return -EINVAL;
+	if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_DIRTY_TIME;
+		spin_unlock(&inode->i_lock);
+		mark_inode_dirty_sync(inode);
+	}
 	return file->f_op->fsync(file, start, end, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012b7a14..4cdf7336f64a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -55,6 +55,7 @@ struct bdi_writeback {
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
+	struct list_head b_dirty_time;	/* time stamps are dirty */
 	spinlock_t list_lock;		/* protects the b_* lists */
 };
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..cd027ce2c705 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1746,8 +1746,12 @@ struct super_operations {
 #define __I_DIO_WAKEUP		9
 #define I_DIO_WAKEUP		(1 << I_DIO_WAKEUP)
 #define I_LINKABLE		(1 << 10)
+#define I_DIRTY_TIME		(1 << 11)
+#define __I_DIRTY_TIME_EXPIRED	12
+#define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
 
 extern void __mark_inode_dirty(struct inode *, int);
 static inline void mark_inode_dirty(struct inode *inode)
@@ -1910,6 +1914,7 @@ extern int current_umask(void);
 
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
+extern int generic_update_time(struct inode *, struct timespec *, int);
 
 static inline struct inode *file_inode(const struct file *f)
 {
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d65ab3f..5ecb4c234625 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -18,6 +18,8 @@
 		{I_FREEING,		"I_FREEING"},		\
 		{I_CLEAR,		"I_CLEAR"},		\
 		{I_SYNC,		"I_SYNC"},		\
+		{I_DIRTY_TIME,		"I_DIRTY_TIME"},	\
+		{I_DIRTY_TIME_EXPIRED,	"I_DIRTY_TIME_EXPIRED"}, \
 		{I_REFERENCED,		"I_REFERENCED"}		\
 	)
 
@@ -68,6 +70,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 	TP_STRUCT__entry (
 		__array(char, name, 32)
 		__field(unsigned long, ino)
+		__field(unsigned long, state)
 		__field(unsigned long, flags)
 	),
 
@@ -78,16 +81,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 		strncpy(__entry->name,
 			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
 		__entry->ino		= inode->i_ino;
+		__entry->state		= inode->i_state;
 		__entry->flags		= flags;
 	),
 
-	TP_printk("bdi %s: ino=%lu flags=%s",
+	TP_printk("bdi %s: ino=%lu state=%s flags=%s",
 		__entry->name,
 		__entry->ino,
+		show_inode_state(__entry->state),
 		show_inode_state(__entry->flags)
 	)
 );
 
+DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,
+
+	TP_PROTO(struct inode *inode, int flags),
+
+	TP_ARGS(inode, flags)
+);
+
 DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,
 
 	TP_PROTO(struct inode *inode, int flags),
@@ -598,6 +610,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
 	TP_ARGS(inode, wbc, nr_to_write)
 );
 
+DECLARE_EVENT_CLASS(writeback_lazytime_template,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(unsigned long,	ino			)
+		__field(unsigned long,	state			)
+		__field(	__u16, mode			)
+		__field(unsigned long, dirtied_when		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->state	= inode->i_state;
+		__entry->mode	= inode->i_mode;
+		__entry->dirtied_when = inode->dirtied_when;
+	),
+
+	TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->dirtied_when,
+		  show_inode_state(__entry->state), __entry->mode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue,
+
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0a6784..9b964a5920af 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -90,6 +90,7 @@ struct inodes_stat_t {
 #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
+#define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
 #define MS_NOSEC	(1<<28)
@@ -100,7 +101,8 @@ struct inodes_stat_t {
 /*
  * Superblock flags that can be altered by MS_REMOUNT
  */
-#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
+#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
+			 MS_LAZYTIME)
 
 /*
  * Old magic mount flag and mask
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..915feea94c66 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
-	unsigned long nr_dirty, nr_io, nr_more_io;
+	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
 	struct inode *inode;
 
-	nr_dirty = nr_io = nr_more_io = 0;
+	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
 	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
@@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
+	list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+		if (inode->i_state & I_DIRTY_TIME)
+			nr_dirty_time++;
 	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "b_dirty:            %10lu\n"
 		   "b_io:               %10lu\n"
 		   "b_more_io:          %10lu\n"
+		   "b_dirty_time:       %10lu\n"
 		   "bdi_list:           %10u\n"
 		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   nr_dirty,
 		   nr_io,
 		   nr_more_io,
+		   nr_dirty_time,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
@@ -418,6 +423,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_dirty);
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
+	INIT_LIST_HEAD(&wb->b_dirty_time);
 	spin_lock_init(&wb->list_lock);
 	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
-- 
cgit v1.2.3


From fe032c422c5ba562ba9c2d316f55e258e03259c6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 2 Feb 2015 00:37:01 -0500
Subject: vfs: add find_inode_nowait() function

Add a new function find_inode_nowait() which is an even more general
version of ilookup5_nowait().  It is designed for callers which need
very fine grained control over when the function is allowed to block
or increment the inode's reference count.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  5 +++++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 4feb85cc125f..740cba79c2b9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1284,6 +1284,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 }
 EXPORT_SYMBOL(ilookup);
 
+/**
+ * find_inode_nowait - find an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @match:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @match
+ *
+ * Search for the inode specified by @hashval and @data in the inode
+ * cache, where the helper function @match will return 0 if the inode
+ * does not match, 1 if the inode does match, and -1 if the search
+ * should be stopped.  The @match function must be responsible for
+ * taking the i_lock spin_lock and checking i_state for an inode being
+ * freed or being initialized, and incrementing the reference count
+ * before returning 1.  It also must not sleep, since it is called with
+ * the inode_hash_lock spinlock held.
+ *
+ * This is a even more generalized version of ilookup5() when the
+ * function must never block --- find_inode() can block in
+ * __wait_on_freeing_inode() --- or when the caller can not increment
+ * the reference count because the resulting iput() might cause an
+ * inode eviction.  The tradeoff is that the @match funtion must be
+ * very carefully implemented.
+ */
+struct inode *find_inode_nowait(struct super_block *sb,
+				unsigned long hashval,
+				int (*match)(struct inode *, unsigned long,
+					     void *),
+				void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode, *ret_inode = NULL;
+	int mval;
+
+	spin_lock(&inode_hash_lock);
+	hlist_for_each_entry(inode, head, i_hash) {
+		if (inode->i_sb != sb)
+			continue;
+		mval = match(inode, hashval, data);
+		if (mval == 0)
+			continue;
+		if (mval == 1)
+			ret_inode = inode;
+		goto out;
+	}
+out:
+	spin_unlock(&inode_hash_lock);
+	return ret_inode;
+}
+EXPORT_SYMBOL(find_inode_nowait);
+
 int insert_inode_locked(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cd027ce2c705..5ea8b6e46a3d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2441,6 +2441,11 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
 
 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
 extern struct inode * iget_locked(struct super_block *, unsigned long);
+extern struct inode *find_inode_nowait(struct super_block *,
+				       unsigned long,
+				       int (*match)(struct inode *,
+						    unsigned long, void *),
+				       void *data);
 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
 extern int insert_inode_locked(struct inode *);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-- 
cgit v1.2.3


From 1c2b364b225a5a93dbd1f317bd000d2fec2694be Mon Sep 17 00:00:00 2001
From: Tiejun Chen <tiejun.chen@intel.com>
Date: Thu, 5 Feb 2015 17:22:26 +0800
Subject: kvm: remove KVM_MMIO_SIZE

After f78146b0f923, "KVM: Fix page-crossing MMIO", and
87da7e66a405, "KVM: x86: fix vcpu->mmio_fragments overflow",
actually KVM_MMIO_SIZE is gone.

Signed-off-by: Tiejun Chen <tiejun.chen@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 --
 include/linux/kvm_host.h        | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9dbc7435cbc2..848947ac6ade 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,8 +38,6 @@
 #define KVM_PRIVATE_MEM_SLOTS 3
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
-#define KVM_MMIO_SIZE 16
-
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 32d057571bf6..8a82838034f1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -33,10 +33,6 @@
 
 #include <asm/kvm_host.h>
 
-#ifndef KVM_MMIO_SIZE
-#define KVM_MMIO_SIZE 8
-#endif
-
 /*
  * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
  * in kvm, other bits are visible for userspace which are defined in
-- 
cgit v1.2.3


From 7fbc1067f06098c6b674e672fbb17e758fcc9402 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Oct 2013 10:32:35 +0100
Subject: exportfs: add methods for block layout exports

Add three methods to allow exporting pnfs block layout volumes:

 - get_uuid: get a filesystem unique signature exposed to clients
 - map_blocks: map and if nessecary allocate blocks for a layout
 - commit_blocks: commit blocks in a layout once the client is done with them

For now we stick the external pnfs block layout interfaces into s_export_op to
avoid mixing them up with the internal interface between the NFS server and
the layout drivers.  Once we've fully internalized the latter interface we
can redecide if these methods should stay in s_export_ops.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/exportfs.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 41b223a59a63..fa05e04c5531 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 struct dentry;
+struct iattr;
 struct inode;
 struct super_block;
 struct vfsmount;
@@ -180,6 +181,21 @@ struct fid {
  *    get_name is not (which is possibly inconsistent)
  */
 
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
+#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
+
+#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
+
+struct iomap {
+	sector_t	blkno;	/* first sector of mapping */
+	loff_t		offset;	/* file offset of mapping, bytes */
+	u64		length;	/* length of mapping, bytes */
+	int		type;	/* type of mapping */
+};
+
 struct export_operations {
 	int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
 			struct inode *parent);
@@ -191,6 +207,13 @@ struct export_operations {
 			struct dentry *child);
 	struct dentry * (*get_parent)(struct dentry *child);
 	int (*commit_metadata)(struct inode *inode);
+
+	int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+	int (*map_blocks)(struct inode *inode, loff_t offset,
+			  u64 len, struct iomap *iomap,
+			  bool write, u32 *device_generation);
+	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
+			     int nr_iomaps, struct iattr *iattr);
 };
 
 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
-- 
cgit v1.2.3


From ae58d882bfd3e537b1ed4a4c3577ca9ba853f0d8 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 19 Jan 2015 12:00:55 +0000
Subject: MIPS: cevt-r4k: Drop GIC special case

The cevt-r4k driver used to call into the GIC driver to find whether the
timer was pending, but only with External Interrupt Controller (EIC)
mode, where the Cause.IP bits can't be used as they encode the interrupt
priority level (Cause.RIPL) instead.

However commit e9de688dac65 ("irqchip: mips-gic: Support local
interrupts") changed the condition from cpu_has_veic to gic_present.
This fails on cores such as P5600 which have a GIC but the local
interrupts aren't routable by the GIC, causing c0_compare_int_usable()
to consider the interrupt unusable so r4k_clockevent_init() fails.

The previous behaviour, added in commit 98b67c37db33 ("MIPS: Add EIC
support for GIC."), wasn't really correct either as far as I can tell,
since P5600 apparently supports EIC mode too, and in any case the use of
Cause.TI with r2 should have been sufficient anyway since commit
010c108d7af7 ("MIPS: PowerTV: Fix support for timer interrupts with > 64
external IRQs").

Therefore drop the call into the gic driver altogether, and add a
comment in c0_compare_int_pending() to clarify that Cause.TI does get
checked since MIPS r2.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Fixes: e9de688dac65 ("irqchip: mips-gic: Support local interrupts")
Reviewed-by: Andrew Bresticker <abrestic@chromium.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Steven J. Hill <steven.hill@imgtec.com>
Cc: Qais Yousef <qais.yousef@imgtec.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/9077/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/cevt-r4k.c      | 6 +-----
 drivers/irqchip/irq-mips-gic.c   | 8 --------
 include/linux/irqchip/mips-gic.h | 1 -
 3 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/cevt-r4k.c b/arch/mips/kernel/cevt-r4k.c
index 6acaad0480af..28bfdf2c59a5 100644
--- a/arch/mips/kernel/cevt-r4k.c
+++ b/arch/mips/kernel/cevt-r4k.c
@@ -11,7 +11,6 @@
 #include <linux/percpu.h>
 #include <linux/smp.h>
 #include <linux/irq.h>
-#include <linux/irqchip/mips-gic.h>
 
 #include <asm/time.h>
 #include <asm/cevt-r4k.h>
@@ -85,10 +84,7 @@ void mips_event_handler(struct clock_event_device *dev)
  */
 static int c0_compare_int_pending(void)
 {
-#ifdef CONFIG_MIPS_GIC
-	if (gic_present)
-		return gic_get_timer_pending();
-#endif
+	/* When cpu_has_mips_r2, this checks Cause.TI instead of Cause.IP7 */
 	return (read_c0_cause() >> cp0_compare_irq_shift) & (1ul << CAUSEB_IP);
 }
 
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 2b0468e3df6a..e58600b1de28 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -191,14 +191,6 @@ static bool gic_local_irq_is_routable(int intr)
 	}
 }
 
-unsigned int gic_get_timer_pending(void)
-{
-	unsigned int vpe_pending;
-
-	vpe_pending = gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_PEND));
-	return vpe_pending & GIC_VPE_PEND_TIMER_MSK;
-}
-
 static void gic_bind_eic_interrupt(int irq, int set)
 {
 	/* Convert irq vector # to hw int # */
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 420f77b34d02..e6a6aac451db 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -243,7 +243,6 @@ extern void gic_write_cpu_compare(cycle_t cnt, int cpu);
 extern void gic_send_ipi(unsigned int intr);
 extern unsigned int plat_ipi_call_int_xlate(unsigned int);
 extern unsigned int plat_ipi_resched_int_xlate(unsigned int);
-extern unsigned int gic_get_timer_pending(void);
 extern int gic_get_c0_compare_int(void);
 extern int gic_get_c0_perfcount_int(void);
 #endif /* __LINUX_IRQCHIP_MIPS_GIC_H */
-- 
cgit v1.2.3


From 90e97820619dc912b52cc9d103272819d8b51259 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 5 Feb 2015 13:44:43 +0800
Subject: resources: Move struct resource_list_entry from ACPI into resource
 core

Currently ACPI, PCI and pnp all implement the same resource list
management with different data structure. We need to transfer from
one data structure into another when passing resources from one
subsystem into another subsystem. So move struct resource_list_entry
from ACPI into resource core and rename it as resource_entry,
then it could be reused by different subystems and avoid the data
structure conversion.

Introduce dedicated header file resource_ext.h instead of embedding
it into ioport.h to avoid header file inclusion order issues.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_lpss.c     |  8 ++---
 drivers/acpi/acpi_platform.c |  4 +--
 drivers/acpi/resource.c      | 17 ++++------
 drivers/dma/acpi-dma.c       | 10 +++---
 include/linux/acpi.h         | 12 +------
 include/linux/resource_ext.h | 77 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/resource.c            | 25 ++++++++++++++
 7 files changed, 120 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/resource_ext.h

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 4f3febf8a589..dfd1b8095dad 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -313,7 +313,7 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 {
 	struct lpss_device_desc *dev_desc;
 	struct lpss_private_data *pdata;
-	struct resource_list_entry *rentry;
+	struct resource_entry *rentry;
 	struct list_head resource_list;
 	struct platform_device *pdev;
 	int ret;
@@ -333,12 +333,12 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 		goto err_out;
 
 	list_for_each_entry(rentry, &resource_list, node)
-		if (resource_type(&rentry->res) == IORESOURCE_MEM) {
+		if (resource_type(rentry->res) == IORESOURCE_MEM) {
 			if (dev_desc->prv_size_override)
 				pdata->mmio_size = dev_desc->prv_size_override;
 			else
-				pdata->mmio_size = resource_size(&rentry->res);
-			pdata->mmio_base = ioremap(rentry->res.start,
+				pdata->mmio_size = resource_size(rentry->res);
+			pdata->mmio_base = ioremap(rentry->res->start,
 						   pdata->mmio_size);
 			break;
 		}
diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 6ba8beb6b9d2..1284138e42ab 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -45,7 +45,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 	struct platform_device *pdev = NULL;
 	struct acpi_device *acpi_parent;
 	struct platform_device_info pdevinfo;
-	struct resource_list_entry *rentry;
+	struct resource_entry *rentry;
 	struct list_head resource_list;
 	struct resource *resources = NULL;
 	int count;
@@ -71,7 +71,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 		}
 		count = 0;
 		list_for_each_entry(rentry, &resource_list, node)
-			resources[count++] = rentry->res;
+			resources[count++] = *rentry->res;
 
 		acpi_dev_free_resource_list(&resource_list);
 	}
diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 3ea0d17eb951..4752b9939987 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -444,12 +444,7 @@ EXPORT_SYMBOL_GPL(acpi_dev_resource_interrupt);
  */
 void acpi_dev_free_resource_list(struct list_head *list)
 {
-	struct resource_list_entry *rentry, *re;
-
-	list_for_each_entry_safe(rentry, re, list, node) {
-		list_del(&rentry->node);
-		kfree(rentry);
-	}
+	resource_list_free(list);
 }
 EXPORT_SYMBOL_GPL(acpi_dev_free_resource_list);
 
@@ -464,16 +459,16 @@ struct res_proc_context {
 static acpi_status acpi_dev_new_resource_entry(struct resource_win *win,
 					       struct res_proc_context *c)
 {
-	struct resource_list_entry *rentry;
+	struct resource_entry *rentry;
 
-	rentry = kmalloc(sizeof(*rentry), GFP_KERNEL);
+	rentry = resource_list_create_entry(NULL, 0);
 	if (!rentry) {
 		c->error = -ENOMEM;
 		return AE_NO_MEMORY;
 	}
-	rentry->res = win->res;
+	*rentry->res = win->res;
 	rentry->offset = win->offset;
-	list_add_tail(&rentry->node, c->list);
+	resource_list_add_tail(rentry, c->list);
 	c->count++;
 	return AE_OK;
 }
@@ -534,7 +529,7 @@ static acpi_status acpi_dev_process_resource(struct acpi_resource *ares,
  * returned as the final error code.
  *
  * The resultant struct resource objects are put on the list pointed to by
- * @list, that must be empty initially, as members of struct resource_list_entry
+ * @list, that must be empty initially, as members of struct resource_entry
  * objects.  Callers of this routine should use %acpi_dev_free_resource_list() to
  * free that list.
  *
diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c
index de361a156b34..5a635646e05c 100644
--- a/drivers/dma/acpi-dma.c
+++ b/drivers/dma/acpi-dma.c
@@ -43,7 +43,7 @@ static int acpi_dma_parse_resource_group(const struct acpi_csrt_group *grp,
 {
 	const struct acpi_csrt_shared_info *si;
 	struct list_head resource_list;
-	struct resource_list_entry *rentry;
+	struct resource_entry *rentry;
 	resource_size_t mem = 0, irq = 0;
 	int ret;
 
@@ -56,10 +56,10 @@ static int acpi_dma_parse_resource_group(const struct acpi_csrt_group *grp,
 		return 0;
 
 	list_for_each_entry(rentry, &resource_list, node) {
-		if (resource_type(&rentry->res) == IORESOURCE_MEM)
-			mem = rentry->res.start;
-		else if (resource_type(&rentry->res) == IORESOURCE_IRQ)
-			irq = rentry->res.start;
+		if (resource_type(rentry->res) == IORESOURCE_MEM)
+			mem = rentry->res->start;
+		else if (resource_type(rentry->res) == IORESOURCE_IRQ)
+			irq = rentry->res->start;
 	}
 
 	acpi_dev_free_resource_list(&resource_list);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e818decb631f..e53822148b6a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -27,6 +27,7 @@
 
 #include <linux/errno.h>
 #include <linux/ioport.h>	/* for struct resource */
+#include <linux/resource_ext.h>
 #include <linux/device.h>
 #include <linux/property.h>
 
@@ -285,11 +286,6 @@ extern int pnpacpi_disabled;
 
 #define PXM_INVAL	(-1)
 
-struct resource_win {
-	struct resource res;
-	resource_size_t offset;
-};
-
 bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res);
 bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res);
 bool acpi_dev_resource_address_space(struct acpi_resource *ares,
@@ -300,12 +296,6 @@ unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable);
 bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
 				 struct resource *res);
 
-struct resource_list_entry {
-	struct list_head node;
-	struct resource res;
-	resource_size_t offset;
-};
-
 void acpi_dev_free_resource_list(struct list_head *list);
 int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 			   int (*preproc)(struct acpi_resource *, void *),
diff --git a/include/linux/resource_ext.h b/include/linux/resource_ext.h
new file mode 100644
index 000000000000..e2bf63d881d4
--- /dev/null
+++ b/include/linux/resource_ext.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2015, Intel Corporation
+ * Author: Jiang Liu <jiang.liu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _LINUX_RESOURCE_EXT_H
+#define _LINUX_RESOURCE_EXT_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+
+/* Represent resource window for bridge devices */
+struct resource_win {
+	struct resource res;		/* In master (CPU) address space */
+	resource_size_t offset;		/* Translation offset for bridge */
+};
+
+/*
+ * Common resource list management data structure and interfaces to support
+ * ACPI, PNP and PCI host bridge etc.
+ */
+struct resource_entry {
+	struct list_head	node;
+	struct resource		*res;	/* In master (CPU) address space */
+	resource_size_t		offset;	/* Translation offset for bridge */
+	struct resource		__res;	/* Default storage for res */
+};
+
+extern struct resource_entry *
+resource_list_create_entry(struct resource *res, size_t extra_size);
+extern void resource_list_free(struct list_head *head);
+
+static inline void resource_list_add(struct resource_entry *entry,
+				     struct list_head *head)
+{
+	list_add(&entry->node, head);
+}
+
+static inline void resource_list_add_tail(struct resource_entry *entry,
+					  struct list_head *head)
+{
+	list_add_tail(&entry->node, head);
+}
+
+static inline void resource_list_del(struct resource_entry *entry)
+{
+	list_del(&entry->node);
+}
+
+static inline void resource_list_free_entry(struct resource_entry *entry)
+{
+	kfree(entry);
+}
+
+static inline void
+resource_list_destroy_entry(struct resource_entry *entry)
+{
+	resource_list_del(entry);
+	resource_list_free_entry(entry);
+}
+
+#define resource_list_for_each_entry(entry, list)	\
+	list_for_each_entry((entry), (list), node)
+
+#define resource_list_for_each_entry_safe(entry, tmp, list)	\
+	list_for_each_entry_safe((entry), (tmp), (list), node)
+
+#endif /* _LINUX_RESOURCE_EXT_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
+#include <linux/resource_ext.h>
 #include <asm/io.h>
 
 
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
 	return err;
 }
 
+struct resource_entry *resource_list_create_entry(struct resource *res,
+						  size_t extra_size)
+{
+	struct resource_entry *entry;
+
+	entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
+	if (entry) {
+		INIT_LIST_HEAD(&entry->node);
+		entry->res = res ? res : &entry->__res;
+	}
+
+	return entry;
+}
+EXPORT_SYMBOL(resource_list_create_entry);
+
+void resource_list_free(struct list_head *head)
+{
+	struct resource_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, head, node)
+		resource_list_destroy_entry(entry);
+}
+EXPORT_SYMBOL(resource_list_free);
+
 static int __init strict_iomem(char *str)
 {
 	if (strstr(str, "relaxed"))
-- 
cgit v1.2.3


From 14d76b68f2819a1d0b50236a7e9e9f2ea69869d9 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 5 Feb 2015 13:44:44 +0800
Subject: PCI: Use common resource list management code instead of private
 implementation

Use common resource list management data structure and interfaces
instead of private implementation.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm/kernel/bios32.c            |  5 ++---
 arch/x86/pci/bus_numa.c             |  4 ++--
 drivers/pci/bus.c                   | 18 ++++++------------
 drivers/pci/host-bridge.c           |  8 ++++----
 drivers/pci/host/pci-host-generic.c |  4 ++--
 drivers/pci/host/pci-xgene.c        |  4 ++--
 drivers/pci/host/pcie-xilinx.c      |  4 ++--
 drivers/pci/probe.c                 | 10 +++++-----
 include/linux/pci.h                 |  9 ++-------
 9 files changed, 27 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c
index a4effd6d8f2f..016991792b0b 100644
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -422,17 +422,16 @@ static int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 static int pcibios_init_resources(int busnr, struct pci_sys_data *sys)
 {
 	int ret;
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 
 	if (list_empty(&sys->resources)) {
 		pci_add_resource_offset(&sys->resources,
 			 &iomem_resource, sys->mem_offset);
 	}
 
-	list_for_each_entry(window, &sys->resources, list) {
+	resource_list_for_each_entry(window, &sys->resources)
 		if (resource_type(window->res) == IORESOURCE_IO)
 			return 0;
-	}
 
 	sys->io_res.start = (busnr * SZ_64K) ?  : pcibios_min_io;
 	sys->io_res.end = (busnr + 1) * SZ_64K - 1;
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index f3a2cfc14125..7bcf06a7cd12 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -31,7 +31,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 {
 	struct pci_root_info *info = x86_find_pci_root_info(bus);
 	struct pci_root_res *root_res;
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	bool found = false;
 
 	if (!info)
@@ -41,7 +41,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 	       bus);
 
 	/* already added by acpi ? */
-	list_for_each_entry(window, resources, list)
+	resource_list_for_each_entry(window, resources)
 		if (window->res->flags & IORESOURCE_BUS) {
 			found = true;
 			break;
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 8fb16188cd82..90fa3a78fb7c 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -20,17 +20,16 @@
 void pci_add_resource_offset(struct list_head *resources, struct resource *res,
 			     resource_size_t offset)
 {
-	struct pci_host_bridge_window *window;
+	struct resource_entry *entry;
 
-	window = kzalloc(sizeof(struct pci_host_bridge_window), GFP_KERNEL);
-	if (!window) {
+	entry = resource_list_create_entry(res, 0);
+	if (!entry) {
 		printk(KERN_ERR "PCI: can't add host bridge window %pR\n", res);
 		return;
 	}
 
-	window->res = res;
-	window->offset = offset;
-	list_add_tail(&window->list, resources);
+	entry->offset = offset;
+	resource_list_add_tail(entry, resources);
 }
 EXPORT_SYMBOL(pci_add_resource_offset);
 
@@ -42,12 +41,7 @@ EXPORT_SYMBOL(pci_add_resource);
 
 void pci_free_resource_list(struct list_head *resources)
 {
-	struct pci_host_bridge_window *window, *tmp;
-
-	list_for_each_entry_safe(window, tmp, resources, list) {
-		list_del(&window->list);
-		kfree(window);
-	}
+	resource_list_free(resources);
 }
 EXPORT_SYMBOL(pci_free_resource_list);
 
diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c
index 0e5f3c95af5b..39b2dbe585aa 100644
--- a/drivers/pci/host-bridge.c
+++ b/drivers/pci/host-bridge.c
@@ -35,10 +35,10 @@ void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region,
 			     struct resource *res)
 {
 	struct pci_host_bridge *bridge = find_pci_host_bridge(bus);
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	resource_size_t offset = 0;
 
-	list_for_each_entry(window, &bridge->windows, list) {
+	resource_list_for_each_entry(window, &bridge->windows) {
 		if (resource_contains(window->res, res)) {
 			offset = window->offset;
 			break;
@@ -60,10 +60,10 @@ void pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res,
 			     struct pci_bus_region *region)
 {
 	struct pci_host_bridge *bridge = find_pci_host_bridge(bus);
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	resource_size_t offset = 0;
 
-	list_for_each_entry(window, &bridge->windows, list) {
+	resource_list_for_each_entry(window, &bridge->windows) {
 		struct pci_bus_region bus_region;
 
 		if (resource_type(res) != resource_type(window->res))
diff --git a/drivers/pci/host/pci-host-generic.c b/drivers/pci/host/pci-host-generic.c
index 6eb1aa75bd37..aee3c620ecf9 100644
--- a/drivers/pci/host/pci-host-generic.c
+++ b/drivers/pci/host/pci-host-generic.c
@@ -149,14 +149,14 @@ static int gen_pci_parse_request_of_pci_ranges(struct gen_pci *pci)
 	struct device *dev = pci->host.dev.parent;
 	struct device_node *np = dev->of_node;
 	resource_size_t iobase;
-	struct pci_host_bridge_window *win;
+	struct resource_entry *win;
 
 	err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pci->resources,
 					       &iobase);
 	if (err)
 		return err;
 
-	list_for_each_entry(win, &pci->resources, list) {
+	resource_list_for_each_entry(win, &pci->resources) {
 		struct resource *parent, *res = win->res;
 
 		switch (resource_type(res)) {
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index b1d0596457c5..a704257bab7f 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -401,11 +401,11 @@ static int xgene_pcie_map_ranges(struct xgene_pcie_port *port,
 				 struct list_head *res,
 				 resource_size_t io_base)
 {
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	struct device *dev = port->dev;
 	int ret;
 
-	list_for_each_entry(window, res, list) {
+	resource_list_for_each_entry(window, res) {
 		struct resource *res = window->res;
 		u64 restype = resource_type(res);
 
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index ef3ebaf9a738..601261df7663 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -737,7 +737,7 @@ static int xilinx_pcie_parse_and_add_res(struct xilinx_pcie_port *port)
 	resource_size_t offset;
 	struct of_pci_range_parser parser;
 	struct of_pci_range range;
-	struct pci_host_bridge_window *win;
+	struct resource_entry *win;
 	int err = 0, mem_resno = 0;
 
 	/* Get the ranges */
@@ -807,7 +807,7 @@ static int xilinx_pcie_parse_and_add_res(struct xilinx_pcie_port *port)
 
 free_resources:
 	release_child_resources(&iomem_resource);
-	list_for_each_entry(win, &port->resources, list)
+	resource_list_for_each_entry(win, &port->resources)
 		devm_kfree(dev, win->res);
 	pci_free_resource_list(&port->resources);
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 23212f8ae09b..8d2f400e96cb 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1895,7 +1895,7 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 	int error;
 	struct pci_host_bridge *bridge;
 	struct pci_bus *b, *b2;
-	struct pci_host_bridge_window *window, *n;
+	struct resource_entry *window, *n;
 	struct resource *res;
 	resource_size_t offset;
 	char bus_addr[64];
@@ -1959,8 +1959,8 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 		printk(KERN_INFO "PCI host bridge to bus %s\n", dev_name(&b->dev));
 
 	/* Add initial resources to the bus */
-	list_for_each_entry_safe(window, n, resources, list) {
-		list_move_tail(&window->list, &bridge->windows);
+	resource_list_for_each_entry_safe(window, n, resources) {
+		list_move_tail(&window->node, &bridge->windows);
 		res = window->res;
 		offset = window->offset;
 		if (res->flags & IORESOURCE_BUS)
@@ -2060,12 +2060,12 @@ void pci_bus_release_busn_res(struct pci_bus *b)
 struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
 		struct pci_ops *ops, void *sysdata, struct list_head *resources)
 {
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	bool found = false;
 	struct pci_bus *b;
 	int max;
 
-	list_for_each_entry(window, resources, list)
+	resource_list_for_each_entry(window, resources)
 		if (window->res->flags & IORESOURCE_BUS) {
 			found = true;
 			break;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9603094ed59b..faa60fa26314 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -29,6 +29,7 @@
 #include <linux/atomic.h>
 #include <linux/device.h>
 #include <linux/io.h>
+#include <linux/resource_ext.h>
 #include <uapi/linux/pci.h>
 
 #include <linux/pci_ids.h>
@@ -397,16 +398,10 @@ static inline int pci_channel_offline(struct pci_dev *pdev)
 	return (pdev->error_state != pci_channel_io_normal);
 }
 
-struct pci_host_bridge_window {
-	struct list_head list;
-	struct resource *res;		/* host bridge aperture (CPU address) */
-	resource_size_t offset;		/* bus address + offset = CPU address */
-};
-
 struct pci_host_bridge {
 	struct device dev;
 	struct pci_bus *bus;		/* root bus */
-	struct list_head windows;	/* pci_host_bridge_windows */
+	struct list_head windows;	/* resource_entry */
 	void (*release_fn)(struct pci_host_bridge *);
 	void *release_data;
 };
-- 
cgit v1.2.3


From ecf5636dcd59cd5508641f995cc4c2bafedbb995 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 5 Feb 2015 13:44:48 +0800
Subject: ACPI: Add interfaces to parse IOAPIC ID for IOAPIC hotplug

We need to parse APIC ID for IOAPIC registration for IOAPIC hotplug.
ACPI _MAT method and MADT table are used to figure out IOAPIC ID, just
like parsing CPU APIC ID for CPU hotplug.

[ tglx: Fixed docbook comment ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/1414387308-27148-8-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/processor_core.c | 123 ++++++++++++++++++++++++++++++++++++++----
 include/linux/acpi.h          |   4 ++
 2 files changed, 118 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 02e48394276c..7962651cdbd4 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -4,6 +4,10 @@
  *
  *	Alex Chiang <achiang@hp.com>
  *	- Unified x86/ia64 implementations
+ *
+ * I/O APIC hotplug support
+ *	Yinghai Lu <yinghai@kernel.org>
+ *	Jiang Liu <jiang.liu@intel.com>
  */
 #include <linux/export.h>
 #include <linux/acpi.h>
@@ -12,6 +16,21 @@
 #define _COMPONENT		ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_core");
 
+static struct acpi_table_madt *get_madt_table(void)
+{
+	static struct acpi_table_madt *madt;
+	static int read_madt;
+
+	if (!read_madt) {
+		if (ACPI_FAILURE(acpi_get_table(ACPI_SIG_MADT, 0,
+					(struct acpi_table_header **)&madt)))
+			madt = NULL;
+		read_madt++;
+	}
+
+	return madt;
+}
+
 static int map_lapic_id(struct acpi_subtable_header *entry,
 		 u32 acpi_id, int *apic_id)
 {
@@ -67,17 +86,10 @@ static int map_lsapic_id(struct acpi_subtable_header *entry,
 static int map_madt_entry(int type, u32 acpi_id)
 {
 	unsigned long madt_end, entry;
-	static struct acpi_table_madt *madt;
-	static int read_madt;
 	int phys_id = -1;	/* CPU hardware ID */
+	struct acpi_table_madt *madt;
 
-	if (!read_madt) {
-		if (ACPI_FAILURE(acpi_get_table(ACPI_SIG_MADT, 0,
-					(struct acpi_table_header **)&madt)))
-			madt = NULL;
-		read_madt++;
-	}
-
+	madt = get_madt_table();
 	if (!madt)
 		return phys_id;
 
@@ -203,3 +215,96 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
 	return acpi_map_cpuid(phys_id, acpi_id);
 }
 EXPORT_SYMBOL_GPL(acpi_get_cpuid);
+
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base,
+			 u64 *phys_addr, int *ioapic_id)
+{
+	struct acpi_madt_io_apic *ioapic = (struct acpi_madt_io_apic *)entry;
+
+	if (ioapic->global_irq_base != gsi_base)
+		return 0;
+
+	*phys_addr = ioapic->address;
+	*ioapic_id = ioapic->id;
+	return 1;
+}
+
+static int parse_madt_ioapic_entry(u32 gsi_base, u64 *phys_addr)
+{
+	struct acpi_subtable_header *hdr;
+	unsigned long madt_end, entry;
+	struct acpi_table_madt *madt;
+	int apic_id = -1;
+
+	madt = get_madt_table();
+	if (!madt)
+		return apic_id;
+
+	entry = (unsigned long)madt;
+	madt_end = entry + madt->header.length;
+
+	/* Parse all entries looking for a match. */
+	entry += sizeof(struct acpi_table_madt);
+	while (entry + sizeof(struct acpi_subtable_header) < madt_end) {
+		hdr = (struct acpi_subtable_header *)entry;
+		if (hdr->type == ACPI_MADT_TYPE_IO_APIC &&
+		    get_ioapic_id(hdr, gsi_base, phys_addr, &apic_id))
+			break;
+		else
+			entry += hdr->length;
+	}
+
+	return apic_id;
+}
+
+static int parse_mat_ioapic_entry(acpi_handle handle, u32 gsi_base,
+				  u64 *phys_addr)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_subtable_header *header;
+	union acpi_object *obj;
+	int apic_id = -1;
+
+	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+		goto exit;
+
+	if (!buffer.length || !buffer.pointer)
+		goto exit;
+
+	obj = buffer.pointer;
+	if (obj->type != ACPI_TYPE_BUFFER ||
+	    obj->buffer.length < sizeof(struct acpi_subtable_header))
+		goto exit;
+
+	header = (struct acpi_subtable_header *)obj->buffer.pointer;
+	if (header->type == ACPI_MADT_TYPE_IO_APIC)
+		get_ioapic_id(header, gsi_base, phys_addr, &apic_id);
+
+exit:
+	kfree(buffer.pointer);
+	return apic_id;
+}
+
+/**
+ * acpi_get_ioapic_id - Get IOAPIC ID and physical address matching @gsi_base
+ * @handle:	ACPI object for IOAPIC device
+ * @gsi_base:	GSI base to match with
+ * @phys_addr:	Pointer to store physical address of matching IOAPIC record
+ *
+ * Walk resources returned by ACPI_MAT method, then ACPI MADT table, to search
+ * for an ACPI IOAPIC record matching @gsi_base.
+ * Return IOAPIC id and store physical address in @phys_addr if found a match,
+ * otherwise return <0.
+ */
+int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr)
+{
+	int apic_id;
+
+	apic_id = parse_mat_ioapic_entry(handle, gsi_base, phys_addr);
+	if (apic_id == -1)
+		apic_id = parse_madt_ioapic_entry(gsi_base, phys_addr);
+
+	return apic_id;
+}
+#endif /* CONFIG_ACPI_HOTPLUG_IOAPIC */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e53822148b6a..24c7aa8b1d20 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -152,6 +152,10 @@ int acpi_map_cpu(acpi_handle handle, int physid, int *pcpu);
 int acpi_unmap_cpu(int cpu);
 #endif /* CONFIG_ACPI_HOTPLUG_CPU */
 
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr);
+#endif
+
 int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base);
 int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base);
 int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base);
-- 
cgit v1.2.3


From ddad8dd0a162fde61646a627a3017c258601dc8a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 18 Jan 2015 16:16:29 +0100
Subject: block: use blk_rq_map_user_iov to implement blk_rq_map_user

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         |  53 +-------------------
 block/blk-map.c     | 137 +++++-----------------------------------------------
 include/linux/bio.h |   4 --
 3 files changed, 14 insertions(+), 180 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 54da51ed43de..879921e6b049 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1102,7 +1102,7 @@ static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_c
  *	bio_uncopy_user	-	finish previously mapped bio
  *	@bio: bio being terminated
  *
- *	Free pages allocated from bio_copy_user() and write back data
+ *	Free pages allocated from bio_copy_user_iov() and write back data
  *	to user space in case of a read.
  */
 int bio_uncopy_user(struct bio *bio)
@@ -1256,32 +1256,6 @@ out_bmd:
 	return ERR_PTR(ret);
 }
 
-/**
- *	bio_copy_user	-	copy user data to bio
- *	@q: destination block queue
- *	@map_data: pointer to the rq_map_data holding pages (if necessary)
- *	@uaddr: start of user address
- *	@len: length in bytes
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Prepares and returns a bio for indirect user io, bouncing data
- *	to/from kernel pages as necessary. Must be paired with
- *	call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
-			  unsigned long uaddr, unsigned int len,
-			  int write_to_vm, gfp_t gfp_mask)
-{
-	struct sg_iovec iov;
-
-	iov.iov_base = (void __user *)uaddr;
-	iov.iov_len = len;
-
-	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_copy_user);
-
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
 				      const struct sg_iovec *iov, int iov_count,
@@ -1394,31 +1368,6 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	return ERR_PTR(ret);
 }
 
-/**
- *	bio_map_user	-	map user address into bio
- *	@q: the struct request_queue for the bio
- *	@bdev: destination block device
- *	@uaddr: start of user address
- *	@len: length in bytes
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Map the user space address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
-			 unsigned long uaddr, unsigned int len, int write_to_vm,
-			 gfp_t gfp_mask)
-{
-	struct sg_iovec iov;
-
-	iov.iov_base = (void __user *)uaddr;
-	iov.iov_len = len;
-
-	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_map_user);
-
 /**
  *	bio_map_user_iov - map user sg_iovec table into bio
  *	@q: the struct request_queue for the bio
diff --git a/block/blk-map.c b/block/blk-map.c
index f890d4345b0c..152a5fe5d85e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -39,130 +39,6 @@ static int __blk_rq_unmap_user(struct bio *bio)
 	return ret;
 }
 
-static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
-			     struct rq_map_data *map_data, void __user *ubuf,
-			     unsigned int len, gfp_t gfp_mask)
-{
-	unsigned long uaddr;
-	struct bio *bio, *orig_bio;
-	int reading, ret;
-
-	reading = rq_data_dir(rq) == READ;
-
-	/*
-	 * if alignment requirement is satisfied, map in user pages for
-	 * direct dma. else, set up kernel bounce buffers
-	 */
-	uaddr = (unsigned long) ubuf;
-	if (blk_rq_aligned(q, uaddr, len) && !map_data)
-		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
-	else
-		bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
-
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	if (map_data && map_data->null_mapped)
-		bio->bi_flags |= (1 << BIO_NULL_MAPPED);
-
-	orig_bio = bio;
-	blk_queue_bounce(q, &bio);
-
-	/*
-	 * We link the bounce buffer in and could have to traverse it
-	 * later so we have to get a ref to prevent it from being freed
-	 */
-	bio_get(bio);
-
-	ret = blk_rq_append_bio(q, rq, bio);
-	if (!ret)
-		return bio->bi_iter.bi_size;
-
-	/* if it was boucned we must call the end io function */
-	bio_endio(bio, 0);
-	__blk_rq_unmap_user(orig_bio);
-	bio_put(bio);
-	return ret;
-}
-
-/**
- * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
- * @q:		request queue where request should be inserted
- * @rq:		request structure to fill
- * @map_data:   pointer to the rq_map_data holding pages (if necessary)
- * @ubuf:	the user buffer
- * @len:	length of user data
- * @gfp_mask:	memory allocation flags
- *
- * Description:
- *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
- *    a kernel bounce buffer is used.
- *
- *    A matching blk_rq_unmap_user() must be issued at the end of I/O, while
- *    still in process context.
- *
- *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
- *    before being submitted to the device, as pages mapped may be out of
- *    reach. It's the callers responsibility to make sure this happens. The
- *    original bio must be passed back in to blk_rq_unmap_user() for proper
- *    unmapping.
- */
-int blk_rq_map_user(struct request_queue *q, struct request *rq,
-		    struct rq_map_data *map_data, void __user *ubuf,
-		    unsigned long len, gfp_t gfp_mask)
-{
-	unsigned long bytes_read = 0;
-	struct bio *bio = NULL;
-	int ret;
-
-	if (len > (queue_max_hw_sectors(q) << 9))
-		return -EINVAL;
-	if (!len)
-		return -EINVAL;
-
-	if (!ubuf && (!map_data || !map_data->null_mapped))
-		return -EINVAL;
-
-	while (bytes_read != len) {
-		unsigned long map_len, end, start;
-
-		map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
-		end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
-								>> PAGE_SHIFT;
-		start = (unsigned long)ubuf >> PAGE_SHIFT;
-
-		/*
-		 * A bad offset could cause us to require BIO_MAX_PAGES + 1
-		 * pages. If this happens we just lower the requested
-		 * mapping len by a page so that we can fit
-		 */
-		if (end - start > BIO_MAX_PAGES)
-			map_len -= PAGE_SIZE;
-
-		ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
-					gfp_mask);
-		if (ret < 0)
-			goto unmap_rq;
-		if (!bio)
-			bio = rq->bio;
-		bytes_read += ret;
-		ubuf += ret;
-
-		if (map_data)
-			map_data->offset += ret;
-	}
-
-	if (!bio_flagged(bio, BIO_USER_MAPPED))
-		rq->cmd_flags |= REQ_COPY_USER;
-
-	return 0;
-unmap_rq:
-	blk_rq_unmap_user(bio);
-	rq->bio = NULL;
-	return ret;
-}
-EXPORT_SYMBOL(blk_rq_map_user);
-
 /**
  * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:		request queue where request should be inserted
@@ -241,6 +117,19 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
 
+int blk_rq_map_user(struct request_queue *q, struct request *rq,
+		    struct rq_map_data *map_data, void __user *ubuf,
+		    unsigned long len, gfp_t gfp_mask)
+{
+	struct sg_iovec iov;
+
+	iov.iov_base = (void __user *)ubuf;
+	iov.iov_len = len;
+
+	return blk_rq_map_user_iov(q, rq, map_data, &iov, 1, len, gfp_mask);
+}
+EXPORT_SYMBOL(blk_rq_map_user);
+
 /**
  * blk_rq_unmap_user - unmap a request with user data
  * @bio:	       start of bio list
diff --git a/include/linux/bio.h b/include/linux/bio.h
index efead0b532c4..d0d6735d61da 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -428,8 +428,6 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
-extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
-				unsigned long, unsigned int, int, gfp_t);
 struct sg_iovec;
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
@@ -462,8 +460,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 
-extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
-				 unsigned long, unsigned int, int, gfp_t);
 extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct rq_map_data *,
 				     const struct sg_iovec *,
-- 
cgit v1.2.3


From 26e49cfc7e988a76bf1e55cef0d9e438e5489180 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sun, 18 Jan 2015 16:16:31 +0100
Subject: block: pass iov_iter to the BLOCK_PC mapping functions

Make use of a new interface provided by iov_iter, backed by
scatter-gather list of iovec, instead of the old interface based on
sg_iovec. Also use iov_iter_advance() instead of manual iteration.

This commit should contain only literal replacements, without
functional changes.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Doug Gilbert <dgilbert@interlog.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
[dpark: add more description in commit message]
Signed-off-by: Dongsu Park <dongsu.park@profitbricks.com>
[hch: fixed to do a deep clone of the iov_iter, and to properly use
      the iov_iter direction]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c            | 146 +++++++++++++++++++++++--------------------------
 block/blk-map.c        |  38 ++++++-------
 block/scsi_ioctl.c     |  17 ++----
 drivers/scsi/sg.c      |  15 ++---
 include/linux/bio.h    |   7 +--
 include/linux/blkdev.h |   4 +-
 6 files changed, 101 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 0895f694f440..7d8c6555e3f3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -28,7 +28,6 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
-#include <scsi/sg.h>		/* for struct sg_iovec */
 
 #include <trace/events/block.h>
 
@@ -1022,21 +1021,11 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 EXPORT_SYMBOL(bio_copy_data);
 
 struct bio_map_data {
-	int nr_sgvecs;
 	int is_our_pages;
-	struct sg_iovec sgvecs[];
+	struct iov_iter iter;
+	struct iovec iov[];
 };
 
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-			     const struct sg_iovec *iov, int iov_count,
-			     int is_our_pages)
-{
-	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
-	bmd->nr_sgvecs = iov_count;
-	bmd->is_our_pages = is_our_pages;
-	bio->bi_private = bmd;
-}
-
 static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
 					       gfp_t gfp_mask)
 {
@@ -1044,36 +1033,33 @@ static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
 		return NULL;
 
 	return kmalloc(sizeof(struct bio_map_data) +
-		       sizeof(struct sg_iovec) * iov_count, gfp_mask);
+		       sizeof(struct iovec) * iov_count, gfp_mask);
 }
 
-static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
-                          int to_user, int from_user)
+static int __bio_copy_iov(struct bio *bio, const struct iov_iter *iter,
+			  int to_user, int from_user)
 {
 	int ret = 0, i;
 	struct bio_vec *bvec;
-	int iov_idx = 0;
-	unsigned int iov_off = 0;
+	struct iov_iter iov_iter = *iter;
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		char *bv_addr = page_address(bvec->bv_page);
 		unsigned int bv_len = bvec->bv_len;
 
-		while (bv_len && iov_idx < iov_count) {
-			unsigned int bytes;
-			char __user *iov_addr;
-
-			bytes = min_t(unsigned int,
-				      iov[iov_idx].iov_len - iov_off, bv_len);
-			iov_addr = iov[iov_idx].iov_base + iov_off;
+		while (bv_len && iov_iter.count) {
+			struct iovec iov = iov_iter_iovec(&iov_iter);
+			unsigned int bytes = min_t(unsigned int, bv_len,
+						   iov.iov_len);
 
 			if (!ret) {
 				if (to_user)
-					ret = copy_to_user(iov_addr, bv_addr,
-							   bytes);
+					ret = copy_to_user(iov.iov_base,
+							   bv_addr, bytes);
 
 				if (from_user)
-					ret = copy_from_user(bv_addr, iov_addr,
+					ret = copy_from_user(bv_addr,
+							     iov.iov_base,
 							     bytes);
 
 				if (ret)
@@ -1082,13 +1068,7 @@ static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_c
 
 			bv_len -= bytes;
 			bv_addr += bytes;
-			iov_addr += bytes;
-			iov_off += bytes;
-
-			if (iov[iov_idx].iov_len == iov_off) {
-				iov_idx++;
-				iov_off = 0;
-			}
+			iov_iter_advance(&iov_iter, bytes);
 		}
 	}
 
@@ -1122,7 +1102,7 @@ int bio_uncopy_user(struct bio *bio)
 		 * don't copy into a random user address space, just free.
 		 */
 		if (current->mm)
-			ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
+			ret = __bio_copy_iov(bio, &bmd->iter,
 					     bio_data_dir(bio) == READ, 0);
 		if (bmd->is_our_pages)
 			bio_free_pages(bio);
@@ -1135,12 +1115,10 @@ EXPORT_SYMBOL(bio_uncopy_user);
 
 /**
  *	bio_copy_user_iov	-	copy user data to bio
- *	@q: destination block queue
- *	@map_data: pointer to the rq_map_data holding pages (if necessary)
- *	@iov:	the iovec.
- *	@iov_count: number of elements in the iovec
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
+ *	@q:		destination block queue
+ *	@map_data:	pointer to the rq_map_data holding pages (if necessary)
+ *	@iter:		iovec iterator
+ *	@gfp_mask:	memory allocation flags
  *
  *	Prepares and returns a bio for indirect user io, bouncing data
  *	to/from kernel pages as necessary. Must be paired with
@@ -1148,24 +1126,25 @@ EXPORT_SYMBOL(bio_uncopy_user);
  */
 struct bio *bio_copy_user_iov(struct request_queue *q,
 			      struct rq_map_data *map_data,
-			      const struct sg_iovec *iov, int iov_count,
-			      int write_to_vm, gfp_t gfp_mask)
+			      const struct iov_iter *iter,
+			      gfp_t gfp_mask)
 {
 	struct bio_map_data *bmd;
 	struct page *page;
 	struct bio *bio;
 	int i, ret;
 	int nr_pages = 0;
-	unsigned int len = 0;
+	unsigned int len = iter->count;
 	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
 
-	for (i = 0; i < iov_count; i++) {
+	for (i = 0; i < iter->nr_segs; i++) {
 		unsigned long uaddr;
 		unsigned long end;
 		unsigned long start;
 
-		uaddr = (unsigned long)iov[i].iov_base;
-		end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		uaddr = (unsigned long) iter->iov[i].iov_base;
+		end = (uaddr + iter->iov[i].iov_len + PAGE_SIZE - 1)
+			>> PAGE_SHIFT;
 		start = uaddr >> PAGE_SHIFT;
 
 		/*
@@ -1175,22 +1154,31 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 			return ERR_PTR(-EINVAL);
 
 		nr_pages += end - start;
-		len += iov[i].iov_len;
 	}
 
 	if (offset)
 		nr_pages++;
 
-	bmd = bio_alloc_map_data(iov_count, gfp_mask);
+	bmd = bio_alloc_map_data(iter->nr_segs, gfp_mask);
 	if (!bmd)
 		return ERR_PTR(-ENOMEM);
 
+	/*
+	 * We need to do a deep copy of the iov_iter including the iovecs.
+	 * The caller provided iov might point to an on-stack or otherwise
+	 * shortlived one.
+	 */
+	bmd->is_our_pages = map_data ? 0 : 1;
+	memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs);
+	iov_iter_init(&bmd->iter, iter->type, bmd->iov,
+			iter->nr_segs, iter->count);
+
 	ret = -ENOMEM;
 	bio = bio_kmalloc(gfp_mask, nr_pages);
 	if (!bio)
 		goto out_bmd;
 
-	if (!write_to_vm)
+	if (iter->type & WRITE)
 		bio->bi_rw |= REQ_WRITE;
 
 	ret = 0;
@@ -1238,14 +1226,14 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	/*
 	 * success
 	 */
-	if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
+	if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) ||
 	    (map_data && map_data->from_user)) {
-		ret = __bio_copy_iov(bio, iov, iov_count, 0, 1);
+		ret = __bio_copy_iov(bio, iter, 0, 1);
 		if (ret)
 			goto cleanup;
 	}
 
-	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
+	bio->bi_private = bmd;
 	return bio;
 cleanup:
 	if (!map_data)
@@ -1258,19 +1246,21 @@ out_bmd:
 
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
-				      const struct sg_iovec *iov, int iov_count,
-				      int write_to_vm, gfp_t gfp_mask)
+				      const struct iov_iter *iter,
+				      gfp_t gfp_mask)
 {
-	int i, j;
+	int j;
 	int nr_pages = 0;
 	struct page **pages;
 	struct bio *bio;
 	int cur_page = 0;
 	int ret, offset;
+	struct iov_iter i;
+	struct iovec iov;
 
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
-		unsigned long len = iov[i].iov_len;
+	iov_for_each(iov, i, *iter) {
+		unsigned long uaddr = (unsigned long) iov.iov_base;
+		unsigned long len = iov.iov_len;
 		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		unsigned long start = uaddr >> PAGE_SHIFT;
 
@@ -1300,16 +1290,17 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	if (!pages)
 		goto out;
 
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
-		unsigned long len = iov[i].iov_len;
+	iov_for_each(iov, i, *iter) {
+		unsigned long uaddr = (unsigned long) iov.iov_base;
+		unsigned long len = iov.iov_len;
 		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		unsigned long start = uaddr >> PAGE_SHIFT;
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
 
 		ret = get_user_pages_fast(uaddr, local_nr_pages,
-				write_to_vm, &pages[cur_page]);
+				(iter->type & WRITE) != WRITE,
+				&pages[cur_page]);
 		if (ret < local_nr_pages) {
 			ret = -EFAULT;
 			goto out_unmap;
@@ -1349,7 +1340,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	/*
 	 * set data direction, and check if mapped pages need bouncing
 	 */
-	if (!write_to_vm)
+	if (iter->type & WRITE)
 		bio->bi_rw |= REQ_WRITE;
 
 	bio->bi_bdev = bdev;
@@ -1357,10 +1348,10 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	return bio;
 
  out_unmap:
-	for (i = 0; i < nr_pages; i++) {
-		if(!pages[i])
+	for (j = 0; j < nr_pages; j++) {
+		if (!pages[j])
 			break;
-		page_cache_release(pages[i]);
+		page_cache_release(pages[j]);
 	}
  out:
 	kfree(pages);
@@ -1369,25 +1360,22 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 }
 
 /**
- *	bio_map_user_iov - map user sg_iovec table into bio
- *	@q: the struct request_queue for the bio
- *	@bdev: destination block device
- *	@iov:	the iovec.
- *	@iov_count: number of elements in the iovec
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
+ *	bio_map_user_iov - map user iovec into bio
+ *	@q:		the struct request_queue for the bio
+ *	@bdev:		destination block device
+ *	@iter:		iovec iterator
+ *	@gfp_mask:	memory allocation flags
  *
  *	Map the user space address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
-			     const struct sg_iovec *iov, int iov_count,
-			     int write_to_vm, gfp_t gfp_mask)
+			     const struct iov_iter *iter,
+			     gfp_t gfp_mask)
 {
 	struct bio *bio;
 
-	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
-				 gfp_mask);
+	bio = __bio_map_user_iov(q, bdev, iter, gfp_mask);
 	if (IS_ERR(bio))
 		return bio;
 
diff --git a/block/blk-map.c b/block/blk-map.c
index 152a5fe5d85e..30e6bb871c5c 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -5,7 +5,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <scsi/sg.h>		/* for struct sg_iovec */
+#include <linux/uio.h>
 
 #include "blk.h"
 
@@ -44,9 +44,7 @@ static int __blk_rq_unmap_user(struct bio *bio)
  * @q:		request queue where request should be inserted
  * @rq:		request to map data to
  * @map_data:   pointer to the rq_map_data holding pages (if necessary)
- * @iov:	pointer to the iovec
- * @iov_count:	number of elements in the iovec
- * @len:	I/O byte count
+ * @iter:	iovec iterator
  * @gfp_mask:	memory allocation flags
  *
  * Description:
@@ -63,20 +61,21 @@ static int __blk_rq_unmap_user(struct bio *bio)
  *    unmapping.
  */
 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
-			struct rq_map_data *map_data, const struct sg_iovec *iov,
-			int iov_count, unsigned int len, gfp_t gfp_mask)
+			struct rq_map_data *map_data,
+			const struct iov_iter *iter, gfp_t gfp_mask)
 {
 	struct bio *bio;
-	int i, read = rq_data_dir(rq) == READ;
 	int unaligned = 0;
+	struct iov_iter i;
+	struct iovec iov;
 
-	if (!iov || iov_count <= 0)
+	if (!iter || !iter->count)
 		return -EINVAL;
 
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
+	iov_for_each(iov, i, *iter) {
+		unsigned long uaddr = (unsigned long) iov.iov_base;
 
-		if (!iov[i].iov_len)
+		if (!iov.iov_len)
 			return -EINVAL;
 
 		/*
@@ -86,16 +85,15 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 			unaligned = 1;
 	}
 
-	if (unaligned || (q->dma_pad_mask & len) || map_data)
-		bio = bio_copy_user_iov(q, map_data, iov, iov_count, read,
-					gfp_mask);
+	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
+		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
 	else
-		bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask);
+		bio = bio_map_user_iov(q, NULL, iter, gfp_mask);
 
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
-	if (bio->bi_iter.bi_size != len) {
+	if (bio->bi_iter.bi_size != iter->count) {
 		/*
 		 * Grab an extra reference to this bio, as bio_unmap_user()
 		 * expects to be able to drop it twice as it happens on the
@@ -121,12 +119,14 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 		    struct rq_map_data *map_data, void __user *ubuf,
 		    unsigned long len, gfp_t gfp_mask)
 {
-	struct sg_iovec iov;
+	struct iovec iov;
+	struct iov_iter i;
 
-	iov.iov_base = (void __user *)ubuf;
+	iov.iov_base = ubuf;
 	iov.iov_len = len;
+	iov_iter_init(&i, rq_data_dir(rq), &iov, 1, len);
 
-	return blk_rq_map_user_iov(q, rq, map_data, &iov, 1, len, gfp_mask);
+	return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask);
 }
 EXPORT_SYMBOL(blk_rq_map_user);
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 28163fad3c5d..e1f71c396193 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -332,7 +332,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 
 	ret = 0;
 	if (hdr->iovec_count) {
-		size_t iov_data_len;
+		struct iov_iter i;
 		struct iovec *iov = NULL;
 
 		ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
@@ -342,20 +342,11 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 			goto out_free_cdb;
 		}
 
-		iov_data_len = ret;
-		ret = 0;
-
 		/* SG_IO howto says that the shorter of the two wins */
-		if (hdr->dxfer_len < iov_data_len) {
-			hdr->iovec_count = iov_shorten(iov,
-						       hdr->iovec_count,
-						       hdr->dxfer_len);
-			iov_data_len = hdr->dxfer_len;
-		}
+		iov_iter_init(&i, rq_data_dir(rq), iov, hdr->iovec_count,
+			      min_t(unsigned, ret, hdr->dxfer_len));
 
-		ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
-					  hdr->iovec_count,
-					  iov_data_len, GFP_KERNEL);
+		ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL);
 		kfree(iov);
 	} else if (hdr->dxfer_len)
 		ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index b14f64cb9724..4052e592818c 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1734,22 +1734,19 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 	}
 
 	if (iov_count) {
-		int len, size = sizeof(struct sg_iovec) * iov_count;
+		int size = sizeof(struct iovec) * iov_count;
 		struct iovec *iov;
+		struct iov_iter i;
 
 		iov = memdup_user(hp->dxferp, size);
 		if (IS_ERR(iov))
 			return PTR_ERR(iov);
 
-		len = iov_length(iov, iov_count);
-		if (hp->dxfer_len < len) {
-			iov_count = iov_shorten(iov, iov_count, hp->dxfer_len);
-			len = hp->dxfer_len;
-		}
+		iov_iter_init(&i, rw, iov, iov_count,
+			      min_t(size_t, hp->dxfer_len,
+				    iov_length(iov, iov_count)));
 
-		res = blk_rq_map_user_iov(q, rq, md, (struct sg_iovec *)iov,
-					  iov_count,
-					  len, GFP_ATOMIC);
+		res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC);
 		kfree(iov);
 	} else
 		res = blk_rq_map_user(q, rq, md, hp->dxferp,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d0d6735d61da..0d6105b34ffa 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -428,11 +428,10 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
-struct sg_iovec;
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    struct block_device *,
-				    const struct sg_iovec *, int, int, gfp_t);
+				    const struct iov_iter *, gfp_t);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 				gfp_t);
@@ -462,8 +461,8 @@ extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 
 extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct rq_map_data *,
-				     const struct sg_iovec *,
-				     int, int, gfp_t);
+				     const struct iov_iter *,
+				     gfp_t);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 13e16401a7ce..bf4ef666d191 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -855,8 +855,8 @@ extern int blk_rq_map_user(struct request_queue *, struct request *,
 extern int blk_rq_unmap_user(struct bio *);
 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
-			       struct rq_map_data *, const struct sg_iovec *,
-			       int, unsigned int, gfp_t);
+			       struct rq_map_data *, const struct iov_iter *,
+			       gfp_t);
 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
-- 
cgit v1.2.3


From 37f19e57a0de3c4a3417aa13ff4d04f1e0dee4b3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 18 Jan 2015 16:16:33 +0100
Subject: block: merge __bio_map_user_iov into bio_map_user_iov

And also remove the unused bdev argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         | 56 +++++++++++++++++++----------------------------------
 block/blk-map.c     |  2 +-
 include/linux/bio.h |  1 -
 3 files changed, 21 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index a69a9c9e7c93..0723d4ce8589 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1244,10 +1244,18 @@ out_bmd:
 	return ERR_PTR(ret);
 }
 
-static struct bio *__bio_map_user_iov(struct request_queue *q,
-				      struct block_device *bdev,
-				      const struct iov_iter *iter,
-				      gfp_t gfp_mask)
+/**
+ *	bio_map_user_iov - map user iovec into bio
+ *	@q:		the struct request_queue for the bio
+ *	@iter:		iovec iterator
+ *	@gfp_mask:	memory allocation flags
+ *
+ *	Map the user space address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_user_iov(struct request_queue *q,
+			     const struct iov_iter *iter,
+			     gfp_t gfp_mask)
 {
 	int j;
 	int nr_pages = 0;
@@ -1343,8 +1351,15 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	if (iter->type & WRITE)
 		bio->bi_rw |= REQ_WRITE;
 
-	bio->bi_bdev = bdev;
 	bio->bi_flags |= (1 << BIO_USER_MAPPED);
+
+	/*
+	 * subtle -- if __bio_map_user() ended up bouncing a bio,
+	 * it would normally disappear when its bi_end_io is run.
+	 * however, we need it for the unmap, so grab an extra
+	 * reference to it
+	 */
+	bio_get(bio);
 	return bio;
 
  out_unmap:
@@ -1359,37 +1374,6 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	return ERR_PTR(ret);
 }
 
-/**
- *	bio_map_user_iov - map user iovec into bio
- *	@q:		the struct request_queue for the bio
- *	@bdev:		destination block device
- *	@iter:		iovec iterator
- *	@gfp_mask:	memory allocation flags
- *
- *	Map the user space address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
-			     const struct iov_iter *iter,
-			     gfp_t gfp_mask)
-{
-	struct bio *bio;
-
-	bio = __bio_map_user_iov(q, bdev, iter, gfp_mask);
-	if (IS_ERR(bio))
-		return bio;
-
-	/*
-	 * subtle -- if __bio_map_user() ended up bouncing a bio,
-	 * it would normally disappear when its bi_end_io is run.
-	 * however, we need it for the unmap, so grab an extra
-	 * reference to it
-	 */
-	bio_get(bio);
-
-	return bio;
-}
-
 static void __bio_unmap_user(struct bio *bio)
 {
 	struct bio_vec *bvec;
diff --git a/block/blk-map.c b/block/blk-map.c
index 30e6bb871c5c..0f22911f17dc 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -88,7 +88,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
 		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
 	else
-		bio = bio_map_user_iov(q, NULL, iter, gfp_mask);
+		bio = bio_map_user_iov(q, iter, gfp_mask);
 
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0d6105b34ffa..da3a127c9958 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -430,7 +430,6 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 extern int bio_get_nr_vecs(struct block_device *);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
-				    struct block_device *,
 				    const struct iov_iter *, gfp_t);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
-- 
cgit v1.2.3


From 472e259449819d939b5a5188b6f4c7d59aa4304c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 5 Feb 2015 16:50:30 -0500
Subject: NFSv4.1: Pin the inode and super block in asynchronous layoutcommit

If we're sending an asynchronous layoutcommit, then we need to ensure
that the inode and the super block remain pinned.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Peng Tao <tao.peng@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 19 +++++++++++--------
 include/linux/nfs_xdr.h |  1 +
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dd892a4e7eb3..e092b8540e2e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7989,6 +7989,7 @@ static void nfs4_layoutcommit_release(void *calldata)
 	nfs_post_op_update_inode_force_wcc(data->args.inode,
 					   data->res.fattr);
 	put_rpccred(data->cred);
+	nfs_iput_and_deactive(data->inode);
 	kfree(data);
 }
 
@@ -8013,7 +8014,6 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_layoutcommit_ops,
 		.callback_data = data,
-		.flags = RPC_TASK_ASYNC,
 	};
 	struct rpc_task *task;
 	int status = 0;
@@ -8024,18 +8024,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
 		data->args.lastbytewritten,
 		data->args.inode->i_ino);
 
+	if (!sync) {
+		data->inode = nfs_igrab_and_active(data->args.inode);
+		if (data->inode == NULL) {
+			nfs4_layoutcommit_release(data);
+			return -EAGAIN;
+		}
+		task_setup_data.flags = RPC_TASK_ASYNC;
+	}
 	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
-	if (sync == false)
-		goto out;
-	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status != 0)
-		goto out;
-	status = task->tk_status;
+	if (sync)
+		status = task->tk_status;
 	trace_nfs4_layoutcommit(data->args.inode, status);
-out:
 	dprintk("%s: status %d\n", __func__, status);
 	rpc_put_task(task);
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2c35e2affa6f..bb0d56f737e0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -285,6 +285,7 @@ struct nfs4_layoutcommit_data {
 	struct nfs_fattr fattr;
 	struct list_head lseg_list;
 	struct rpc_cred *cred;
+	struct inode *inode;
 	struct nfs4_layoutcommit_args args;
 	struct nfs4_layoutcommit_res res;
 };
-- 
cgit v1.2.3


From 5a0ec8acb945e302ce819b4a9787796ccf284548 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 5 Feb 2015 16:35:16 -0500
Subject: NFSv4.1: Pin the inode and super block in asynchronous layoutreturns

If we're sending an asynchronous layoutreturn, then we need to ensure
that the inode and the super block remain pinned.

Cc: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Peng Tao <tao.peng@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 19 +++++++++++--------
 include/linux/nfs_xdr.h |  1 +
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e092b8540e2e..2e7c9f7a6f7c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7856,6 +7856,7 @@ static void nfs4_layoutreturn_release(void *calldata)
 	lo->plh_block_lgets--;
 	spin_unlock(&lo->plh_inode->i_lock);
 	pnfs_put_layout_hdr(lrp->args.layout);
+	nfs_iput_and_deactive(lrp->inode);
 	kfree(calldata);
 	dprintk("<-- %s\n", __func__);
 }
@@ -7880,23 +7881,25 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_layoutreturn_call_ops,
 		.callback_data = lrp,
-		.flags = RPC_TASK_ASYNC,
 	};
 	int status = 0;
 
 	dprintk("--> %s\n", __func__);
+	if (!sync) {
+		lrp->inode = nfs_igrab_and_active(lrp->args.inode);
+		if (!lrp->inode) {
+			nfs4_layoutreturn_release(lrp);
+			return -EAGAIN;
+		}
+		task_setup_data.flags |= RPC_TASK_ASYNC;
+	}
 	nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
-	if (sync == false)
-		goto out;
-	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status != 0)
-		goto out;
-	status = task->tk_status;
+	if (sync)
+		status = task->tk_status;
 	trace_nfs4_layoutreturn(lrp->args.inode, status);
-out:
 	dprintk("<-- %s status=%d\n", __func__, status);
 	rpc_put_task(task);
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index bb0d56f737e0..38d96ba935c2 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -310,6 +310,7 @@ struct nfs4_layoutreturn {
 	struct nfs4_layoutreturn_res res;
 	struct rpc_cred *cred;
 	struct nfs_client *clp;
+	struct inode *inode;
 	int rpc_status;
 };
 
-- 
cgit v1.2.3


From a05d59a5673339ef6936d6940cdf68172ce75b9f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 6 Feb 2015 14:30:50 -0500
Subject: tracing: Add condition check to RCU lockdep checks

The trace_tlb_flush() tracepoint can be called when a CPU is going offline.
When a CPU is offline, RCU is no longer watching that CPU and since the
tracepoint is protected by RCU, it must not be called. To prevent the
tlb_flush tracepoint from being called when the CPU is offline, it was
converted to a TRACE_EVENT_CONDITION where the condition checks if the
CPU is online before calling the tracepoint.

Unfortunately, this was not enough to stop lockdep from complaining about
it. Even though the RCU protected code of the tracepoint will never be
called, the condition is hidden within the tracepoint, and even though the
condition prevents RCU code from being called, the lockdep checks are
outside the tracepoint (this is to test tracepoints even when they are not
enabled).

Even though tracepoints should be checked to be RCU safe when they are not
enabled, the condition should still be considered when checking RCU.

Link: http://lkml.kernel.org/r/CA+icZUUGiGDoL5NU8RuxKzFjoLjEKRtUWx=JB8B9a0EQv-eGzQ@mail.gmail.com

Fixes: 3a630178fd5f "tracing: generate RCU warnings even when tracepoints are disabled"
Cc: stable@vger.kernel.org # 3.18+
Acked-by: Dave Hansen <dave@sr71.net>
Reported-by: Sedat Dilek <sedat.dilek@gmail.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index e08e21e5f601..c72851328ca9 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -173,7 +173,7 @@ extern void syscall_unregfunc(void);
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args),			\
 				TP_CONDITION(cond),,);			\
-		if (IS_ENABLED(CONFIG_LOCKDEP)) {			\
+		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
 			rcu_read_lock_sched_notrace();			\
 			rcu_dereference_sched(__tracepoint_##name.funcs);\
 			rcu_read_unlock_sched_notrace();		\
-- 
cgit v1.2.3


From a9b2c06dbef48ed31cff1764c5ce824829106f4f Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:39 -0500
Subject: tcp: mitigate ACK loops for connections as tcp_request_sock

In the SYN_RECV state, where the TCP connection is represented by
tcp_request_sock, we now rate-limit SYNACKs in response to a client's
retransmitted SYNs: we do not send a SYNACK in response to client SYN
if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms)
since we last sent a SYNACK in response to a client's retransmitted
SYN.

This allows the vast majority of legitimate client connections to
proceed unimpeded, even for the most aggressive platforms, iOS and
MacOS, which actually retransmit SYNs 1-second intervals for several
times in a row. They use SYN RTO timeouts following the progression:
1,1,1,1,1,2,4,8,16,32.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 1 +
 include/net/tcp.h        | 1 +
 net/ipv4/tcp_minisocks.c | 6 +++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 67309ece0772..bcc828d3b9b9 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -115,6 +115,7 @@ struct tcp_request_sock {
 	u32				rcv_isn;
 	u32				snt_isn;
 	u32				snt_synack; /* synack sent time */
+	u32				last_oow_ack_time; /* last SYNACK */
 	u32				rcv_nxt; /* the ack # by SYNACK. For
 						  * FastOpen it's the seq#
 						  * after data-in-SYN.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b81f45c67b2e..da4196fb78db 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1145,6 +1145,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
 	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
+	tcp_rsk(req)->last_oow_ack_time = 0;
 	req->mss = rx_opt->mss_clamp;
 	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
 	ireq->tstamp_ok = rx_opt->tstamp_ok;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bc9216dc9de1..131aa4950d1c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -605,7 +605,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		 * Reset timer after retransmitting SYNACK, similar to
 		 * the idea of fast retransmit in recovery.
 		 */
-		if (!inet_rtx_syn_ack(sk, req))
+		if (!tcp_oow_rate_limited(sock_net(sk), skb,
+					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+					  &tcp_rsk(req)->last_oow_ack_time) &&
+
+		    !inet_rtx_syn_ack(sk, req))
 			req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
 					   TCP_RTO_MAX) + jiffies;
 		return NULL;
-- 
cgit v1.2.3


From f2b2c582e82429270d5818fbabe653f4359d7024 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:40 -0500
Subject: tcp: mitigate ACK loops for connections as tcp_sock

Ensure that in state ESTABLISHED, where the connection is represented
by a tcp_sock, we rate limit dupacks in response to incoming packets
(a) with TCP timestamps that fail PAWS checks, or (b) with sequence
numbers or ACK numbers that are out of the acceptable window.

We do not send a dupack in response to out-of-window packets if it has
been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we
last sent a dupack in response to an out-of-window packet.

There is already a similar (although global) rate-limiting mechanism
for "challenge ACKs". When deciding whether to send a challence ACK,
we first consult the new per-connection rate limit, and then the
global rate limit.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  1 +
 net/ipv4/tcp_input.c     | 29 ++++++++++++++++++++++-------
 net/ipv4/tcp_minisocks.c |  1 +
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bcc828d3b9b9..66d85a80a1ec 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -153,6 +153,7 @@ struct tcp_sock {
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
 	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
+	u32	last_oow_ack_time;  /* timestamp of last out-of-window ACK */
 
 	u32	tsoffset;	/* timestamp offset */
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9401aa43b814..8fdd27b17306 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3322,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 }
 
 /* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 {
 	/* unprotected vars, we dont care of overwrites */
 	static u32 challenge_timestamp;
 	static unsigned int challenge_count;
-	u32 now = jiffies / HZ;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 now;
+
+	/* First check our per-socket dupack rate limit. */
+	if (tcp_oow_rate_limited(sock_net(sk), skb,
+				 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+				 &tp->last_oow_ack_time))
+		return;
 
+	/* Then check the check host-wide RFC 5961 rate limit. */
+	now = jiffies / HZ;
 	if (now != challenge_timestamp) {
 		challenge_timestamp = now;
 		challenge_count = 0;
@@ -3424,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (before(ack, prior_snd_una)) {
 		/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
 		if (before(ack, prior_snd_una - tp->max_window)) {
-			tcp_send_challenge_ack(sk);
+			tcp_send_challenge_ack(sk, skb);
 			return -1;
 		}
 		goto old_ack;
@@ -4993,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 	    tcp_paws_discard(sk, skb)) {
 		if (!th->rst) {
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
-			tcp_send_dupack(sk, skb);
+			if (!tcp_oow_rate_limited(sock_net(sk), skb,
+						  LINUX_MIB_TCPACKSKIPPEDPAWS,
+						  &tp->last_oow_ack_time))
+				tcp_send_dupack(sk, skb);
 			goto discard;
 		}
 		/* Reset is accepted even if it did not pass PAWS. */
@@ -5010,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 		if (!th->rst) {
 			if (th->syn)
 				goto syn_challenge;
-			tcp_send_dupack(sk, skb);
+			if (!tcp_oow_rate_limited(sock_net(sk), skb,
+						  LINUX_MIB_TCPACKSKIPPEDSEQ,
+						  &tp->last_oow_ack_time))
+				tcp_send_dupack(sk, skb);
 		}
 		goto discard;
 	}
@@ -5026,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
 			tcp_reset(sk);
 		else
-			tcp_send_challenge_ack(sk);
+			tcp_send_challenge_ack(sk, skb);
 		goto discard;
 	}
 
@@ -5040,7 +5055,7 @@ syn_challenge:
 		if (syn_inerr)
 			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
-		tcp_send_challenge_ack(sk);
+		tcp_send_challenge_ack(sk, skb);
 		goto discard;
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 131aa4950d1c..98a840561ec8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -467,6 +467,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		tcp_enable_early_retrans(newtp);
 		newtp->tlp_high_seq = 0;
 		newtp->lsndtime = treq->snt_synack;
+		newtp->last_oow_ack_time = 0;
 		newtp->total_retrans = req->num_retrans;
 
 		/* So many TCP implementations out there (incorrectly) count the
-- 
cgit v1.2.3


From 4fb17a6091674f469e8ac85dc770fbf9a9ba7cc8 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:41 -0500
Subject: tcp: mitigate ACK loops for connections as tcp_timewait_sock

Ensure that in state FIN_WAIT2 or TIME_WAIT, where the connection is
represented by a tcp_timewait_sock, we rate limit dupacks in response
to incoming packets (a) with TCP timestamps that fail PAWS checks, or
(b) with sequence numbers that are out of the acceptable window.

We do not send a dupack in response to out-of-window packets if it has
been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we
last sent a dupack in response to an out-of-window packet.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  4 ++++
 net/ipv4/tcp_minisocks.c | 29 ++++++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 66d85a80a1ec..1a7adb411647 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -342,6 +342,10 @@ struct tcp_timewait_sock {
 	u32			  tw_rcv_wnd;
 	u32			  tw_ts_offset;
 	u32			  tw_ts_recent;
+
+	/* The time we sent the last out-of-window ACK: */
+	u32			  tw_last_oow_ack_time;
+
 	long			  tw_ts_recent_stamp;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 98a840561ec8..dd11ac7798c6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -58,6 +58,25 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 	return seq == e_win && seq == end_seq;
 }
 
+static enum tcp_tw_status
+tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
+				  const struct sk_buff *skb, int mib_idx)
+{
+	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+	if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
+				  &tcptw->tw_last_oow_ack_time)) {
+		/* Send ACK. Note, we do not put the bucket,
+		 * it will be released by caller.
+		 */
+		return TCP_TW_ACK;
+	}
+
+	/* We are rate-limiting, so just release the tw sock and drop skb. */
+	inet_twsk_put(tw);
+	return TCP_TW_SUCCESS;
+}
+
 /*
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -116,7 +135,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 				   tcptw->tw_rcv_nxt,
 				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
-			return TCP_TW_ACK;
+			return tcp_timewait_check_oow_rate_limit(
+				tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
 
 		if (th->rst)
 			goto kill;
@@ -250,10 +270,8 @@ kill:
 			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
 					   TCP_TIMEWAIT_LEN);
 
-		/* Send ACK. Note, we do not put the bucket,
-		 * it will be released by caller.
-		 */
-		return TCP_TW_ACK;
+		return tcp_timewait_check_oow_rate_limit(
+			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
 	}
 	inet_twsk_put(tw);
 	return TCP_TW_SUCCESS;
@@ -289,6 +307,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 		tcptw->tw_ts_offset	= tp->tsoffset;
+		tcptw->tw_last_oow_ack_time = 0;
 
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
-- 
cgit v1.2.3


From 096a4cfa5807aa89c78ce12309c0b1c10cf88184 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 6 Feb 2015 18:54:19 +0100
Subject: net: fix a typo in skb_checksum_validate_zero_check

Remove trailing underscore.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 111e665455c3..1bb36edb66b9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3072,7 +3072,7 @@ static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
 
 #define skb_checksum_validate_zero_check(skb, proto, check,		\
 					 compute_pseudo)		\
-	__skb_checksum_validate_(skb, proto, true, true, check, compute_pseudo)
+	__skb_checksum_validate(skb, proto, true, true, check, compute_pseudo)
 
 #define skb_checksum_simple_validate(skb)				\
 	__skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)
-- 
cgit v1.2.3


From 567e4b79731c352a17d73c483959f795d3593e03 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 6 Feb 2015 12:59:01 -0800
Subject: net: rfs: add hash collision detection

Receive Flow Steering is a nice solution but suffers from
hash collisions when a mix of connected and unconnected traffic
is received on the host, when flow hash table is populated.

Also, clearing flow in inet_release() makes RFS not very good
for short lived flows, as many packets can follow close().
(FIN , ACK packets, ...)

This patch extends the information stored into global hash table
to not only include cpu number, but upper part of the hash value.

I use a 32bit value, and dynamically split it in two parts.

For host with less than 64 possible cpus, this gives 6 bits for the
cpu number, and 26 (32-6) bits for the upper part of the hash.

Since hash bucket selection use low order bits of the hash, we have
a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
enough.

If the hash found in flow table does not match, we fallback to RPS (if
it is enabled for the rxqueue).

This means that a packet for an non connected flow can avoid the
IPI through a unrelated/victim CPU.

This also means we no longer have to clear the table at socket
close time, and this helps short lived flows performance.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c          |  5 +----
 include/linux/netdevice.h  | 34 ++++++++++++++++----------------
 include/net/sock.h         | 24 +----------------------
 net/core/dev.c             | 48 ++++++++++++++++++++++++++--------------------
 net/core/sysctl_net_core.c |  2 +-
 net/ipv4/af_inet.c         |  2 --
 6 files changed, 47 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ad7d3d5f3ee5..857dca47bf80 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
 {
 	tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
 		  e->rxhash, e->queue_index);
-	sock_rps_reset_flow_hash(e->rps_rxhash);
 	hlist_del_rcu(&e->hash_link);
 	kfree_rcu(e, rcu);
 	--tun->flow_count;
@@ -373,10 +372,8 @@ unlock:
  */
 static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
 {
-	if (unlikely(e->rps_rxhash != hash)) {
-		sock_rps_reset_flow_hash(e->rps_rxhash);
+	if (unlikely(e->rps_rxhash != hash))
 		e->rps_rxhash = hash;
-	}
 }
 
 /* We try to identify a flow through its rxhash first. The reason that
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ce784d5018e0..ab3b7cef4638 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -644,39 +644,39 @@ struct rps_dev_flow_table {
 /*
  * The rps_sock_flow_table contains mappings of flows to the last CPU
  * on which they were processed by the application (set in recvmsg).
+ * Each entry is a 32bit value. Upper part is the high order bits
+ * of flow hash, lower part is cpu number.
+ * rps_cpu_mask is used to partition the space, depending on number of
+ * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
+ * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f,
+ * meaning we use 32-6=26 bits for the hash.
  */
 struct rps_sock_flow_table {
-	unsigned int mask;
-	u16 ents[0];
+	u32	mask;
+	u32	ents[0];
 };
-#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
-    ((_num) * sizeof(u16)))
+#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
 
 #define RPS_NO_CPU 0xffff
 
+extern u32 rps_cpu_mask;
+extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+
 static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
 					u32 hash)
 {
 	if (table && hash) {
-		unsigned int cpu, index = hash & table->mask;
+		unsigned int index = hash & table->mask;
+		u32 val = hash & ~rps_cpu_mask;
 
 		/* We only give a hint, preemption can change cpu under us */
-		cpu = raw_smp_processor_id();
+		val |= raw_smp_processor_id();
 
-		if (table->ents[index] != cpu)
-			table->ents[index] = cpu;
+		if (table->ents[index] != val)
+			table->ents[index] = val;
 	}
 }
 
-static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
-				       u32 hash)
-{
-	if (table && hash)
-		table->ents[hash & table->mask] = RPS_NO_CPU;
-}
-
-extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
-
 #ifdef CONFIG_RFS_ACCEL
 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 			 u16 filter_id);
diff --git a/include/net/sock.h b/include/net/sock.h
index d28b8fededd6..e13824570b0f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 #endif
 }
 
-static inline void sock_rps_reset_flow_hash(__u32 hash)
-{
-#ifdef CONFIG_RPS
-	struct rps_sock_flow_table *sock_flow_table;
-
-	rcu_read_lock();
-	sock_flow_table = rcu_dereference(rps_sock_flow_table);
-	rps_reset_sock_flow(sock_flow_table, hash);
-	rcu_read_unlock();
-#endif
-}
-
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
@@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk)
 #endif
 }
 
-static inline void sock_rps_reset_flow(const struct sock *sk)
-{
-#ifdef CONFIG_RPS
-	sock_rps_reset_flow_hash(sk->sk_rxhash);
-#endif
-}
-
 static inline void sock_rps_save_rxhash(struct sock *sk,
 					const struct sk_buff *skb)
 {
 #ifdef CONFIG_RPS
-	if (unlikely(sk->sk_rxhash != skb->hash)) {
-		sock_rps_reset_flow(sk);
+	if (unlikely(sk->sk_rxhash != skb->hash))
 		sk->sk_rxhash = skb->hash;
-	}
 #endif
 }
 
 static inline void sock_rps_reset_rxhash(struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	sock_rps_reset_flow(sk);
 	sk->sk_rxhash = 0;
 #endif
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index a3a96ffc67f4..8be38675e1a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 /* One global table that all flow-based protocols share. */
 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 EXPORT_SYMBOL(rps_sock_flow_table);
+u32 rps_cpu_mask __read_mostly;
+EXPORT_SYMBOL(rps_cpu_mask);
 
 struct static_key rps_needed __read_mostly;
 
@@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		       struct rps_dev_flow **rflowp)
 {
-	struct netdev_rx_queue *rxqueue;
-	struct rps_map *map;
+	const struct rps_sock_flow_table *sock_flow_table;
+	struct netdev_rx_queue *rxqueue = dev->_rx;
 	struct rps_dev_flow_table *flow_table;
-	struct rps_sock_flow_table *sock_flow_table;
+	struct rps_map *map;
 	int cpu = -1;
-	u16 tcpu;
+	u32 tcpu;
 	u32 hash;
 
 	if (skb_rx_queue_recorded(skb)) {
 		u16 index = skb_get_rx_queue(skb);
+
 		if (unlikely(index >= dev->real_num_rx_queues)) {
 			WARN_ONCE(dev->real_num_rx_queues > 1,
 				  "%s received packet on queue %u, but number "
@@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 				  dev->name, index, dev->real_num_rx_queues);
 			goto done;
 		}
-		rxqueue = dev->_rx + index;
-	} else
-		rxqueue = dev->_rx;
+		rxqueue += index;
+	}
 
+	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
+
+	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	map = rcu_dereference(rxqueue->rps_map);
-	if (map) {
-		if (map->len == 1 &&
-		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
-			tcpu = map->cpus[0];
-			if (cpu_online(tcpu))
-				cpu = tcpu;
-			goto done;
-		}
-	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
+	if (!flow_table && !map)
 		goto done;
-	}
 
 	skb_reset_network_header(skb);
 	hash = skb_get_hash(skb);
 	if (!hash)
 		goto done;
 
-	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 	if (flow_table && sock_flow_table) {
-		u16 next_cpu;
 		struct rps_dev_flow *rflow;
+		u32 next_cpu;
+		u32 ident;
+
+		/* First check into global flow table if there is a match */
+		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
+		if ((ident ^ hash) & ~rps_cpu_mask)
+			goto try_rps;
 
+		next_cpu = ident & rps_cpu_mask;
+
+		/* OK, now we know there is a match,
+		 * we can look at the local (per receive queue) flow table
+		 */
 		rflow = &flow_table->flows[hash & flow_table->mask];
 		tcpu = rflow->cpu;
 
-		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
-
 		/*
 		 * If the desired CPU (where last recvmsg was done) is
 		 * different from current CPU (one in the rx-queue flow
@@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		}
 	}
 
+try_rps:
+
 	if (map) {
 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 		if (cpu_online(tcpu)) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index fde21d19e61b..7a31be5e361f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 					mutex_unlock(&sock_flow_mutex);
 					return -ENOMEM;
 				}
-
+				rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
 				sock_table->mask = size - 1;
 			} else
 				sock_table = orig_sock_table;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a44773c8346c..d2e49baaff63 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
 	if (sk) {
 		long timeout;
 
-		sock_rps_reset_flow(sk);
-
 		/* Applications forget to leave groups before exiting */
 		ip_mc_drop_socket(sk);
 
-- 
cgit v1.2.3


From 718ba5b87343df303017585200ee182e937eabfc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 8 Feb 2015 18:19:25 -0500
Subject: SUNRPC: Add helpers to prevent socket create from racing

The socket lock is currently held by the task that is requesting the
connection be established. While that is efficient in the case where
the connection happens quickly, it is racy in the case where it doesn't.
What we really want is for the connect helper to be able to block access
to the socket while it is being set up.

This patch does so by arranging to transfer the socket lock from the
task that is requesting the connect attempt, and then releasing that
lock once everything is done.
This scheme also gives us automatic protection against collisions with
the RPC close code, so we can kill the cancel_delayed_work_sync()
call in xs_close().

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h |  3 +++
 net/sunrpc/xprt.c           | 37 +++++++++++++++++++++++++++++++++----
 net/sunrpc/xprtsock.c       |  7 +++++--
 3 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 9d27ac45b909..2926e618dbc6 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -347,6 +347,9 @@ void			xprt_force_disconnect(struct rpc_xprt *xprt);
 void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 int			xs_swapper(struct rpc_xprt *xprt, int enable);
 
+bool			xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
+void			xprt_unlock_connect(struct rpc_xprt *, void *);
+
 /*
  * Reserved bit positions in xprt->state
  */
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ebbefad21a37..ff3574df8344 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -690,6 +690,37 @@ out_abort:
 	spin_unlock(&xprt->transport_lock);
 }
 
+bool xprt_lock_connect(struct rpc_xprt *xprt,
+		struct rpc_task *task,
+		void *cookie)
+{
+	bool ret = false;
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (!test_bit(XPRT_LOCKED, &xprt->state))
+		goto out;
+	if (xprt->snd_task != task)
+		goto out;
+	xprt->snd_task = cookie;
+	ret = true;
+out:
+	spin_unlock_bh(&xprt->transport_lock);
+	return ret;
+}
+
+void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
+{
+	spin_lock_bh(&xprt->transport_lock);
+	if (xprt->snd_task != cookie)
+		goto out;
+	if (!test_bit(XPRT_LOCKED, &xprt->state))
+		goto out;
+	xprt->snd_task =NULL;
+	xprt->ops->release_xprt(xprt, NULL);
+out:
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
 /**
  * xprt_connect - schedule a transport connect operation
  * @task: RPC task that is requesting the connect
@@ -712,9 +743,7 @@ void xprt_connect(struct rpc_task *task)
 	if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state))
 		xprt->ops->close(xprt);
 
-	if (xprt_connected(xprt))
-		xprt_release_write(xprt, task);
-	else {
+	if (!xprt_connected(xprt)) {
 		task->tk_rqstp->rq_bytes_sent = 0;
 		task->tk_timeout = task->tk_rqstp->rq_timeout;
 		rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
@@ -726,6 +755,7 @@ void xprt_connect(struct rpc_task *task)
 		xprt->stat.connect_start = jiffies;
 		xprt->ops->connect(xprt, task);
 	}
+	xprt_release_write(xprt, task);
 }
 
 static void xprt_connect_status(struct rpc_task *task)
@@ -758,7 +788,6 @@ static void xprt_connect_status(struct rpc_task *task)
 		dprintk("RPC: %5u xprt_connect_status: error %d connecting to "
 				"server %s\n", task->tk_pid, -task->tk_status,
 				xprt->servername);
-		xprt_release_write(xprt, task);
 		task->tk_status = -EIO;
 	}
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0fa7ed93dc20..e57d8ed2c4d8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -852,8 +852,6 @@ static void xs_close(struct rpc_xprt *xprt)
 
 	dprintk("RPC:       xs_close xprt %p\n", xprt);
 
-	cancel_delayed_work_sync(&transport->connect_worker);
-
 	xs_reset_transport(transport);
 	xprt->reestablish_timeout = 0;
 
@@ -2101,6 +2099,7 @@ static void xs_udp_setup_socket(struct work_struct *work)
 	trace_rpc_socket_connect(xprt, sock, 0);
 	status = 0;
 out:
+	xprt_unlock_connect(xprt, transport);
 	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
 }
@@ -2286,6 +2285,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 	case 0:
 	case -EINPROGRESS:
 	case -EALREADY:
+		xprt_unlock_connect(xprt, transport);
 		xprt_clear_connecting(xprt);
 		return;
 	case -EINVAL:
@@ -2303,6 +2303,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 out_eagain:
 	status = -EAGAIN;
 out:
+	xprt_unlock_connect(xprt, transport);
 	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
 }
@@ -2325,6 +2326,8 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 
+	WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
+
 	if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
 		dprintk("RPC:       xs_connect delayed xprt %p for %lu "
 				"seconds\n",
-- 
cgit v1.2.3


From 4efdd92c921135175a85452cd41273d9e2788db3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 8 Feb 2015 15:34:28 -0500
Subject: SUNRPC: Remove TCP client connection reset hack

Instead we rely on SO_REUSEPORT to provide the reconnection semantics
that we need for NFSv2/v3.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h |  1 -
 net/sunrpc/xprtsock.c       | 67 +--------------------------------------------
 2 files changed, 1 insertion(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 2926e618dbc6..86af854338b5 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -363,7 +363,6 @@ void			xprt_unlock_connect(struct rpc_xprt *, void *);
 #define XPRT_CONNECTION_ABORT	(7)
 #define XPRT_CONNECTION_CLOSE	(8)
 #define XPRT_CONGESTED		(9)
-#define XPRT_CONNECTION_REUSE	(10)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e53a5ca03daf..dbf279cd4494 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -796,8 +796,6 @@ static void xs_error_report(struct sock *sk)
 	dprintk("RPC:       xs_error_report client %p, error=%d...\n",
 			xprt, -err);
 	trace_rpc_socket_error(xprt, sk->sk_socket, err);
-	if (test_bit(XPRT_CONNECTION_REUSE, &xprt->state))
-		goto out;
 	xprt_wake_pending_tasks(xprt, err);
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -2102,57 +2100,6 @@ out:
 	xprt_wake_pending_tasks(xprt, status);
 }
 
-/*
- * We need to preserve the port number so the reply cache on the server can
- * find our cached RPC replies when we get around to reconnecting.
- */
-static void xs_abort_connection(struct sock_xprt *transport)
-{
-	int result;
-	struct sockaddr any;
-
-	dprintk("RPC:       disconnecting xprt %p to reuse port\n", transport);
-
-	/*
-	 * Disconnect the transport socket by doing a connect operation
-	 * with AF_UNSPEC.  This should return immediately...
-	 */
-	memset(&any, 0, sizeof(any));
-	any.sa_family = AF_UNSPEC;
-	result = kernel_connect(transport->sock, &any, sizeof(any), 0);
-	trace_rpc_socket_reset_connection(&transport->xprt,
-			transport->sock, result);
-	if (!result)
-		xs_sock_reset_connection_flags(&transport->xprt);
-	dprintk("RPC:       AF_UNSPEC connect return code %d\n", result);
-}
-
-static void xs_tcp_reuse_connection(struct sock_xprt *transport)
-{
-	unsigned int state = transport->inet->sk_state;
-
-	if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) {
-		/* we don't need to abort the connection if the socket
-		 * hasn't undergone a shutdown
-		 */
-		if (transport->inet->sk_shutdown == 0)
-			return;
-		dprintk("RPC:       %s: TCP_CLOSEd and sk_shutdown set to %d\n",
-				__func__, transport->inet->sk_shutdown);
-	}
-	if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) {
-		/* we don't need to abort the connection if the socket
-		 * hasn't undergone a shutdown
-		 */
-		if (transport->inet->sk_shutdown == 0)
-			return;
-		dprintk("RPC:       %s: ESTABLISHED/SYN_SENT "
-				"sk_shutdown set to %d\n",
-				__func__, transport->inet->sk_shutdown);
-	}
-	xs_abort_connection(transport);
-}
-
 static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2245,18 +2192,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 			status = PTR_ERR(sock);
 			goto out;
 		}
-	} else {
-		int abort_and_exit;
-
-		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
-				&xprt->state);
-		/* "close" the socket, preserving the local port */
-		set_bit(XPRT_CONNECTION_REUSE, &xprt->state);
-		xs_tcp_reuse_connection(transport);
-		clear_bit(XPRT_CONNECTION_REUSE, &xprt->state);
-
-		if (abort_and_exit)
-			goto out_eagain;
 	}
 
 	dprintk("RPC:       worker connecting xprt %p via %s to "
@@ -2296,9 +2231,9 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 	case -EADDRINUSE:
 	case -ENOBUFS:
 		/* retry with existing socket, after a delay */
+		xs_tcp_force_close(xprt);
 		goto out;
 	}
-out_eagain:
 	status = -EAGAIN;
 out:
 	xprt_unlock_connect(xprt, transport);
-- 
cgit v1.2.3


From 93c1af6ca94c1e763efba76a127b5c135e3d23a6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 8 Feb 2015 20:39:13 -0800
Subject: net:rfs: adjust table size checking

Make sure root user does not try something stupid.

Also make sure mask field in struct rps_sock_flow_table
does not share a cache line with the potentially often dirtied
flow table.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 567e4b79731c ("net: rfs: add hash collision detection")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h  | 3 ++-
 net/core/sysctl_net_core.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ab3b7cef4638..d115256ed5a2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -653,7 +653,8 @@ struct rps_dev_flow_table {
  */
 struct rps_sock_flow_table {
 	u32	mask;
-	u32	ents[0];
+
+	u32	ents[0] ____cacheline_aligned_in_smp;
 };
 #define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7a31be5e361f..eaa51ddf2368 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -52,7 +52,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 
 	if (write) {
 		if (size) {
-			if (size > 1<<30) {
+			if (size > 1<<29) {
 				/* Enforce limit to prevent overflow */
 				mutex_unlock(&sock_flow_mutex);
 				return -EINVAL;
-- 
cgit v1.2.3


From 505936f59f1e4cd0ff92ae5abc7aae64fb74dbdb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 8 Feb 2015 16:00:01 -0500
Subject: SUNRPC: Cleanup to remove remaining uses of XPRT_CONNECTION_ABORT

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h | 1 -
 net/sunrpc/xprtsock.c       | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 86af854338b5..ae39d478a272 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -360,7 +360,6 @@ void			xprt_unlock_connect(struct rpc_xprt *, void *);
 #define XPRT_BOUND		(4)
 #define XPRT_BINDING		(5)
 #define XPRT_CLOSING		(6)
-#define XPRT_CONNECTION_ABORT	(7)
 #define XPRT_CONNECTION_CLOSE	(8)
 #define XPRT_CONGESTED		(9)
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c65f74019288..2f8db3499a17 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -804,7 +804,6 @@ static void xs_error_report(struct sock *sk)
 static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
 {
 	smp_mb__before_atomic();
-	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 	clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
 	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	clear_bit(XPRT_CLOSING, &xprt->state);
@@ -1904,7 +1903,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
 	struct socket *sock;
 	int status = -EIO;
 
-	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 	status = __sock_create(xprt->xprt_net, AF_LOCAL,
 					SOCK_STREAM, 0, &sock, 1);
 	if (status < 0) {
@@ -2149,7 +2147,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 	int status = -EIO;
 
 	if (!sock) {
-		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 		sock = xs_create_sock(xprt, transport,
 				xs_addr(xprt)->sa_family, SOCK_STREAM,
 				IPPROTO_TCP, true);
-- 
cgit v1.2.3


From 9e2b9f37760e129cee053cc7b6e7288acc2a7134 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 8 Feb 2015 19:21:27 -0500
Subject: SUNRPC: Remove the redundant XPRT_CONNECTION_CLOSE flag

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h | 1 -
 net/sunrpc/xprt.c           | 1 -
 net/sunrpc/xprtsock.c       | 1 -
 3 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index ae39d478a272..8b93ef53df3c 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -360,7 +360,6 @@ void			xprt_unlock_connect(struct rpc_xprt *, void *);
 #define XPRT_BOUND		(4)
 #define XPRT_BINDING		(5)
 #define XPRT_CLOSING		(6)
-#define XPRT_CONNECTION_CLOSE	(8)
 #define XPRT_CONGESTED		(9)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ff3574df8344..e3015aede0d9 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -683,7 +683,6 @@ xprt_init_autodisconnect(unsigned long data)
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
 		goto out_abort;
 	spin_unlock(&xprt->transport_lock);
-	set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
 	queue_work(rpciod_workqueue, &xprt->task_cleanup);
 	return;
 out_abort:
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0279e8ffb14a..c72b13e2bdf5 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -804,7 +804,6 @@ static void xs_error_report(struct sock *sk)
 static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
 {
 	smp_mb__before_atomic();
-	clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
 	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	clear_bit(XPRT_CLOSING, &xprt->state);
 	smp_mb__after_atomic();
-- 
cgit v1.2.3


From dbf9782c1078c537831201c73ac60c9623ae9370 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 16 Dec 2014 18:44:36 -0500
Subject: dm: remove exports for request-based interfaces without external
 callers

Remove exports for dm_dispatch_request, dm_requeue_unmapped_request,
and dm_kill_unmapped_request.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c               | 9 +++------
 include/linux/device-mapper.h | 3 ---
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 71e6b73fe78d..5920a9af7bd6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1062,7 +1062,7 @@ static void dm_unprep_request(struct request *rq)
 /*
  * Requeue the original request of a clone.
  */
-void dm_requeue_unmapped_request(struct request *clone)
+static void dm_requeue_unmapped_request(struct request *clone)
 {
 	int rw = rq_data_dir(clone);
 	struct dm_rq_target_io *tio = clone->end_io_data;
@@ -1079,7 +1079,6 @@ void dm_requeue_unmapped_request(struct request *clone)
 
 	rq_completed(md, rw, 0);
 }
-EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
 
 static void __stop_queue(struct request_queue *q)
 {
@@ -1177,7 +1176,7 @@ static void dm_complete_request(struct request *clone, int error)
  * Target's rq_end_io() function isn't called.
  * This may be used when the target's map_rq() function fails.
  */
-void dm_kill_unmapped_request(struct request *clone, int error)
+static void dm_kill_unmapped_request(struct request *clone, int error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
@@ -1185,7 +1184,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
 	rq->cmd_flags |= REQ_FAILED;
 	dm_complete_request(clone, error);
 }
-EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
 
 /*
  * Called with the queue lock held
@@ -1686,7 +1684,7 @@ static void dm_request(struct request_queue *q, struct bio *bio)
 		_dm_request(q, bio);
 }
 
-void dm_dispatch_request(struct request *rq)
+static void dm_dispatch_request(struct request *rq)
 {
 	int r;
 
@@ -1698,7 +1696,6 @@ void dm_dispatch_request(struct request *rq)
 	if (r)
 		dm_complete_request(rq, r);
 }
-EXPORT_SYMBOL_GPL(dm_dispatch_request);
 
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
 				 void *data)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ca6d2acc5eb7..19296fba58e8 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -600,9 +600,6 @@ static inline unsigned long to_bytes(sector_t n)
 /*-----------------------------------------------------------------
  * Helper for block layer and dm core operations
  *---------------------------------------------------------------*/
-void dm_dispatch_request(struct request *rq);
-void dm_requeue_unmapped_request(struct request *rq);
-void dm_kill_unmapped_request(struct request *rq, int error);
 int dm_underlying_device_busy(struct request_queue *q);
 
 #endif	/* _LINUX_DEVICE_MAPPER_H */
-- 
cgit v1.2.3


From e5863d9ad754926e7d3f38b43ac8bd48ef73b097 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 17 Dec 2014 21:08:12 -0500
Subject: dm: allocate requests in target when stacking on blk-mq devices

For blk-mq request-based DM the responsibility of allocating a cloned
request is transfered from DM core to the target type.  Doing so
enables the cloned request to be allocated from the appropriate
blk-mq request_queue's pool (only the DM target, e.g. multipath, can
know which block device to send a given cloned request to).

Care was taken to preserve compatibility with old-style block request
completion that requires request-based DM _not_ acquire the clone
request's queue lock in the completion path.  As such, there are now 2
different request-based DM target_type interfaces:
1) the original .map_rq() interface will continue to be used for
   non-blk-mq devices -- the preallocated clone request is passed in
   from DM core.
2) a new .clone_and_map_rq() and .release_clone_rq() will be used for
   blk-mq devices -- blk_get_request() and blk_put_request() are used
   respectively from these hooks.

dm_table_set_type() was updated to detect if the request-based target is
being stacked on blk-mq devices, if so DM_TYPE_MQ_REQUEST_BASED is set.
DM core disallows switching the DM table's type after it is set.  This
means that there is no mixing of non-blk-mq and blk-mq devices within
the same request-based DM table.

[This patch was started by Keith and later heavily modified by Mike]

Tested-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c         |  51 ++++++++++++++++---
 drivers/md/dm-table.c         |  34 +++++++++++--
 drivers/md/dm-target.c        |  15 +++++-
 drivers/md/dm.c               | 114 +++++++++++++++++++++++++++++++-----------
 drivers/md/dm.h               |   8 +--
 include/linux/device-mapper.h |   7 +++
 include/uapi/linux/dm-ioctl.h |   4 +-
 7 files changed, 185 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2552b88f8953..863fc8c1ac06 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -11,6 +11,7 @@
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 
+#include <linux/blkdev.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/mempool.h>
@@ -378,12 +379,13 @@ static int __must_push_back(struct multipath *m)
 /*
  * Map cloned requests
  */
-static int multipath_map(struct dm_target *ti, struct request *clone,
-			 union map_info *map_context)
+static int __multipath_map(struct dm_target *ti, struct request *clone,
+			   union map_info *map_context,
+			   struct request *rq, struct request **__clone)
 {
 	struct multipath *m = (struct multipath *) ti->private;
 	int r = DM_MAPIO_REQUEUE;
-	size_t nr_bytes = blk_rq_bytes(clone);
+	size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
 	struct pgpath *pgpath;
 	struct block_device *bdev;
 	struct dm_mpath_io *mpio;
@@ -416,12 +418,25 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
 
 	bdev = pgpath->path.dev->bdev;
 
-	clone->q = bdev_get_queue(bdev);
-	clone->rq_disk = bdev->bd_disk;
-	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-
 	spin_unlock_irq(&m->lock);
 
+	if (clone) {
+		/* Old request-based interface: allocated clone is passed in */
+		clone->q = bdev_get_queue(bdev);
+		clone->rq_disk = bdev->bd_disk;
+		clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+	} else {
+		/* blk-mq request-based interface */
+		*__clone = blk_get_request(bdev_get_queue(bdev),
+					   rq_data_dir(rq), GFP_KERNEL);
+		if (IS_ERR(*__clone))
+			/* ENOMEM, requeue */
+			return r;
+		(*__clone)->bio = (*__clone)->biotail = NULL;
+		(*__clone)->rq_disk = bdev->bd_disk;
+		(*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+	}
+
 	if (pgpath->pg->ps.type->start_io)
 		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
 					      &pgpath->path,
@@ -434,6 +449,24 @@ out_unlock:
 	return r;
 }
 
+static int multipath_map(struct dm_target *ti, struct request *clone,
+			 union map_info *map_context)
+{
+	return __multipath_map(ti, clone, map_context, NULL, NULL);
+}
+
+static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
+				   union map_info *map_context,
+				   struct request **clone)
+{
+	return __multipath_map(ti, NULL, map_context, rq, clone);
+}
+
+static void multipath_release_clone(struct request *clone)
+{
+	blk_put_request(clone);
+}
+
 /*
  * If we run out of usable paths, should we queue I/O or error it?
  */
@@ -1670,11 +1703,13 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 7, 0},
+	.version = {1, 8, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
 	.map_rq = multipath_map,
+	.clone_and_map_rq = multipath_clone_and_map,
+	.release_clone_rq = multipath_release_clone,
 	.rq_end_io = multipath_end_io,
 	.presuspend = multipath_presuspend,
 	.postsuspend = multipath_postsuspend,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3afae9e062f8..2d7e373955f3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -827,6 +827,7 @@ static int dm_table_set_type(struct dm_table *t)
 {
 	unsigned i;
 	unsigned bio_based = 0, request_based = 0, hybrid = 0;
+	bool use_blk_mq = false;
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
 	struct list_head *devices;
@@ -872,11 +873,26 @@ static int dm_table_set_type(struct dm_table *t)
 	/* Non-request-stackable devices can't be used for request-based dm */
 	devices = dm_table_get_devices(t);
 	list_for_each_entry(dd, devices, list) {
-		if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) {
-			DMWARN("table load rejected: including"
-			       " non-request-stackable devices");
+		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
+
+		if (!blk_queue_stackable(q)) {
+			DMERR("table load rejected: including"
+			      " non-request-stackable devices");
 			return -EINVAL;
 		}
+
+		if (q->mq_ops)
+			use_blk_mq = true;
+	}
+
+	if (use_blk_mq) {
+		/* verify _all_ devices in the table are blk-mq devices */
+		list_for_each_entry(dd, devices, list)
+			if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
+				DMERR("table load rejected: not all devices"
+				      " are blk-mq request-stackable");
+				return -EINVAL;
+			}
 	}
 
 	/*
@@ -890,7 +906,7 @@ static int dm_table_set_type(struct dm_table *t)
 		return -EINVAL;
 	}
 
-	t->type = DM_TYPE_REQUEST_BASED;
+	t->type = !use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
 
 	return 0;
 }
@@ -907,7 +923,15 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
 
 bool dm_table_request_based(struct dm_table *t)
 {
-	return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
+	unsigned table_type = dm_table_get_type(t);
+
+	return (table_type == DM_TYPE_REQUEST_BASED ||
+		table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
+bool dm_table_mq_request_based(struct dm_table *t)
+{
+	return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
 }
 
 static int dm_table_alloc_md_mempools(struct dm_table *t)
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 242e3cec397a..925ec1b15e75 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -137,13 +137,26 @@ static int io_err_map_rq(struct dm_target *ti, struct request *clone,
 	return -EIO;
 }
 
+static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
+				   union map_info *map_context,
+				   struct request **clone)
+{
+	return -EIO;
+}
+
+static void io_err_release_clone_rq(struct request *clone)
+{
+}
+
 static struct target_type error_target = {
 	.name = "error",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
 	.map  = io_err_map,
 	.map_rq = io_err_map_rq,
+	.clone_and_map_rq = io_err_clone_and_map_rq,
+	.release_clone_rq = io_err_release_clone_rq,
 };
 
 int __init dm_target_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ae1219893948..549b815999a1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1044,7 +1044,10 @@ static void free_rq_clone(struct request *clone)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
 	blk_rq_unprep_clone(clone);
-	free_clone_request(tio->md, clone);
+	if (clone->q && clone->q->mq_ops)
+		tio->ti->type->release_clone_rq(clone);
+	else
+		free_clone_request(tio->md, clone);
 	free_rq_tio(tio);
 }
 
@@ -1086,7 +1089,8 @@ static void dm_unprep_request(struct request *rq)
 	rq->special = NULL;
 	rq->cmd_flags &= ~REQ_DONTPREP;
 
-	free_rq_clone(clone);
+	if (clone)
+		free_rq_clone(clone);
 }
 
 /*
@@ -1185,6 +1189,13 @@ static void dm_softirq_done(struct request *rq)
 	struct dm_rq_target_io *tio = rq->special;
 	struct request *clone = tio->clone;
 
+	if (!clone) {
+		blk_end_request_all(rq, tio->error);
+		rq_completed(tio->md, rq_data_dir(rq), false);
+		free_rq_tio(tio);
+		return;
+	}
+
 	if (rq->cmd_flags & REQ_FAILED)
 		mapped = false;
 
@@ -1207,7 +1218,7 @@ static void dm_complete_request(struct request *rq, int error)
  * Complete the not-mapped clone and the original request with the error status
  * through softirq context.
  * Target's rq_end_io() function isn't called.
- * This may be used when the target's map_rq() function fails.
+ * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
  */
 static void dm_kill_unmapped_request(struct request *rq, int error)
 {
@@ -1222,13 +1233,15 @@ static void end_clone_request(struct request *clone, int error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
-	/*
-	 * For just cleaning up the information of the queue in which
-	 * the clone was dispatched.
-	 * The clone is *NOT* freed actually here because it is alloced from
-	 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
-	 */
-	__blk_put_request(clone->q, clone);
+	if (!clone->q->mq_ops) {
+		/*
+		 * For just cleaning up the information of the queue in which
+		 * the clone was dispatched.
+		 * The clone is *NOT* freed actually here because it is alloced
+		 * from dm own mempool (REQ_ALLOCED isn't set).
+		 */
+		__blk_put_request(clone->q, clone);
+	}
 
 	/*
 	 * Actual request completion is done in a softirq context which doesn't
@@ -1789,6 +1802,8 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
 					struct mapped_device *md, gfp_t gfp_mask)
 {
 	struct dm_rq_target_io *tio;
+	int srcu_idx;
+	struct dm_table *table;
 
 	tio = alloc_rq_tio(md, gfp_mask);
 	if (!tio)
@@ -1802,10 +1817,15 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
 	memset(&tio->info, 0, sizeof(tio->info));
 	init_kthread_work(&tio->work, map_tio_request);
 
-	if (!clone_rq(rq, md, tio, gfp_mask)) {
-		free_rq_tio(tio);
-		return NULL;
+	table = dm_get_live_table(md, &srcu_idx);
+	if (!dm_table_mq_request_based(table)) {
+		if (!clone_rq(rq, md, tio, gfp_mask)) {
+			dm_put_live_table(md, srcu_idx);
+			free_rq_tio(tio);
+			return NULL;
+		}
 	}
+	dm_put_live_table(md, srcu_idx);
 
 	return tio;
 }
@@ -1835,17 +1855,36 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 
 /*
  * Returns:
- * 0  : the request has been processed (not requeued)
- * !0 : the request has been requeued
+ * 0                : the request has been processed
+ * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * < 0              : the request was completed due to failure
  */
 static int map_request(struct dm_target *ti, struct request *rq,
 		       struct mapped_device *md)
 {
-	int r, requeued = 0;
+	int r;
 	struct dm_rq_target_io *tio = rq->special;
-	struct request *clone = tio->clone;
+	struct request *clone = NULL;
+
+	if (tio->clone) {
+		clone = tio->clone;
+		r = ti->type->map_rq(ti, clone, &tio->info);
+	} else {
+		r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+		if (r < 0) {
+			/* The target wants to complete the I/O */
+			dm_kill_unmapped_request(rq, r);
+			return r;
+		}
+		if (IS_ERR(clone))
+			return DM_MAPIO_REQUEUE;
+		if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+			/* -ENOMEM */
+			ti->type->release_clone_rq(clone);
+			return DM_MAPIO_REQUEUE;
+		}
+	}
 
-	r = ti->type->map_rq(ti, clone, &tio->info);
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
 		/* The target has taken the I/O to submit by itself later */
@@ -1859,7 +1898,6 @@ static int map_request(struct dm_target *ti, struct request *rq,
 	case DM_MAPIO_REQUEUE:
 		/* The target wants to requeue the I/O */
 		dm_requeue_unmapped_request(clone);
-		requeued = 1;
 		break;
 	default:
 		if (r > 0) {
@@ -1869,17 +1907,20 @@ static int map_request(struct dm_target *ti, struct request *rq,
 
 		/* The target wants to complete the I/O */
 		dm_kill_unmapped_request(rq, r);
-		break;
+		return r;
 	}
 
-	return requeued;
+	return 0;
 }
 
 static void map_tio_request(struct kthread_work *work)
 {
 	struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
+	struct request *rq = tio->orig;
+	struct mapped_device *md = tio->md;
 
-	map_request(tio->ti, tio->orig, tio->md);
+	if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+		dm_requeue_unmapped_original_request(md, rq);
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)
@@ -2459,6 +2500,14 @@ unsigned dm_get_md_type(struct mapped_device *md)
 	return md->type;
 }
 
+static bool dm_md_type_request_based(struct mapped_device *md)
+{
+	unsigned table_type = dm_get_md_type(md);
+
+	return (table_type == DM_TYPE_REQUEST_BASED ||
+		table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 {
 	return md->immutable_target_type;
@@ -2511,8 +2560,7 @@ static int dm_init_request_based_queue(struct mapped_device *md)
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
-	if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
-	    !dm_init_request_based_queue(md)) {
+	if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
 		DMWARN("Cannot initialize queue for request-based mapped device");
 		return -EINVAL;
 	}
@@ -3184,27 +3232,35 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 {
 	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	struct kmem_cache *cachep;
-	unsigned int pool_size;
+	unsigned int pool_size = 0;
 	unsigned int front_pad;
 
 	if (!pools)
 		return NULL;
 
-	if (type == DM_TYPE_BIO_BASED) {
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
 		cachep = _io_cache;
 		pool_size = dm_get_reserved_bio_based_ios();
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-	} else if (type == DM_TYPE_REQUEST_BASED) {
-		cachep = _rq_tio_cache;
+		break;
+	case DM_TYPE_REQUEST_BASED:
 		pool_size = dm_get_reserved_rq_based_ios();
 		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
 		if (!pools->rq_pool)
 			goto out;
+		/* fall through to setup remaining rq-based pools */
+	case DM_TYPE_MQ_REQUEST_BASED:
+		cachep = _rq_tio_cache;
+		if (!pool_size)
+			pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
-	} else
+		break;
+	default:
 		goto out;
+	}
 
 	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
 	if (!pools->io_pool)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 84b0f9e4ba6c..84d79784b866 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -34,9 +34,10 @@
 /*
  * Type of table and mapped_device's mempool
  */
-#define DM_TYPE_NONE		0
-#define DM_TYPE_BIO_BASED	1
-#define DM_TYPE_REQUEST_BASED	2
+#define DM_TYPE_NONE			0
+#define DM_TYPE_BIO_BASED		1
+#define DM_TYPE_REQUEST_BASED		2
+#define DM_TYPE_MQ_REQUEST_BASED	3
 
 /*
  * List of devices that a metadevice uses and should open/close.
@@ -73,6 +74,7 @@ int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
+bool dm_table_mq_request_based(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 19296fba58e8..2646aed1d3fe 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -48,6 +48,11 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
 typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio);
 typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
 				  union map_info *map_context);
+typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti,
+					    struct request *rq,
+					    union map_info *map_context,
+					    struct request **clone);
+typedef void (*dm_release_clone_request_fn) (struct request *clone);
 
 /*
  * Returns:
@@ -143,6 +148,8 @@ struct target_type {
 	dm_dtr_fn dtr;
 	dm_map_fn map;
 	dm_map_request_fn map_rq;
+	dm_clone_and_map_request_fn clone_and_map_rq;
+	dm_release_clone_request_fn release_clone_rq;
 	dm_endio_fn end_io;
 	dm_request_endio_fn rq_end_io;
 	dm_presuspend_fn presuspend;
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index a570d7b5796c..889f3a5b7b18 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	29
+#define DM_VERSION_MINOR	30
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2014-10-28)"
+#define DM_VERSION_EXTRA	"-ioctl (2014-12-22)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 35f05dabf95ac3ebc4c15bafd6833f7a3046e66f Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 8 Feb 2015 11:49:34 +0200
Subject: IB/mlx4: Reset flow support for IB kernel ULPs

The driver exposes interfaces that directly relate to HW state. Upon fatal
error, consumers of these interfaces (ULPs) that rely on completion of
all their posted work-request could hang, thereby introducing dependencies
in shutdown order.  To prevent this from happening, we manage the
relevant resources (CQs, QPs) that are used by the device. Upon a fatal error,
we now generate simulated completions for outstanding WQEs that were not
completed at the time the HW was reset.

It includes invoking the completion event handler for all involved CQs so that
the ULPs will poll those CQs. When polled we return simulated CQEs with
IB_WC_WR_FLUSH_ERR return code enabling ULPs to clean up their resources and
not wait forever for completions upon receiving remove_one.

The above change requires an extra check in the data path to make sure that when
device is in error state, the simulated CQEs will be returned and no further
WQEs will be posted.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/cq.c      | 57 ++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx4/main.c    | 64 ++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  9 +++++
 drivers/infiniband/hw/mlx4/qp.c      | 59 +++++++++++++++++++++++++++++----
 drivers/infiniband/hw/mlx4/srq.c     |  8 +++++
 include/linux/mlx4/device.h          |  2 ++
 6 files changed, 193 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index a3b70f6c4035..543ecdd8667b 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -188,6 +188,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
 	spin_lock_init(&cq->lock);
 	cq->resize_buf = NULL;
 	cq->resize_umem = NULL;
+	INIT_LIST_HEAD(&cq->send_qp_list);
+	INIT_LIST_HEAD(&cq->recv_qp_list);
 
 	if (context) {
 		struct mlx4_ib_create_cq ucmd;
@@ -594,6 +596,55 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct
 	return 0;
 }
 
+static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries,
+			       struct ib_wc *wc, int *npolled, int is_send)
+{
+	struct mlx4_ib_wq *wq;
+	unsigned cur;
+	int i;
+
+	wq = is_send ? &qp->sq : &qp->rq;
+	cur = wq->head - wq->tail;
+
+	if (cur == 0)
+		return;
+
+	for (i = 0;  i < cur && *npolled < num_entries; i++) {
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR;
+		wq->tail++;
+		(*npolled)++;
+		wc->qp = &qp->ibqp;
+		wc++;
+	}
+}
+
+static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
+				 struct ib_wc *wc, int *npolled)
+{
+	struct mlx4_ib_qp *qp;
+
+	*npolled = 0;
+	/* Find uncompleted WQEs belonging to that cq and retrun
+	 * simulated FLUSH_ERR completions
+	 */
+	list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
+		mlx4_ib_qp_sw_comp(qp, num_entries, wc, npolled, 1);
+		if (*npolled >= num_entries)
+			goto out;
+	}
+
+	list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) {
+		mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0);
+		if (*npolled >= num_entries)
+			goto out;
+	}
+
+out:
+	return;
+}
+
 static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 			    struct mlx4_ib_qp **cur_qp,
 			    struct ib_wc *wc)
@@ -836,8 +887,13 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 	unsigned long flags;
 	int npolled;
 	int err = 0;
+	struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device);
 
 	spin_lock_irqsave(&cq->lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+		goto out;
+	}
 
 	for (npolled = 0; npolled < num_entries; ++npolled) {
 		err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
@@ -847,6 +903,7 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 
 	mlx4_cq_set_ci(&cq->mcq);
 
+out:
 	spin_unlock_irqrestore(&cq->lock, flags);
 
 	if (err == 0 || err == -EAGAIN)
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 3140da518a07..eb8e215f1613 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2308,6 +2308,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
 	spin_lock_init(&ibdev->sm_lock);
 	mutex_init(&ibdev->cap_mask_mutex);
+	INIT_LIST_HEAD(&ibdev->qp_list);
+	spin_lock_init(&ibdev->reset_flow_resource_lock);
 
 	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED &&
 	    ib_num_ports) {
@@ -2622,6 +2624,67 @@ out:
 	return;
 }
 
+static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev)
+{
+	struct mlx4_ib_qp *mqp;
+	unsigned long flags_qp;
+	unsigned long flags_cq;
+	struct mlx4_ib_cq *send_mcq, *recv_mcq;
+	struct list_head    cq_notify_list;
+	struct mlx4_cq *mcq;
+	unsigned long flags;
+
+	pr_warn("mlx4_ib_handle_catas_error was started\n");
+	INIT_LIST_HEAD(&cq_notify_list);
+
+	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+
+	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+		if (mqp->sq.tail != mqp->sq.head) {
+			send_mcq = to_mcq(mqp->ibqp.send_cq);
+			spin_lock_irqsave(&send_mcq->lock, flags_cq);
+			if (send_mcq->mcq.comp &&
+			    mqp->ibqp.send_cq->comp_handler) {
+				if (!send_mcq->mcq.reset_notify_added) {
+					send_mcq->mcq.reset_notify_added = 1;
+					list_add_tail(&send_mcq->mcq.reset_notify,
+						      &cq_notify_list);
+				}
+			}
+			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+		}
+		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+		/* Now, handle the QP's receive queue */
+		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+		/* no handling is needed for SRQ */
+		if (!mqp->ibqp.srq) {
+			if (mqp->rq.tail != mqp->rq.head) {
+				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+				if (recv_mcq->mcq.comp &&
+				    mqp->ibqp.recv_cq->comp_handler) {
+					if (!recv_mcq->mcq.reset_notify_added) {
+						recv_mcq->mcq.reset_notify_added = 1;
+						list_add_tail(&recv_mcq->mcq.reset_notify,
+							      &cq_notify_list);
+					}
+				}
+				spin_unlock_irqrestore(&recv_mcq->lock,
+						       flags_cq);
+			}
+		}
+		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+	}
+
+	list_for_each_entry(mcq, &cq_notify_list, reset_notify) {
+		mcq->comp(mcq);
+	}
+	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+	pr_warn("mlx4_ib_handle_catas_error ended\n");
+}
+
 static void handle_bonded_port_state_event(struct work_struct *work)
 {
 	struct ib_event_work *ew =
@@ -2701,6 +2764,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
 	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
 		ibdev->ib_active = false;
 		ibev.event = IB_EVENT_DEVICE_FATAL;
+		mlx4_ib_handle_catas_error(ibdev);
 		break;
 
 	case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 721540c9163d..f829fd935b79 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -110,6 +110,9 @@ struct mlx4_ib_cq {
 	struct mutex		resize_mutex;
 	struct ib_umem	       *umem;
 	struct ib_umem	       *resize_umem;
+	/* List of qps that it serves.*/
+	struct list_head		send_qp_list;
+	struct list_head		recv_qp_list;
 };
 
 struct mlx4_ib_mr {
@@ -300,6 +303,9 @@ struct mlx4_ib_qp {
 	struct mlx4_roce_smac_vlan_info pri;
 	struct mlx4_roce_smac_vlan_info alt;
 	u64			reg_id;
+	struct list_head	qps_list;
+	struct list_head	cq_recv_list;
+	struct list_head	cq_send_list;
 };
 
 struct mlx4_ib_srq {
@@ -535,6 +541,9 @@ struct mlx4_ib_dev {
 	/* lock when destroying qp1_proxy and getting netdev events */
 	struct mutex		qp1_proxy_lock[MLX4_MAX_PORTS];
 	u8			bond_next_port;
+	/* protect resources needed as part of reset flow */
+	spinlock_t		reset_flow_resource_lock;
+	struct list_head		qp_list;
 };
 
 struct ib_event_work {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 792f9dc86ada..dfc6ca128a7e 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -46,6 +46,11 @@
 #include "mlx4_ib.h"
 #include "user.h"
 
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
+			     struct mlx4_ib_cq *recv_cq);
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
+			       struct mlx4_ib_cq *recv_cq);
+
 enum {
 	MLX4_IB_ACK_REQ_FREQ	= 8,
 };
@@ -618,6 +623,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	struct mlx4_ib_sqp *sqp;
 	struct mlx4_ib_qp *qp;
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
+	struct mlx4_ib_cq *mcq;
+	unsigned long flags;
 
 	/* When tunneling special qps, we use a plain UD qp */
 	if (sqpn) {
@@ -828,6 +835,24 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	qp->mqp.event = mlx4_ib_qp_event;
 	if (!*caller_qp)
 		*caller_qp = qp;
+
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
+			 to_mcq(init_attr->recv_cq));
+	/* Maintain device to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	list_add_tail(&qp->qps_list, &dev->qp_list);
+	/* Maintain CQ to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	mcq = to_mcq(init_attr->send_cq);
+	list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
+	mcq = to_mcq(init_attr->recv_cq);
+	list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
+	mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
+			   to_mcq(init_attr->recv_cq));
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
 	return 0;
 
 err_qpn:
@@ -886,13 +911,13 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv
 	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
 {
 	if (send_cq == recv_cq) {
-		spin_lock_irq(&send_cq->lock);
+		spin_lock(&send_cq->lock);
 		__acquire(&recv_cq->lock);
 	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
-		spin_lock_irq(&send_cq->lock);
+		spin_lock(&send_cq->lock);
 		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
 	} else {
-		spin_lock_irq(&recv_cq->lock);
+		spin_lock(&recv_cq->lock);
 		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
 	}
 }
@@ -902,13 +927,13 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re
 {
 	if (send_cq == recv_cq) {
 		__release(&recv_cq->lock);
-		spin_unlock_irq(&send_cq->lock);
+		spin_unlock(&send_cq->lock);
 	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
 		spin_unlock(&recv_cq->lock);
-		spin_unlock_irq(&send_cq->lock);
+		spin_unlock(&send_cq->lock);
 	} else {
 		spin_unlock(&send_cq->lock);
-		spin_unlock_irq(&recv_cq->lock);
+		spin_unlock(&recv_cq->lock);
 	}
 }
 
@@ -953,6 +978,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 			      int is_user)
 {
 	struct mlx4_ib_cq *send_cq, *recv_cq;
+	unsigned long flags;
 
 	if (qp->state != IB_QPS_RESET) {
 		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
@@ -984,8 +1010,13 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 
 	get_cqs(qp, &send_cq, &recv_cq);
 
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
 	mlx4_ib_lock_cqs(send_cq, recv_cq);
 
+	/* del from lists under both locks above to protect reset flow paths */
+	list_del(&qp->qps_list);
+	list_del(&qp->cq_send_list);
+	list_del(&qp->cq_recv_list);
 	if (!is_user) {
 		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
 				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
@@ -996,6 +1027,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 	mlx4_qp_remove(dev->dev, &qp->mqp);
 
 	mlx4_ib_unlock_cqs(send_cq, recv_cq);
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
 
 	mlx4_qp_free(dev->dev, &qp->mqp);
 
@@ -2618,8 +2650,15 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	__be32 uninitialized_var(lso_hdr_sz);
 	__be32 blh;
 	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 
 	spin_lock_irqsave(&qp->sq.lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
 
 	ind = qp->sq_next_wqe;
 
@@ -2917,10 +2956,18 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 	int ind;
 	int max_gs;
 	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 
 	max_gs = qp->rq.max_gs;
 	spin_lock_irqsave(&qp->rq.lock, flags);
 
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
 	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 62d9285300af..dce5dfe3a70e 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -316,8 +316,15 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 	int err = 0;
 	int nreq;
 	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device);
 
 	spin_lock_irqsave(&srq->lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
@@ -362,6 +369,7 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 
 		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
 	}
+out:
 
 	spin_unlock_irqrestore(&srq->lock, flags);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index c116cb02475c..e4ebff7e9d02 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -689,6 +689,8 @@ struct mlx4_cq {
 		void (*comp)(struct mlx4_cq *);
 		void		*priv;
 	} tasklet_ctx;
+	int		reset_notify_added;
+	struct list_head	reset_notify;
 };
 
 struct mlx4_qp {
-- 
cgit v1.2.3


From 3cf385713460eb2bb4cb7ceb8ed89833b00b594b Mon Sep 17 00:00:00 2001
From: Antonios Motakis <a.motakis@virtualopensystems.com>
Date: Tue, 6 Jan 2015 11:15:11 +0100
Subject: ARM: 8256/1: driver coamba: add device binding path 'driver_override'

As already demonstrated with PCI [1] and the platform bus [2], a
driver_override property in sysfs can be used to bypass the id
matching of a device to a AMBA driver. This can be used by VFIO to
bind to any AMBA device requested by the user.

[1] http://lists-archives.com/linux-kernel/28030441-pci-introduce-new-device-binding-path-using-pci_dev-driver_override.html
[2] https://www.redhat.com/archives/libvir-list/2014-April/msg00382.html

Signed-off-by: Antonios Motakis <a.motakis@virtualopensystems.com>
Reviewed-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 Documentation/ABI/testing/sysfs-bus-amba | 20 ++++++++++++++
 drivers/amba/bus.c                       | 47 ++++++++++++++++++++++++++++++++
 include/linux/amba/bus.h                 |  1 +
 3 files changed, 68 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-amba

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-amba b/Documentation/ABI/testing/sysfs-bus-amba
new file mode 100644
index 000000000000..e7b54677cfbe
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-amba
@@ -0,0 +1,20 @@
+What:		/sys/bus/amba/devices/.../driver_override
+Date:		September 2014
+Contact:	Antonios Motakis <a.motakis@virtualopensystems.com>
+Description:
+		This file allows the driver for a device to be specified which
+		will override standard OF, ACPI, ID table, and name matching.
+		When specified, only a driver with a name matching the value
+		written to driver_override will have an opportunity to bind to
+		the device. The override is specified by writing a string to the
+		driver_override file (echo vfio-amba > driver_override)	and may
+		be cleared with an empty string (echo > driver_override).
+		This returns the device to standard matching rules binding.
+		Writing to driver_override does not automatically unbind the
+		device from its current driver or make any attempt to
+		automatically load the specified driver. If no driver with a
+		matching name is currently loaded in the kernel, the device will
+		not bind to any driver. This also allows devices to opt-out of
+		driver binding using a driver_override name such as "none".
+		Only a single driver may be specified in the override, there is
+		no support for parsing delimiters.
diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 52ddd9fbb55e..f0099360039e 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -18,6 +18,7 @@
 #include <linux/pm_domain.h>
 #include <linux/amba/bus.h>
 #include <linux/sizes.h>
+#include <linux/limits.h>
 
 #include <asm/irq.h>
 
@@ -43,6 +44,10 @@ static int amba_match(struct device *dev, struct device_driver *drv)
 	struct amba_device *pcdev = to_amba_device(dev);
 	struct amba_driver *pcdrv = to_amba_driver(drv);
 
+	/* When driver_override is set, only bind to the matching driver */
+	if (pcdev->driver_override)
+		return !strcmp(pcdev->driver_override, drv->name);
+
 	return amba_lookup(pcdrv->id_table, pcdev) != NULL;
 }
 
@@ -59,6 +64,47 @@ static int amba_uevent(struct device *dev, struct kobj_uevent_env *env)
 	return retval;
 }
 
+static ssize_t driver_override_show(struct device *_dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct amba_device *dev = to_amba_device(_dev);
+
+	if (!dev->driver_override)
+		return 0;
+
+	return sprintf(buf, "%s\n", dev->driver_override);
+}
+
+static ssize_t driver_override_store(struct device *_dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct amba_device *dev = to_amba_device(_dev);
+	char *driver_override, *old = dev->driver_override, *cp;
+
+	if (count > PATH_MAX)
+		return -EINVAL;
+
+	driver_override = kstrndup(buf, count, GFP_KERNEL);
+	if (!driver_override)
+		return -ENOMEM;
+
+	cp = strchr(driver_override, '\n');
+	if (cp)
+		*cp = '\0';
+
+	if (strlen(driver_override)) {
+		dev->driver_override = driver_override;
+	} else {
+	       kfree(driver_override);
+	       dev->driver_override = NULL;
+	}
+
+	kfree(old);
+
+	return count;
+}
+
 #define amba_attr_func(name,fmt,arg...)					\
 static ssize_t name##_show(struct device *_dev,				\
 			   struct device_attribute *attr, char *buf)	\
@@ -81,6 +127,7 @@ amba_attr_func(resource, "\t%016llx\t%016llx\t%016lx\n",
 static struct device_attribute amba_dev_attrs[] = {
 	__ATTR_RO(id),
 	__ATTR_RO(resource),
+	__ATTR_RW(driver_override),
 	__ATTR_NULL,
 };
 
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 0ab5f8e0dea2..50fc66868402 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -33,6 +33,7 @@ struct amba_device {
 	struct clk		*pclk;
 	unsigned int		periphid;
 	unsigned int		irq[AMBA_NR_IRQS];
+	char			*driver_override;
 };
 
 struct amba_driver {
-- 
cgit v1.2.3


From 13060b64b819c194909121b90b5f8dd9abb5ea4e Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 6 Feb 2015 15:05:07 -0700
Subject: vfio: Add and use device request op for vfio bus drivers

When a request is made to unbind a device from a vfio bus driver,
we need to wait for the device to become unused, ie. for userspace
to release the device.  However, we have a long standing TODO in
the code to do something proactive to make that happen.  To enable
this, we add a request callback on the vfio bus driver struct,
which is intended to signal the user through the vfio device
interface to release the device.  Instead of passively waiting for
the device to become unused, we can now pester the user to give
it up.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c  | 24 ++++++++++++++++++++++--
 include/linux/vfio.h |  2 ++
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 13e0f39d91e0..4cde85501444 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -710,6 +710,7 @@ void *vfio_del_group_dev(struct device *dev)
 	struct vfio_group *group = device->group;
 	void *device_data = device->device_data;
 	struct vfio_unbound_dev *unbound;
+	unsigned int i = 0;
 
 	/*
 	 * The group exists so long as we have a device reference.  Get
@@ -737,8 +738,27 @@ void *vfio_del_group_dev(struct device *dev)
 
 	vfio_device_put(device);
 
-	/* TODO send a signal to encourage this to be released */
-	wait_event(vfio.release_q, !vfio_dev_present(group, dev));
+	/*
+	 * If the device is still present in the group after the above
+	 * 'put', then it is in use and we need to request it from the
+	 * bus driver.  The driver may in turn need to request the
+	 * device from the user.  We send the request on an arbitrary
+	 * interval with counter to allow the driver to take escalating
+	 * measures to release the device if it has the ability to do so.
+	 */
+	do {
+		device = vfio_group_get_device(group, dev);
+		if (!device)
+			break;
+
+		if (device->ops->request)
+			device->ops->request(device_data, i++);
+
+		vfio_device_put(device);
+
+	} while (wait_event_interruptible_timeout(vfio.release_q,
+						  !vfio_dev_present(group, dev),
+						  HZ * 10) <= 0);
 
 	vfio_group_put(group);
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index d3204115f15d..2d67b8998fd8 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -26,6 +26,7 @@
  * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_*
  *         operations documented below
  * @mmap: Perform mmap(2) on a region of the device file descriptor
+ * @request: Request for the bus driver to release the device
  */
 struct vfio_device_ops {
 	char	*name;
@@ -38,6 +39,7 @@ struct vfio_device_ops {
 	long	(*ioctl)(void *device_data, unsigned int cmd,
 			 unsigned long arg);
 	int	(*mmap)(void *device_data, struct vm_area_struct *vma);
+	void	(*request)(void *device_data, unsigned int count);
 };
 
 extern int vfio_add_group_dev(struct device *dev,
-- 
cgit v1.2.3


From 201f201c33220f53856fd300e1990b779538d67f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 10 Feb 2015 13:31:34 -0700
Subject: blk-mq: make blk_mq_run_queues() static

We no longer use it outside of blk-mq.c, so we can make it static
and stop exporting it. Additionally, kill the 'async' argument, as
there's only one used of it.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 9 ++++-----
 include/linux/blk-mq.h | 1 -
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index eb8e694fda06..1e4d4599d9c5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -33,6 +33,7 @@ static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
+static void blk_mq_run_queues(struct request_queue *q);
 
 /*
  * Check if any of the ctx's have pending work in this hardware queue
@@ -117,7 +118,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 
 	if (freeze) {
 		percpu_ref_kill(&q->mq_usage_counter);
-		blk_mq_run_queues(q, false);
+		blk_mq_run_queues(q);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
@@ -853,7 +854,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 			&hctx->run_work, 0);
 }
 
-void blk_mq_run_queues(struct request_queue *q, bool async)
+static void blk_mq_run_queues(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
@@ -864,10 +865,9 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 			continue;
 
-		blk_mq_run_hw_queue(hctx, async);
+		blk_mq_run_hw_queue(hctx, false);
 	}
 }
-EXPORT_SYMBOL(blk_mq_run_queues);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
@@ -905,7 +905,6 @@ void blk_mq_start_hw_queues(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 
-
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 86b08b1a5eba..ac6c7f534de1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -175,7 +175,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
 void blk_mq_insert_request(struct request *, bool, bool, bool);
-void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-- 
cgit v1.2.3


From 6ee8e25fc3e916193bce4ebb43d5439e1e2144ab Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 10 Feb 2015 14:08:32 -0800
Subject: fsnotify: fix handling of renames in audit

Commit e9fd702a58c4 ("audit: convert audit watches to use fsnotify
instead of inotify") broke handling of renames in audit.  Audit code
wants to update inode number of an inode corresponding to watched name
in a directory.  When something gets renamed into a directory to a
watched name, inotify previously passed moved inode to audit code
however new fsnotify code passes directory inode where the change
happened.  That confuses audit and it starts watching parent directory
instead of a file in a directory.

This can be observed for example by doing:

  cd /tmp
  touch foo bar
  auditctl -w /tmp/foo
  touch foo
  mv bar foo
  touch foo

In audit log we see events like:

  type=CONFIG_CHANGE msg=audit(1423563584.155:90): auid=1000 ses=2 op="updated rules" path="/tmp/foo" key=(null) list=4 res=1
  ...
  type=PATH msg=audit(1423563584.155:91): item=2 name="bar" inode=1046884 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=DELETE
  type=PATH msg=audit(1423563584.155:91): item=3 name="foo" inode=1046842 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=DELETE
  type=PATH msg=audit(1423563584.155:91): item=4 name="foo" inode=1046884 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=CREATE
  ...

and that's it - we see event for the first touch after creating the
audit rule, we see events for rename but we don't see any event for the
last touch.  However we start seeing events for unrelated stuff
happening in /tmp.

Fix the problem by passing moved inode as data in the FS_MOVED_FROM and
FS_MOVED_TO events instead of the directory where the change happens.
This doesn't introduce any new problems because noone besides
audit_watch.c cares about the passed value:

  fs/notify/fanotify/fanotify.c cares only about FSNOTIFY_EVENT_PATH events.
  fs/notify/dnotify/dnotify.c doesn't care about passed 'data' value at all.
  fs/notify/inotify/inotify_fsnotify.c uses 'data' only for FSNOTIFY_EVENT_PATH.
  kernel/audit_tree.c doesn't care about passed 'data' at all.
  kernel/audit_watch.c expects moved inode as 'data'.

Fixes: e9fd702a58c49db ("audit: convert audit watches to use fsnotify instead of inotify")
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fsnotify.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 1c804b057fb1..7ee1774edee5 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -101,8 +101,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		new_dir_mask |= FS_ISDIR;
 	}
 
-	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie);
-	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie);
+	fsnotify(old_dir, old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_name,
+		 fs_cookie);
+	fsnotify(new_dir, new_dir_mask, source, FSNOTIFY_EVENT_INODE, new_name,
+		 fs_cookie);
 
 	if (target)
 		fsnotify_link_count(target);
-- 
cgit v1.2.3


From ccaafd7fd039aebc9359a9799f8558b01f1c2adc Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 10 Feb 2015 14:09:35 -0800
Subject: mm: don't use compound_head() in virt_to_head_page()

compound_head() is implemented with assumption that there would be race
condition when checking tail flag.  This assumption is only true when we
try to access arbitrary positioned struct page.

The situation that virt_to_head_page() is called is different case.  We
call virt_to_head_page() only in the range of allocated pages, so there
is no race condition on tail flag.  In this case, we don't need to
handle race condition and we can reduce overhead slightly.  This patch
implements compound_head_fast() which is similar with compound_head()
except tail flag race handling.  And then, virt_to_head_page() uses this
optimized function to improve performance.

I saw 1.8% win in a fast-path loop over kmem_cache_alloc/free, (14.063
ns -> 13.810 ns) if target object is on tail page.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd5ea3016fc4..2c6fd3c5424a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -446,6 +446,12 @@ static inline struct page *compound_head_by_tail(struct page *tail)
 	return tail;
 }
 
+/*
+ * Since either compound page could be dismantled asynchronously in THP
+ * or we access asynchronously arbitrary positioned struct page, there
+ * would be tail flag race. To handle this race, we should call
+ * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
+ */
 static inline struct page *compound_head(struct page *page)
 {
 	if (unlikely(PageTail(page)))
@@ -453,6 +459,18 @@ static inline struct page *compound_head(struct page *page)
 	return page;
 }
 
+/*
+ * If we access compound page synchronously such as access to
+ * allocated page, there is no need to handle tail flag race, so we can
+ * check tail flag directly without any synchronization primitive.
+ */
+static inline struct page *compound_head_fast(struct page *page)
+{
+	if (unlikely(PageTail(page)))
+		return page->first_page;
+	return page;
+}
+
 /*
  * The atomic page->_mapcount, starts from -1: so that transitions
  * both from it and to it can be tracked, using atomic_inc_and_test
@@ -531,7 +549,14 @@ static inline void get_page(struct page *page)
 static inline struct page *virt_to_head_page(const void *x)
 {
 	struct page *page = virt_to_page(x);
-	return compound_head(page);
+
+	/*
+	 * We don't need to worry about synchronization of tail flag
+	 * when we call virt_to_head_page() since it is only called for
+	 * already allocated page and this page won't be freed until
+	 * this virt_to_head_page() is finished. So use _fast variant.
+	 */
+	return compound_head_fast(page);
 }
 
 /*
-- 
cgit v1.2.3


From c8d78c1823f46519473949d33f0d1d33fe21ea16 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:46 -0800
Subject: mm: replace remap_file_pages() syscall with emulation

remap_file_pages(2) was invented to be able efficiently map parts of
huge file into limited 32-bit virtual address space such as in database
workloads.

Nonlinear mappings are pain to support and it seems there's no
legitimate use-cases nowadays since 64-bit systems are widely available.

Let's drop it and get rid of all these special-cased code.

The patch replaces the syscall with emulation which creates new VMA on
each remap_file_pages(), unless they it can be merged with an adjacent
one.

I didn't find *any* real code that uses remap_file_pages(2) to test
emulation impact on.  I've checked Debian code search and source of all
packages in ALT Linux.  No real users: libc wrappers, mentions in
strace, gdb, valgrind and this kind of stuff.

There are few basic tests in LTP for the syscall.  They work just fine
with emulation.

To test performance impact, I've written small test case which
demonstrate pretty much worst case scenario: map 4G shmfs file, write to
begin of every page pgoff of the page, remap pages in reverse order,
read every page.

The test creates 1 million of VMAs if emulation is in use, so I had to
set vm.max_map_count to 1100000 to avoid -ENOMEM.

Before:		23.3 ( +-  4.31% ) seconds
After:		43.9 ( +-  0.85% ) seconds
Slowdown:	1.88x

I believe we can live with that.

Test case:

        #define _GNU_SOURCE
        #include <assert.h>
        #include <stdlib.h>
        #include <stdio.h>
        #include <sys/mman.h>

        #define MB	(1024UL * 1024)
        #define SIZE	(4096 * MB)

        int main(int argc, char **argv)
        {
                unsigned long *p;
                long i, pass;

                for (pass = 0; pass < 10; pass++) {
                        p = mmap(NULL, SIZE, PROT_READ|PROT_WRITE,
                                        MAP_SHARED | MAP_ANONYMOUS, -1, 0);
                        if (p == MAP_FAILED) {
                                perror("mmap");
                                return -1;
                        }

                        for (i = 0; i < SIZE / 4096; i++)
                                p[i * 4096 / sizeof(*p)] = i;

                        for (i = 0; i < SIZE / 4096; i++) {
                                if (remap_file_pages(p + i * 4096 / sizeof(*p), 4096,
                                                0, (SIZE - 4096 * (i + 1)) >> 12, 0)) {
                                        perror("remap_file_pages");
                                        return -1;
                                }
                        }

                        for (i = SIZE / 4096 - 1; i >= 0; i--)
                                assert(p[i * 4096 / sizeof(*p)] == SIZE / 4096 - i - 1);

                        munmap(p, SIZE);
                }

                return 0;
        }

[akpm@linux-foundation.org: fix spello]
[sasha.levin@oracle.com: initialize populate before usage]
[sasha.levin@oracle.com: grab file ref to prevent race while mmaping]
Signed-off-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Armin Rigo <arigo@tunes.org>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/remap_file_pages.txt |   7 +-
 include/linux/fs.h                    |   8 +-
 mm/Makefile                           |   2 +-
 mm/fremap.c                           | 283 ----------------------------------
 mm/mmap.c                             |  69 +++++++++
 mm/nommu.c                            |   8 -
 6 files changed, 79 insertions(+), 298 deletions(-)
 delete mode 100644 mm/fremap.c

(limited to 'include/linux')

diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt
index 560e4363a55d..f609142f406a 100644
--- a/Documentation/vm/remap_file_pages.txt
+++ b/Documentation/vm/remap_file_pages.txt
@@ -18,10 +18,9 @@ on 32-bit systems to map files bigger than can linearly fit into 32-bit
 virtual address space. This use-case is not critical anymore since 64-bit
 systems are widely available.
 
-The plan is to deprecate the syscall and replace it with an emulation.
-The emulation will create new VMAs instead of nonlinear mappings. It's
-going to work slower for rare users of remap_file_pages() but ABI is
-preserved.
+The syscall is deprecated and replaced it with an emulation now. The
+emulation creates new VMAs instead of nonlinear mappings. It's going to
+work slower for rare users of remap_file_pages() but ABI is preserved.
 
 One side effect of emulation (apart from performance) is that user can hit
 vm.max_map_count limit more easily due to additional VMAs. See comment for
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..60c4996df7f3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2481,8 +2481,12 @@ extern int sb_min_blocksize(struct super_block *, int);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
-extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
-		unsigned long size, pgoff_t pgoff);
+static inline int generic_file_remap_pages(struct vm_area_struct *vma,
+		unsigned long addr, unsigned long size, pgoff_t pgoff)
+{
+	BUG();
+	return 0;
+}
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/Makefile b/mm/Makefile
index 4bf586e66378..3548460ab7b6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
 #
 
 mmu-y			:= nommu.o
-mmu-$(CONFIG_MMU)	:= fremap.o gup.o highmem.o memory.o mincore.o \
+mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   vmalloc.o pagewalk.o pgtable-generic.o
 
diff --git a/mm/fremap.c b/mm/fremap.c
deleted file mode 100644
index 2805d71cf476..000000000000
--- a/mm/fremap.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *   linux/mm/fremap.c
- * 
- * Explicit pagetable population and nonlinear (random) mappings support.
- *
- * started by Ingo Molnar, Copyright (C) 2002, 2003
- */
-#include <linux/export.h>
-#include <linux/backing-dev.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/swapops.h>
-#include <linux/rmap.h>
-#include <linux/syscalls.h>
-#include <linux/mmu_notifier.h>
-
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#include "internal.h"
-
-static int mm_counter(struct page *page)
-{
-	return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
-}
-
-static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	struct page *page;
-	swp_entry_t entry;
-
-	if (pte_present(pte)) {
-		flush_cache_page(vma, addr, pte_pfn(pte));
-		pte = ptep_clear_flush_notify(vma, addr, ptep);
-		page = vm_normal_page(vma, addr, pte);
-		if (page) {
-			if (pte_dirty(pte))
-				set_page_dirty(page);
-			update_hiwater_rss(mm);
-			dec_mm_counter(mm, mm_counter(page));
-			page_remove_rmap(page);
-			page_cache_release(page);
-		}
-	} else {	/* zap_pte() is not called when pte_none() */
-		if (!pte_file(pte)) {
-			update_hiwater_rss(mm);
-			entry = pte_to_swp_entry(pte);
-			if (non_swap_entry(entry)) {
-				if (is_migration_entry(entry)) {
-					page = migration_entry_to_page(entry);
-					dec_mm_counter(mm, mm_counter(page));
-				}
-			} else {
-				free_swap_and_cache(entry);
-				dec_mm_counter(mm, MM_SWAPENTS);
-			}
-		}
-		pte_clear_not_present_full(mm, addr, ptep, 0);
-	}
-}
-
-/*
- * Install a file pte to a given virtual memory address, release any
- * previously existing mapping.
- */
-static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long pgoff, pgprot_t prot)
-{
-	int err = -ENOMEM;
-	pte_t *pte, ptfile;
-	spinlock_t *ptl;
-
-	pte = get_locked_pte(mm, addr, &ptl);
-	if (!pte)
-		goto out;
-
-	ptfile = pgoff_to_pte(pgoff);
-
-	if (!pte_none(*pte))
-		zap_pte(mm, vma, addr, pte);
-
-	set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
-	/*
-	 * We don't need to run update_mmu_cache() here because the "file pte"
-	 * being installed by install_file_pte() is not a real pte - it's a
-	 * non-present entry (like a swap entry), noting what file offset should
-	 * be mapped there when there's a fault (in a non-linear vma where
-	 * that's not obvious).
-	 */
-	pte_unmap_unlock(pte, ptl);
-	err = 0;
-out:
-	return err;
-}
-
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-			     unsigned long size, pgoff_t pgoff)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	int err;
-
-	do {
-		err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
-		if (err)
-			return err;
-
-		size -= PAGE_SIZE;
-		addr += PAGE_SIZE;
-		pgoff++;
-	} while (size);
-
-	return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
-/**
- * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
- * @start: start of the remapped virtual memory range
- * @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range (see NOTE)
- * @pgoff: to-be-mapped page of the backing store file
- * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
- *
- * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
- * (shared backing store file).
- *
- * This syscall works purely via pagetables, so it's the most efficient
- * way to map the same (large) file into a given virtual window. Unlike
- * mmap()/mremap() it does not create any new vmas. The new mappings are
- * also safe across swapout.
- *
- * NOTE: the @prot parameter right now is ignored (but must be zero),
- * and the vma's default protection is used. Arbitrary protections
- * might be implemented in the future.
- */
-SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
-		unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
-{
-	struct mm_struct *mm = current->mm;
-	struct address_space *mapping;
-	struct vm_area_struct *vma;
-	int err = -EINVAL;
-	int has_write_lock = 0;
-	vm_flags_t vm_flags = 0;
-
-	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
-			"See Documentation/vm/remap_file_pages.txt.\n",
-			current->comm, current->pid);
-
-	if (prot)
-		return err;
-	/*
-	 * Sanitize the syscall parameters:
-	 */
-	start = start & PAGE_MASK;
-	size = size & PAGE_MASK;
-
-	/* Does the address range wrap, or is the span zero-sized? */
-	if (start + size <= start)
-		return err;
-
-	/* Does pgoff wrap? */
-	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
-		return err;
-
-	/* Can we represent this offset inside this architecture's pte's? */
-#if PTE_FILE_MAX_BITS < BITS_PER_LONG
-	if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
-		return err;
-#endif
-
-	/* We need down_write() to change vma->vm_flags. */
-	down_read(&mm->mmap_sem);
- retry:
-	vma = find_vma(mm, start);
-
-	/*
-	 * Make sure the vma is shared, that it supports prefaulting,
-	 * and that the remapped range is valid and fully within
-	 * the single existing vma.
-	 */
-	if (!vma || !(vma->vm_flags & VM_SHARED))
-		goto out;
-
-	if (!vma->vm_ops || !vma->vm_ops->remap_pages)
-		goto out;
-
-	if (start < vma->vm_start || start + size > vma->vm_end)
-		goto out;
-
-	/* Must set VM_NONLINEAR before any pages are populated. */
-	if (!(vma->vm_flags & VM_NONLINEAR)) {
-		/*
-		 * vm_private_data is used as a swapout cursor
-		 * in a VM_NONLINEAR vma.
-		 */
-		if (vma->vm_private_data)
-			goto out;
-
-		/* Don't need a nonlinear mapping, exit success */
-		if (pgoff == linear_page_index(vma, start)) {
-			err = 0;
-			goto out;
-		}
-
-		if (!has_write_lock) {
-get_write_lock:
-			up_read(&mm->mmap_sem);
-			down_write(&mm->mmap_sem);
-			has_write_lock = 1;
-			goto retry;
-		}
-		mapping = vma->vm_file->f_mapping;
-		/*
-		 * page_mkclean doesn't work on nonlinear vmas, so if
-		 * dirty pages need to be accounted, emulate with linear
-		 * vmas.
-		 */
-		if (mapping_cap_account_dirty(mapping)) {
-			unsigned long addr;
-			struct file *file = get_file(vma->vm_file);
-			/* mmap_region may free vma; grab the info now */
-			vm_flags = vma->vm_flags;
-
-			addr = mmap_region(file, start, size, vm_flags, pgoff);
-			fput(file);
-			if (IS_ERR_VALUE(addr)) {
-				err = addr;
-			} else {
-				BUG_ON(addr != start);
-				err = 0;
-			}
-			goto out_freed;
-		}
-		i_mmap_lock_write(mapping);
-		flush_dcache_mmap_lock(mapping);
-		vma->vm_flags |= VM_NONLINEAR;
-		vma_interval_tree_remove(vma, &mapping->i_mmap);
-		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-		flush_dcache_mmap_unlock(mapping);
-		i_mmap_unlock_write(mapping);
-	}
-
-	if (vma->vm_flags & VM_LOCKED) {
-		/*
-		 * drop PG_Mlocked flag for over-mapped range
-		 */
-		if (!has_write_lock)
-			goto get_write_lock;
-		vm_flags = vma->vm_flags;
-		munlock_vma_pages_range(vma, start, start + size);
-		vma->vm_flags = vm_flags;
-	}
-
-	mmu_notifier_invalidate_range_start(mm, start, start + size);
-	err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
-	mmu_notifier_invalidate_range_end(mm, start, start + size);
-
-	/*
-	 * We can't clear VM_NONLINEAR because we'd have to do
-	 * it after ->populate completes, and that would prevent
-	 * downgrading the lock.  (Locks can't be upgraded).
-	 */
-
-out:
-	if (vma)
-		vm_flags = vma->vm_flags;
-out_freed:
-	if (likely(!has_write_lock))
-		up_read(&mm->mmap_sem);
-	else
-		up_write(&mm->mmap_sem);
-	if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
-		mm_populate(start, size);
-
-	return err;
-}
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f684d5a8087..e023dc5e59a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2634,6 +2634,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 	return vm_munmap(addr, len);
 }
 
+
+/*
+ * Emulation of deprecated remap_file_pages() syscall.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+		unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long populate = 0;
+	unsigned long ret = -EINVAL;
+	struct file *file;
+
+	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
+			"See Documentation/vm/remap_file_pages.txt.\n",
+			current->comm, current->pid);
+
+	if (prot)
+		return ret;
+	start = start & PAGE_MASK;
+	size = size & PAGE_MASK;
+
+	if (start + size <= start)
+		return ret;
+
+	/* Does pgoff wrap? */
+	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+		return ret;
+
+	down_write(&mm->mmap_sem);
+	vma = find_vma(mm, start);
+
+	if (!vma || !(vma->vm_flags & VM_SHARED))
+		goto out;
+
+	if (start < vma->vm_start || start + size > vma->vm_end)
+		goto out;
+
+	if (pgoff == linear_page_index(vma, start)) {
+		ret = 0;
+		goto out;
+	}
+
+	prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+	prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+	prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+	flags &= MAP_NONBLOCK;
+	flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+	if (vma->vm_flags & VM_LOCKED) {
+		flags |= MAP_LOCKED;
+		/* drop PG_Mlocked flag for over-mapped range */
+		munlock_vma_pages_range(vma, start, start + size);
+	}
+
+	file = get_file(vma->vm_file);
+	ret = do_mmap_pgoff(vma->vm_file, start, size,
+			prot, flags, pgoff, &populate);
+	fput(file);
+out:
+	up_write(&mm->mmap_sem);
+	if (populate)
+		mm_populate(ret, populate);
+	if (!IS_ERR_VALUE(ret))
+		ret = 0;
+	return ret;
+}
+
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
diff --git a/mm/nommu.c b/mm/nommu.c
index 28bd8c4dff6f..541bed64e348 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1984,14 +1984,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-			     unsigned long size, pgoff_t pgoff)
-{
-	BUG();
-	return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, int write)
 {
-- 
cgit v1.2.3


From 8a5f14a23177061ec11daeaa3d09d0765d785c47 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:49 -0800
Subject: mm: drop support of non-linear mapping from unmap/zap codepath

We have remap_file_pages(2) emulation in -mm tree for few release cycles
and we plan to have it mainline in v3.20. This patchset removes rest of
VM_NONLINEAR infrastructure.

Patches 1-8 take care about generic code. They are pretty
straight-forward and can be applied without other of patches.

Rest patches removes pte_file()-related stuff from architecture-specific
code. It usually frees up one bit in non-present pte. I've tried to reuse
that bit for swap offset, where I was able to figure out how to do that.

For obvious reason I cannot test all that arch-specific code and would
like to see acks from maintainers.

In total, remap_file_pages(2) required about 1.4K lines of not-so-trivial
kernel code. That's too much for functionality nobody uses.

Tested-by: Felipe Balbi <balbi@ti.com>

This patch (of 38):

We don't create non-linear mappings anymore. Let's drop code which
handles them on unmap/zap.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/madvise.c       |  9 +-----
 mm/memory.c        | 82 ++++++++++++++----------------------------------------
 3 files changed, 22 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c6fd3c5424a..600ef5ed4698 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1146,7 +1146,6 @@ extern void user_shm_unlock(size_t, struct user_struct *);
  * Parameter block passed down to zap_pte_range in exceptional cases.
  */
 struct zap_details {
-	struct vm_area_struct *nonlinear_vma;	/* Check page->index if set */
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc93289..917754d26c17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -278,14 +278,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
 		return -EINVAL;
 
-	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
-		struct zap_details details = {
-			.nonlinear_vma = vma,
-			.last_index = ULONG_MAX,
-		};
-		zap_page_range(vma, start, end - start, &details);
-	} else
-		zap_page_range(vma, start, end - start, NULL);
+	zap_page_range(vma, start, end - start, NULL);
 	return 0;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 2c3536cc6c63..9a3e73b69dad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1082,6 +1082,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	spinlock_t *ptl;
 	pte_t *start_pte;
 	pte_t *pte;
+	swp_entry_t entry;
 
 again:
 	init_rss_vec(rss);
@@ -1107,28 +1108,12 @@ again:
 				if (details->check_mapping &&
 				    details->check_mapping != page->mapping)
 					continue;
-				/*
-				 * Each page->index must be checked when
-				 * invalidating or truncating nonlinear.
-				 */
-				if (details->nonlinear_vma &&
-				    (page->index < details->first_index ||
-				     page->index > details->last_index))
-					continue;
 			}
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
-			if (unlikely(details) && details->nonlinear_vma
-			    && linear_page_index(details->nonlinear_vma,
-						addr) != page->index) {
-				pte_t ptfile = pgoff_to_pte(page->index);
-				if (pte_soft_dirty(ptent))
-					ptfile = pte_file_mksoft_dirty(ptfile);
-				set_pte_at(mm, addr, pte, ptfile);
-			}
 			if (PageAnon(page))
 				rss[MM_ANONPAGES]--;
 			else {
@@ -1151,33 +1136,25 @@ again:
 			}
 			continue;
 		}
-		/*
-		 * If details->check_mapping, we leave swap entries;
-		 * if details->nonlinear_vma, we leave file entries.
-		 */
+		/* If details->check_mapping, we leave swap entries. */
 		if (unlikely(details))
 			continue;
-		if (pte_file(ptent)) {
-			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
-				print_bad_pte(vma, addr, ptent, NULL);
-		} else {
-			swp_entry_t entry = pte_to_swp_entry(ptent);
 
-			if (!non_swap_entry(entry))
-				rss[MM_SWAPENTS]--;
-			else if (is_migration_entry(entry)) {
-				struct page *page;
+		entry = pte_to_swp_entry(ptent);
+		if (!non_swap_entry(entry))
+			rss[MM_SWAPENTS]--;
+		else if (is_migration_entry(entry)) {
+			struct page *page;
 
-				page = migration_entry_to_page(entry);
+			page = migration_entry_to_page(entry);
 
-				if (PageAnon(page))
-					rss[MM_ANONPAGES]--;
-				else
-					rss[MM_FILEPAGES]--;
-			}
-			if (unlikely(!free_swap_and_cache(entry)))
-				print_bad_pte(vma, addr, ptent, NULL);
+			if (PageAnon(page))
+				rss[MM_ANONPAGES]--;
+			else
+				rss[MM_FILEPAGES]--;
 		}
+		if (unlikely(!free_swap_and_cache(entry)))
+			print_bad_pte(vma, addr, ptent, NULL);
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
@@ -1277,7 +1254,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
 	pgd_t *pgd;
 	unsigned long next;
 
-	if (details && !details->check_mapping && !details->nonlinear_vma)
+	if (details && !details->check_mapping)
 		details = NULL;
 
 	BUG_ON(addr >= end);
@@ -1371,7 +1348,7 @@ void unmap_vmas(struct mmu_gather *tlb,
  * @vma: vm_area_struct holding the applicable pages
  * @start: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * Caller must protect the VMA list
  */
@@ -1397,7 +1374,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
  * @vma: vm_area_struct holding the applicable pages
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * The range must fit into one VMA.
  */
@@ -2331,25 +2308,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
 	}
 }
 
-static inline void unmap_mapping_range_list(struct list_head *head,
-					    struct zap_details *details)
-{
-	struct vm_area_struct *vma;
-
-	/*
-	 * In nonlinear VMAs there is no correspondence between virtual address
-	 * offset and file offset.  So we must perform an exhaustive search
-	 * across *all* the pages in each nonlinear VMA, not just the pages
-	 * whose virtual address lies outside the file truncation point.
-	 */
-	list_for_each_entry(vma, head, shared.nonlinear) {
-		details->nonlinear_vma = vma;
-		unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-	}
-}
-
 /**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified page range in the underlying
+ * file.
+ *
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
@@ -2378,7 +2341,6 @@ void unmap_mapping_range(struct address_space *mapping,
 	}
 
 	details.check_mapping = even_cows? NULL: mapping;
-	details.nonlinear_vma = NULL;
 	details.first_index = hba;
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
@@ -2388,8 +2350,6 @@ void unmap_mapping_range(struct address_space *mapping,
 	i_mmap_lock_write(mapping);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
-	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
-		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	i_mmap_unlock_write(mapping);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
-- 
cgit v1.2.3


From 9b4bdd2ffab9557ac43af7dff02e7dab1c8c58bd Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:51 -0800
Subject: mm: drop support of non-linear mapping from fault codepath

We don't create non-linear mappings anymore.  Let's drop code which
handles them on page fault.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 16 ++++++--------
 mm/memory.c        | 65 ++++++++----------------------------------------------
 2 files changed, 16 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 600ef5ed4698..376e5c325dee 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,21 +206,19 @@ extern unsigned int kobjsize(const void *objp);
 extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
-#define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
-#define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
-#define FAULT_FLAG_ALLOW_RETRY	0x08	/* Retry fault if blocking */
-#define FAULT_FLAG_RETRY_NOWAIT	0x10	/* Don't drop mmap_sem and wait when retrying */
-#define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
-#define FAULT_FLAG_TRIED	0x40	/* second try */
-#define FAULT_FLAG_USER		0x80	/* The fault originated in userspace */
+#define FAULT_FLAG_MKWRITE	0x02	/* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_ALLOW_RETRY	0x04	/* Retry fault if blocking */
+#define FAULT_FLAG_RETRY_NOWAIT	0x08	/* Don't drop mmap_sem and wait when retrying */
+#define FAULT_FLAG_KILLABLE	0x10	/* The fault task is in SIGKILL killable region */
+#define FAULT_FLAG_TRIED	0x20	/* Second try */
+#define FAULT_FLAG_USER		0x40	/* The fault originated in userspace */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
  * of VM_FAULT_xxx flags that give details about how the fault was handled.
  *
- * pgoff should be used in favour of virtual_address, if possible. If pgoff
- * is used, one may implement ->remap_pages to get nonlinear mapping support.
+ * pgoff should be used in favour of virtual_address, if possible.
  */
 struct vm_fault {
 	unsigned int flags;		/* FAULT_FLAG_xxx flags */
diff --git a/mm/memory.c b/mm/memory.c
index 9a3e73b69dad..43a53743cbb4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1899,12 +1899,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
+ * handle_pte_fault chooses page fault handler according to an entry which was
+ * read non-atomically.  Before making any commitment, on those architectures
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
+ * parts, do_swap_page must check under lock before unmapping the pte and
+ * proceeding (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -2710,8 +2709,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
-		entry = pte_mksoft_dirty(entry);
 	if (anon) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address);
@@ -2846,8 +2843,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * if page by the offset is not ready to be mapped (cold cache or
 	 * something).
 	 */
-	if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
-	    fault_around_bytes >> PAGE_SHIFT > 1) {
+	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
 		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 		do_fault_around(vma, address, pte, pgoff, flags);
 		if (!pte_same(*pte, orig_pte))
@@ -2992,7 +2988,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
-static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
@@ -3009,46 +3005,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
-/*
- * Fault of a previously existing named mapping. Repopulate the pte
- * from the encoded file_pte if possible. This enables swappable
- * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with pte unmapped and unlocked.
- * The mmap_sem may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
- */
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
-{
-	pgoff_t pgoff;
-
-	flags |= FAULT_FLAG_NONLINEAR;
-
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return 0;
-
-	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-		/*
-		 * Page table corrupted: show pte and kill process.
-		 */
-		print_bad_pte(vma, address, orig_pte, NULL);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pgoff = pte_to_pgoff(orig_pte);
-	if (!(flags & FAULT_FLAG_WRITE))
-		return do_read_fault(mm, vma, address, pmd, pgoff, flags,
-				orig_pte);
-	if (!(vma->vm_flags & VM_SHARED))
-		return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
-				orig_pte);
-	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
-}
-
 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 				unsigned long addr, int page_nid,
 				int *flags)
@@ -3176,15 +3132,12 @@ static int handle_pte_fault(struct mm_struct *mm,
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
 				if (likely(vma->vm_ops->fault))
-					return do_linear_fault(mm, vma, address,
-						pte, pmd, flags, entry);
+					return do_fault(mm, vma, address, pte,
+							pmd, flags, entry);
 			}
 			return do_anonymous_page(mm, vma, address,
 						 pte, pmd, flags);
 		}
-		if (pte_file(entry))
-			return do_nonlinear_fault(mm, vma, address,
-					pte, pmd, flags, entry);
 		return do_swap_page(mm, vma, address,
 					pte, pmd, flags, entry);
 	}
-- 
cgit v1.2.3


From d83a08db5ba6072caa658745881f4baa9bad6a08 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:54 -0800
Subject: mm: drop vm_ops->remap_pages and generic_file_remap_pages() stub

Nobody uses it anymore.

[akpm@linux-foundation.org: fix filemap_xip.c]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/9p/vfs_file.c   | 2 --
 fs/btrfs/file.c    | 1 -
 fs/ceph/addr.c     | 1 -
 fs/cifs/file.c     | 1 -
 fs/ext4/file.c     | 1 -
 fs/f2fs/file.c     | 1 -
 fs/fuse/file.c     | 1 -
 fs/gfs2/file.c     | 1 -
 fs/nfs/file.c      | 1 -
 fs/nilfs2/file.c   | 1 -
 fs/ocfs2/mmap.c    | 1 -
 fs/ubifs/file.c    | 1 -
 fs/xfs/xfs_file.c  | 1 -
 include/linux/fs.h | 6 ------
 include/linux/mm.h | 3 ---
 mm/filemap.c       | 1 -
 mm/filemap_xip.c   | 1 -
 mm/shmem.c         | 1 -
 18 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5594505e6e73..b40133796b87 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..a606ab551296 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= btrfs_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c81c0e004588..24be059fd1f8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1569,7 +1569,6 @@ out:
 static struct vm_operations_struct ceph_vmops = {
 	.fault		= ceph_filemap_fault,
 	.page_mkwrite	= ceph_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 74f12877493a..294ff302a237 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3248,7 +3248,6 @@ static struct vm_operations_struct cifs_file_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = cifs_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8c0af3..7cb592386121 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -195,7 +195,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite   = ext4_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c27e0ecb3bc..5674ba13102b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -92,7 +92,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= f2fs_vm_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int get_parent_ino(struct inode *inode, nid_t *pino)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..d769e594855b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= fuse_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..ec9c2d33477a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = gfs2_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 /**
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2ab6f00dba5b..94712fc781fa 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = nfs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 3a03e0aea1fb..a8c728acb7a8 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= nilfs_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 10d66c75cecb..9581d190f6e1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ out:
 static const struct vm_operations_struct ocfs2_file_vm_ops = {
 	.fault		= ocfs2_fault,
 	.page_mkwrite	= ocfs2_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 538519ee37d9..035e51011444 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
 	.fault        = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = ubifs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..ac7f1e8f92b3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1384,5 +1384,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= xfs_vm_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60c4996df7f3..47f557c7ef7e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2481,12 +2481,6 @@ extern int sb_min_blocksize(struct super_block *, int);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
-static inline int generic_file_remap_pages(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long size, pgoff_t pgoff)
-{
-	BUG();
-	return 0;
-}
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 376e5c325dee..2ddd9d1d6268 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,9 +285,6 @@ struct vm_operations_struct {
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
 					unsigned long addr);
 #endif
-	/* called by sys_remap_file_pages() to populate non-linear mapping */
-	int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
-			   unsigned long size, pgoff_t pgoff);
 };
 
 struct mmu_gather;
diff --git a/mm/filemap.c b/mm/filemap.c
index 673e4581a2e5..bf7a27142704 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2087,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= filemap_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 /* This is used for a general mmap of a disk file */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0d105aeff82f..70c09da1a419 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -301,7 +301,6 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
 	.fault	= xip_file_fault,
 	.page_mkwrite	= filemap_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
diff --git a/mm/shmem.c b/mm/shmem.c
index 993e6ba689cc..b3e403181981 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3201,7 +3201,6 @@ static const struct vm_operations_struct shmem_vm_ops = {
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
 #endif
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
-- 
cgit v1.2.3


From 27ba0644ea9dfe6e7693abc85837b60e40583b96 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:59 -0800
Subject: rmap: drop support of non-linear mappings

We don't create non-linear mappings anymore.  Let's drop code which
handles them in rmap.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cachetlb.txt |   8 +-
 fs/inode.c                 |   1 -
 include/linux/fs.h         |   4 +-
 include/linux/mm.h         |   6 --
 include/linux/mm_types.h   |   4 +-
 include/linux/rmap.h       |   2 -
 kernel/fork.c              |   8 +-
 mm/migrate.c               |  32 -------
 mm/mmap.c                  |  24 ++---
 mm/rmap.c                  | 225 +--------------------------------------------
 mm/swap.c                  |   4 +-
 11 files changed, 18 insertions(+), 300 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index d79b008e4a32..3f9f808b5119 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -317,10 +317,10 @@ maps this page at its virtual address.
 	about doing this.
 
 	The idea is, first at flush_dcache_page() time, if
-	page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear
-	an empty list, just mark the architecture private page flag bit.
-	Later, in update_mmu_cache(), a check is made of this flag bit,
-	and if set the flush is done and the flag bit is cleared.
+	page->mapping->i_mmap is an empty tree, just mark the architecture
+	private page flag bit.  Later, in update_mmu_cache(), a check is
+	made of this flag bit, and if set the flush is done and the flag
+	bit is cleared.
 
 	IMPORTANT NOTE: It is often important, if you defer the flush,
 			that the actual flush occurs on the same CPU
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..c760fac33c92 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -355,7 +355,6 @@ void address_space_init_once(struct address_space *mapping)
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	mapping->i_mmap = RB_ROOT;
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 EXPORT_SYMBOL(address_space_init_once);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 47f557c7ef7e..60acab209701 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -401,7 +401,6 @@ struct address_space {
 	spinlock_t		tree_lock;	/* and lock protecting it */
 	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
-	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
@@ -493,8 +492,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping)
  */
 static inline int mapping_mapped(struct address_space *mapping)
 {
-	return	!RB_EMPTY_ROOT(&mapping->i_mmap) ||
-		!list_empty(&mapping->i_mmap_nonlinear);
+	return	!RB_EMPTY_ROOT(&mapping->i_mmap);
 }
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ddd9d1d6268..18391eec4864 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1796,12 +1796,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 	for (vma = vma_interval_tree_iter_first(root, start, last);	\
 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
 
-static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
-					struct list_head *list)
-{
-	list_add_tail(&vma->shared.nonlinear, list);
-}
-
 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
 				   struct rb_root *root);
 void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6d34aa266a8c..3b1d20fb0848 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -273,15 +273,13 @@ struct vm_area_struct {
 
 	/*
 	 * For areas with an address space and backing store,
-	 * linkage into the address_space->i_mmap interval tree, or
-	 * linkage of vma in the address_space->i_mmap_nonlinear list.
+	 * linkage into the address_space->i_mmap interval tree.
 	 */
 	union {
 		struct {
 			struct rb_node rb;
 			unsigned long rb_subtree_last;
 		} linear;
-		struct list_head nonlinear;
 	} shared;
 
 	/*
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d9d7e7e56352..b38f559130d5 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -246,7 +246,6 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
  * arg: passed to rmap_one() and invalid_vma()
  * rmap_one: executed on each vma where page is mapped
  * done: for checking traversing termination condition
- * file_nonlinear: for handling file nonlinear mapping
  * anon_lock: for getting anon_lock by optimized way rather than default
  * invalid_vma: for skipping uninterested vma
  */
@@ -255,7 +254,6 @@ struct rmap_walk_control {
 	int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
 					unsigned long addr, void *arg);
 	int (*done)(struct page *page);
-	int (*file_nonlinear)(struct page *, struct address_space *, void *arg);
 	struct anon_vma *(*anon_lock)(struct page *page);
 	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
 };
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..b379d9abddc7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				atomic_inc(&mapping->i_mmap_writable);
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
-			if (unlikely(tmp->vm_flags & VM_NONLINEAR))
-				vma_nonlinear_insert(tmp,
-						&mapping->i_mmap_nonlinear);
-			else
-				vma_interval_tree_insert_after(tmp, mpnt,
-							&mapping->i_mmap);
+			vma_interval_tree_insert_after(tmp, mpnt,
+					&mapping->i_mmap);
 			flush_dcache_mmap_unlock(mapping);
 			i_mmap_unlock_write(mapping);
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index 344cdf692fc8..6e284bcca8bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -178,37 +178,6 @@ out:
 	return SWAP_AGAIN;
 }
 
-/*
- * Congratulations to trinity for discovering this bug.
- * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
- * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
- * replace the specified range by file ptes throughout (maybe populated after).
- * If page migration finds a page within that range, while it's still located
- * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
- * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
- * But if the migrating page is in a part of the vma outside the range to be
- * remapped, then it will not be cleared, and remove_migration_ptes() needs to
- * deal with it.  Fortunately, this part of the vma is of course still linear,
- * so we just need to use linear location on the nonlinear list.
- */
-static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
-		struct address_space *mapping, void *arg)
-{
-	struct vm_area_struct *vma;
-	/* hugetlbfs does not support remap_pages, so no huge pgoff worries */
-	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-	unsigned long addr;
-
-	list_for_each_entry(vma,
-		&mapping->i_mmap_nonlinear, shared.nonlinear) {
-
-		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-		if (addr >= vma->vm_start && addr < vma->vm_end)
-			remove_migration_pte(page, vma, addr, arg);
-	}
-	return SWAP_AGAIN;
-}
-
 /*
  * Get rid of all migration entries and replace them by
  * references to the indicated page.
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
 	struct rmap_walk_control rwc = {
 		.rmap_one = remove_migration_pte,
 		.arg = old,
-		.file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
 	};
 
 	rmap_walk(new, &rwc);
diff --git a/mm/mmap.c b/mm/mmap.c
index e023dc5e59a8..14d84666e8ba 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		mapping_unmap_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	if (unlikely(vma->vm_flags & VM_NONLINEAR))
-		list_del_init(&vma->shared.nonlinear);
-	else
-		vma_interval_tree_remove(vma, &mapping->i_mmap);
+	vma_interval_tree_remove(vma, &mapping->i_mmap);
 	flush_dcache_mmap_unlock(mapping);
 }
 
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
 			atomic_inc(&mapping->i_mmap_writable);
 
 		flush_dcache_mmap_lock(mapping);
-		if (unlikely(vma->vm_flags & VM_NONLINEAR))
-			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-		else
-			vma_interval_tree_insert(vma, &mapping->i_mmap);
+		vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 }
@@ -789,14 +783,11 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	if (file) {
 		mapping = file->f_mapping;
-		if (!(vma->vm_flags & VM_NONLINEAR)) {
-			root = &mapping->i_mmap;
-			uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+		root = &mapping->i_mmap;
+		uprobe_munmap(vma, vma->vm_start, vma->vm_end);
 
-			if (adjust_next)
-				uprobe_munmap(next, next->vm_start,
-							next->vm_end);
-		}
+		if (adjust_next)
+			uprobe_munmap(next, next->vm_start, next->vm_end);
 
 		i_mmap_lock_write(mapping);
 		if (insert) {
@@ -3177,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  *
  * mmap_sem in write mode is required in order to block all operations
  * that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
- * nonlinear vmas). It's also needed in write mode to avoid new
+ * altering the vma layout. It's also needed in write mode to avoid new
  * anon_vmas to be associated with existing vmas.
  *
  * A single task can't take more than one mm_take_all_locks() in a row
diff --git a/mm/rmap.c b/mm/rmap.c
index 71cd5bd0c17d..70b32498d4f2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 		if (!vma->anon_vma || !page__anon_vma ||
 		    vma->anon_vma->root != page__anon_vma->root)
 			return -EFAULT;
-	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
-		if (!vma->vm_file ||
-		    vma->vm_file->f_mapping != page->mapping)
+	} else if (page->mapping) {
+		if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
 			return -EFAULT;
 	} else
 		return -EFAULT;
@@ -1274,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		if (pte_soft_dirty(pteval))
 			swp_pte = pte_swp_mksoft_dirty(swp_pte);
 		set_pte_at(mm, address, pte, swp_pte);
-		BUG_ON(pte_file(*pte));
 	} else if (IS_ENABLED(CONFIG_MIGRATION) &&
 		   (flags & TTU_MIGRATION)) {
 		/* Establish migration entry for a file page */
@@ -1316,211 +1314,6 @@ out_mlock:
 	return ret;
 }
 
-/*
- * objrmap doesn't work for nonlinear VMAs because the assumption that
- * offset-into-file correlates with offset-into-virtual-addresses does not hold.
- * Consequently, given a particular page and its ->index, we cannot locate the
- * ptes which are mapping that page without an exhaustive linear search.
- *
- * So what this code does is a mini "virtual scan" of each nonlinear VMA which
- * maps the file to which the target page belongs.  The ->vm_private_data field
- * holds the current cursor into that scan.  Successive searches will circulate
- * around the vma's virtual address space.
- *
- * So as more replacement pressure is applied to the pages in a nonlinear VMA,
- * more scanning pressure is placed against them as well.   Eventually pages
- * will become fully unmapped and are eligible for eviction.
- *
- * For very sparsely populated VMAs this is a little inefficient - chances are
- * there there won't be many ptes located within the scan cluster.  In this case
- * maybe we could scan further - to the end of the pte page, perhaps.
- *
- * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
- * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
- * rather than unmapping them.  If we encounter the "check_page" that vmscan is
- * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
- */
-#define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
-#define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
-
-static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
-		struct vm_area_struct *vma, struct page *check_page)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	pmd_t *pmd;
-	pte_t *pte;
-	pte_t pteval;
-	spinlock_t *ptl;
-	struct page *page;
-	unsigned long address;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
-	unsigned long end;
-	int ret = SWAP_AGAIN;
-	int locked_vma = 0;
-
-	address = (vma->vm_start + cursor) & CLUSTER_MASK;
-	end = address + CLUSTER_SIZE;
-	if (address < vma->vm_start)
-		address = vma->vm_start;
-	if (end > vma->vm_end)
-		end = vma->vm_end;
-
-	pmd = mm_find_pmd(mm, address);
-	if (!pmd)
-		return ret;
-
-	mmun_start = address;
-	mmun_end   = end;
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
-	/*
-	 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
-	 * keep the sem while scanning the cluster for mlocking pages.
-	 */
-	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-		locked_vma = (vma->vm_flags & VM_LOCKED);
-		if (!locked_vma)
-			up_read(&vma->vm_mm->mmap_sem); /* don't need it */
-	}
-
-	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-
-	/* Update high watermark before we lower rss */
-	update_hiwater_rss(mm);
-
-	for (; address < end; pte++, address += PAGE_SIZE) {
-		if (!pte_present(*pte))
-			continue;
-		page = vm_normal_page(vma, address, *pte);
-		BUG_ON(!page || PageAnon(page));
-
-		if (locked_vma) {
-			if (page == check_page) {
-				/* we know we have check_page locked */
-				mlock_vma_page(page);
-				ret = SWAP_MLOCK;
-			} else if (trylock_page(page)) {
-				/*
-				 * If we can lock the page, perform mlock.
-				 * Otherwise leave the page alone, it will be
-				 * eventually encountered again later.
-				 */
-				mlock_vma_page(page);
-				unlock_page(page);
-			}
-			continue;	/* don't unmap */
-		}
-
-		/*
-		 * No need for _notify because we're within an
-		 * mmu_notifier_invalidate_range_ {start|end} scope.
-		 */
-		if (ptep_clear_flush_young(vma, address, pte))
-			continue;
-
-		/* Nuke the page table entry. */
-		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush_notify(vma, address, pte);
-
-		/* If nonlinear, store the file page offset in the pte. */
-		if (page->index != linear_page_index(vma, address)) {
-			pte_t ptfile = pgoff_to_pte(page->index);
-			if (pte_soft_dirty(pteval))
-				ptfile = pte_file_mksoft_dirty(ptfile);
-			set_pte_at(mm, address, pte, ptfile);
-		}
-
-		/* Move the dirty bit to the physical page now the pte is gone. */
-		if (pte_dirty(pteval))
-			set_page_dirty(page);
-
-		page_remove_rmap(page);
-		page_cache_release(page);
-		dec_mm_counter(mm, MM_FILEPAGES);
-		(*mapcount)--;
-	}
-	pte_unmap_unlock(pte - 1, ptl);
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-	if (locked_vma)
-		up_read(&vma->vm_mm->mmap_sem);
-	return ret;
-}
-
-static int try_to_unmap_nonlinear(struct page *page,
-		struct address_space *mapping, void *arg)
-{
-	struct vm_area_struct *vma;
-	int ret = SWAP_AGAIN;
-	unsigned long cursor;
-	unsigned long max_nl_cursor = 0;
-	unsigned long max_nl_size = 0;
-	unsigned int mapcount;
-
-	list_for_each_entry(vma,
-		&mapping->i_mmap_nonlinear, shared.nonlinear) {
-
-		cursor = (unsigned long) vma->vm_private_data;
-		if (cursor > max_nl_cursor)
-			max_nl_cursor = cursor;
-		cursor = vma->vm_end - vma->vm_start;
-		if (cursor > max_nl_size)
-			max_nl_size = cursor;
-	}
-
-	if (max_nl_size == 0) {	/* all nonlinears locked or reserved ? */
-		return SWAP_FAIL;
-	}
-
-	/*
-	 * We don't try to search for this page in the nonlinear vmas,
-	 * and page_referenced wouldn't have found it anyway.  Instead
-	 * just walk the nonlinear vmas trying to age and unmap some.
-	 * The mapcount of the page we came in with is irrelevant,
-	 * but even so use it as a guide to how hard we should try?
-	 */
-	mapcount = page_mapcount(page);
-	if (!mapcount)
-		return ret;
-
-	cond_resched();
-
-	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
-	if (max_nl_cursor == 0)
-		max_nl_cursor = CLUSTER_SIZE;
-
-	do {
-		list_for_each_entry(vma,
-			&mapping->i_mmap_nonlinear, shared.nonlinear) {
-
-			cursor = (unsigned long) vma->vm_private_data;
-			while (cursor < max_nl_cursor &&
-				cursor < vma->vm_end - vma->vm_start) {
-				if (try_to_unmap_cluster(cursor, &mapcount,
-						vma, page) == SWAP_MLOCK)
-					ret = SWAP_MLOCK;
-				cursor += CLUSTER_SIZE;
-				vma->vm_private_data = (void *) cursor;
-				if ((int)mapcount <= 0)
-					return ret;
-			}
-			vma->vm_private_data = (void *) max_nl_cursor;
-		}
-		cond_resched();
-		max_nl_cursor += CLUSTER_SIZE;
-	} while (max_nl_cursor <= max_nl_size);
-
-	/*
-	 * Don't loop forever (perhaps all the remaining pages are
-	 * in locked vmas).  Reset cursor on all unreserved nonlinear
-	 * vmas, now forgetting on which ones it had fallen behind.
-	 */
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
-		vma->vm_private_data = NULL;
-
-	return ret;
-}
-
 bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
 	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1566,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 		.rmap_one = try_to_unmap_one,
 		.arg = (void *)flags,
 		.done = page_not_mapped,
-		.file_nonlinear = try_to_unmap_nonlinear,
 		.anon_lock = page_lock_anon_vma_read,
 	};
 
@@ -1612,12 +1404,6 @@ int try_to_munlock(struct page *page)
 		.rmap_one = try_to_unmap_one,
 		.arg = (void *)TTU_MUNLOCK,
 		.done = page_not_mapped,
-		/*
-		 * We don't bother to try to find the munlocked page in
-		 * nonlinears. It's costly. Instead, later, page reclaim logic
-		 * may call try_to_unmap() and recover PG_mlocked lazily.
-		 */
-		.file_nonlinear = NULL,
 		.anon_lock = page_lock_anon_vma_read,
 
 	};
@@ -1748,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 			goto done;
 	}
 
-	if (!rwc->file_nonlinear)
-		goto done;
-
-	if (list_empty(&mapping->i_mmap_nonlinear))
-		goto done;
-
-	ret = rwc->file_nonlinear(page, mapping, rwc->arg);
 done:
 	i_mmap_unlock_read(mapping);
 	return ret;
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33936b4..5b3087228b99 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1140,10 +1140,8 @@ void __init swap_setup(void)
 
 	if (bdi_init(swapper_spaces[0].backing_dev_info))
 		panic("Failed to init swap bdi");
-	for (i = 0; i < MAX_SWAPFILES; i++) {
+	for (i = 0; i < MAX_SWAPFILES; i++)
 		spin_lock_init(&swapper_spaces[i].tree_lock);
-		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
-	}
 #endif
 
 	/* Use a smaller cluster for small-memory machines */
-- 
cgit v1.2.3


From ac51b934f3912582d3c897c6c4d09b32ea57b2c7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:10:02 -0800
Subject: mm: replace vma->sharead.linear with vma->shared

After removing vma->shared.nonlinear we have only one member of
vma->shared union, which doesn't make much sense.

This patch drops the union and move struct vma->shared.linear to
vma->shared.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  8 +++-----
 mm/interval_tree.c       | 34 +++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3b1d20fb0848..07c8bd3f7b48 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -275,11 +275,9 @@ struct vm_area_struct {
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
 	 */
-	union {
-		struct {
-			struct rb_node rb;
-			unsigned long rb_subtree_last;
-		} linear;
+	struct {
+		struct rb_node rb;
+		unsigned long rb_subtree_last;
 	} shared;
 
 	/*
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 8da581fa9060..f2c2492681bf 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
 	return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
 }
 
-INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
-		     unsigned long, shared.linear.rb_subtree_last,
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
+		     unsigned long, shared.rb_subtree_last,
 		     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
 
 /* Insert node immediately after prev in the interval tree */
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
 
 	VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
 
-	if (!prev->shared.linear.rb.rb_right) {
+	if (!prev->shared.rb.rb_right) {
 		parent = prev;
-		link = &prev->shared.linear.rb.rb_right;
+		link = &prev->shared.rb.rb_right;
 	} else {
-		parent = rb_entry(prev->shared.linear.rb.rb_right,
-				  struct vm_area_struct, shared.linear.rb);
-		if (parent->shared.linear.rb_subtree_last < last)
-			parent->shared.linear.rb_subtree_last = last;
-		while (parent->shared.linear.rb.rb_left) {
-			parent = rb_entry(parent->shared.linear.rb.rb_left,
-				struct vm_area_struct, shared.linear.rb);
-			if (parent->shared.linear.rb_subtree_last < last)
-				parent->shared.linear.rb_subtree_last = last;
+		parent = rb_entry(prev->shared.rb.rb_right,
+				  struct vm_area_struct, shared.rb);
+		if (parent->shared.rb_subtree_last < last)
+			parent->shared.rb_subtree_last = last;
+		while (parent->shared.rb.rb_left) {
+			parent = rb_entry(parent->shared.rb.rb_left,
+				struct vm_area_struct, shared.rb);
+			if (parent->shared.rb_subtree_last < last)
+				parent->shared.rb_subtree_last = last;
 		}
-		link = &parent->shared.linear.rb.rb_left;
+		link = &parent->shared.rb.rb_left;
 	}
 
-	node->shared.linear.rb_subtree_last = last;
-	rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
-	rb_insert_augmented(&node->shared.linear.rb, root,
+	node->shared.rb_subtree_last = last;
+	rb_link_node(&node->shared.rb, &parent->shared.rb, link);
+	rb_insert_augmented(&node->shared.rb, root,
 			    &vma_interval_tree_augment);
 }
 
-- 
cgit v1.2.3


From 0661a33611fca12570cba48d9344ce68834ee86c Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:10:04 -0800
Subject: mm: remove rest usage of VM_NONLINEAR and pte_file()

One bit in ->vm_flags is unused now!

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/drm_vma_manager.c |  3 +-
 include/linux/mm.h                |  1 -
 include/linux/swapops.h           |  4 +-
 mm/debug.c                        |  1 -
 mm/gup.c                          |  2 +-
 mm/ksm.c                          |  2 +-
 mm/madvise.c                      |  4 +-
 mm/memcontrol.c                   |  7 +---
 mm/memory.c                       | 78 +++++++++++++++++++--------------------
 mm/mincore.c                      |  9 +----
 mm/mprotect.c                     |  2 +-
 mm/mremap.c                       |  2 -
 mm/msync.c                        |  5 +--
 13 files changed, 49 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_vma_manager.c b/drivers/gpu/drm/drm_vma_manager.c
index 63b471205072..68c1f32fb086 100644
--- a/drivers/gpu/drm/drm_vma_manager.c
+++ b/drivers/gpu/drm/drm_vma_manager.c
@@ -50,8 +50,7 @@
  *
  * You must not use multiple offset managers on a single address_space.
  * Otherwise, mm-core will be unable to tear down memory mappings as the VM will
- * no longer be linear. Please use VM_NONLINEAR in that case and implement your
- * own offset managers.
+ * no longer be linear.
  *
  * This offset manager works on page-based addresses. That is, every argument
  * and return code (with the exception of drm_vma_node_offset_addr()) is given
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 18391eec4864..a0da685bdb82 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -138,7 +138,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
-#define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_ARCH_2	0x02000000
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 6adfb7bfbf44..50cbc876be56 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
-	return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte);
+	return !pte_none(pte) && !pte_present_nonuma(pte);
 }
 #endif
 
@@ -66,7 +66,6 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 {
 	swp_entry_t arch_entry;
 
-	BUG_ON(pte_file(pte));
 	if (pte_swp_soft_dirty(pte))
 		pte = pte_swp_clear_soft_dirty(pte);
 	arch_entry = __pte_to_swp_entry(pte);
@@ -82,7 +81,6 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
 	swp_entry_t arch_entry;
 
 	arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
-	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
 
diff --git a/mm/debug.c b/mm/debug.c
index 0e58f3211f89..d69cb5a7ba9a 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = {
 	{VM_ACCOUNT,			"account"	},
 	{VM_NORESERVE,			"noreserve"	},
 	{VM_HUGETLB,			"hugetlb"	},
-	{VM_NONLINEAR,			"nonlinear"	},
 #if defined(CONFIG_X86)
 	{VM_PAT,			"pat"		},
 #elif defined(CONFIG_PPC)
diff --git a/mm/gup.c b/mm/gup.c
index 8dd50ce6326f..12bc2bc33da7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -55,7 +55,7 @@ retry:
 		 */
 		if (likely(!(flags & FOLL_MIGRATION)))
 			goto no_page;
-		if (pte_none(pte) || pte_file(pte))
+		if (pte_none(pte))
 			goto no_page;
 		entry = pte_to_swp_entry(pte);
 		if (!is_migration_entry(entry))
diff --git a/mm/ksm.c b/mm/ksm.c
index 15647fb0394f..4162dce2eb44 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		 */
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
+				 VM_HUGETLB | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
 #ifdef VM_SAO
diff --git a/mm/madvise.c b/mm/madvise.c
index 917754d26c17..d79fb5e8f80a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
 		pte_unmap_unlock(orig_pte, ptl);
 
-		if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+		if (pte_present(pte) || pte_none(pte))
 			continue;
 		entry = pte_to_swp_entry(pte);
 		if (unlikely(non_swap_entry(entry)))
@@ -296,7 +296,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 
 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
 
-	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+	if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
 		return -EINVAL;
 
 	f = vma->vm_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f6893c2f01b..8b58701b9647 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4926,10 +4926,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 		return NULL;
 
 	mapping = vma->vm_file->f_mapping;
-	if (pte_none(ptent))
-		pgoff = linear_page_index(vma, addr);
-	else /* pte_file(ptent) is true */
-		pgoff = pte_to_pgoff(ptent);
+	pgoff = linear_page_index(vma, addr);
 
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 #ifdef CONFIG_SWAP
@@ -4961,7 +4958,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
-	else if (pte_none(ptent) || pte_file(ptent))
+	else if (pte_none(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 
 	if (!page && !ent.val)
diff --git a/mm/memory.c b/mm/memory.c
index 43a53743cbb4..9aa09217fe20 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -811,42 +811,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
-		if (!pte_file(pte)) {
-			swp_entry_t entry = pte_to_swp_entry(pte);
-
-			if (likely(!non_swap_entry(entry))) {
-				if (swap_duplicate(entry) < 0)
-					return entry.val;
-
-				/* make sure dst_mm is on swapoff's mmlist. */
-				if (unlikely(list_empty(&dst_mm->mmlist))) {
-					spin_lock(&mmlist_lock);
-					if (list_empty(&dst_mm->mmlist))
-						list_add(&dst_mm->mmlist,
-							 &src_mm->mmlist);
-					spin_unlock(&mmlist_lock);
-				}
-				rss[MM_SWAPENTS]++;
-			} else if (is_migration_entry(entry)) {
-				page = migration_entry_to_page(entry);
-
-				if (PageAnon(page))
-					rss[MM_ANONPAGES]++;
-				else
-					rss[MM_FILEPAGES]++;
-
-				if (is_write_migration_entry(entry) &&
-				    is_cow_mapping(vm_flags)) {
-					/*
-					 * COW mappings require pages in both
-					 * parent and child to be set to read.
-					 */
-					make_migration_entry_read(&entry);
-					pte = swp_entry_to_pte(entry);
-					if (pte_swp_soft_dirty(*src_pte))
-						pte = pte_swp_mksoft_dirty(pte);
-					set_pte_at(src_mm, addr, src_pte, pte);
-				}
+		swp_entry_t entry = pte_to_swp_entry(pte);
+
+		if (likely(!non_swap_entry(entry))) {
+			if (swap_duplicate(entry) < 0)
+				return entry.val;
+
+			/* make sure dst_mm is on swapoff's mmlist. */
+			if (unlikely(list_empty(&dst_mm->mmlist))) {
+				spin_lock(&mmlist_lock);
+				if (list_empty(&dst_mm->mmlist))
+					list_add(&dst_mm->mmlist,
+							&src_mm->mmlist);
+				spin_unlock(&mmlist_lock);
+			}
+			rss[MM_SWAPENTS]++;
+		} else if (is_migration_entry(entry)) {
+			page = migration_entry_to_page(entry);
+
+			if (PageAnon(page))
+				rss[MM_ANONPAGES]++;
+			else
+				rss[MM_FILEPAGES]++;
+
+			if (is_write_migration_entry(entry) &&
+					is_cow_mapping(vm_flags)) {
+				/*
+				 * COW mappings require pages in both
+				 * parent and child to be set to read.
+				 */
+				make_migration_entry_read(&entry);
+				pte = swp_entry_to_pte(entry);
+				if (pte_swp_soft_dirty(*src_pte))
+					pte = pte_swp_mksoft_dirty(pte);
+				set_pte_at(src_mm, addr, src_pte, pte);
 			}
 		}
 		goto out_set_pte;
@@ -1020,11 +1018,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
-			       VM_PFNMAP | VM_MIXEDMAP))) {
-		if (!vma->anon_vma)
-			return 0;
-	}
+	if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+			!vma->anon_vma)
+		return 0;
 
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
diff --git a/mm/mincore.c b/mm/mincore.c
index c8c528b36641..46527c023e0c 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		pte_t pte = *ptep;
-		pgoff_t pgoff;
 
 		next = addr + PAGE_SIZE;
 		if (pte_none(pte))
 			mincore_unmapped_range(vma, addr, next, vec);
 		else if (pte_present(pte))
 			*vec = 1;
-		else if (pte_file(pte)) {
-			pgoff = pte_to_pgoff(pte);
-			*vec = mincore_page(vma->vm_file->f_mapping, pgoff);
-		} else { /* pte is a swap entry */
+		else { /* pte is a swap entry */
 			swp_entry_t entry = pte_to_swp_entry(pte);
 
 			if (non_swap_entry(entry)) {
@@ -145,9 +141,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				*vec = 1;
 			} else {
 #ifdef CONFIG_SWAP
-				pgoff = entry.val;
 				*vec = mincore_page(swap_address_space(entry),
-					pgoff);
+					entry.val);
 #else
 				WARN_ON(1);
 				*vec = 1;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ace93454ce8e..33121662f08b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -105,7 +105,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			}
 			if (updated)
 				pages++;
-		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
+		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
 
 			if (is_write_migration_entry(entry)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 17fa018f5f39..57dadc025c64 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 		pte = pte_mksoft_dirty(pte);
 	else if (is_swap_pte(pte))
 		pte = pte_swp_mksoft_dirty(pte);
-	else if (pte_file(pte))
-		pte = pte_file_mksoft_dirty(pte);
 #endif
 	return pte;
 }
diff --git a/mm/msync.c b/mm/msync.c
index 992a1673d488..bb04d53ae852 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 				(vma->vm_flags & VM_SHARED)) {
 			get_file(file);
 			up_read(&mm->mmap_sem);
-			if (vma->vm_flags & VM_NONLINEAR)
-				error = vfs_fsync(file, 1);
-			else
-				error = vfs_fsync_range(file, fstart, fend, 1);
+			error = vfs_fsync_range(file, fstart, fend, 1);
 			fput(file);
 			if (error || start >= end)
 				goto out;
-- 
cgit v1.2.3


From 753162cd849c45580fb5aaa7f3597c81e74e391c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Tue, 10 Feb 2015 14:11:36 -0800
Subject: mm: hugetlb: fix type of hugetlb_treat_as_movable variable

hugetlb_treat_as_movable declared as unsigned long, but
proc_dointvec() used for parsing it:

static struct ctl_table vm_table[] = {
...
        {
                .procname	= "hugepages_treat_as_movable",
                .data		= &hugepages_treat_as_movable,
                .maxlen		= sizeof(int),
                .mode		= 0644,
                .proc_handler	= proc_dointvec,
        },

This seems harmless, but it's better to use int type here.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 +-
 mm/hugetlb.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 431b7fc605c9..7d7856359920 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -86,7 +86,7 @@ void free_huge_page(struct page *page);
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
 #endif
 
-extern unsigned long hugepages_treat_as_movable;
+extern int hugepages_treat_as_movable;
 extern int sysctl_hugetlb_shm_group;
 extern struct list_head huge_boot_pages;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85032de5e20f..be0e5d0db5ec 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,7 @@
 #include <linux/node.h>
 #include "internal.h"
 
-unsigned long hugepages_treat_as_movable;
+int hugepages_treat_as_movable;
 
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
-- 
cgit v1.2.3


From dbf22eb6d8675fc173154d9f1bd1bd0fda53a001 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Tue, 10 Feb 2015 14:11:41 -0800
Subject: memcg: zap __memcg_{charge,uncharge}_slab

They are simple wrappers around memcg_{charge,uncharge}_kmem, so let's
zap them and call these functions directly.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  5 +++--
 mm/memcontrol.c            | 21 +++------------------
 mm/slab.h                  |  4 ++--
 3 files changed, 8 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7c95af8d552c..18ccb2988979 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -403,8 +403,9 @@ void memcg_update_array_size(int num_groups);
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
 
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order);
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order);
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+		      unsigned long nr_pages);
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
 
 int __memcg_cleanup_cache_params(struct kmem_cache *s);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8b58701b9647..e229e3ad615c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2495,8 +2495,8 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
 	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
 }
 
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-			     unsigned long nr_pages)
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+		      unsigned long nr_pages)
 {
 	struct page_counter *counter;
 	int ret = 0;
@@ -2533,8 +2533,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 	return ret;
 }
 
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
-				unsigned long nr_pages)
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
 {
 	page_counter_uncharge(&memcg->memory, nr_pages);
 	if (do_swap_account)
@@ -2767,20 +2766,6 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
 	current->memcg_kmem_skip_account = 0;
 }
 
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
-	unsigned int nr_pages = 1 << order;
-
-	return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-}
-
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
-	unsigned int nr_pages = 1 << order;
-
-	memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-}
-
 /*
  * Return the kmem_cache we're supposed to use for a slab allocation.
  * We try to use the current memcg's version of the cache.
diff --git a/mm/slab.h b/mm/slab.h
index 1cf4005482dd..90430d6f665e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -235,7 +235,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
 		return 0;
 	if (is_root_cache(s))
 		return 0;
-	return __memcg_charge_slab(s, gfp, order);
+	return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order);
 }
 
 static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,7 +244,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 		return;
 	if (is_root_cache(s))
 		return;
-	__memcg_uncharge_slab(s, order);
+	memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order);
 }
 #else
 static inline bool is_root_cache(struct kmem_cache *s)
-- 
cgit v1.2.3


From 3e0350a36414a73c5c2d1e354f8c0ab4ace1296d Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Tue, 10 Feb 2015 14:11:44 -0800
Subject: memcg: zap memcg_name argument of memcg_create_kmem_cache

Instead of passing the name of the memory cgroup which the cache is
created for in the memcg_name_argument, let's obtain it immediately in
memcg_create_kmem_cache.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 3 +--
 mm/memcontrol.c      | 5 +----
 mm/slab_common.c     | 9 +++++----
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9a139b637069..eca9ed303a1b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -117,8 +117,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			void (*)(void *));
 #ifdef CONFIG_MEMCG_KMEM
 struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *,
-					   struct kmem_cache *,
-					   const char *);
+					   struct kmem_cache *);
 #endif
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e229e3ad615c..baf7eb27e3ae 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2607,8 +2607,6 @@ void memcg_update_array_size(int num)
 static void memcg_register_cache(struct mem_cgroup *memcg,
 				 struct kmem_cache *root_cache)
 {
-	static char memcg_name_buf[NAME_MAX + 1]; /* protected by
-						     memcg_slab_mutex */
 	struct kmem_cache *cachep;
 	int id;
 
@@ -2624,8 +2622,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
 	if (cache_from_memcg_idx(root_cache, id))
 		return;
 
-	cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
-	cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
+	cachep = memcg_create_kmem_cache(memcg, root_cache);
 	/*
 	 * If we could not create a memcg cache, do not complain, because
 	 * that's not critical at all as we can always proceed with the root
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 67f182c10f24..1b782a2d3b3d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -430,16 +430,15 @@ EXPORT_SYMBOL(kmem_cache_create);
  * memcg_create_kmem_cache - Create a cache for a memory cgroup.
  * @memcg: The memory cgroup the new cache is for.
  * @root_cache: The parent of the new cache.
- * @memcg_name: The name of the memory cgroup (used for naming the new cache).
  *
  * This function attempts to create a kmem cache that will serve allocation
  * requests going from @memcg to @root_cache. The new cache inherits properties
  * from its parent.
  */
 struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-					   struct kmem_cache *root_cache,
-					   const char *memcg_name)
+					   struct kmem_cache *root_cache)
 {
+	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
 	struct kmem_cache *s = NULL;
 	char *cache_name;
 
@@ -448,8 +447,10 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	mutex_lock(&slab_mutex);
 
+	cgroup_name(mem_cgroup_css(memcg)->cgroup,
+		    memcg_name_buf, sizeof(memcg_name_buf));
 	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
-			       memcg_cache_id(memcg), memcg_name);
+			       memcg_cache_id(memcg), memcg_name_buf);
 	if (!cache_name)
 		goto out_unlock;
 
-- 
cgit v1.2.3


From d5b3cf7139b8770af4ed8bb36a1ab9d290ac39e9 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Tue, 10 Feb 2015 14:11:47 -0800
Subject: memcg: zap memcg_slab_caches and memcg_slab_mutex

mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup.  Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed.  The
list is protected by memcg_slab_mutex.  The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.

However, we can perfectly get on without these two.  To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex.  This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.

Apart from this nice cleanup, it also:

 - assures that rcu_barrier() is called once at max when a root cache is
   destroyed or a memory cgroup is freed, no matter how many caches have
   SLAB_DESTROY_BY_RCU flag set;

 - fixes the race between kmem_cache_destroy and kmem_cache_create that
   exists, because memcg_cleanup_cache_params, which is called from
   kmem_cache_destroy after checking that kmem_cache->refcount=0,
   releases the slab_mutex, which gives kmem_cache_create a chance to
   make an alias to a cache doomed to be destroyed.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |   2 -
 include/linux/slab.h       |   6 +-
 mm/memcontrol.c            | 156 +++++----------------------------------------
 mm/slab_common.c           | 142 +++++++++++++++++++++++++++++------------
 4 files changed, 120 insertions(+), 186 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 18ccb2988979..fb212e1d700d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -407,8 +407,6 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 		      unsigned long nr_pages);
 void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
 
-int __memcg_cleanup_cache_params(struct kmem_cache *s);
-
 /**
  * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
  * @gfp: the gfp allocation flags.
diff --git a/include/linux/slab.h b/include/linux/slab.h
index eca9ed303a1b..2e3b448cfa2d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -116,8 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *));
 #ifdef CONFIG_MEMCG_KMEM
-struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *,
-					   struct kmem_cache *);
+void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
+void memcg_destroy_kmem_caches(struct mem_cgroup *);
 #endif
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
@@ -490,7 +490,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * Child caches will hold extra metadata needed for its operation. Fields are:
  *
  * @memcg: pointer to the memcg this cache belongs to
- * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
  */
 struct memcg_cache_params {
@@ -502,7 +501,6 @@ struct memcg_cache_params {
 		};
 		struct {
 			struct mem_cgroup *memcg;
-			struct list_head list;
 			struct kmem_cache *root_cache;
 		};
 	};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index baf7eb27e3ae..f3f8a4f52a0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -343,9 +343,6 @@ struct mem_cgroup {
 	struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-	/* analogous to slab_common's slab_caches list, but per-memcg;
-	 * protected by memcg_slab_mutex */
-	struct list_head memcg_slab_caches;
         /* Index in the kmem_cache->memcg_params->memcg_caches array */
 	int kmemcg_id;
 #endif
@@ -2476,25 +2473,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-/*
- * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
- * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
- */
-static DEFINE_MUTEX(memcg_slab_mutex);
-
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
-	struct kmem_cache *cachep;
-
-	VM_BUG_ON(p->is_root_cache);
-	cachep = p->root_cache;
-	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
-}
-
 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 		      unsigned long nr_pages)
 {
@@ -2578,10 +2556,7 @@ static int memcg_alloc_cache_id(void)
 	else if (size > MEMCG_CACHES_MAX_SIZE)
 		size = MEMCG_CACHES_MAX_SIZE;
 
-	mutex_lock(&memcg_slab_mutex);
 	err = memcg_update_all_caches(size);
-	mutex_unlock(&memcg_slab_mutex);
-
 	if (err) {
 		ida_simple_remove(&kmem_limited_groups, id);
 		return err;
@@ -2604,120 +2579,20 @@ void memcg_update_array_size(int num)
 	memcg_limited_groups_array_size = num;
 }
 
-static void memcg_register_cache(struct mem_cgroup *memcg,
-				 struct kmem_cache *root_cache)
-{
-	struct kmem_cache *cachep;
-	int id;
-
-	lockdep_assert_held(&memcg_slab_mutex);
-
-	id = memcg_cache_id(memcg);
-
-	/*
-	 * Since per-memcg caches are created asynchronously on first
-	 * allocation (see memcg_kmem_get_cache()), several threads can try to
-	 * create the same cache, but only one of them may succeed.
-	 */
-	if (cache_from_memcg_idx(root_cache, id))
-		return;
-
-	cachep = memcg_create_kmem_cache(memcg, root_cache);
-	/*
-	 * If we could not create a memcg cache, do not complain, because
-	 * that's not critical at all as we can always proceed with the root
-	 * cache.
-	 */
-	if (!cachep)
-		return;
-
-	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-
-	/*
-	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
-	 * barrier here to ensure nobody will see the kmem_cache partially
-	 * initialized.
-	 */
-	smp_wmb();
-
-	BUG_ON(root_cache->memcg_params->memcg_caches[id]);
-	root_cache->memcg_params->memcg_caches[id] = cachep;
-}
-
-static void memcg_unregister_cache(struct kmem_cache *cachep)
-{
-	struct kmem_cache *root_cache;
-	struct mem_cgroup *memcg;
-	int id;
-
-	lockdep_assert_held(&memcg_slab_mutex);
-
-	BUG_ON(is_root_cache(cachep));
-
-	root_cache = cachep->memcg_params->root_cache;
-	memcg = cachep->memcg_params->memcg;
-	id = memcg_cache_id(memcg);
-
-	BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
-	root_cache->memcg_params->memcg_caches[id] = NULL;
-
-	list_del(&cachep->memcg_params->list);
-
-	kmem_cache_destroy(cachep);
-}
-
-int __memcg_cleanup_cache_params(struct kmem_cache *s)
-{
-	struct kmem_cache *c;
-	int i, failed = 0;
-
-	mutex_lock(&memcg_slab_mutex);
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg_idx(s, i);
-		if (!c)
-			continue;
-
-		memcg_unregister_cache(c);
-
-		if (cache_from_memcg_idx(s, i))
-			failed++;
-	}
-	mutex_unlock(&memcg_slab_mutex);
-	return failed;
-}
-
-static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
-	struct kmem_cache *cachep;
-	struct memcg_cache_params *params, *tmp;
-
-	if (!memcg_kmem_is_active(memcg))
-		return;
-
-	mutex_lock(&memcg_slab_mutex);
-	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
-		cachep = memcg_params_to_cache(params);
-		memcg_unregister_cache(cachep);
-	}
-	mutex_unlock(&memcg_slab_mutex);
-}
-
-struct memcg_register_cache_work {
+struct memcg_kmem_cache_create_work {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *cachep;
 	struct work_struct work;
 };
 
-static void memcg_register_cache_func(struct work_struct *w)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
 {
-	struct memcg_register_cache_work *cw =
-		container_of(w, struct memcg_register_cache_work, work);
+	struct memcg_kmem_cache_create_work *cw =
+		container_of(w, struct memcg_kmem_cache_create_work, work);
 	struct mem_cgroup *memcg = cw->memcg;
 	struct kmem_cache *cachep = cw->cachep;
 
-	mutex_lock(&memcg_slab_mutex);
-	memcg_register_cache(memcg, cachep);
-	mutex_unlock(&memcg_slab_mutex);
+	memcg_create_kmem_cache(memcg, cachep);
 
 	css_put(&memcg->css);
 	kfree(cw);
@@ -2726,10 +2601,10 @@ static void memcg_register_cache_func(struct work_struct *w)
 /*
  * Enqueue the creation of a per-memcg kmem_cache.
  */
-static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
-					    struct kmem_cache *cachep)
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+					       struct kmem_cache *cachep)
 {
-	struct memcg_register_cache_work *cw;
+	struct memcg_kmem_cache_create_work *cw;
 
 	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
 	if (!cw)
@@ -2739,18 +2614,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
 
 	cw->memcg = memcg;
 	cw->cachep = cachep;
+	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
 
-	INIT_WORK(&cw->work, memcg_register_cache_func);
 	schedule_work(&cw->work);
 }
 
-static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
-					  struct kmem_cache *cachep)
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+					     struct kmem_cache *cachep)
 {
 	/*
 	 * We need to stop accounting when we kmalloc, because if the
 	 * corresponding kmalloc cache is not yet created, the first allocation
-	 * in __memcg_schedule_register_cache will recurse.
+	 * in __memcg_schedule_kmem_cache_create will recurse.
 	 *
 	 * However, it is better to enclose the whole function. Depending on
 	 * the debugging options enabled, INIT_WORK(), for instance, can
@@ -2759,7 +2634,7 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
 	 * the safest choice is to do it like this, wrapping the whole function.
 	 */
 	current->memcg_kmem_skip_account = 1;
-	__memcg_schedule_register_cache(memcg, cachep);
+	__memcg_schedule_kmem_cache_create(memcg, cachep);
 	current->memcg_kmem_skip_account = 0;
 }
 
@@ -2807,7 +2682,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 	 * could happen with the slab_mutex held. So it's better to
 	 * defer everything.
 	 */
-	memcg_schedule_register_cache(memcg, cachep);
+	memcg_schedule_kmem_cache_create(memcg, cachep);
 out:
 	css_put(&memcg->css);
 	return cachep;
@@ -4136,7 +4011,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
-	memcg_unregister_all_caches(memcg);
+	memcg_destroy_kmem_caches(memcg);
 	mem_cgroup_sockets_destroy(memcg);
 }
 #else
@@ -4664,7 +4539,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	spin_lock_init(&memcg->event_list_lock);
 #ifdef CONFIG_MEMCG_KMEM
 	memcg->kmemcg_id = -1;
-	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
 #endif
 
 	return &memcg->css;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1b782a2d3b3d..6e1e4cf65836 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -425,6 +425,49 @@ out_unlock:
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
+static int do_kmem_cache_shutdown(struct kmem_cache *s,
+		struct list_head *release, bool *need_rcu_barrier)
+{
+	if (__kmem_cache_shutdown(s) != 0) {
+		printk(KERN_ERR "kmem_cache_destroy %s: "
+		       "Slab cache still has objects\n", s->name);
+		dump_stack();
+		return -EBUSY;
+	}
+
+	if (s->flags & SLAB_DESTROY_BY_RCU)
+		*need_rcu_barrier = true;
+
+#ifdef CONFIG_MEMCG_KMEM
+	if (!is_root_cache(s)) {
+		struct kmem_cache *root_cache = s->memcg_params->root_cache;
+		int memcg_id = memcg_cache_id(s->memcg_params->memcg);
+
+		BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
+		root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
+	}
+#endif
+	list_move(&s->list, release);
+	return 0;
+}
+
+static void do_kmem_cache_release(struct list_head *release,
+				  bool need_rcu_barrier)
+{
+	struct kmem_cache *s, *s2;
+
+	if (need_rcu_barrier)
+		rcu_barrier();
+
+	list_for_each_entry_safe(s, s2, release, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+		sysfs_slab_remove(s);
+#else
+		slab_kmem_cache_release(s);
+#endif
+	}
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * memcg_create_kmem_cache - Create a cache for a memory cgroup.
@@ -435,10 +478,11 @@ EXPORT_SYMBOL(kmem_cache_create);
  * requests going from @memcg to @root_cache. The new cache inherits properties
  * from its parent.
  */
-struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-					   struct kmem_cache *root_cache)
+void memcg_create_kmem_cache(struct mem_cgroup *memcg,
+			     struct kmem_cache *root_cache)
 {
 	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
+	int memcg_id = memcg_cache_id(memcg);
 	struct kmem_cache *s = NULL;
 	char *cache_name;
 
@@ -447,6 +491,14 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	mutex_lock(&slab_mutex);
 
+	/*
+	 * Since per-memcg caches are created asynchronously on first
+	 * allocation (see memcg_kmem_get_cache()), several threads can try to
+	 * create the same cache, but only one of them may succeed.
+	 */
+	if (cache_from_memcg_idx(root_cache, memcg_id))
+		goto out_unlock;
+
 	cgroup_name(mem_cgroup_css(memcg)->cgroup,
 		    memcg_name_buf, sizeof(memcg_name_buf));
 	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
@@ -458,49 +510,73 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 				 root_cache->size, root_cache->align,
 				 root_cache->flags, root_cache->ctor,
 				 memcg, root_cache);
+	/*
+	 * If we could not create a memcg cache, do not complain, because
+	 * that's not critical at all as we can always proceed with the root
+	 * cache.
+	 */
 	if (IS_ERR(s)) {
 		kfree(cache_name);
-		s = NULL;
+		goto out_unlock;
 	}
 
+	/*
+	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
+	 * barrier here to ensure nobody will see the kmem_cache partially
+	 * initialized.
+	 */
+	smp_wmb();
+	root_cache->memcg_params->memcg_caches[memcg_id] = s;
+
 out_unlock:
 	mutex_unlock(&slab_mutex);
 
 	put_online_mems();
 	put_online_cpus();
-
-	return s;
 }
 
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
+void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 {
-	int rc;
+	LIST_HEAD(release);
+	bool need_rcu_barrier = false;
+	struct kmem_cache *s, *s2;
 
-	if (!s->memcg_params ||
-	    !s->memcg_params->is_root_cache)
-		return 0;
+	get_online_cpus();
+	get_online_mems();
 
-	mutex_unlock(&slab_mutex);
-	rc = __memcg_cleanup_cache_params(s);
 	mutex_lock(&slab_mutex);
+	list_for_each_entry_safe(s, s2, &slab_caches, list) {
+		if (is_root_cache(s) || s->memcg_params->memcg != memcg)
+			continue;
+		/*
+		 * The cgroup is about to be freed and therefore has no charges
+		 * left. Hence, all its caches must be empty by now.
+		 */
+		BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+	}
+	mutex_unlock(&slab_mutex);
 
-	return rc;
-}
-#else
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
-{
-	return 0;
+	put_online_mems();
+	put_online_cpus();
+
+	do_kmem_cache_release(&release, need_rcu_barrier);
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
 void slab_kmem_cache_release(struct kmem_cache *s)
 {
+	memcg_free_cache_params(s);
 	kfree(s->name);
 	kmem_cache_free(kmem_cache, s);
 }
 
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+	int i;
+	LIST_HEAD(release);
+	bool need_rcu_barrier = false;
+	bool busy = false;
+
 	get_online_cpus();
 	get_online_mems();
 
@@ -510,35 +586,23 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (s->refcount)
 		goto out_unlock;
 
-	if (memcg_cleanup_cache_params(s) != 0)
-		goto out_unlock;
+	for_each_memcg_cache_index(i) {
+		struct kmem_cache *c = cache_from_memcg_idx(s, i);
 
-	if (__kmem_cache_shutdown(s) != 0) {
-		printk(KERN_ERR "kmem_cache_destroy %s: "
-		       "Slab cache still has objects\n", s->name);
-		dump_stack();
-		goto out_unlock;
+		if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+			busy = true;
 	}
 
-	list_del(&s->list);
-
-	mutex_unlock(&slab_mutex);
-	if (s->flags & SLAB_DESTROY_BY_RCU)
-		rcu_barrier();
-
-	memcg_free_cache_params(s);
-#ifdef SLAB_SUPPORTS_SYSFS
-	sysfs_slab_remove(s);
-#else
-	slab_kmem_cache_release(s);
-#endif
-	goto out;
+	if (!busy)
+		do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
 
 out_unlock:
 	mutex_unlock(&slab_mutex);
-out:
+
 	put_online_mems();
 	put_online_cpus();
+
+	do_kmem_cache_release(&release, need_rcu_barrier);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-- 
cgit v1.2.3


From 18c137371b2ea86d263b75665a4904a0b8872990 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:15:09 +1030
Subject: lguest: add operations to get/set a register from the Launcher.

We use the ptrace API struct, and we currently don't let them set
anything but the normal registers (we'd have to filter the others).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c           |  8 +++++++
 drivers/lguest/lg.h             |  3 +++
 drivers/lguest/lguest_user.c    | 49 +++++++++++++++++++++++++++++++++++++++++
 drivers/lguest/x86/core.c       | 46 ++++++++++++++++++++++++++++++++++++++
 include/linux/lguest_launcher.h |  2 ++
 5 files changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 6590558d1d31..cdb2f9aa5860 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -208,6 +208,14 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
  */
 int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 {
+	/* If the launcher asked for a register with LHREQ_GETREG */
+	if (cpu->reg_read) {
+		if (put_user(*cpu->reg_read, user))
+			return -EFAULT;
+		cpu->reg_read = NULL;
+		return sizeof(*cpu->reg_read);
+	}
+
 	/* We stop running once the Guest is dead. */
 	while (!cpu->lg->dead) {
 		unsigned int irq;
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 2eef40be4c04..1c98bf74fd68 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -52,6 +52,8 @@ struct lg_cpu {
 
 	unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
 
+	unsigned long *reg_read; /* register from LHREQ_GETREG */
+
 	/* At end of a page shared mapped over lguest_pages in guest. */
 	unsigned long regs_page;
 	struct lguest_regs *regs;
@@ -210,6 +212,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu);
 int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
 int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
 void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
+unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
 
 /* <arch>/switcher.S: */
 extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 4263f4cc8c55..7f14c152dd23 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -173,6 +173,51 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
 	return err;
 }
 
+/* The Launcher can get the registers, and also set some of them. */
+static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
+{
+	unsigned long which;
+
+	/* We re-use the ptrace structure to specify which register to read. */
+	if (get_user(which, input) != 0)
+		return -EFAULT;
+
+	/*
+	 * We set up the cpu register pointer, and their next read will
+	 * actually get the value (instead of running the guest).
+	 *
+	 * The last argument 'true' says we can access any register.
+	 */
+	cpu->reg_read = lguest_arch_regptr(cpu, which, true);
+	if (!cpu->reg_read)
+		return -ENOENT;
+
+	/* And because this is a write() call, we return the length used. */
+	return sizeof(unsigned long) * 2;
+}
+
+static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
+{
+	unsigned long which, value, *reg;
+
+	/* We re-use the ptrace structure to specify which register to read. */
+	if (get_user(which, input) != 0)
+		return -EFAULT;
+	input++;
+	if (get_user(value, input) != 0)
+		return -EFAULT;
+
+	/* The last argument 'false' means we can't access all registers. */
+	reg = lguest_arch_regptr(cpu, which, false);
+	if (!reg)
+		return -ENOENT;
+
+	*reg = value;
+
+	/* And because this is a write() call, we return the length used. */
+	return sizeof(unsigned long) * 3;
+}
+
 /*L:050
  * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
  * number to /dev/lguest.
@@ -434,6 +479,10 @@ static ssize_t write(struct file *file, const char __user *in,
 		return user_send_irq(cpu, input);
 	case LHREQ_EVENTFD:
 		return attach_eventfd(lg, input);
+	case LHREQ_GETREG:
+		return getreg_setup(cpu, input);
+	case LHREQ_SETREG:
+		return setreg(cpu, input);
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 922a1acbf652..f7a16b4ea456 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -181,6 +181,52 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
 }
 /*:*/
 
+unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
+{
+	switch (reg_off) {
+	case offsetof(struct pt_regs, bx):
+		return &cpu->regs->ebx;
+	case offsetof(struct pt_regs, cx):
+		return &cpu->regs->ecx;
+	case offsetof(struct pt_regs, dx):
+		return &cpu->regs->edx;
+	case offsetof(struct pt_regs, si):
+		return &cpu->regs->esi;
+	case offsetof(struct pt_regs, di):
+		return &cpu->regs->edi;
+	case offsetof(struct pt_regs, bp):
+		return &cpu->regs->ebp;
+	case offsetof(struct pt_regs, ax):
+		return &cpu->regs->eax;
+	case offsetof(struct pt_regs, ip):
+		return &cpu->regs->eip;
+	case offsetof(struct pt_regs, sp):
+		return &cpu->regs->esp;
+	}
+
+	/* Launcher can read these, but we don't allow any setting. */
+	if (any) {
+		switch (reg_off) {
+		case offsetof(struct pt_regs, ds):
+			return &cpu->regs->ds;
+		case offsetof(struct pt_regs, es):
+			return &cpu->regs->es;
+		case offsetof(struct pt_regs, fs):
+			return &cpu->regs->fs;
+		case offsetof(struct pt_regs, gs):
+			return &cpu->regs->gs;
+		case offsetof(struct pt_regs, cs):
+			return &cpu->regs->cs;
+		case offsetof(struct pt_regs, flags):
+			return &cpu->regs->eflags;
+		case offsetof(struct pt_regs, ss):
+			return &cpu->regs->ss;
+		}
+	}
+
+	return NULL;
+}
+
 /*M:002
  * There are hooks in the scheduler which we can register to tell when we
  * get kicked off the CPU (preempt_notifier_register()).  This would allow us
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 495203ff221c..f27cae27b0c1 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -63,6 +63,8 @@ enum lguest_req
 	LHREQ_IRQ, /* + irq */
 	LHREQ_BREAK, /* No longer used */
 	LHREQ_EVENTFD, /* + address, fd. */
+	LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
+	LHREQ_SETREG, /* + offset within struct pt_regs, value. */
 };
 
 /*
-- 
cgit v1.2.3


From 69a09dc1742ffbb3b02f3a1e03da4801e96452e9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:15:09 +1030
Subject: lguest: write more information to userspace about pending traps.

This is preparation for userspace handling MMIO and ioport accesses.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c           |  7 ++++---
 drivers/lguest/hypercalls.c     |  7 ++++---
 drivers/lguest/lg.h             |  3 ++-
 drivers/lguest/lguest_user.c    | 14 +++++++++-----
 include/linux/lguest_launcher.h | 13 +++++++++++++
 tools/lguest/lguest.c           | 16 ++++++++++------
 6 files changed, 42 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index cdb2f9aa5860..9159dbc583f6 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -229,16 +229,17 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		 * It's possible the Guest did a NOTIFY hypercall to the
 		 * Launcher.
 		 */
-		if (cpu->pending_notify) {
+		if (cpu->pending.trap) {
 			/*
 			 * Does it just needs to write to a registered
 			 * eventfd (ie. the appropriate virtqueue thread)?
 			 */
 			if (!send_notify_to_eventfd(cpu)) {
 				/* OK, we tell the main Launcher. */
-				if (put_user(cpu->pending_notify, user))
+				if (copy_to_user(user, &cpu->pending,
+						 sizeof(cpu->pending)))
 					return -EFAULT;
-				return sizeof(cpu->pending_notify);
+				return sizeof(cpu->pending);
 			}
 		}
 
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 83511eb0923d..5dd1fb8a6610 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -118,7 +118,8 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		cpu->halted = 1;
 		break;
 	case LHCALL_NOTIFY:
-		cpu->pending_notify = args->arg1;
+		cpu->pending.trap = LGUEST_TRAP_ENTRY;
+		cpu->pending.addr = args->arg1;
 		break;
 	default:
 		/* It should be an architecture-specific hypercall. */
@@ -189,7 +190,7 @@ static void do_async_hcalls(struct lg_cpu *cpu)
 		 * Stop doing hypercalls if they want to notify the Launcher:
 		 * it needs to service this first.
 		 */
-		if (cpu->pending_notify)
+		if (cpu->pending.trap)
 			break;
 	}
 }
@@ -280,7 +281,7 @@ void do_hypercalls(struct lg_cpu *cpu)
 	 * NOTIFY to the Launcher, we want to return now.  Otherwise we do
 	 * the hypercall.
 	 */
-	if (!cpu->pending_notify) {
+	if (!cpu->pending.trap) {
 		do_hcall(cpu, cpu->hcall);
 		/*
 		 * Tricky point: we reset the hcall pointer to mark the
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 1c98bf74fd68..020fec5bb072 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -50,7 +50,8 @@ struct lg_cpu {
 	/* Bitmap of what has changed: see CHANGED_* above. */
 	int changed;
 
-	unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
+	/* Pending operation. */
+	struct lguest_pending pending;
 
 	unsigned long *reg_read; /* register from LHREQ_GETREG */
 
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 7f14c152dd23..dcf9efd94cf4 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -29,6 +29,10 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu)
 	unsigned int i;
 	struct lg_eventfd_map *map;
 
+	/* We only connect LHCALL_NOTIFY to event fds, not other traps. */
+	if (cpu->pending.trap != LGUEST_TRAP_ENTRY)
+		return false;
+
 	/*
 	 * This "rcu_read_lock()" helps track when someone is still looking at
 	 * the (RCU-using) eventfds array.  It's not actually a lock at all;
@@ -52,9 +56,9 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu)
 	 * we'll continue to use the old array and just won't see the new one.
 	 */
 	for (i = 0; i < map->num; i++) {
-		if (map->map[i].addr == cpu->pending_notify) {
+		if (map->map[i].addr == cpu->pending.addr) {
 			eventfd_signal(map->map[i].event, 1);
-			cpu->pending_notify = 0;
+			cpu->pending.trap = 0;
 			break;
 		}
 	}
@@ -62,7 +66,7 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu)
 	rcu_read_unlock();
 
 	/* If we cleared the notification, it's because we found a match. */
-	return cpu->pending_notify == 0;
+	return cpu->pending.trap == 0;
 }
 
 /*L:055
@@ -282,8 +286,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 	 * If we returned from read() last time because the Guest sent I/O,
 	 * clear the flag.
 	 */
-	if (cpu->pending_notify)
-		cpu->pending_notify = 0;
+	if (cpu->pending.trap)
+		cpu->pending.trap = 0;
 
 	/* Run the Guest until something interesting happens. */
 	return run_guest(cpu, (unsigned long __user *)user);
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index f27cae27b0c1..c4451ebece47 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -67,6 +67,19 @@ enum lguest_req
 	LHREQ_SETREG, /* + offset within struct pt_regs, value. */
 };
 
+/*
+ * This is what read() of the lguest fd populates.  trap ==
+ * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the
+ * argument), 14 for a page fault in the MMIO region (addr is
+ * the trap address, insn is the instruction), or 13 for a GPF
+ * (insn is the instruction).
+ */
+struct lguest_pending {
+	__u8 trap;
+	__u8 insn[7];
+	__u32 addr;
+};
+
 /*
  * The alignment to use between consumer and producer parts of vring.
  * x86 pagesize for historical reasons.
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 3f7f2326cd9a..0e754d04876d 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -1820,17 +1820,21 @@ static void __attribute__((noreturn)) restart_guest(void)
 static void __attribute__((noreturn)) run_guest(void)
 {
 	for (;;) {
-		unsigned long notify_addr;
+		struct lguest_pending notify;
 		int readval;
 
 		/* We read from the /dev/lguest device to run the Guest. */
-		readval = pread(lguest_fd, &notify_addr,
-				sizeof(notify_addr), cpu_id);
+		readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id);
 
 		/* One unsigned long means the Guest did HCALL_NOTIFY */
-		if (readval == sizeof(notify_addr)) {
-			verbose("Notify on address %#lx\n", notify_addr);
-			handle_output(notify_addr);
+		if (readval == sizeof(notify)) {
+			if (notify.trap == 0x1F) {
+				verbose("Notify on address %#08x\n",
+					notify.addr);
+				handle_output(notify.addr);
+			} else
+				errx(1, "Unknown trap %i addr %#08x\n",
+				     notify.trap, notify.addr);
 		/* ENOENT means the Guest died.  Reading tells us why. */
 		} else if (errno == ENOENT) {
 			char reason[1024] = { 0 };
-- 
cgit v1.2.3


From 8ed313001a892f240269dea05d4b925cbd150492 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:15:09 +1030
Subject: lguest: add infrastructure for userspace to deliver a trap to the
 guest.

This is required for instruction emulation to move to userspace.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/lguest_user.c    | 19 +++++++++++++++++++
 include/linux/lguest_launcher.h |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index dcf9efd94cf4..be996d173615 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -243,6 +243,23 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
 	return 0;
 }
 
+/*L:053
+ * Deliver a trap: this is used by the Launcher if it can't emulate
+ * an instruction.
+ */
+static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
+{
+	unsigned long trapnum;
+
+	if (get_user(trapnum, input) != 0)
+		return -EFAULT;
+
+	if (!deliver_trap(cpu, trapnum))
+		return -EINVAL;
+
+	return 0;
+}
+
 /*L:040
  * Once our Guest is initialized, the Launcher makes it run by reading
  * from /dev/lguest.
@@ -487,6 +504,8 @@ static ssize_t write(struct file *file, const char __user *in,
 		return getreg_setup(cpu, input);
 	case LHREQ_SETREG:
 		return setreg(cpu, input);
+	case LHREQ_TRAP:
+		return trap(cpu, input);
 	default:
 		return -EINVAL;
 	}
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index c4451ebece47..3c402b843e03 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -65,6 +65,7 @@ enum lguest_req
 	LHREQ_EVENTFD, /* + address, fd. */
 	LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
 	LHREQ_SETREG, /* + offset within struct pt_regs, value. */
+	LHREQ_TRAP, /* + trap number to deliver to guest. */
 };
 
 /*
-- 
cgit v1.2.3


From b3e28b65de254570140832cf7c95255ab4d501bb Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:22:01 +1030
Subject: lguest: remove lguest bus definitions from header.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/lguest_launcher.h | 49 ++---------------------------------------
 1 file changed, 2 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 3c402b843e03..677cde735d4b 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -8,52 +8,13 @@
  *
  * The Guest needs devices to do anything useful.  Since we don't let it touch
  * real devices (think of the damage it could do!) we provide virtual devices.
- * We could emulate a PCI bus with various devices on it, but that is a fairly
- * complex burden for the Host and suboptimal for the Guest, so we have our own
- * simple lguest bus and we use "virtio" drivers.  These drivers need a set of
- * routines from us which will actually do the virtual I/O, but they handle all
- * the net/block/console stuff themselves.  This means that if we want to add
- * a new device, we simply need to write a new virtio driver and create support
- * for it in the Launcher: this code won't need to change.
+ * We emulate a PCI bus with virtio devices on it; we used to have our own
+ * lguest bus which was far simpler, but this tests the virtio 1.0 standard.
  *
  * Virtio devices are also used by kvm, so we can simply reuse their optimized
  * device drivers.  And one day when everyone uses virtio, my plan will be
  * complete.  Bwahahahah!
- *
- * Devices are described by a simplified ID, a status byte, and some "config"
- * bytes which describe this device's configuration.  This is placed by the
- * Launcher just above the top of physical memory:
- */
-struct lguest_device_desc {
-	/* The device type: console, network, disk etc.  Type 0 terminates. */
-	__u8 type;
-	/* The number of virtqueues (first in config array) */
-	__u8 num_vq;
-	/*
-	 * The number of bytes of feature bits.  Multiply by 2: one for host
-	 * features and one for Guest acknowledgements.
-	 */
-	__u8 feature_len;
-	/* The number of bytes of the config array after virtqueues. */
-	__u8 config_len;
-	/* A status byte, written by the Guest. */
-	__u8 status;
-	__u8 config[0];
-};
-
-/*D:135
- * This is how we expect the device configuration field for a virtqueue
- * to be laid out in config space.
  */
-struct lguest_vqconfig {
-	/* The number of entries in the virtio_ring */
-	__u16 num;
-	/* The interrupt we get when something happens. */
-	__u16 irq;
-	/* The page number of the virtio ring for this device. */
-	__u32 pfn;
-};
-/*:*/
 
 /* Write command first word is a request. */
 enum lguest_req
@@ -80,10 +41,4 @@ struct lguest_pending {
 	__u8 insn[7];
 	__u32 addr;
 };
-
-/*
- * The alignment to use between consumer and producer parts of vring.
- * x86 pagesize for historical reasons.
- */
-#define LGUEST_VRING_ALIGN	4096
 #endif /* _LINUX_LGUEST_LAUNCHER */
-- 
cgit v1.2.3


From d9bab50aa46ce46dd4537d455eb13b200cdac516 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:28:01 +1030
Subject: lguest: remove NOTIFY call and eventfd facility.

Disappointing, as this was kind of neat (especially getting to use RCU
to manage the address -> eventfd mapping).  But now the devices are PCI
handled in userspace, we get rid of both the NOTIFY hypercall and
the interface to connect an eventfd.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/lguest_hcall.h |   1 -
 drivers/lguest/core.c               |  20 +---
 drivers/lguest/hypercalls.c         |   4 -
 drivers/lguest/lg.h                 |  12 ---
 drivers/lguest/lguest_user.c        | 186 +-----------------------------------
 include/linux/lguest_launcher.h     |   2 +-
 6 files changed, 10 insertions(+), 215 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 879fd7d33877..ef01fef3eebc 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -16,7 +16,6 @@
 #define LHCALL_SET_PTE		14
 #define LHCALL_SET_PGD		15
 #define LHCALL_LOAD_TLS		16
-#define LHCALL_NOTIFY		17
 #define LHCALL_LOAD_GDT_ENTRY	18
 #define LHCALL_SEND_INTERRUPTS	19
 
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 9159dbc583f6..7dc93aa004c8 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -225,22 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		if (cpu->hcall)
 			do_hypercalls(cpu);
 
-		/*
-		 * It's possible the Guest did a NOTIFY hypercall to the
-		 * Launcher.
-		 */
+		/* Do we have to tell the Launcher about a trap? */
 		if (cpu->pending.trap) {
-			/*
-			 * Does it just needs to write to a registered
-			 * eventfd (ie. the appropriate virtqueue thread)?
-			 */
-			if (!send_notify_to_eventfd(cpu)) {
-				/* OK, we tell the main Launcher. */
-				if (copy_to_user(user, &cpu->pending,
-						 sizeof(cpu->pending)))
-					return -EFAULT;
-				return sizeof(cpu->pending);
-			}
+			if (copy_to_user(user, &cpu->pending,
+					 sizeof(cpu->pending)))
+				return -EFAULT;
+			return sizeof(cpu->pending);
 		}
 
 		/*
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 5dd1fb8a6610..1219af493c0f 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -117,10 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		/* Similarly, this sets the halted flag for run_guest(). */
 		cpu->halted = 1;
 		break;
-	case LHCALL_NOTIFY:
-		cpu->pending.trap = LGUEST_TRAP_ENTRY;
-		cpu->pending.addr = args->arg1;
-		break;
 	default:
 		/* It should be an architecture-specific hypercall. */
 		if (lguest_arch_do_hcall(cpu, args))
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index eb81abc05995..307e8b39e7d1 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -81,16 +81,6 @@ struct lg_cpu {
 	struct lg_cpu_arch arch;
 };
 
-struct lg_eventfd {
-	unsigned long addr;
-	struct eventfd_ctx *event;
-};
-
-struct lg_eventfd_map {
-	unsigned int num;
-	struct lg_eventfd map[];
-};
-
 /* The private info the thread maintains about the guest. */
 struct lguest {
 	struct lguest_data __user *lguest_data;
@@ -117,8 +107,6 @@ struct lguest {
 	unsigned int stack_pages;
 	u32 tsc_khz;
 
-	struct lg_eventfd_map *eventfds;
-
 	/* Dead? */
 	const char *dead;
 };
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index c8b0e8575b44..c4c6113eb9a6 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -2,182 +2,20 @@
  * launcher controls and communicates with the Guest.  For example,
  * the first write will tell us the Guest's memory layout and entry
  * point.  A read will run the Guest until something happens, such as
- * a signal or the Guest doing a NOTIFY out to the Launcher.  There is
- * also a way for the Launcher to attach eventfds to particular NOTIFY
- * values instead of returning from the read() call.
+ * a signal or the Guest accessing a device.
 :*/
 #include <linux/uaccess.h>
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/eventfd.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/export.h>
 #include "lg.h"
 
-/*L:056
- * Before we move on, let's jump ahead and look at what the kernel does when
- * it needs to look up the eventfds.  That will complete our picture of how we
- * use RCU.
- *
- * The notification value is in cpu->pending_notify: we return true if it went
- * to an eventfd.
- */
-bool send_notify_to_eventfd(struct lg_cpu *cpu)
-{
-	unsigned int i;
-	struct lg_eventfd_map *map;
-
-	/* We only connect LHCALL_NOTIFY to event fds, not other traps. */
-	if (cpu->pending.trap != LGUEST_TRAP_ENTRY)
-		return false;
-
-	/*
-	 * This "rcu_read_lock()" helps track when someone is still looking at
-	 * the (RCU-using) eventfds array.  It's not actually a lock at all;
-	 * indeed it's a noop in many configurations.  (You didn't expect me to
-	 * explain all the RCU secrets here, did you?)
-	 */
-	rcu_read_lock();
-	/*
-	 * rcu_dereference is the counter-side of rcu_assign_pointer(); it
-	 * makes sure we don't access the memory pointed to by
-	 * cpu->lg->eventfds before cpu->lg->eventfds is set.  Sounds crazy,
-	 * but Alpha allows this!  Paul McKenney points out that a really
-	 * aggressive compiler could have the same effect:
-	 *   http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
-	 *
-	 * So play safe, use rcu_dereference to get the rcu-protected pointer:
-	 */
-	map = rcu_dereference(cpu->lg->eventfds);
-	/*
-	 * Simple array search: even if they add an eventfd while we do this,
-	 * we'll continue to use the old array and just won't see the new one.
-	 */
-	for (i = 0; i < map->num; i++) {
-		if (map->map[i].addr == cpu->pending.addr) {
-			eventfd_signal(map->map[i].event, 1);
-			cpu->pending.trap = 0;
-			break;
-		}
-	}
-	/* We're done with the rcu-protected variable cpu->lg->eventfds. */
-	rcu_read_unlock();
-
-	/* If we cleared the notification, it's because we found a match. */
-	return cpu->pending.trap == 0;
-}
-
-/*L:055
- * One of the more tricksy tricks in the Linux Kernel is a technique called
- * Read Copy Update.  Since one point of lguest is to teach lguest journeyers
- * about kernel coding, I use it here.  (In case you're curious, other purposes
- * include learning about virtualization and instilling a deep appreciation for
- * simplicity and puppies).
- *
- * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
- * add new eventfds without ever blocking readers from accessing the array.
- * The current Launcher only does this during boot, so that never happens.  But
- * Read Copy Update is cool, and adding a lock risks damaging even more puppies
- * than this code does.
- *
- * We allocate a brand new one-larger array, copy the old one and add our new
- * element.  Then we make the lg eventfd pointer point to the new array.
- * That's the easy part: now we need to free the old one, but we need to make
- * sure no slow CPU somewhere is still looking at it.  That's what
- * synchronize_rcu does for us: waits until every CPU has indicated that it has
- * moved on to know it's no longer using the old one.
- *
- * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
- */
-static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
-{
-	struct lg_eventfd_map *new, *old = lg->eventfds;
-
-	/*
-	 * We don't allow notifications on value 0 anyway (pending_notify of
-	 * 0 means "nothing pending").
-	 */
-	if (!addr)
-		return -EINVAL;
-
-	/*
-	 * Replace the old array with the new one, carefully: others can
-	 * be accessing it at the same time.
-	 */
-	new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
-		      GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
-
-	/* First make identical copy. */
-	memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
-	new->num = old->num;
-
-	/* Now append new entry. */
-	new->map[new->num].addr = addr;
-	new->map[new->num].event = eventfd_ctx_fdget(fd);
-	if (IS_ERR(new->map[new->num].event)) {
-		int err =  PTR_ERR(new->map[new->num].event);
-		kfree(new);
-		return err;
-	}
-	new->num++;
-
-	/*
-	 * Now put new one in place: rcu_assign_pointer() is a fancy way of
-	 * doing "lg->eventfds = new", but it uses memory barriers to make
-	 * absolutely sure that the contents of "new" written above is nailed
-	 * down before we actually do the assignment.
-	 *
-	 * We have to think about these kinds of things when we're operating on
-	 * live data without locks.
-	 */
-	rcu_assign_pointer(lg->eventfds, new);
-
-	/*
-	 * We're not in a big hurry.  Wait until no one's looking at old
-	 * version, then free it.
-	 */
-	synchronize_rcu();
-	kfree(old);
-
-	return 0;
-}
-
 /*L:052
- * Receiving notifications from the Guest is usually done by attaching a
- * particular LHCALL_NOTIFY value to an event filedescriptor.  The eventfd will
- * become readable when the Guest does an LHCALL_NOTIFY with that value.
- *
- * This is really convenient for processing each virtqueue in a separate
- * thread.
- */
-static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
-{
-	unsigned long addr, fd;
-	int err;
-
-	if (get_user(addr, input) != 0)
-		return -EFAULT;
-	input++;
-	if (get_user(fd, input) != 0)
-		return -EFAULT;
-
-	/*
-	 * Just make sure two callers don't add eventfds at once.  We really
-	 * only need to lock against callers adding to the same Guest, so using
-	 * the Big Lguest Lock is overkill.  But this is setup, not a fast path.
-	 */
-	mutex_lock(&lguest_lock);
-	err = add_eventfd(lg, addr, fd);
-	mutex_unlock(&lguest_lock);
-
-	return err;
-}
-
-/* The Launcher can get the registers, and also set some of them. */
+  The Launcher can get the registers, and also set some of them.
+*/
 static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
 {
 	unsigned long which;
@@ -409,13 +247,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
 		goto unlock;
 	}
 
-	lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
-	if (!lg->eventfds) {
-		err = -ENOMEM;
-		goto free_lg;
-	}
-	lg->eventfds->num = 0;
-
 	/* Populate the easy fields of our "struct lguest" */
 	lg->mem_base = (void __user *)args[0];
 	lg->pfn_limit = args[1];
@@ -424,7 +255,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
 	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
 	if (err)
-		goto free_eventfds;
+		goto free_lg;
 
 	/*
 	 * Initialize the Guest's shadow page tables.  This allocates
@@ -445,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
 free_regs:
 	/* FIXME: This should be in free_vcpu */
 	free_page(lg->cpus[0].regs_page);
-free_eventfds:
-	kfree(lg->eventfds);
 free_lg:
 	kfree(lg);
 unlock:
@@ -499,8 +328,6 @@ static ssize_t write(struct file *file, const char __user *in,
 		return initialize(file, input);
 	case LHREQ_IRQ:
 		return user_send_irq(cpu, input);
-	case LHREQ_EVENTFD:
-		return attach_eventfd(lg, input);
 	case LHREQ_GETREG:
 		return getreg_setup(cpu, input);
 	case LHREQ_SETREG:
@@ -551,11 +378,6 @@ static int close(struct inode *inode, struct file *file)
 		mmput(lg->cpus[i].mm);
 	}
 
-	/* Release any eventfds they registered. */
-	for (i = 0; i < lg->eventfds->num; i++)
-		eventfd_ctx_put(lg->eventfds->map[i].event);
-	kfree(lg->eventfds);
-
 	/*
 	 * If lg->dead doesn't contain an error code it will be NULL or a
 	 * kmalloc()ed string, either of which is ok to hand to kfree().
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 677cde735d4b..acd5b12565cc 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -23,7 +23,7 @@ enum lguest_req
 	LHREQ_GETDMA, /* No longer used */
 	LHREQ_IRQ, /* + irq */
 	LHREQ_BREAK, /* No longer used */
-	LHREQ_EVENTFD, /* + address, fd. */
+	LHREQ_EVENTFD, /* No longer used. */
 	LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
 	LHREQ_SETREG, /* + offset within struct pt_regs, value. */
 	LHREQ_TRAP, /* + trap number to deliver to guest. */
-- 
cgit v1.2.3


From d427e3c82ef4fc5fbb22c0cef0b040e6767b1028 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Feb 2015 14:07:50 +0100
Subject: block: remove unused function blk_bio_map_sg

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c      | 29 -----------------------------
 include/linux/blkdev.h |  2 --
 2 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 9476b1528ded..fc1ff3b1ea1f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -283,35 +283,6 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
 
-/**
- * blk_bio_map_sg - map a bio to a scatterlist
- * @q: request_queue in question
- * @bio: bio being mapped
- * @sglist: scatterlist being mapped
- *
- * Note:
- *    Caller must make sure sg can hold bio->bi_phys_segments entries
- *
- * Will return the number of sg entries setup
- */
-int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
-		   struct scatterlist *sglist)
-{
-	struct scatterlist *sg = NULL;
-	int nsegs;
-	struct bio *next = bio->bi_next;
-	bio->bi_next = NULL;
-
-	nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
-	bio->bi_next = next;
-	if (sg)
-		sg_mark_end(sg);
-
-	BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
-	return nsegs;
-}
-EXPORT_SYMBOL(blk_bio_map_sg);
-
 static inline int ll_new_hw_segment(struct request_queue *q,
 				    struct request *req,
 				    struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bf4ef666d191..7f9a516f24de 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1049,8 +1049,6 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
-extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
-			  struct scatterlist *sglist);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);
 
-- 
cgit v1.2.3


From f7219b527b5710ae0c4add8faa4d0ea529722cb5 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 10 Feb 2015 12:55:03 -0800
Subject: treewide: Remove unnecessary BCMA_CORETABLE_END macro

Use the normal {} instead of a macro to terminate an array.

Remove the macro too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bgmac.c                 | 2 +-
 drivers/net/wireless/b43/main.c                       | 2 +-
 drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c | 2 +-
 drivers/spi/spi-bcm53xx.c                             | 2 +-
 drivers/usb/host/bcma-hcd.c                           | 2 +-
 include/linux/mod_devicetable.h                       | 2 --
 6 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c
index 3007d95fbb9f..ea63cb085d19 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -21,7 +21,7 @@
 static const struct bcma_device_id bgmac_bcma_tbl[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_4706_MAC_GBIT, BCMA_ANY_REV, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_MAC_GBIT, BCMA_ANY_REV, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, bgmac_bcma_tbl);
 
diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index 2c9088633ec6..6acb2b51ab8f 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -127,7 +127,7 @@ static const struct bcma_device_id b43_bcma_tbl[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 0x1E, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 0x28, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 0x2A, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, b43_bcma_tbl);
 #endif
diff --git a/drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c
index f95b52442281..48135063347e 100644
--- a/drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c
@@ -99,7 +99,7 @@ static struct bcma_device_id brcms_coreid_table[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 17, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 23, BCMA_ANY_CLASS),
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_80211, 24, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, brcms_coreid_table);
 
diff --git a/drivers/spi/spi-bcm53xx.c b/drivers/spi/spi-bcm53xx.c
index 17b34cbadc03..3fb91c81015a 100644
--- a/drivers/spi/spi-bcm53xx.c
+++ b/drivers/spi/spi-bcm53xx.c
@@ -216,7 +216,7 @@ static struct spi_board_info bcm53xx_info = {
 
 static const struct bcma_device_id bcm53xxspi_bcma_tbl[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_NS_QSPI, BCMA_ANY_REV, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, bcm53xxspi_bcma_tbl);
 
diff --git a/drivers/usb/host/bcma-hcd.c b/drivers/usb/host/bcma-hcd.c
index cd6d0afb6b8f..526cfab41d5f 100644
--- a/drivers/usb/host/bcma-hcd.c
+++ b/drivers/usb/host/bcma-hcd.c
@@ -306,7 +306,7 @@ static int bcma_hcd_resume(struct bcma_device *dev)
 
 static const struct bcma_device_id bcma_hcd_table[] = {
 	BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_USB20_HOST, BCMA_ANY_REV, BCMA_ANY_CLASS),
-	BCMA_CORETABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(bcma, bcma_hcd_table);
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 745def862580..6711d57f9c05 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -381,8 +381,6 @@ struct bcma_device_id {
 } __attribute__((packed,aligned(2)));
 #define BCMA_CORE(_manuf, _id, _rev, _class)  \
 	{ .manuf = _manuf, .id = _id, .rev = _rev, .class = _class, }
-#define BCMA_CORETABLE_END  \
-	{ 0, },
 
 #define BCMA_ANY_MANUF		0xFFFF
 #define BCMA_ANY_ID		0xFFFF
-- 
cgit v1.2.3


From 673e2baaa6d986e2fcd9c867661d8113f6c7dc7b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 10 Feb 2015 13:19:24 -0800
Subject: treewide: Remove unnecessary SSB_DEVTABLE_END macro

Use the normal {} instead of a macro to terminate an array.

Remove the macro too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/b44.c   | 2 +-
 drivers/net/wireless/b43/main.c       | 2 +-
 drivers/net/wireless/b43legacy/main.c | 2 +-
 drivers/ssb/driver_gige.c             | 2 +-
 drivers/usb/host/ssb-hcd.c            | 2 +-
 include/linux/mod_devicetable.h       | 2 --
 6 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index d86d6baf9681..bd5916a60cb5 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -121,7 +121,7 @@ static struct pci_driver b44_pci_driver = {
 
 static const struct ssb_device_id b44_ssb_tbl[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_ETHERNET, SSB_ANY_REV),
-	SSB_DEVTABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(ssb, b44_ssb_tbl);
 
diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index 6acb2b51ab8f..ccbdb05b28cd 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -144,7 +144,7 @@ static const struct ssb_device_id b43_ssb_tbl[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 13),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 15),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 16),
-	SSB_DEVTABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(ssb, b43_ssb_tbl);
 #endif
diff --git a/drivers/net/wireless/b43legacy/main.c b/drivers/net/wireless/b43legacy/main.c
index 1aec2146a2bf..4e58c0069830 100644
--- a/drivers/net/wireless/b43legacy/main.c
+++ b/drivers/net/wireless/b43legacy/main.c
@@ -86,7 +86,7 @@ MODULE_PARM_DESC(fwpostfix, "Postfix for the firmware files to load.");
 static const struct ssb_device_id b43legacy_ssb_tbl[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 2),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 4),
-	SSB_DEVTABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(ssb, b43legacy_ssb_tbl);
 
diff --git a/drivers/ssb/driver_gige.c b/drivers/ssb/driver_gige.c
index 21f71a1581fa..e9734051e3c4 100644
--- a/drivers/ssb/driver_gige.c
+++ b/drivers/ssb/driver_gige.c
@@ -24,7 +24,7 @@ MODULE_LICENSE("GPL");
 
 static const struct ssb_device_id ssb_gige_tbl[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_ETHERNET_GBIT, SSB_ANY_REV),
-	SSB_DEVTABLE_END
+	{},
 };
 /* MODULE_DEVICE_TABLE(ssb, ssb_gige_tbl); */
 
diff --git a/drivers/usb/host/ssb-hcd.c b/drivers/usb/host/ssb-hcd.c
index 0196f766df73..ffc32f4b1b1b 100644
--- a/drivers/usb/host/ssb-hcd.c
+++ b/drivers/usb/host/ssb-hcd.c
@@ -251,7 +251,7 @@ static const struct ssb_device_id ssb_hcd_table[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_USB11_HOSTDEV, SSB_ANY_REV),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_USB11_HOST, SSB_ANY_REV),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_USB20_HOST, SSB_ANY_REV),
-	SSB_DEVTABLE_END
+	{},
 };
 MODULE_DEVICE_TABLE(ssb, ssb_hcd_table);
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 6711d57f9c05..2bb97175b6cc 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -365,8 +365,6 @@ struct ssb_device_id {
 } __attribute__((packed, aligned(2)));
 #define SSB_DEVICE(_vendor, _coreid, _revision)  \
 	{ .vendor = _vendor, .coreid = _coreid, .revision = _revision, }
-#define SSB_DEVTABLE_END  \
-	{ 0, },
 
 #define SSB_ANY_VENDOR		0xFFFF
 #define SSB_ANY_ID		0xFFFF
-- 
cgit v1.2.3


From 26c4f7da3e413da697a7beb22ad496390eda7da0 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Feb 2015 16:30:27 -0800
Subject: net: Fix remcsum in GRO path to not change packet

Remote checksum offload processing is currently the same for both
the GRO and non-GRO path. When the remote checksum offload option
is encountered, the checksum field referred to is modified in
the packet. So in the GRO case, the packet is modified in the
GRO path and then the operation is skipped when the packet goes
through the normal path based on skb->remcsum_offload. There is
a problem in that the packet may be modified in the GRO path, but
then forwarded off host still containing the remote checksum option.
A remote host will again perform RCO but now the checksum verification
will fail since GRO RCO already modified the checksum.

To fix this, we ensure that GRO restores a packet to it's original
state before returning. In this model, when GRO processes a remote
checksum option it still changes the checksum per the algorithm
but on return from lower layer processing the checksum is restored
to its original value.

In this patch we add define gro_remcsum structure which is passed
to skb_gro_remcsum_process to save offset and delta for the checksum
being changed. After lower layer processing, skb_gro_remcsum_cleanup
is called to restore the checksum before returning from GRO.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 19 +++++++++----------
 include/linux/netdevice.h | 25 +++++++++++++++++++++++--
 include/net/checksum.h    |  5 +++++
 net/ipv4/fou.c            | 20 ++++++++++----------
 4 files changed, 47 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 0e57e862c399..30310a63475a 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -555,12 +555,12 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 					  unsigned int off,
 					  struct vxlanhdr *vh, size_t hdrlen,
-					  u32 data)
+					  u32 data, struct gro_remcsum *grc)
 {
 	size_t start, offset, plen;
 
 	if (skb->remcsum_offload)
-		return vh;
+		return NULL;
 
 	if (!NAPI_GRO_CB(skb)->csum_valid)
 		return NULL;
@@ -579,7 +579,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 			return NULL;
 	}
 
-	skb_gro_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
+	skb_gro_remcsum_process(skb, (void *)vh + hdrlen,
+				start, offset, grc);
 
 	skb->remcsum_offload = 1;
 
@@ -597,6 +598,9 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 	struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock,
 					     udp_offloads);
 	u32 flags;
+	struct gro_remcsum grc;
+
+	skb_gro_remcsum_init(&grc);
 
 	off_vx = skb_gro_offset(skb);
 	hlen = off_vx + sizeof(*vh);
@@ -614,7 +618,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 
 	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
 		vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
-				       ntohl(vh->vx_vni));
+				       ntohl(vh->vx_vni), &grc);
 
 		if (!vh)
 			goto out;
@@ -637,6 +641,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 	pp = eth_gro_receive(head, skb);
 
 out:
+	skb_gro_remcsum_cleanup(skb, &grc);
 	NAPI_GRO_CB(skb)->flush |= flush;
 
 	return pp;
@@ -1154,12 +1159,6 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 {
 	size_t start, offset, plen;
 
-	if (skb->remcsum_offload) {
-		/* Already processed in GRO path */
-		skb->remcsum_offload = 0;
-		return vh;
-	}
-
 	start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
 	offset = start + ((data & VXLAN_RCO_UDP) ?
 			  offsetof(struct udphdr, check) :
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d115256ed5a2..3aa02457fe4f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2321,8 +2321,19 @@ do {									\
 					   compute_pseudo(skb, proto));	\
 } while (0)
 
+struct gro_remcsum {
+	int offset;
+	__wsum delta;
+};
+
+static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
+{
+	grc->delta = 0;
+}
+
 static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
-					   int start, int offset)
+					   int start, int offset,
+					   struct gro_remcsum *grc)
 {
 	__wsum delta;
 
@@ -2331,10 +2342,20 @@ static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
 	delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
 
 	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
 	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+
+	grc->offset = (ptr + offset) - (void *)skb->head;
+	grc->delta = delta;
 }
 
+static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
+					   struct gro_remcsum *grc)
+{
+	if (!grc->delta)
+		return;
+
+	remcsum_unadjust((__sum16 *)(skb->head + grc->offset), grc->delta);
+}
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
diff --git a/include/net/checksum.h b/include/net/checksum.h
index e339a9513e29..0a55ac715077 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -167,4 +167,9 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum,
 	return delta;
 }
 
+static inline void remcsum_unadjust(__sum16 *psum, __wsum delta)
+{
+	*psum = csum_fold(csum_sub(delta, *psum));
+}
+
 #endif
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 92ddea1e6457..7fa8d36e56d4 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -71,12 +71,6 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 	size_t offset = ntohs(pd[1]);
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
 
-	if (skb->remcsum_offload) {
-		/* Already processed in GRO path */
-		skb->remcsum_offload = 0;
-		return guehdr;
-	}
-
 	if (!pskb_may_pull(skb, plen))
 		return NULL;
 	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
@@ -214,7 +208,8 @@ out_unlock:
 
 static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 				      struct guehdr *guehdr, void *data,
-				      size_t hdrlen, u8 ipproto)
+				      size_t hdrlen, u8 ipproto,
+				      struct gro_remcsum *grc)
 {
 	__be16 *pd = data;
 	size_t start = ntohs(pd[0]);
@@ -222,7 +217,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
 
 	if (skb->remcsum_offload)
-		return guehdr;
+		return NULL;
 
 	if (!NAPI_GRO_CB(skb)->csum_valid)
 		return NULL;
@@ -234,7 +229,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 			return NULL;
 	}
 
-	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
+	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
+				start, offset, grc);
 
 	skb->remcsum_offload = 1;
 
@@ -254,6 +250,9 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 	void *data;
 	u16 doffset = 0;
 	int flush = 1;
+	struct gro_remcsum grc;
+
+	skb_gro_remcsum_init(&grc);
 
 	off = skb_gro_offset(skb);
 	len = off + sizeof(*guehdr);
@@ -295,7 +294,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 		if (flags & GUE_PFLAG_REMCSUM) {
 			guehdr = gue_gro_remcsum(skb, off, guehdr,
 						 data + doffset, hdrlen,
-						 guehdr->proto_ctype);
+						 guehdr->proto_ctype, &grc);
 			if (!guehdr)
 				goto out;
 
@@ -345,6 +344,7 @@ out_unlock:
 	rcu_read_unlock();
 out:
 	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_remcsum_cleanup(skb, &grc);
 
 	return pp;
 }
-- 
cgit v1.2.3


From 6edec0e61b1e52f9144935d28c9088f5b202e61c Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Feb 2015 16:30:28 -0800
Subject: net: Clarify meaning of CHECKSUM_PARTIAL for receive path

The current meaning of CHECKSUM_PARTIAL for validating checksums
is that _all_ checksums in the packet are considered valid.
However, in the manner that CHECKSUM_PARTIAL is set only the checksum
at csum_start+csum_offset and any preceding checksums may
be considered valid. If there are checksums in the packet after
csum_offset it is possible they have not been verfied.

This patch changes CHECKSUM_PARTIAL logic in skb_csum_unnecessary and
__skb_gro_checksum_validate_needed to only considered checksums
referring to csum_start and any preceding checksums (with starting
offset before csum_start) to be verified.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 +++-
 include/linux/skbuff.h    | 17 ++++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3aa02457fe4f..9e9be221e4bf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2246,7 +2246,9 @@ static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
 						      bool zero_okay,
 						      __sum16 check)
 {
-	return (skb->ip_summed != CHECKSUM_PARTIAL &&
+	return ((skb->ip_summed != CHECKSUM_PARTIAL ||
+		skb_checksum_start_offset(skb) <
+		 skb_gro_offset(skb)) &&
 		NAPI_GRO_CB(skb)->csum_cnt == 0 &&
 		(!zero_okay || check));
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1bb36edb66b9..da6028a50687 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -83,11 +83,15 @@
  *
  * CHECKSUM_PARTIAL:
  *
- *   This is identical to the case for output below. This may occur on a packet
+ *   A checksum is set up to be offloaded to a device as described in the
+ *   output description for CHECKSUM_PARTIAL. This may occur on a packet
  *   received directly from another Linux OS, e.g., a virtualized Linux kernel
- *   on the same host. The packet can be treated in the same way as
- *   CHECKSUM_UNNECESSARY, except that on output (i.e., forwarding) the
- *   checksum must be filled in by the OS or the hardware.
+ *   on the same host, or it may be set in the input path in GRO or remote
+ *   checksum offload. For the purposes of checksum verification, the checksum
+ *   referred to by skb->csum_start + skb->csum_offset and any preceding
+ *   checksums in the packet are considered verified. Any checksums in the
+ *   packet that are after the checksum being offloaded are not considered to
+ *   be verified.
  *
  * B. Checksumming on output.
  *
@@ -2915,7 +2919,10 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb);
 
 static inline int skb_csum_unnecessary(const struct sk_buff *skb)
 {
-	return ((skb->ip_summed & CHECKSUM_UNNECESSARY) || skb->csum_valid);
+	return ((skb->ip_summed == CHECKSUM_UNNECESSARY) ||
+		skb->csum_valid ||
+		(skb->ip_summed == CHECKSUM_PARTIAL &&
+		 skb_checksum_start_offset(skb) >= 0));
 }
 
 /**
-- 
cgit v1.2.3


From baa32ff42871f2d4aca9c08c9403d0e497325564 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Feb 2015 16:30:30 -0800
Subject: net: Use more bit fields in napi_gro_cb

This patch moves the free and same_flow fields to be bit fields
(2 and 1 bit sized respectively). This frees up some space for u16's.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9e9be221e4bf..43fd0a4dcd04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1923,20 +1923,15 @@ struct napi_gro_cb {
 	/* Number of segments aggregated. */
 	u16	count;
 
-	/* This is non-zero if the packet may be of the same flow. */
-	u8	same_flow;
-
-	/* Free the skb? */
-	u8	free;
-#define NAPI_GRO_FREE		  1
-#define NAPI_GRO_FREE_STOLEN_HEAD 2
-
 	/* jiffies when first packet was created/queued */
 	unsigned long age;
 
 	/* Used in ipv6_gro_receive() and foo-over-udp */
 	u16	proto;
 
+	/* This is non-zero if the packet may be of the same flow. */
+	u8	same_flow:1;
+
 	/* Used in udp_gro_receive */
 	u8	udp_mark:1;
 
@@ -1946,9 +1941,16 @@ struct napi_gro_cb {
 	/* Number of checksums via CHECKSUM_UNNECESSARY */
 	u8	csum_cnt:3;
 
+	/* Free the skb? */
+	u8	free:2;
+#define NAPI_GRO_FREE		  1
+#define NAPI_GRO_FREE_STOLEN_HEAD 2
+
 	/* Used in foo-over-udp, set in udp[46]_gro_receive */
 	u8	is_ipv6:1;
 
+	/* 7 bit hole */
+
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
 
-- 
cgit v1.2.3


From 15e2396d4e3ce23188852b74d924107982c63b42 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Feb 2015 16:30:31 -0800
Subject: net: Infrastructure for CHECKSUM_PARTIAL with remote checsum offload

This patch adds infrastructure so that remote checksum offload can
set CHECKSUM_PARTIAL instead of calling csum_partial and writing
the modfied checksum field.

Add skb_remcsum_adjust_partial function to set an skb for using
CHECKSUM_PARTIAL with remote checksum offload.  Changed
skb_remcsum_process and skb_gro_remcsum_process to take a boolean
argument to indicate if checksum partial can be set or the
checksum needs to be modified using the normal algorithm.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       |  4 ++--
 include/linux/netdevice.h | 19 ++++++++++++++++++-
 include/linux/skbuff.h    | 15 ++++++++++++++-
 net/core/dev.c            |  1 +
 net/ipv4/fou.c            |  4 ++--
 5 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 30310a63475a..4f04443cfd33 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -580,7 +580,7 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 	}
 
 	skb_gro_remcsum_process(skb, (void *)vh + hdrlen,
-				start, offset, grc);
+				start, offset, grc, true);
 
 	skb->remcsum_offload = 1;
 
@@ -1171,7 +1171,7 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 
 	vh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
 
-	skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
+	skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset, true);
 
 	return vh;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 43fd0a4dcd04..5897b4ea5a3f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1923,6 +1923,9 @@ struct napi_gro_cb {
 	/* Number of segments aggregated. */
 	u16	count;
 
+	/* Start offset for remote checksum offload */
+	u16	gro_remcsum_start;
+
 	/* jiffies when first packet was created/queued */
 	unsigned long age;
 
@@ -2244,6 +2247,12 @@ static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
 
 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
 
+static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
+{
+	return (NAPI_GRO_CB(skb)->gro_remcsum_start - skb_headroom(skb) ==
+		skb_gro_offset(skb));
+}
+
 static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
 						      bool zero_okay,
 						      __sum16 check)
@@ -2251,6 +2260,7 @@ static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
 	return ((skb->ip_summed != CHECKSUM_PARTIAL ||
 		skb_checksum_start_offset(skb) <
 		 skb_gro_offset(skb)) &&
+		!skb_at_gro_remcsum_start(skb) &&
 		NAPI_GRO_CB(skb)->csum_cnt == 0 &&
 		(!zero_okay || check));
 }
@@ -2337,12 +2347,19 @@ static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
 
 static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
 					   int start, int offset,
-					   struct gro_remcsum *grc)
+					   struct gro_remcsum *grc,
+					   bool nopartial)
 {
 	__wsum delta;
 
 	BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
 
+	if (!nopartial) {
+		NAPI_GRO_CB(skb)->gro_remcsum_start =
+		    ((unsigned char *)ptr + start) - skb->head;
+		return;
+	}
+
 	delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
 
 	/* Adjust skb->csum since we changed the packet */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index da6028a50687..30007afe70b3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3104,16 +3104,29 @@ do {									\
 				       compute_pseudo(skb, proto));	\
 } while (0)
 
+static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
+					      u16 start, u16 offset)
+{
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
+	skb->csum_offset = offset - start;
+}
+
 /* Update skbuf and packet to reflect the remote checksum offload operation.
  * When called, ptr indicates the starting point for skb->csum when
  * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
  * here, skb_postpull_rcsum is done so skb->csum start is ptr.
  */
 static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
-				       int start, int offset)
+				       int start, int offset, bool nopartial)
 {
 	__wsum delta;
 
+	if (!nopartial) {
+		skb_remcsum_adjust_partial(skb, ptr, start, offset);
+		return;
+	}
+
 	 if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
 		__skb_checksum_complete(skb);
 		skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
diff --git a/net/core/dev.c b/net/core/dev.c
index d030575532a2..48c6ecb18f3c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4024,6 +4024,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->flush = 0;
 		NAPI_GRO_CB(skb)->free = 0;
 		NAPI_GRO_CB(skb)->udp_mark = 0;
+		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
 		/* Setup for GRO checksum validation */
 		switch (skb->ip_summed) {
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 7fa8d36e56d4..d320f575cc62 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -75,7 +75,7 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 		return NULL;
 	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
 
-	skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
+	skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset, true);
 
 	return guehdr;
 }
@@ -230,7 +230,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 	}
 
 	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
-				start, offset, grc);
+				start, offset, grc, true);
 
 	skb->remcsum_offload = 1;
 
-- 
cgit v1.2.3


From 119ee9144534141822462e3e8a5ccc8dc537f712 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 29 Jan 2015 11:45:33 -0800
Subject: f2fs: split UMOUNT and FASTBOOT flags

This patch adds FASTBOOT flag into checkpoint as follows.

 - CP_UMOUNT_FLAG is set when system is umounted.
 - CP_FASTBOOT_FLAG is set when intermediate checkpoint having node summaries
   was done.

So, if you get CP_UMOUNT_FLAG from checkpoint, the system was umounted cleanly.
Instead, if there was sudden-power-off, you can get CP_FASTBOOT_FLAG or nothing.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c        | 19 +++++++++++++------
 fs/f2fs/f2fs.h              | 23 +++++++++++++++++++++++
 fs/f2fs/gc.c                |  3 +--
 fs/f2fs/segment.c           | 11 +++++------
 fs/f2fs/super.c             |  5 ++---
 include/linux/f2fs_fs.h     |  1 +
 include/trace/events/f2fs.h |  1 +
 7 files changed, 46 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 22165fb1d0d1..f7cdcad31943 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -956,17 +956,24 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
 			orphan_blocks);
 
-	if (cpc->reason == CP_UMOUNT) {
-		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+	if (__remain_node_summaries(cpc->reason))
 		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
 				cp_payload_blks + data_sum_blocks +
 				orphan_blocks + NR_CURSEG_NODE_TYPE);
-	} else {
-		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+	else
 		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
 				cp_payload_blks + data_sum_blocks +
 				orphan_blocks);
-	}
+
+	if (cpc->reason == CP_UMOUNT)
+		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+	else
+		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+
+	if (cpc->reason == CP_FASTBOOT)
+		set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+	else
+		clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
 
 	if (orphan_num)
 		set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
@@ -1010,7 +1017,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	write_data_summaries(sbi, start_blk);
 	start_blk += data_sum_blocks;
-	if (cpc->reason == CP_UMOUNT) {
+	if (__remain_node_summaries(cpc->reason)) {
 		write_node_summaries(sbi, start_blk);
 		start_blk += NR_CURSEG_NODE_TYPE;
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5abe0836c179..8231a599a305 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -100,6 +100,7 @@ enum {
 
 enum {
 	CP_UMOUNT,
+	CP_FASTBOOT,
 	CP_SYNC,
 	CP_DISCARD,
 };
@@ -764,6 +765,28 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
 	up_write(&sbi->cp_rwsem);
 }
 
+static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
+{
+	int reason = CP_SYNC;
+
+	if (test_opt(sbi, FASTBOOT))
+		reason = CP_FASTBOOT;
+	if (is_sbi_flag_set(sbi, SBI_IS_CLOSE))
+		reason = CP_UMOUNT;
+	return reason;
+}
+
+static inline bool __remain_node_summaries(int reason)
+{
+	return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+}
+
+static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
+{
+	return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
+			is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
+}
+
 /*
  * Check whether the given nid is within node id range.
  */
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ba89e27f394f..76adbc3641f1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -698,8 +698,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
 		.iroot = RADIX_TREE_INIT(GFP_NOFS),
 	};
 
-	cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
-
+	cpc.reason = __get_cp_reason(sbi);
 gc_more:
 	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
 		goto stop;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 31c4e5702c7d..5ea57ec153d1 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1401,7 +1401,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 		segno = le32_to_cpu(ckpt->cur_data_segno[type]);
 		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
 							CURSEG_HOT_DATA]);
-		if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+		if (__exist_node_summaries(sbi))
 			blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
 		else
 			blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
@@ -1410,7 +1410,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 							CURSEG_HOT_NODE]);
 		blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
 							CURSEG_HOT_NODE]);
-		if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+		if (__exist_node_summaries(sbi))
 			blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
 							type - CURSEG_HOT_NODE);
 		else
@@ -1421,7 +1421,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 	sum = (struct f2fs_summary_block *)page_address(new);
 
 	if (IS_NODESEG(type)) {
-		if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
+		if (__exist_node_summaries(sbi)) {
 			struct f2fs_summary *ns = &sum->entries[0];
 			int i;
 			for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
@@ -1470,7 +1470,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 		type = CURSEG_HOT_NODE;
 	}
 
-	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
+	if (__exist_node_summaries(sbi))
 		ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
 					NR_CURSEG_TYPE - type, META_CP);
 
@@ -1567,8 +1567,7 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 
 void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 {
-	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
-		write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
+	write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
 }
 
 int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 06d903b5d6c8..bfeab3c81a48 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -500,9 +500,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 	if (sync) {
 		struct cp_control cpc;
 
-		cpc.reason = (test_opt(sbi, FASTBOOT) ||
-					is_sbi_flag_set(sbi, SBI_IS_CLOSE)) ?
-						CP_UMOUNT : CP_SYNC;
+		cpc.reason = __get_cp_reason(sbi);
+
 		mutex_lock(&sbi->gc_mutex);
 		write_checkpoint(sbi, &cpc);
 		mutex_unlock(&sbi->gc_mutex);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index e993b0bc9abf..09805720f4bf 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -87,6 +87,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_FASTBOOT_FLAG	0x00000020
 #define CP_FSCK_FLAG		0x00000010
 #define CP_ERROR_FLAG		0x00000008
 #define CP_COMPACT_SUM_FLAG	0x00000004
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 5e1c0292250c..69629826c2ba 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -72,6 +72,7 @@
 #define show_cpreason(type)						\
 	__print_symbolic(type,						\
 		{ CP_UMOUNT,	"Umount" },				\
+		{ CP_FASTBOOT,	"Fastboot" },				\
 		{ CP_SYNC,	"Sync" },				\
 		{ CP_DISCARD,	"Discard" })
 
-- 
cgit v1.2.3


From f7ef9b83b583640111039b30e13263b71c3a6ed5 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 9 Feb 2015 12:02:44 -0800
Subject: f2fs: introduce macros to convert bytes and blocks in f2fs

This patch adds two macros for transition between byte and block offsets.
Currently, f2fs only supports 4KB blocks, so use the default size for now.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c    | 7 +++----
 fs/f2fs/file.c          | 3 +--
 fs/f2fs/segment.c       | 8 ++++----
 include/linux/f2fs_fs.h | 4 ++++
 4 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 31a715b5fd5c..58d88df0d632 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -981,15 +981,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* write out checkpoint buffer at block 0 */
 	cp_page = grab_meta_page(sbi, start_blk++);
 	kaddr = page_address(cp_page);
-	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+	memcpy(kaddr, ckpt, F2FS_BLKSIZE);
 	set_page_dirty(cp_page);
 	f2fs_put_page(cp_page, 1);
 
 	for (i = 1; i < 1 + cp_payload_blks; i++) {
 		cp_page = grab_meta_page(sbi, start_blk++);
 		kaddr = page_address(cp_page);
-		memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
-				(1 << sbi->log_blocksize));
+		memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
 		set_page_dirty(cp_page);
 		f2fs_put_page(cp_page, 1);
 	}
@@ -1009,7 +1008,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* writeout checkpoint block */
 	cp_page = grab_meta_page(sbi, start_blk);
 	kaddr = page_address(cp_page);
-	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+	memcpy(kaddr, ckpt, F2FS_BLKSIZE);
 	set_page_dirty(cp_page);
 	f2fs_put_page(cp_page, 1);
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7188a2ac64b5..f3b007540a48 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -491,8 +491,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
 
 	trace_f2fs_truncate_blocks_enter(inode, from);
 
-	free_from = (pgoff_t)
-		((from + blocksize - 1) >> (sbi->log_blocksize));
+	free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
 
 	if (lock)
 		f2fs_lock_op(sbi);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9f278d156d88..877a272a8146 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1048,8 +1048,8 @@ static const struct segment_allocation default_salloc_ops = {
 
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 {
-	__u64 start = range->start >> sbi->log_blocksize;
-	__u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+	__u64 start = F2FS_BYTES_TO_BLK(range->start);
+	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
 	unsigned int start_segno, end_segno;
 	struct cp_control cpc;
 
@@ -1066,7 +1066,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
 						GET_SEGNO(sbi, end);
 	cpc.reason = CP_DISCARD;
-	cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
+	cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
 
 	/* do checkpoint to issue discard commands safely */
 	for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
@@ -1080,7 +1080,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 		mutex_unlock(&sbi->gc_mutex);
 	}
 out:
-	range->len = cpc.trimmed << sbi->log_blocksize;
+	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
 	return 0;
 }
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 09805720f4bf..a23556c32703 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -19,12 +19,16 @@
 #define F2FS_MAX_LOG_SECTOR_SIZE	12	/* 12 bits for 4096 bytes */
 #define F2FS_LOG_SECTORS_PER_BLOCK	3	/* log number for sector/blk */
 #define F2FS_BLKSIZE			4096	/* support only 4KB block */
+#define F2FS_BLKSIZE_BITS		12	/* bits for F2FS_BLKSIZE */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
 #define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) / F2FS_BLKSIZE)
 
 #define NULL_ADDR		((block_t)0)	/* used as block_t addresses */
 #define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
 
+#define F2FS_BYTES_TO_BLK(bytes)	((bytes) >> F2FS_BLKSIZE_BITS)
+#define F2FS_BLK_TO_BYTES(blk)		((blk) << F2FS_BLKSIZE_BITS)
+
 /* 0, 1(node nid), 2(meta nid) are reserved node id */
 #define F2FS_RESERVED_NODE_NUM		3
 
-- 
cgit v1.2.3


From e4b294c2d8f73af4cd41ff30638ad0e4769dc56a Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 11 Feb 2015 15:24:46 -0800
Subject: mm: add fields for compound destructor and order into struct page

Currently, we use lru.next/lru.prev plus cast to access or set
destructor and order of compound page.

Let's replace it with explicit fields in struct page.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       | 9 ++++-----
 include/linux/mm_types.h | 8 ++++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65db4aee738a..8dd4fde9d2e5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -627,29 +627,28 @@ int split_free_page(struct page *page);
  * prototype for that function and accessor functions.
  * These are _only_ valid on the head of a PG_compound page.
  */
-typedef void compound_page_dtor(struct page *);
 
 static inline void set_compound_page_dtor(struct page *page,
 						compound_page_dtor *dtor)
 {
-	page[1].lru.next = (void *)dtor;
+	page[1].compound_dtor = dtor;
 }
 
 static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
 {
-	return (compound_page_dtor *)page[1].lru.next;
+	return page[1].compound_dtor;
 }
 
 static inline int compound_order(struct page *page)
 {
 	if (!PageHead(page))
 		return 0;
-	return (unsigned long)page[1].lru.prev;
+	return page[1].compound_order;
 }
 
 static inline void set_compound_order(struct page *page, unsigned long order)
 {
-	page[1].lru.prev = (void *)order;
+	page[1].compound_order = order;
 }
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 07c8bd3f7b48..20ff2105b564 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -28,6 +28,8 @@ struct mem_cgroup;
 		IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
 #define ALLOC_SPLIT_PTLOCKS	(SPINLOCK_SIZE > BITS_PER_LONG/8)
 
+typedef void compound_page_dtor(struct page *);
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -142,6 +144,12 @@ struct page {
 		struct rcu_head rcu_head;	/* Used by SLAB
 						 * when destroying via RCU
 						 */
+		/* First tail page of compound page */
+		struct {
+			compound_page_dtor *compound_dtor;
+			unsigned long compound_order;
+		};
+
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
 		pgtable_t pmd_huge_pte; /* protected by page->ptl */
 #endif
-- 
cgit v1.2.3


From 1d148e218a0d0566b1c06f2f45f1436d53b049b2 Mon Sep 17 00:00:00 2001
From: "Wang, Yalin" <Yalin.Wang@sonymobile.com>
Date: Wed, 11 Feb 2015 15:24:48 -0800
Subject: mm: add VM_BUG_ON_PAGE() to page_mapcount()

Add VM_BUG_ON_PAGE() for slab pages.  _mapcount is an union with slab
struct in struct page, so we must avoid accessing _mapcount if this page
is a slab page.  Also remove the unneeded bracket.

Signed-off-by: Yalin Wang <yalin.wang@sonymobile.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8dd4fde9d2e5..c6bf813a6b3d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -484,7 +484,8 @@ static inline void page_mapcount_reset(struct page *page)
 
 static inline int page_mapcount(struct page *page)
 {
-	return atomic_read(&(page)->_mapcount) + 1;
+	VM_BUG_ON_PAGE(PageSlab(page), page);
+	return atomic_read(&page->_mapcount) + 1;
 }
 
 static inline int page_count(struct page *page)
-- 
cgit v1.2.3


From 56873f43abdcd574b25105867a990f067747b2f4 Mon Sep 17 00:00:00 2001
From: "Wang, Yalin" <Yalin.Wang@sonymobile.com>
Date: Wed, 11 Feb 2015 15:24:51 -0800
Subject: mm:add KPF_ZERO_PAGE flag for /proc/kpageflags

Add KPF_ZERO_PAGE flag for zero_page, so that userspace processes can
detect zero_page in /proc/kpageflags, and then do memory analysis more
accurately.

Signed-off-by: Yalin Wang <yalin.wang@sonymobile.com>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/pagemap.txt           |  8 ++++++++
 fs/proc/page.c                         | 16 +++++++++++++---
 include/linux/huge_mm.h                | 12 ++++++++++++
 include/uapi/linux/kernel-page-flags.h |  1 +
 mm/huge_memory.c                       |  7 +------
 tools/vm/page-types.c                  |  1 +
 6 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 5948e455c4d2..6fbd55ef6b45 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -62,6 +62,8 @@ There are three components to pagemap:
     20. NOPAGE
     21. KSM
     22. THP
+    23. BALLOON
+    24. ZERO_PAGE
 
 Short descriptions to the page flags:
 
@@ -102,6 +104,12 @@ Short descriptions to the page flags:
 22. THP
     contiguous pages which construct transparent hugepages
 
+23. BALLOON
+    balloon compaction page
+
+24. ZERO_PAGE
+    zero page for pfn_zero or huge_zero page
+
     [IO related page flags]
  1. ERROR     IO error occurred
  3. UPTODATE  page has up-to-date data
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
 #include <linux/ksm.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/huge_mm.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
 	 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
 	 * to make sure a given page is a thp, not a non-huge compound page.
 	 */
-	else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
-					     PageAnon(compound_head(page))))
-		u |= 1 << KPF_THP;
+	else if (PageTransCompound(page)) {
+		struct page *head = compound_head(page);
+
+		if (PageLRU(head) || PageAnon(head))
+			u |= 1 << KPF_THP;
+		else if (is_huge_zero_page(head)) {
+			u |= 1 << KPF_ZERO_PAGE;
+			u |= 1 << KPF_THP;
+		}
+	} else if (is_zero_pfn(page_to_pfn(page)))
+		u |= 1 << KPF_ZERO_PAGE;
+
 
 	/*
 	 * Caveats on high order pages: page->_count will only be set
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ad9051bab267..f10b20f05159 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -157,6 +157,13 @@ static inline int hpage_nr_pages(struct page *page)
 extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pmd_t pmd, pmd_t *pmdp);
 
+extern struct page *huge_zero_page;
+
+static inline bool is_huge_zero_page(struct page *page)
+{
+	return ACCESS_ONCE(huge_zero_page) == page;
+}
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -206,6 +213,11 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str
 	return 0;
 }
 
+static inline bool is_huge_zero_page(struct page *page)
+{
+	return false;
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 2f96d233c980..a6c4962e5d46 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -32,6 +32,7 @@
 #define KPF_KSM			21
 #define KPF_THP			22
 #define KPF_BALLOON		23
+#define KPF_ZERO_PAGE		24
 
 
 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875f2b8c..889713180980 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -171,12 +171,7 @@ static int start_khugepaged(void)
 }
 
 static atomic_t huge_zero_refcount;
-static struct page *huge_zero_page __read_mostly;
-
-static inline bool is_huge_zero_page(struct page *page)
-{
-	return ACCESS_ONCE(huge_zero_page) == page;
-}
+struct page *huge_zero_page __read_mostly;
 
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 264fbc297e0b..8bdf16b8ba60 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -133,6 +133,7 @@ static const char * const page_flag_names[] = {
 	[KPF_KSM]		= "x:ksm",
 	[KPF_THP]		= "t:thp",
 	[KPF_BALLOON]		= "o:balloon",
+	[KPF_ZERO_PAGE]		= "z:zero_page",
 
 	[KPF_RESERVED]		= "r:reserved",
 	[KPF_MLOCKED]		= "m:mlocked",
-- 
cgit v1.2.3


From 93aa7d95248d04b934eb8e89717c7b8d6400bf2b Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 11 Feb 2015 15:24:59 -0800
Subject: swap: remove unused mem_cgroup_uncharge_swapcache declaration

The body of this function was removed by commit 0a31bc97c80c ("mm:
memcontrol: rewrite uncharge API").

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 15 ---------------
 mm/shmem.c           |  2 +-
 2 files changed, 1 insertion(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 34e8b60ab973..7067eca501e2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -437,16 +437,6 @@ extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
-#ifdef CONFIG_MEMCG
-extern void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
-#else
-static inline void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
-{
-}
-#endif
-
 #else /* CONFIG_SWAP */
 
 #define swap_address_space(entry)		(NULL)
@@ -547,11 +537,6 @@ static inline swp_entry_t get_swap_page(void)
 	return entry;
 }
 
-static inline void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
-{
-}
-
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index b3e403181981..864c878401e6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1131,7 +1131,7 @@ repeat:
 			 * truncated or holepunched since swap was confirmed.
 			 * shmem_undo_range() will have done some of the
 			 * unaccounting, now delete_from_swap_cache() will do
-			 * the rest (including mem_cgroup_uncharge_swapcache).
+			 * the rest.
 			 * Reset swap.val? No, leave it so "failed" goes back to
 			 * "repeat": reading a hole and writing should succeed.
 			 */
-- 
cgit v1.2.3


From 6de226191d12fce30331ebf024ca3ed24834f0ee Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Feb 2015 15:25:01 -0800
Subject: mm: memcontrol: track move_lock state internally

The complexity of memcg page stat synchronization is currently leaking
into the callsites, forcing them to keep track of the move_lock state and
the IRQ flags.  Simplify the API by tracking it in the memcg.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 12 +++-----
 mm/memcontrol.c            | 68 ++++++++++++++++++++++++++--------------------
 mm/page-writeback.c        | 12 +++-----
 mm/rmap.c                  | 12 +++-----
 4 files changed, 51 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fb212e1d700d..76b4084b8d08 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -138,12 +138,10 @@ static inline bool mem_cgroup_disabled(void)
 	return false;
 }
 
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked,
-					      unsigned long *flags);
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
-			      unsigned long *flags);
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
 void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
 				 enum mem_cgroup_stat_index idx, int val);
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
 
 static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
 					    enum mem_cgroup_stat_index idx)
@@ -285,14 +283,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
-					bool *locked, unsigned long *flags)
+static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
 {
 	return NULL;
 }
 
-static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg,
-					bool *locked, unsigned long *flags)
+static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f3f8a4f52a0c..028d07c79104 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -325,9 +325,11 @@ struct mem_cgroup {
 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
 	 */
-	atomic_t	moving_account;
+	atomic_t		moving_account;
 	/* taken only while moving_account > 0 */
-	spinlock_t	move_lock;
+	spinlock_t		move_lock;
+	struct task_struct	*move_lock_task;
+	unsigned long		move_lock_flags;
 	/*
 	 * percpu counter.
 	 */
@@ -1977,34 +1979,33 @@ cleanup:
 /**
  * mem_cgroup_begin_page_stat - begin a page state statistics transaction
  * @page: page that is going to change accounted state
- * @locked: &memcg->move_lock slowpath was taken
- * @flags: IRQ-state flags for &memcg->move_lock
  *
  * This function must mark the beginning of an accounted page state
  * change to prevent double accounting when the page is concurrently
  * being moved to another memcg:
  *
- *   memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ *   memcg = mem_cgroup_begin_page_stat(page);
  *   if (TestClearPageState(page))
  *     mem_cgroup_update_page_stat(memcg, state, -1);
- *   mem_cgroup_end_page_stat(memcg, locked, flags);
- *
- * The RCU lock is held throughout the transaction.  The fast path can
- * get away without acquiring the memcg->move_lock (@locked is false)
- * because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when the page
- * state that is going to change is the only thing preventing the page
- * from being uncharged.  E.g. end-writeback clearing PageWriteback(),
- * which allows migration to go ahead and uncharge the page before the
- * account transaction might be complete.
+ *   mem_cgroup_end_page_stat(memcg);
  */
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
-					      bool *locked,
-					      unsigned long *flags)
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
 {
 	struct mem_cgroup *memcg;
+	unsigned long flags;
 
+	/*
+	 * The RCU lock is held throughout the transaction.  The fast
+	 * path can get away without acquiring the memcg->move_lock
+	 * because page moving starts with an RCU grace period.
+	 *
+	 * The RCU lock also protects the memcg from being freed when
+	 * the page state that is going to change is the only thing
+	 * preventing the page from being uncharged.
+	 * E.g. end-writeback clearing PageWriteback(), which allows
+	 * migration to go ahead and uncharge the page before the
+	 * account transaction might be complete.
+	 */
 	rcu_read_lock();
 
 	if (mem_cgroup_disabled())
@@ -2014,16 +2015,22 @@ again:
 	if (unlikely(!memcg))
 		return NULL;
 
-	*locked = false;
 	if (atomic_read(&memcg->moving_account) <= 0)
 		return memcg;
 
-	spin_lock_irqsave(&memcg->move_lock, *flags);
+	spin_lock_irqsave(&memcg->move_lock, flags);
 	if (memcg != page->mem_cgroup) {
-		spin_unlock_irqrestore(&memcg->move_lock, *flags);
+		spin_unlock_irqrestore(&memcg->move_lock, flags);
 		goto again;
 	}
-	*locked = true;
+
+	/*
+	 * When charge migration first begins, we can have locked and
+	 * unlocked page stat updates happening concurrently.  Track
+	 * the task who has the lock for mem_cgroup_end_page_stat().
+	 */
+	memcg->move_lock_task = current;
+	memcg->move_lock_flags = flags;
 
 	return memcg;
 }
@@ -2031,14 +2038,17 @@ again:
 /**
  * mem_cgroup_end_page_stat - finish a page state statistics transaction
  * @memcg: the memcg that was accounted against
- * @locked: value received from mem_cgroup_begin_page_stat()
- * @flags: value received from mem_cgroup_begin_page_stat()
  */
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
-			      unsigned long *flags)
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 {
-	if (memcg && *locked)
-		spin_unlock_irqrestore(&memcg->move_lock, *flags);
+	if (memcg && memcg->move_lock_task == current) {
+		unsigned long flags = memcg->move_lock_flags;
+
+		memcg->move_lock_task = NULL;
+		memcg->move_lock_flags = 0;
+
+		spin_unlock_irqrestore(&memcg->move_lock, flags);
+	}
 
 	rcu_read_unlock();
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f4335238e33..fb71e9deca85 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2308,12 +2308,10 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
-	unsigned long memcg_flags;
 	struct mem_cgroup *memcg;
-	bool locked;
 	int ret;
 
-	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+	memcg = mem_cgroup_begin_page_stat(page);
 	if (mapping) {
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
@@ -2338,19 +2336,17 @@ int test_clear_page_writeback(struct page *page)
 		dec_zone_page_state(page, NR_WRITEBACK);
 		inc_zone_page_state(page, NR_WRITTEN);
 	}
-	mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+	mem_cgroup_end_page_stat(memcg);
 	return ret;
 }
 
 int __test_set_page_writeback(struct page *page, bool keep_write)
 {
 	struct address_space *mapping = page_mapping(page);
-	unsigned long memcg_flags;
 	struct mem_cgroup *memcg;
-	bool locked;
 	int ret;
 
-	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+	memcg = mem_cgroup_begin_page_stat(page);
 	if (mapping) {
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
@@ -2380,7 +2376,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 		mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
 		inc_zone_page_state(page, NR_WRITEBACK);
 	}
-	mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+	mem_cgroup_end_page_stat(memcg);
 	return ret;
 
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 70b32498d4f2..5e3e09081164 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1085,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
 	struct mem_cgroup *memcg;
-	unsigned long flags;
-	bool locked;
 
-	memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+	memcg = mem_cgroup_begin_page_stat(page);
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
 		mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
 	}
-	mem_cgroup_end_page_stat(memcg, &locked, &flags);
+	mem_cgroup_end_page_stat(memcg);
 }
 
 static void page_remove_file_rmap(struct page *page)
 {
 	struct mem_cgroup *memcg;
-	unsigned long flags;
-	bool locked;
 
-	memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+	memcg = mem_cgroup_begin_page_stat(page);
 
 	/* page still mapped by someone else? */
 	if (!atomic_add_negative(-1, &page->_mapcount))
@@ -1123,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page)
 	if (unlikely(PageMlocked(page)))
 		clear_page_mlock(page);
 out:
-	mem_cgroup_end_page_stat(memcg, &locked, &flags);
+	mem_cgroup_end_page_stat(memcg);
 }
 
 /**
-- 
cgit v1.2.3


From 44628d9755e249aab9a6e1a17407d2f4278047ee Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 11 Feb 2015 15:25:10 -0800
Subject: mm: fix typo of MIGRATE_RESERVE in comment

Found it when I want to jump to the definition of MIGRATE_RESERVE ctags.

Signed-off-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f0856d14b21..b41829701334 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -426,7 +426,7 @@ struct zone {
 	const char		*name;
 
 	/*
-	 * Number of MIGRATE_RESEVE page block. To maintain for just
+	 * Number of MIGRATE_RESERVE page block. To maintain for just
 	 * optimization. Protected by zone->lock.
 	 */
 	int			nr_migrate_reserve_block;
-- 
cgit v1.2.3


From e66f17ff71772b209eed39de35aaa99ba819c93d Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:25:22 -0800
Subject: mm/hugetlb: take page table lock in follow_huge_pmd()

We have a race condition between move_pages() and freeing hugepages, where
move_pages() calls follow_page(FOLL_GET) for hugepages internally and
tries to get its refcount without preventing concurrent freeing.  This
race crashes the kernel, so this patch fixes it by moving FOLL_GET code
for hugepages into follow_huge_pmd() with taking the page table lock.

This patch intentionally removes page==NULL check after pte_page.
This is justified because pte_page() never returns NULL for any
architectures or configurations.

This patch changes the behavior of follow_huge_pmd() for tail pages and
then tail pages can be pinned/returned.  So the caller must be changed to
properly handle the returned tail pages.

We could have a choice to add the similar locking to
follow_huge_(addr|pud) for consistency, but it's not necessary because
currently these functions don't support FOLL_GET flag, so let's leave it
for future development.

Here is the reproducer:

  $ cat movepages.c
  #include <stdio.h>
  #include <stdlib.h>
  #include <numaif.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000
  #define PS              0x1000

  int main(int argc, char *argv[]) {
          int i;
          int nr_hp = strtol(argv[1], NULL, 0);
          int nr_p  = nr_hp * HPS / PS;
          int ret;
          void **addrs;
          int *status;
          int *nodes;
          pid_t pid;

          pid = strtol(argv[2], NULL, 0);
          addrs  = malloc(sizeof(char *) * nr_p + 1);
          status = malloc(sizeof(char *) * nr_p + 1);
          nodes  = malloc(sizeof(char *) * nr_p + 1);

          while (1) {
                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 1;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");

                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 0;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");
          }
          return 0;
  }

  $ cat hugepage.c
  #include <stdio.h>
  #include <sys/mman.h>
  #include <string.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000

  int main(int argc, char *argv[]) {
          int nr_hp = strtol(argv[1], NULL, 0);
          char *p;

          while (1) {
                  p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
                           MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
                  if (p != (void *)ADDR_INPUT) {
                          perror("mmap");
                          break;
                  }
                  memset(p, 0, nr_hp * HPS);
                  munmap(p, nr_hp * HPS);
          }
  }

  $ sysctl vm.nr_hugepages=40
  $ ./hugepage 10 &
  $ ./movepages 10 $(pgrep -f hugepage)

Fixes: e632a938d914 ("mm: migrate: add hugepage migration code to move_pages()")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Hugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: <stable@vger.kernel.org>	[3.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  8 ++++----
 include/linux/swapops.h |  4 ++++
 mm/gup.c                | 25 ++++++++-----------------
 mm/hugetlb.c            | 48 ++++++++++++++++++++++++++++++++++--------------
 mm/migrate.c            |  5 +++--
 5 files changed, 53 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d7856359920..7b5785032049 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int write);
+				pmd_t *pmd, int flags);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-				pud_t *pud, int write);
+				pud_t *pud, int flags);
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pmd);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 static inline void hugetlb_show_meminfo(void)
 {
 }
-#define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define follow_huge_pud(mm, addr, pud, write)	NULL
+#define follow_huge_pmd(mm, addr, pmd, flags)	NULL
+#define follow_huge_pud(mm, addr, pud, flags)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define pud_huge(x)	0
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 50cbc876be56..831a3168ab35 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
 	*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
 }
 
+extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+					spinlock_t *ptl);
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
 extern void migration_entry_wait_huge(struct vm_area_struct *vma,
@@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp)
 }
 #define migration_entry_to_page(swp) NULL
 static inline void make_migration_entry_read(swp_entry_t *entryp) { }
+static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+					spinlock_t *ptl) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
 static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
diff --git a/mm/gup.c b/mm/gup.c
index 12bc2bc33da7..1a8ab05918e0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pud_none(*pud))
 		return no_page_table(vma, flags);
 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-		if (flags & FOLL_GET)
-			return NULL;
-		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
-		return page;
+		page = follow_huge_pud(mm, address, pud, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
 	}
 	if (unlikely(pud_bad(*pud)))
 		return no_page_table(vma, flags);
@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pmd_none(*pmd))
 		return no_page_table(vma, flags);
 	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
-		if (flags & FOLL_GET) {
-			/*
-			 * Refcount on tail pages are not well-defined and
-			 * shouldn't be taken. The caller should handle a NULL
-			 * return when trying to follow tail pages.
-			 */
-			if (PageHead(page))
-				get_page(page);
-			else
-				page = NULL;
-		}
-		return page;
+		page = follow_huge_pmd(mm, address, pmd, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
 	}
 	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
 		return no_page_table(vma, flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d96b8bfa748f..5aca3707450f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3675,28 +3675,48 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,
 
 struct page * __weak
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+		pmd_t *pmd, int flags)
 {
-	struct page *page;
-
-	if (!pmd_present(*pmd))
-		return NULL;
-	page = pte_page(*(pte_t *)pmd);
-	if (page)
-		page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+	struct page *page = NULL;
+	spinlock_t *ptl;
+retry:
+	ptl = pmd_lockptr(mm, pmd);
+	spin_lock(ptl);
+	/*
+	 * make sure that the address range covered by this pmd is not
+	 * unmapped from other threads.
+	 */
+	if (!pmd_huge(*pmd))
+		goto out;
+	if (pmd_present(*pmd)) {
+		page = pte_page(*(pte_t *)pmd) +
+			((address & ~PMD_MASK) >> PAGE_SHIFT);
+		if (flags & FOLL_GET)
+			get_page(page);
+	} else {
+		if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+			spin_unlock(ptl);
+			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
+			goto retry;
+		}
+		/*
+		 * hwpoisoned entry is treated as no_page_table in
+		 * follow_page_mask().
+		 */
+	}
+out:
+	spin_unlock(ptl);
 	return page;
 }
 
 struct page * __weak
 follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int write)
+		pud_t *pud, int flags)
 {
-	struct page *page;
+	if (flags & FOLL_GET)
+		return NULL;
 
-	page = pte_page(*(pte_t *)pud);
-	if (page)
-		page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
-	return page;
+	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
 }
 
 #ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/migrate.c b/mm/migrate.c
index 6e284bcca8bb..f98067e5d353 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
  * get to the page and wait until migration is finished.
  * When we return from this function the fault will be retried.
  */
-static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 				spinlock_t *ptl)
 {
 	pte_t pte;
@@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 			goto put_and_set;
 
 		if (PageHuge(page)) {
-			isolate_huge_page(page, &pagelist);
+			if (PageHead(page))
+				isolate_huge_page(page, &pagelist);
 			goto put_and_set;
 		}
 
-- 
cgit v1.2.3


From 1a6d53a105406d97396c87511afd6f09b4dc8ad2 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Feb 2015 15:25:44 -0800
Subject: mm: reduce try_to_compact_pages parameters

Expand the usage of the struct alloc_context introduced in the previous
patch also for calling try_to_compact_pages(), to reduce the number of its
parameters.  Since the function is in different compilation unit, we need
to move alloc_context definition in the shared mm/internal.h header.

With this change we get simpler code and small savings of code size and stack
usage:

add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27)
function                                     old     new   delta
__alloc_pages_direct_compact                 283     256     -27
add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-13 (-13)
function                                     old     new   delta
try_to_compact_pages                         582     569     -13

Stack usage of __alloc_pages_direct_compact goes from 24 to none (per
scripts/checkstack.pl).

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 17 +++++++++--------
 mm/compaction.c            | 23 +++++++++++------------
 mm/internal.h              | 22 ++++++++++++++++++++++
 mm/page_alloc.c            | 27 ++-------------------------
 4 files changed, 44 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 3238ffa33f68..f2efda2e6ac6 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -21,6 +21,8 @@
 /* Zone lock or lru_lock was contended in async compaction */
 #define COMPACT_CONTENDED_LOCK	2
 
+struct alloc_context; /* in mm/internal.h */
+
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -30,10 +32,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos);
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
-extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
-			int order, gfp_t gfp_mask, nodemask_t *mask,
-			enum migrate_mode mode, int *contended,
-			int alloc_flags, int classzone_idx);
+extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+			int alloc_flags, const struct alloc_context *ac,
+			enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order,
@@ -101,10 +102,10 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 }
 
 #else
-static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
-			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			enum migrate_mode mode, int *contended,
-			int alloc_flags, int classzone_idx)
+static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
+			unsigned int order, int alloc_flags,
+			const struct alloc_context *ac,
+			enum migrate_mode mode, int *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
index 546e571e9d60..9c7e6909dd29 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1335,22 +1335,20 @@ int sysctl_extfrag_threshold = 500;
 
 /**
  * try_to_compact_pages - Direct compact to satisfy a high-order allocation
- * @zonelist: The zonelist used for the current allocation
- * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
- * @nodemask: The allowed nodes to allocate from
+ * @order: The order of the current allocation
+ * @alloc_flags: The allocation flags of the current allocation
+ * @ac: The context of current allocation
  * @mode: The migration mode for async, sync light, or sync migration
  * @contended: Return value that determines if compaction was aborted due to
  *	       need_resched() or lock contention
  *
  * This is the main entry point for direct page compaction.
  */
-unsigned long try_to_compact_pages(struct zonelist *zonelist,
-			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			enum migrate_mode mode, int *contended,
-			int alloc_flags, int classzone_idx)
+unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+			int alloc_flags, const struct alloc_context *ac,
+			enum migrate_mode mode, int *contended)
 {
-	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
@@ -1365,8 +1363,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		return COMPACT_SKIPPED;
 
 	/* Compact each zone in the list */
-	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
-								nodemask) {
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+								ac->nodemask) {
 		int status;
 		int zone_contended;
 
@@ -1374,7 +1372,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			continue;
 
 		status = compact_zone_order(zone, order, gfp_mask, mode,
-				&zone_contended, alloc_flags, classzone_idx);
+				&zone_contended, alloc_flags,
+				ac->classzone_idx);
 		rc = max(status, rc);
 		/*
 		 * It takes at least one zone that wasn't lock contended
@@ -1384,7 +1383,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
 		/* If a normal allocation would succeed, stop compacting */
 		if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
-					classzone_idx, alloc_flags)) {
+					ac->classzone_idx, alloc_flags)) {
 			/*
 			 * We think the allocation will succeed in this zone,
 			 * but it is not certain, hence the false. The caller
diff --git a/mm/internal.h b/mm/internal.h
index efad241f7014..c4d6c9b43491 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -109,6 +109,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
  * in mm/page_alloc.c
  */
 
+/*
+ * Structure for holding the mostly immutable allocation parameters passed
+ * between functions involved in allocations, including the alloc_pages*
+ * family of functions.
+ *
+ * nodemask, migratetype and high_zoneidx are initialized only once in
+ * __alloc_pages_nodemask() and then never change.
+ *
+ * zonelist, preferred_zone and classzone_idx are set first in
+ * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * in __alloc_pages_slowpath(). All other functions pass the whole strucure
+ * by a const pointer.
+ */
+struct alloc_context {
+	struct zonelist *zonelist;
+	nodemask_t *nodemask;
+	struct zone *preferred_zone;
+	int classzone_idx;
+	int migratetype;
+	enum zone_type high_zoneidx;
+};
+
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4aead0bd8d44..d664eb922a7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -232,27 +232,6 @@ EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 
-/*
- * Structure for holding the mostly immutable allocation parameters passed
- * between alloc_pages* family of functions.
- *
- * nodemask, migratetype and high_zoneidx are initialized only once in
- * __alloc_pages_nodemask() and then never change.
- *
- * zonelist, preferred_zone and classzone_idx are set first in
- * __alloc_pages_nodemask() for the fast path, and might be later changed
- * in __alloc_pages_slowpath(). All other functions pass the whole strucure
- * by a const pointer.
- */
-struct alloc_context {
-	struct zonelist *zonelist;
-	nodemask_t *nodemask;
-	struct zone *preferred_zone;
-	int classzone_idx;
-	int migratetype;
-	enum zone_type high_zoneidx;
-};
-
 int page_group_by_mobility_disabled __read_mostly;
 
 void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -2429,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 
 	current->flags |= PF_MEMALLOC;
-	compact_result = try_to_compact_pages(ac->zonelist, order, gfp_mask,
-						ac->nodemask, mode,
-						contended_compaction,
-						alloc_flags, ac->classzone_idx);
+	compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+						mode, contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
 	switch (compact_result) {
-- 
cgit v1.2.3


From 05891fb06517d19ae5357c9dc44e96bbe0300a3c Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Feb 2015 15:25:47 -0800
Subject: mm: microoptimize zonelist operations

next_zones_zonelist() returns a zoneref pointer, as well as a zone pointer
via extra parameter.  Since the latter can be trivially obtained by
dereferencing the former, the overhead of the extra parameter is
unjustified.

This patch thus removes the zone parameter from next_zones_zonelist().
Both callers happen to be in the same header file, so it's simple to add
the zoneref dereference inline.  We save some bytes of code size.

add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-105 (-105)
function                                     old     new   delta
nr_free_zone_pages                           129     115     -14
__alloc_pages_nodemask                      2300    2285     -15
get_page_from_freelist                      2652    2576     -76

add/remove: 0/0 grow/shrink: 1/0 up/down: 10/0 (10)
function                                     old     new   delta
try_to_compact_pages                         569     579     +10

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 13 +++++++------
 mm/mmzone.c            |  4 +---
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b41829701334..f279d9c158cd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -970,7 +970,6 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
  * @z - The cursor used as a starting point for the search
  * @highest_zoneidx - The zone index of the highest zone to return
  * @nodes - An optional nodemask to filter the zonelist with
- * @zone - The first suitable zone found is returned via this parameter
  *
  * This function returns the next zone at or below a given zone index that is
  * within the allowed nodemask using a cursor as the starting point for the
@@ -980,8 +979,7 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
  */
 struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes,
-					struct zone **zone);
+					nodemask_t *nodes);
 
 /**
  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
@@ -1000,8 +998,10 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					nodemask_t *nodes,
 					struct zone **zone)
 {
-	return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
-								zone);
+	struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
+							highest_zoneidx, nodes);
+	*zone = zonelist_zone(z);
+	return z;
 }
 
 /**
@@ -1018,7 +1018,8 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
 	for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone);	\
 		zone;							\
-		z = next_zones_zonelist(++z, highidx, nodemask, &zone))	\
+		z = next_zones_zonelist(++z, highidx, nodemask),	\
+			zone = zonelist_zone(z))			\
 
 /**
  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
diff --git a/mm/mmzone.c b/mm/mmzone.c
index bf34fb8556db..7d87ebb0d632 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
 /* Returns the next zone at or below highest_zoneidx in a zonelist */
 struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes,
-					struct zone **zone)
+					nodemask_t *nodes)
 {
 	/*
 	 * Find the next suitable zone to use for the allocation.
@@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
 				(z->zone && !zref_in_nodemask(z, nodes)))
 			z++;
 
-	*zone = zonelist_zone(z);
 	return z;
 }
 
-- 
cgit v1.2.3


From 90cbc2508827e1e15dca23361c33cc26dd2b9e99 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 11 Feb 2015 15:25:55 -0800
Subject: vmscan: force scan offline memory cgroups

Since commit b2052564e66d ("mm: memcontrol: continue cache reclaim from
offlined groups") pages charged to a memory cgroup are not reparented when
the cgroup is removed.  Instead, they are supposed to be reclaimed in a
regular way, along with pages accounted to online memory cgroups.

However, an lruvec of an offline memory cgroup will sooner or later get so
small that it will be scanned only at low scan priorities (see
get_scan_count()).  Therefore, if there are enough reclaimable pages in
big lruvecs, pages accounted to offline memory cgroups will never be
scanned at all, wasting memory.

Fix this by unconditionally forcing scanning dead lruvecs from kswapd.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ++++++
 mm/memcontrol.c            | 14 ++++++++++++++
 mm/vmscan.c                |  8 ++++++--
 3 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 76b4084b8d08..353537a5981a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -102,6 +102,7 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
  * For memory reclaim.
  */
 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
@@ -266,6 +267,11 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 	return 1;
 }
 
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+	return true;
+}
+
 static inline unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 028d07c79104..6187ca4d5dc2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1367,6 +1367,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 	return inactive * inactive_ratio < active;
 }
 
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return true;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+	memcg = mz->memcg;
+
+	return !!(memcg->css.flags & CSS_ONLINE);
+}
+
 #define mem_cgroup_from_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f756a202d5d5..b6dfa0081a8e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1903,8 +1903,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
 	 * latencies, so it's better to scan a minimum amount there as
 	 * well.
 	 */
-	if (current_is_kswapd() && !zone_reclaimable(zone))
-		force_scan = true;
+	if (current_is_kswapd()) {
+		if (!zone_reclaimable(zone))
+			force_scan = true;
+		if (!mem_cgroup_lruvec_online(lruvec))
+			force_scan = true;
+	}
 	if (!global_reclaim(sc))
 		force_scan = true;
 
-- 
cgit v1.2.3


From 650c5e565492f9092552bfe4d65935196c7d9567 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Feb 2015 15:26:03 -0800
Subject: mm: page_counter: pull "-1" handling out of page_counter_memparse()

The unified hierarchy interface for memory cgroups will no longer use "-1"
to mean maximum possible resource value.  In preparation for this, make
the string an argument and let the caller supply it.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_counter.h | 3 ++-
 mm/hugetlb_cgroup.c          | 2 +-
 mm/memcontrol.c              | 4 ++--
 mm/page_counter.c            | 7 ++++---
 net/ipv4/tcp_memcontrol.c    | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 955421575d16..17fa4f8de3a6 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -41,7 +41,8 @@ int page_counter_try_charge(struct page_counter *counter,
 			    struct page_counter **fail);
 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
 int page_counter_limit(struct page_counter *counter, unsigned long limit);
-int page_counter_memparse(const char *buf, unsigned long *nr_pages);
+int page_counter_memparse(const char *buf, const char *max,
+			  unsigned long *nr_pages);
 
 static inline void page_counter_reset_watermark(struct page_counter *counter)
 {
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 037e1c00a5b7..6e0057439a46 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
 		return -EINVAL;
 
 	buf = strstrip(buf);
-	ret = page_counter_memparse(buf, &nr_pages);
+	ret = page_counter_memparse(buf, "-1", &nr_pages);
 	if (ret)
 		return ret;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dc5c4cd0afdf..6453ea5a27aa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3414,7 +3414,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
 	int ret;
 
 	buf = strstrip(buf);
-	ret = page_counter_memparse(buf, &nr_pages);
+	ret = page_counter_memparse(buf, "-1", &nr_pages);
 	if (ret)
 		return ret;
 
@@ -3786,7 +3786,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	unsigned long usage;
 	int i, size, ret;
 
-	ret = page_counter_memparse(args, &threshold);
+	ret = page_counter_memparse(args, "-1", &threshold);
 	if (ret)
 		return ret;
 
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a009574fbba9..11b4beda14ba 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
 /**
  * page_counter_memparse - memparse() for page counter limits
  * @buf: string to parse
+ * @max: string meaning maximum possible value
  * @nr_pages: returns the result in number of pages
  *
  * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
  * limited to %PAGE_COUNTER_MAX.
  */
-int page_counter_memparse(const char *buf, unsigned long *nr_pages)
+int page_counter_memparse(const char *buf, const char *max,
+			  unsigned long *nr_pages)
 {
-	char unlimited[] = "-1";
 	char *end;
 	u64 bytes;
 
-	if (!strncmp(buf, unlimited, sizeof(unlimited))) {
+	if (!strcmp(buf, max)) {
 		*nr_pages = PAGE_COUNTER_MAX;
 		return 0;
 	}
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 272327134a1b..c2a75c6957a1 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -120,7 +120,7 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
 	switch (of_cft(of)->private) {
 	case RES_LIMIT:
 		/* see memcontrol.c */
-		ret = page_counter_memparse(buf, &nr_pages);
+		ret = page_counter_memparse(buf, "-1", &nr_pages);
 		if (ret)
 			break;
 		mutex_lock(&tcp_limit_mutex);
-- 
cgit v1.2.3


From 241994ed8649f7300667be8b13a9e04ae04e05a1 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Feb 2015 15:26:06 -0800
Subject: mm: memcontrol: default hierarchy interface for memory

Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.

This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below.  The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.

The control files are thus:

  - memory.current shows the current consumption of the cgroup and its
    descendants, in bytes.

  - memory.low configures the lower end of the cgroup's expected
    memory consumption range.  The kernel considers memory below that
    boundary to be a reserve - the minimum that the workload needs in
    order to make forward progress - and generally avoids reclaiming
    it, unless there is an imminent risk of entering an OOM situation.

  - memory.high configures the upper end of the cgroup's expected
    memory consumption range.  A cgroup whose consumption grows beyond
    this threshold is forced into direct reclaim, to work off the
    excess and to throttle new allocations heavily, but is generally
    allowed to continue and the OOM killer is not invoked.

  - memory.max configures the hard maximum amount of memory that the
    cgroup is allowed to consume before the OOM killer is invoked.

  - memory.events shows event counters that indicate how often the
    cgroup was reclaimed while below memory.low, how often it was
    forced to reclaim excess beyond memory.high, how often it hit
    memory.max, and how often it entered OOM due to memory.max.  This
    allows users to identify configuration problems when observing a
    degradation in workload performance.  An overcommitted system will
    have an increased rate of low boundary breaches, whereas increased
    rates of high limit breaches, maximum hits, or even OOM situations
    will indicate internally overcommitted cgroups.

For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:

  - The original lower boundary, the soft limit, is defined as a limit
    that is per default unset.  As a result, the set of cgroups that
    global reclaim prefers is opt-in, rather than opt-out.  The costs
    for optimizing these mostly negative lookups are so high that the
    implementation, despite its enormous size, does not even provide
    the basic desirable behavior.  First off, the soft limit has no
    hierarchical meaning.  All configured groups are organized in a
    global rbtree and treated like equal peers, regardless where they
    are located in the hierarchy.  This makes subtree delegation
    impossible.  Second, the soft limit reclaim pass is so aggressive
    that it not just introduces high allocation latencies into the
    system, but also impacts system performance due to overreclaim, to
    the point where the feature becomes self-defeating.

    The memory.low boundary on the other hand is a top-down allocated
    reserve.  A cgroup enjoys reclaim protection when it and all its
    ancestors are below their low boundaries, which makes delegation
    of subtrees possible.  Secondly, new cgroups have no reserve per
    default and in the common case most cgroups are eligible for the
    preferred reclaim pass.  This allows the new low boundary to be
    efficiently implemented with just a minor addition to the generic
    reclaim code, without the need for out-of-band data structures and
    reclaim passes.  Because the generic reclaim code considers all
    cgroups except for the ones running low in the preferred first
    reclaim pass, overreclaim of individual groups is eliminated as
    well, resulting in much better overall workload performance.

  - The original high boundary, the hard limit, is defined as a strict
    limit that can not budge, even if the OOM killer has to be called.
    But this generally goes against the goal of making the most out of
    the available memory.  The memory consumption of workloads varies
    during runtime, and that requires users to overcommit.  But doing
    that with a strict upper limit requires either a fairly accurate
    prediction of the working set size or adding slack to the limit.
    Since working set size estimation is hard and error prone, and
    getting it wrong results in OOM kills, most users tend to err on
    the side of a looser limit and end up wasting precious resources.

    The memory.high boundary on the other hand can be set much more
    conservatively.  When hit, it throttles allocations by forcing
    them into direct reclaim to work off the excess, but it never
    invokes the OOM killer.  As a result, a high boundary that is
    chosen too aggressively will not terminate the processes, but
    instead it will lead to gradual performance degradation.  The user
    can monitor this and make corrections until the minimal memory
    footprint that still gives acceptable performance is found.

    In extreme cases, with many concurrent allocations and a complete
    breakdown of reclaim progress within the group, the high boundary
    can be exceeded.  But even then it's mostly better to satisfy the
    allocation from the slack available in other groups or the rest of
    the system than killing the group.  Otherwise, memory.max is there
    to limit this type of spillover and ultimately contain buggy or
    even malicious applications.

  - The original control file names are unwieldy and inconsistent in
    many different ways.  For example, the upper boundary hit count is
    exported in the memory.failcnt file, but an OOM event count has to
    be manually counted by listening to memory.oom_control events, and
    lower boundary / soft limit events have to be counted by first
    setting a threshold for that value and then counting those events.
    Also, usage and limit files encode their units in the filename.
    That makes the filenames very long, even though this is not
    information that a user needs to be reminded of every time they
    type out those names.

    To address these naming issues, as well as to signal clearly that
    the new interface carries a new configuration model, the naming
    conventions in it necessarily differ from the old interface.

  - The original limit files indicate the state of an unset limit with
    a very high number, and a configured limit can be unset by echoing
    -1 into those files.  But that very high number is implementation
    and architecture dependent and not very descriptive.  And while -1
    can be understood as an underflow into the highest possible value,
    -2 or -10M etc. do not work, so it's not inconsistent.

    memory.low, memory.high, and memory.max will use the string
    "infinity" to indicate and set the highest possible value.

[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/unified-hierarchy.txt |  79 ++++++++++
 include/linux/memcontrol.h                  |  32 ++++
 mm/memcontrol.c                             | 229 ++++++++++++++++++++++++++--
 mm/vmscan.c                                 |  22 ++-
 4 files changed, 348 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 4f4563277864..71daa35ec2d9 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -327,6 +327,85 @@ supported and the interface files "release_agent" and
 - use_hierarchy is on by default and the cgroup file for the flag is
   not created.
 
+- The original lower boundary, the soft limit, is defined as a limit
+  that is per default unset.  As a result, the set of cgroups that
+  global reclaim prefers is opt-in, rather than opt-out.  The costs
+  for optimizing these mostly negative lookups are so high that the
+  implementation, despite its enormous size, does not even provide the
+  basic desirable behavior.  First off, the soft limit has no
+  hierarchical meaning.  All configured groups are organized in a
+  global rbtree and treated like equal peers, regardless where they
+  are located in the hierarchy.  This makes subtree delegation
+  impossible.  Second, the soft limit reclaim pass is so aggressive
+  that it not just introduces high allocation latencies into the
+  system, but also impacts system performance due to overreclaim, to
+  the point where the feature becomes self-defeating.
+
+  The memory.low boundary on the other hand is a top-down allocated
+  reserve.  A cgroup enjoys reclaim protection when it and all its
+  ancestors are below their low boundaries, which makes delegation of
+  subtrees possible.  Secondly, new cgroups have no reserve per
+  default and in the common case most cgroups are eligible for the
+  preferred reclaim pass.  This allows the new low boundary to be
+  efficiently implemented with just a minor addition to the generic
+  reclaim code, without the need for out-of-band data structures and
+  reclaim passes.  Because the generic reclaim code considers all
+  cgroups except for the ones running low in the preferred first
+  reclaim pass, overreclaim of individual groups is eliminated as
+  well, resulting in much better overall workload performance.
+
+- The original high boundary, the hard limit, is defined as a strict
+  limit that can not budge, even if the OOM killer has to be called.
+  But this generally goes against the goal of making the most out of
+  the available memory.  The memory consumption of workloads varies
+  during runtime, and that requires users to overcommit.  But doing
+  that with a strict upper limit requires either a fairly accurate
+  prediction of the working set size or adding slack to the limit.
+  Since working set size estimation is hard and error prone, and
+  getting it wrong results in OOM kills, most users tend to err on the
+  side of a looser limit and end up wasting precious resources.
+
+  The memory.high boundary on the other hand can be set much more
+  conservatively.  When hit, it throttles allocations by forcing them
+  into direct reclaim to work off the excess, but it never invokes the
+  OOM killer.  As a result, a high boundary that is chosen too
+  aggressively will not terminate the processes, but instead it will
+  lead to gradual performance degradation.  The user can monitor this
+  and make corrections until the minimal memory footprint that still
+  gives acceptable performance is found.
+
+  In extreme cases, with many concurrent allocations and a complete
+  breakdown of reclaim progress within the group, the high boundary
+  can be exceeded.  But even then it's mostly better to satisfy the
+  allocation from the slack available in other groups or the rest of
+  the system than killing the group.  Otherwise, memory.max is there
+  to limit this type of spillover and ultimately contain buggy or even
+  malicious applications.
+
+- The original control file names are unwieldy and inconsistent in
+  many different ways.  For example, the upper boundary hit count is
+  exported in the memory.failcnt file, but an OOM event count has to
+  be manually counted by listening to memory.oom_control events, and
+  lower boundary / soft limit events have to be counted by first
+  setting a threshold for that value and then counting those events.
+  Also, usage and limit files encode their units in the filename.
+  That makes the filenames very long, even though this is not
+  information that a user needs to be reminded of every time they type
+  out those names.
+
+  To address these naming issues, as well as to signal clearly that
+  the new interface carries a new configuration model, the naming
+  conventions in it necessarily differ from the old interface.
+
+- The original limit files indicate the state of an unset limit with a
+  Very High Number, and a configured limit can be unset by echoing -1
+  into those files.  But that very high number is implementation and
+  architecture dependent and not very descriptive.  And while -1 can
+  be understood as an underflow into the highest possible value, -2 or
+  -10M etc. do not work, so it's not consistent.
+
+  memory.low, memory.high, and memory.max will use the string
+  "infinity" to indicate and set the highest possible value.
 
 5. Planned Changes
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 353537a5981a..6cfd934c7c9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,7 +52,27 @@ struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 };
 
+enum mem_cgroup_events_index {
+	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
+	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
+	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
+	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
+	MEM_CGROUP_EVENTS_NSTATS,
+	/* default hierarchy events */
+	MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS,
+	MEMCG_HIGH,
+	MEMCG_MAX,
+	MEMCG_OOM,
+	MEMCG_NR_EVENTS,
+};
+
 #ifdef CONFIG_MEMCG
+void mem_cgroup_events(struct mem_cgroup *memcg,
+		       enum mem_cgroup_events_index idx,
+		       unsigned int nr);
+
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
+
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp);
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -175,6 +195,18 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
+				     enum mem_cgroup_events_index idx,
+				     unsigned int nr)
+{
+}
+
+static inline bool mem_cgroup_low(struct mem_cgroup *root,
+				  struct mem_cgroup *memcg)
+{
+	return false;
+}
+
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask,
 					struct mem_cgroup **memcgp)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6453ea5a27aa..ee97c9ac62c0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -97,14 +97,6 @@ static const char * const mem_cgroup_stat_names[] = {
 	"swap",
 };
 
-enum mem_cgroup_events_index {
-	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
-	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
-	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
-	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
-	MEM_CGROUP_EVENTS_NSTATS,
-};
-
 static const char * const mem_cgroup_events_names[] = {
 	"pgpgin",
 	"pgpgout",
@@ -138,7 +130,7 @@ enum mem_cgroup_events_target {
 
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
-	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+	unsigned long events[MEMCG_NR_EVENTS];
 	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
@@ -284,6 +276,10 @@ struct mem_cgroup {
 	struct page_counter memsw;
 	struct page_counter kmem;
 
+	/* Normal memory consumption range */
+	unsigned long low;
+	unsigned long high;
+
 	unsigned long soft_limit;
 
 	/* vmpressure notifications */
@@ -2315,6 +2311,8 @@ retry:
 	if (!(gfp_mask & __GFP_WAIT))
 		goto nomem;
 
+	mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
+
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
 						    gfp_mask, may_swap);
 
@@ -2356,6 +2354,8 @@ retry:
 	if (fatal_signal_pending(current))
 		goto bypass;
 
+	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
+
 	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
 nomem:
 	if (!(gfp_mask & __GFP_NOFAIL))
@@ -2367,6 +2367,16 @@ done_restock:
 	css_get_many(&memcg->css, batch);
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
+	/*
+	 * If the hierarchy is above the normal consumption range,
+	 * make the charging task trim their excess contribution.
+	 */
+	do {
+		if (page_counter_read(&memcg->memory) <= memcg->high)
+			continue;
+		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+	} while ((memcg = parent_mem_cgroup(memcg)));
 done:
 	return ret;
 }
@@ -4276,7 +4286,7 @@ out_kfree:
 	return ret;
 }
 
-static struct cftype mem_cgroup_files[] = {
+static struct cftype mem_cgroup_legacy_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4552,6 +4562,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (parent_css == NULL) {
 		root_mem_cgroup = memcg;
 		page_counter_init(&memcg->memory, NULL);
+		memcg->high = PAGE_COUNTER_MAX;
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
@@ -4597,6 +4608,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 
 	if (parent->use_hierarchy) {
 		page_counter_init(&memcg->memory, &parent->memory);
+		memcg->high = PAGE_COUNTER_MAX;
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4607,6 +4619,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		 */
 	} else {
 		page_counter_init(&memcg->memory, NULL);
+		memcg->high = PAGE_COUNTER_MAX;
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
@@ -4682,6 +4695,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
 	mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
 	memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+	memcg->low = 0;
+	memcg->high = PAGE_COUNTER_MAX;
 	memcg->soft_limit = PAGE_COUNTER_MAX;
 }
 
@@ -5267,6 +5282,147 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 		mem_cgroup_from_css(root_css)->use_hierarchy = true;
 }
 
+static u64 memory_current_read(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+}
+
+static int memory_low_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long low = ACCESS_ONCE(memcg->low);
+
+	if (low == PAGE_COUNTER_MAX)
+		seq_puts(m, "infinity\n");
+	else
+		seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
+
+	return 0;
+}
+
+static ssize_t memory_low_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned long low;
+	int err;
+
+	buf = strstrip(buf);
+	err = page_counter_memparse(buf, "infinity", &low);
+	if (err)
+		return err;
+
+	memcg->low = low;
+
+	return nbytes;
+}
+
+static int memory_high_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long high = ACCESS_ONCE(memcg->high);
+
+	if (high == PAGE_COUNTER_MAX)
+		seq_puts(m, "infinity\n");
+	else
+		seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
+
+	return 0;
+}
+
+static ssize_t memory_high_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned long high;
+	int err;
+
+	buf = strstrip(buf);
+	err = page_counter_memparse(buf, "infinity", &high);
+	if (err)
+		return err;
+
+	memcg->high = high;
+
+	return nbytes;
+}
+
+static int memory_max_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long max = ACCESS_ONCE(memcg->memory.limit);
+
+	if (max == PAGE_COUNTER_MAX)
+		seq_puts(m, "infinity\n");
+	else
+		seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
+
+	return 0;
+}
+
+static ssize_t memory_max_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned long max;
+	int err;
+
+	buf = strstrip(buf);
+	err = page_counter_memparse(buf, "infinity", &max);
+	if (err)
+		return err;
+
+	err = mem_cgroup_resize_limit(memcg, max);
+	if (err)
+		return err;
+
+	return nbytes;
+}
+
+static int memory_events_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
+	seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
+	seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
+	seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
+
+	return 0;
+}
+
+static struct cftype memory_files[] = {
+	{
+		.name = "current",
+		.read_u64 = memory_current_read,
+	},
+	{
+		.name = "low",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_low_show,
+		.write = memory_low_write,
+	},
+	{
+		.name = "high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_high_show,
+		.write = memory_high_write,
+	},
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_max_show,
+		.write = memory_max_write,
+	},
+	{
+		.name = "events",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_events_show,
+	},
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys memory_cgrp_subsys = {
 	.css_alloc = mem_cgroup_css_alloc,
 	.css_online = mem_cgroup_css_online,
@@ -5277,7 +5433,8 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
 	.bind = mem_cgroup_bind,
-	.legacy_cftypes = mem_cgroup_files,
+	.dfl_cftypes = memory_files,
+	.legacy_cftypes = mem_cgroup_legacy_files,
 	.early_init = 0,
 };
 
@@ -5312,6 +5469,56 @@ static void __init enable_swap_cgroup(void)
 }
 #endif
 
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+void mem_cgroup_events(struct mem_cgroup *memcg,
+		       enum mem_cgroup_events_index idx,
+		       unsigned int nr)
+{
+	this_cpu_add(memcg->stat->events[idx], nr);
+}
+
+/**
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
+ *
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
+ */
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return false;
+
+	/*
+	 * The toplevel group doesn't have a configurable range, so
+	 * it's never low when looked at directly, and it is not
+	 * considered an ancestor when assessing the hierarchy.
+	 */
+
+	if (memcg == root_mem_cgroup)
+		return false;
+
+	if (page_counter_read(&memcg->memory) > memcg->low)
+		return false;
+
+	while (memcg != root) {
+		memcg = parent_mem_cgroup(memcg);
+
+		if (memcg == root_mem_cgroup)
+			break;
+
+		if (page_counter_read(&memcg->memory) > memcg->low)
+			return false;
+	}
+	return true;
+}
+
 #ifdef CONFIG_MEMCG_SWAP
 /**
  * mem_cgroup_swapout - transfer a memsw charge to swap
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b6dfa0081a8e..8e645ee52045 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -91,6 +91,9 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
+	/* Can cgroups be reclaimed below their normal consumption range? */
+	unsigned int may_thrash:1;
+
 	unsigned int hibernation_mode:1;
 
 	/* One of the zones is ready for compaction */
@@ -2294,6 +2297,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 			struct lruvec *lruvec;
 			int swappiness;
 
+			if (mem_cgroup_low(root, memcg)) {
+				if (!sc->may_thrash)
+					continue;
+				mem_cgroup_events(memcg, MEMCG_LOW, 1);
+			}
+
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			swappiness = mem_cgroup_swappiness(memcg);
 
@@ -2315,8 +2324,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
-			memcg = mem_cgroup_iter(root, memcg, &reclaim);
-		} while (memcg);
+		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
 
 		/*
 		 * Shrink the slab caches in the same proportion that
@@ -2519,10 +2527,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					  struct scan_control *sc)
 {
+	int initial_priority = sc->priority;
 	unsigned long total_scanned = 0;
 	unsigned long writeback_threshold;
 	bool zones_reclaimable;
-
+retry:
 	delayacct_freepages_start();
 
 	if (global_reclaim(sc))
@@ -2572,6 +2581,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	if (sc->compaction_ready)
 		return 1;
 
+	/* Untapped cgroup reserves?  Don't OOM, retry. */
+	if (!sc->may_thrash) {
+		sc->priority = initial_priority;
+		sc->may_thrash = 1;
+		goto retry;
+	}
+
 	/* Any of the zones still reclaimable?  Don't OOM. */
 	if (zones_reclaimable)
 		return 1;
-- 
cgit v1.2.3


From 49550b605587924b3336386caae53200c68969d3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 11 Feb 2015 15:26:12 -0800
Subject: oom: add helpers for setting and clearing TIF_MEMDIE

This patchset addresses a race which was described in the changelog for
5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend"):

: PM freezer relies on having all tasks frozen by the time devices are
: getting frozen so that no task will touch them while they are getting
: frozen.  But OOM killer is allowed to kill an already frozen task in order
: to handle OOM situtation.  In order to protect from late wake ups OOM
: killer is disabled after all tasks are frozen.  This, however, still keeps
: a window open when a killed task didn't manage to die by the time
: freeze_processes finishes.

The original patch hasn't closed the race window completely because that
would require a more complex solution as it can be seen by this patchset.

The primary motivation was to close the race condition between OOM killer
and PM freezer _completely_.  As Tejun pointed out, even though the race
condition is unlikely the harder it would be to debug weird bugs deep in
the PM freezer when the debugging options are reduced considerably.  I can
only speculate what might happen when a task is still runnable
unexpectedly.

On a plus side and as a side effect the oom enable/disable has a better
(full barrier) semantic without polluting hot paths.

I have tested the series in KVM with 100M RAM:
- many small tasks (20M anon mmap) which are triggering OOM continually
- s2ram which resumes automatically is triggered in a loop
	echo processors > /sys/power/pm_test
	while true
	do
		echo mem > /sys/power/state
		sleep 1s
	done
- simple module which allocates and frees 20M in 8K chunks. If it sees
  freezing(current) then it tries another round of allocation before calling
  try_to_freeze
- debugging messages of PM stages and OOM killer enable/disable/fail added
  and unmark_oom_victim is delayed by 1s after it clears TIF_MEMDIE and before
  it wakes up waiters.
- rebased on top of the current mmotm which means some necessary updates
  in mm/oom_kill.c. mark_tsk_oom_victim is now called under task_lock but
  I think this should be OK because __thaw_task shouldn't interfere with any
  locking down wake_up_process. Oleg?

As expected there are no OOM killed tasks after oom is disabled and
allocations requested by the kernel thread are failing after all the tasks
are frozen and OOM disabled.  I wasn't able to catch a race where
oom_killer_disable would really have to wait but I kinda expected the race
is really unlikely.

[  242.609330] Killed process 2992 (mem_eater) total-vm:24412kB, anon-rss:2164kB, file-rss:4kB
[  243.628071] Unmarking 2992 OOM victim. oom_victims: 1
[  243.636072] (elapsed 2.837 seconds) done.
[  243.641985] Trying to disable OOM killer
[  243.643032] Waiting for concurent OOM victims
[  243.644342] OOM killer disabled
[  243.645447] Freezing remaining freezable tasks ... (elapsed 0.005 seconds) done.
[  243.652983] Suspending console(s) (use no_console_suspend to debug)
[  243.903299] kmem_eater: page allocation failure: order:1, mode:0x204010
[...]
[  243.992600] PM: suspend of devices complete after 336.667 msecs
[  243.993264] PM: late suspend of devices complete after 0.660 msecs
[  243.994713] PM: noirq suspend of devices complete after 1.446 msecs
[  243.994717] ACPI: Preparing to enter system sleep state S3
[  243.994795] PM: Saving platform NVS memory
[  243.994796] Disabling non-boot CPUs ...

The first 2 patches are simple cleanups for OOM.  They should go in
regardless the rest IMO.

Patches 3 and 4 are trivial printk -> pr_info conversion and they should
go in ditto.

The main patch is the last one and I would appreciate acks from Tejun and
Rafael.  I think the OOM part should be OK (except for __thaw_task vs.
task_lock where a look from Oleg would appreciated) but I am not so sure I
haven't screwed anything in the freezer code.  I have found several
surprises there.

This patch (of 5):

This patch is just a preparatory and it doesn't introduce any functional
change.

Note:
I am utterly unhappy about lowmemory killer abusing TIF_MEMDIE just to
wait for the oom victim and to prevent from new killing. This is
just a side effect of the flag. The primary meaning is to give the oom
victim access to the memory reserves and that shouldn't be necessary
here.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/staging/android/lowmemorykiller.c |  7 ++++++-
 include/linux/oom.h                       |  4 ++++
 kernel/exit.c                             |  2 +-
 mm/memcontrol.c                           |  2 +-
 mm/oom_kill.c                             | 23 ++++++++++++++++++++---
 5 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index b545d3d1da3e..feafa172b155 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -160,7 +160,12 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 			     selected->pid, selected->comm,
 			     selected_oom_score_adj, selected_tasksize);
 		lowmem_deathpending_timeout = jiffies + HZ;
-		set_tsk_thread_flag(selected, TIF_MEMDIE);
+		/*
+		 * FIXME: lowmemorykiller shouldn't abuse global OOM killer
+		 * infrastructure. There is no real reason why the selected
+		 * task should have access to the memory reserves.
+		 */
+		mark_tsk_oom_victim(selected);
 		send_sig(SIGKILL, selected, 0);
 		rem += selected_tasksize;
 	}
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 76200984d1e2..b42b80f88c3a 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -47,6 +47,10 @@ static inline bool oom_task_origin(const struct task_struct *p)
 	return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
 }
 
+extern void mark_tsk_oom_victim(struct task_struct *tsk);
+
+extern void unmark_oom_victim(void);
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6806c55475ee..02b3d1ab2ec0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk)
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
-	clear_thread_flag(TIF_MEMDIE);
+	unmark_oom_victim();
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 11c9e6a1dad5..fe4d258ef32b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1556,7 +1556,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-		set_thread_flag(TIF_MEMDIE);
+		mark_tsk_oom_victim(current);
 		return;
 	}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 294493a7ae4b..80b34e285f96 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -416,6 +416,23 @@ void note_oom_kill(void)
 	atomic_inc(&oom_kills);
 }
 
+/**
+ * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * @tsk: task to mark
+ */
+void mark_tsk_oom_victim(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk, TIF_MEMDIE);
+}
+
+/**
+ * unmark_oom_victim - unmarks the current task as OOM victim.
+ */
+void unmark_oom_victim(void)
+{
+	clear_thread_flag(TIF_MEMDIE);
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Must be called while holding a reference to p, which will be released upon
@@ -440,7 +457,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 */
 	task_lock(p);
 	if (p->mm && task_will_free_mem(p)) {
-		set_tsk_thread_flag(p, TIF_MEMDIE);
+		mark_tsk_oom_victim(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -495,7 +512,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 	/* mm cannot safely be dereferenced after task_unlock(victim) */
 	mm = victim->mm;
-	set_tsk_thread_flag(victim, TIF_MEMDIE);
+	mark_tsk_oom_victim(victim);
 	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
 		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
 		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -652,7 +669,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 */
 	if (current->mm &&
 	    (fatal_signal_pending(current) || task_will_free_mem(current))) {
-		set_thread_flag(TIF_MEMDIE);
+		mark_tsk_oom_victim(current);
 		return;
 	}
 
-- 
cgit v1.2.3


From c32b3cbe0d067a9cfae85aa70ba1e97ceba0ced7 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 11 Feb 2015 15:26:24 -0800
Subject: oom, PM: make OOM detection in the freezer path raceless

Commit 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM
suspend") has left a race window when OOM killer manages to
note_oom_kill after freeze_processes checks the counter.  The race
window is quite small and really unlikely and partial solution deemed
sufficient at the time of submission.

Tejun wasn't happy about this partial solution though and insisted on a
full solution.  That requires the full OOM and freezer's task freezing
exclusion, though.  This is done by this patch which introduces oom_sem
RW lock and turns oom_killer_disable() into a full OOM barrier.

oom_killer_disabled check is moved from the allocation path to the OOM
level and we take oom_sem for reading for both the check and the whole
OOM invocation.

oom_killer_disable() takes oom_sem for writing so it waits for all
currently running OOM killer invocations.  Then it disable all the further
OOMs by setting oom_killer_disabled and checks for any oom victims.
Victims are counted via mark_tsk_oom_victim resp.  unmark_oom_victim.  The
last victim wakes up all waiters enqueued by oom_killer_disable().
Therefore this function acts as the full OOM barrier.

The page fault path is covered now as well although it was assumed to be
safe before.  As per Tejun, "We used to have freezing points deep in file
system code which may be reacheable from page fault." so it would be
better and more robust to not rely on freezing points here.  Same applies
to the memcg OOM killer.

out_of_memory tells the caller whether the OOM was allowed to trigger and
the callers are supposed to handle the situation.  The page allocation
path simply fails the allocation same as before.  The page fault path will
retry the fault (more on that later) and Sysrq OOM trigger will simply
complain to the log.

Normally there wouldn't be any unfrozen user tasks after
try_to_freeze_tasks so the function will not block. But if there was an
OOM killer racing with try_to_freeze_tasks and the OOM victim didn't
finish yet then we have to wait for it. This should complete in a finite
time, though, because

	- the victim cannot loop in the page fault handler (it would die
	  on the way out from the exception)
	- it cannot loop in the page allocator because all the further
	  allocation would fail and __GFP_NOFAIL allocations are not
	  acceptable at this stage
	- it shouldn't be blocked on any locks held by frozen tasks
	  (try_to_freeze expects lockless context) and kernel threads and
	  work queues are not frozen yet

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Suggested-by: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/sysrq.c    |   5 +-
 include/linux/oom.h    |  14 ++----
 kernel/exit.c          |   3 +-
 kernel/power/process.c |  50 ++++---------------
 mm/memcontrol.c        |   2 +-
 mm/oom_kill.c          | 132 +++++++++++++++++++++++++++++++++++++++++--------
 mm/page_alloc.c        |  17 +------
 7 files changed, 132 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 0071469ecbf1..259a4d5a4e8f 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(struct work_struct *ignored)
 {
-	out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,
-		      0, NULL, true);
+	if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
+			   GFP_KERNEL, 0, NULL, true))
+		pr_info("OOM request ignored because killer is disabled\n");
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index b42b80f88c3a..d5771bed59c9 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
 		bool force_kill);
 
-extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *mask, bool force_kill);
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
 extern bool oom_killer_disabled;
-
-static inline void oom_killer_disable(void)
-{
-	oom_killer_disabled = true;
-}
-
-static inline void oom_killer_enable(void)
-{
-	oom_killer_disabled = false;
-}
+extern bool oom_killer_disable(void);
+extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 02b3d1ab2ec0..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
-	unmark_oom_victim();
+	if (test_thread_flag(TIF_MEMDIE))
+		unmark_oom_victim();
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3ac45f192e9f..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only)
 	return todo ? -EBUSY : 0;
 }
 
-static bool __check_frozen_processes(void)
-{
-	struct task_struct *g, *p;
-
-	for_each_process_thread(g, p)
-		if (p != current && !freezer_should_skip(p) && !frozen(p))
-			return false;
-
-	return true;
-}
-
-/*
- * Returns true if all freezable tasks (except for current) are frozen already
- */
-static bool check_frozen_processes(void)
-{
-	bool ret;
-
-	read_lock(&tasklist_lock);
-	ret = __check_frozen_processes();
-	read_unlock(&tasklist_lock);
-	return ret;
-}
-
 /**
  * freeze_processes - Signal user space processes to enter the refrigerator.
  * The current thread will not be frozen.  The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
 int freeze_processes(void)
 {
 	int error;
-	int oom_kills_saved;
 
 	error = __usermodehelper_disable(UMH_FREEZING);
 	if (error)
@@ -157,29 +132,22 @@ int freeze_processes(void)
 	pm_wakeup_clear();
 	pr_info("Freezing user space processes ... ");
 	pm_freezing = true;
-	oom_kills_saved = oom_kills_count();
 	error = try_to_freeze_tasks(true);
 	if (!error) {
 		__usermodehelper_set_disable_depth(UMH_DISABLED);
-		oom_killer_disable();
-
-		/*
-		 * There might have been an OOM kill while we were
-		 * freezing tasks and the killed task might be still
-		 * on the way out so we have to double check for race.
-		 */
-		if (oom_kills_count() != oom_kills_saved &&
-		    !check_frozen_processes()) {
-			__usermodehelper_set_disable_depth(UMH_ENABLED);
-			pr_cont("OOM in progress.");
-			error = -EBUSY;
-		} else {
-			pr_cont("done.");
-		}
+		pr_cont("done.");
 	}
 	pr_cont("\n");
 	BUG_ON(in_atomic());
 
+	/*
+	 * Now that the whole userspace is frozen we need to disbale
+	 * the OOM killer to disallow any further interference with
+	 * killable tasks.
+	 */
+	if (!error && !oom_killer_disable())
+		error = -EBUSY;
+
 	if (error)
 		thaw_processes();
 	return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe4d258ef32b..fbf64e6f64e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	if (!memcg)
 		return false;
 
-	if (!handle)
+	if (!handle || oom_killer_disabled)
 		goto cleanup;
 
 	owait.memcg = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3cbd76b8c13b..b8df76ee2be3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 
 /*
- * Number of OOM killer invocations (including memcg OOM killer).
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
+ * Number of OOM victims in flight
  */
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
 
-int oom_kills_count(void)
-{
-	return atomic_read(&oom_kills);
-}
-
-void note_oom_kill(void)
-{
-	atomic_inc(&oom_kills);
-}
+bool oom_killer_disabled __read_mostly;
+static DECLARE_RWSEM(oom_sem);
 
 /**
  * mark_tsk_oom_victim - marks the given taks as OOM victim.
  * @tsk: task to mark
+ *
+ * Has to be called with oom_sem taken for read and never after
+ * oom has been disabled already.
  */
 void mark_tsk_oom_victim(struct task_struct *tsk)
 {
-	set_tsk_thread_flag(tsk, TIF_MEMDIE);
-
+	WARN_ON(oom_killer_disabled);
+	/* OOM killer might race with memcg OOM */
+	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+		return;
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
 	 * if it is frozen because OOM killer wouldn't be able to free
@@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
 	 * that TIF_MEMDIE tasks should be ignored.
 	 */
 	__thaw_task(tsk);
+	atomic_inc(&oom_victims);
 }
 
 /**
  * unmark_oom_victim - unmarks the current task as OOM victim.
+ *
+ * Wakes up all waiters in oom_killer_disable()
  */
 void unmark_oom_victim(void)
 {
-	clear_thread_flag(TIF_MEMDIE);
+	if (!test_and_clear_thread_flag(TIF_MEMDIE))
+		return;
+
+	down_read(&oom_sem);
+	/*
+	 * There is no need to signal the lasst oom_victim if there
+	 * is nobody who cares.
+	 */
+	if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+		wake_up_all(&oom_victims_wait);
+	up_read(&oom_sem);
+}
+
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(void)
+{
+	/*
+	 * Make sure to not race with an ongoing OOM killer
+	 * and that the current is not the victim.
+	 */
+	down_write(&oom_sem);
+	if (test_thread_flag(TIF_MEMDIE)) {
+		up_write(&oom_sem);
+		return false;
+	}
+
+	oom_killer_disabled = true;
+	up_write(&oom_sem);
+
+	wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+
+	return true;
+}
+
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
+{
+	down_write(&oom_sem);
+	oom_killer_disabled = false;
+	up_write(&oom_sem);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
 }
 
 /**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
  * @gfp_mask: memory allocation flags
  * @order: amount of memory being requested as a power of 2
@@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *nodemask, bool force_kill)
 {
 	const nodemask_t *mpol_mask;
@@ -718,6 +771,32 @@ out:
 		schedule_timeout_killable(1);
 }
 
+/**
+ * out_of_memory -  tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+		int order, nodemask_t *nodemask, bool force_kill)
+{
+	bool ret = false;
+
+	down_read(&oom_sem);
+	if (!oom_killer_disabled) {
+		__out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+		ret = true;
+	}
+	up_read(&oom_sem);
+
+	return ret;
+}
+
 /*
  * The pagefault handler calls here because it is out of memory, so kill a
  * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
@@ -727,12 +806,25 @@ void pagefault_out_of_memory(void)
 {
 	struct zonelist *zonelist;
 
+	down_read(&oom_sem);
 	if (mem_cgroup_oom_synchronize(true))
-		return;
+		goto unlock;
 
 	zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
 	if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
-		out_of_memory(NULL, 0, 0, NULL, false);
+		if (!oom_killer_disabled)
+			__out_of_memory(NULL, 0, 0, NULL, false);
+		else
+			/*
+			 * There shouldn't be any user tasks runable while the
+			 * OOM killer is disabled so the current task has to
+			 * be a racing OOM victim for which oom_killer_disable()
+			 * is waiting for.
+			 */
+			WARN_ON(test_thread_flag(TIF_MEMDIE));
+
 		oom_zonelist_unlock(zonelist, GFP_KERNEL);
 	}
+unlock:
+	up_read(&oom_sem);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 641d5a9a8617..134e25525044 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
 					PB_migrate, PB_migrate_end);
 }
 
-bool oom_killer_disabled __read_mostly;
-
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 
 	*did_some_progress = 0;
 
-	if (oom_killer_disabled)
-		return NULL;
-
 	/*
 	 * Acquire the per-zone oom lock for each zone.  If that
 	 * fails, somebody else is making progress for us.
@@ -2330,14 +2325,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 	}
 
-	/*
-	 * PM-freezer should be notified that there might be an OOM killer on
-	 * its way to kill and wake somebody up. This is too early and we might
-	 * end up not killing anything but false positives are acceptable.
-	 * See freeze_processes.
-	 */
-	note_oom_kill();
-
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
@@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false);
-	*did_some_progress = 1;
+	if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+		*did_some_progress = 1;
 out:
 	oom_zonelist_unlock(ac->zonelist, gfp_mask);
 	return page;
-- 
cgit v1.2.3


From dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 11 Feb 2015 15:26:50 -0800
Subject: mm: account pmd page tables to the process

Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup.  The trick is to allocate a lot of PMD page tables.  Linux
kernel doesn't account PMD tables to the process, only PTE.

The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low.  oom_score for the process will be 0.

	#include <errno.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <sys/mman.h>
	#include <sys/prctl.h>

	#define PUD_SIZE (1UL << 30)
	#define PMD_SIZE (1UL << 21)

	#define NR_PUD 130000

	int main(void)
	{
		char *addr = NULL;
		unsigned long i;

		prctl(PR_SET_THP_DISABLE);
		for (i = 0; i < NR_PUD ; i++) {
			addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
			if (addr == MAP_FAILED) {
				perror("mmap");
				break;
			}
			*addr = 'x';
			munmap(addr, PMD_SIZE);
			mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
			if (addr == MAP_FAILED)
				perror("re-mmap"), exit(1);
		}
		printf("PID %d consumed %lu KiB in PMD page tables\n",
				getpid(), i * 4096 >> 10);
		return pause();
	}

The patch addresses the issue by account PMD tables to the process the
same way we account PTE.

The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:

 - HugeTLB can share PMD page tables. The patch handles by accounting
   the table to all processes who share it.

 - x86 PAE pre-allocates few PMD tables on fork.

 - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
   check on exit(2).

Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded).  As with nr_ptes we use per-mm counter.  The
counter value is used to calculate baseline for badness score by
oom-killer.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 12 ++++++------
 arch/x86/mm/pgtable.c       | 14 +++++++++-----
 fs/proc/task_mmu.c          |  9 ++++++---
 include/linux/mm.h          | 24 ++++++++++++++++++++++++
 include/linux/mm_types.h    |  3 ++-
 kernel/fork.c               |  3 +++
 mm/debug.c                  |  3 ++-
 mm/hugetlb.c                |  8 ++++++--
 mm/memory.c                 | 15 +++++++++------
 mm/mmap.c                   |  4 +++-
 mm/oom_kill.c               |  9 +++++----
 11 files changed, 75 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 4415aa915681..e9c706e4627a 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -555,12 +555,12 @@ this is causing problems for your system/application.
 
 oom_dump_tasks
 
-Enables a system-wide task dump (excluding kernel threads) to be
-produced when the kernel performs an OOM-killing and includes such
-information as pid, uid, tgid, vm size, rss, nr_ptes, swapents,
-oom_score_adj score, and name.  This is helpful to determine why the
-OOM killer was invoked, to identify the rogue task that caused it,
-and to determine why the OOM killer chose the task it did to kill.
+Enables a system-wide task dump (excluding kernel threads) to be produced
+when the kernel performs an OOM-killing and includes such information as
+pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
+score, and name.  This is helpful to determine why the OOM killer was
+invoked, to identify the rogue task that caused it, and to determine why
+the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6fb6927f9e76..7b22adaad4f1 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 
 #endif	/* CONFIG_X86_PAE */
 
-static void free_pmds(pmd_t *pmds[])
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 
@@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[])
 		if (pmds[i]) {
 			pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
 			free_page((unsigned long)pmds[i]);
+			mm_dec_nr_pmds(mm);
 		}
 }
 
-static int preallocate_pmds(pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 	bool failed = false;
@@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[])
 			pmd = NULL;
 			failed = true;
 		}
+		if (pmd)
+			mm_inc_nr_pmds(mm);
 		pmds[i] = pmd;
 	}
 
 	if (failed) {
-		free_pmds(pmds);
+		free_pmds(mm, pmds);
 		return -ENOMEM;
 	}
 
@@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 
 			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 			pmd_free(mm, pmd);
+			mm_dec_nr_pmds(mm);
 		}
 	}
 }
@@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	mm->pgd = pgd;
 
-	if (preallocate_pmds(pmds) != 0)
+	if (preallocate_pmds(mm, pmds) != 0)
 		goto out_free_pgd;
 
 	if (paravirt_pgd_alloc(mm) != 0)
@@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 
 out_free_pmds:
-	free_pmds(pmds);
+	free_pmds(mm, pmds);
 out_free_pgd:
 	free_page((unsigned long)pgd);
 out:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6396f88c6687..e6e0abeb5d12 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib, swap;
+	unsigned long data, text, lib, swap, ptes, pmds;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	/*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
+	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
+		"VmPMD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE * sizeof(pte_t) *
-		 atomic_long_read(&mm->nr_ptes)) >> 10,
+		ptes >> 10,
+		pmds >> 10,
 		swap << (PAGE_SHIFT-10));
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c6bf813a6b3d..644990b83cda 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1438,8 +1438,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 {
 	return 0;
 }
+
+static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
+static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
+
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
+
+static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_pmds);
+}
+
+static inline void mm_inc_nr_pmds(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_pmds);
+}
+
+static inline void mm_dec_nr_pmds(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_pmds);
+}
 #endif
 
 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 20ff2105b564..199a03aab8dc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -363,7 +363,8 @@ struct mm_struct {
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
-	atomic_long_t nr_ptes;			/* Page table pages */
+	atomic_long_t nr_ptes;			/* PTE page table pages */
+	atomic_long_t nr_pmds;			/* PMD page table pages */
 	int map_count;				/* number of VMAs */
 
 	spinlock_t page_table_lock;		/* Protects page tables and some counters */
diff --git a/kernel/fork.c b/kernel/fork.c
index b379d9abddc7..c99098c52641 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
+#ifndef __PAGETABLE_PMD_FOLDED
+	atomic_long_set(&mm->nr_pmds, 0);
+#endif
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
diff --git a/mm/debug.c b/mm/debug.c
index d69cb5a7ba9a..3eb3ac2fcee7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
 		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+		mm_nr_pmds((struct mm_struct *)mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fd28d6ba5e5d..0a9ac6c26832 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3598,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 		if (saddr) {
 			spte = huge_pte_offset(svma->vm_mm, saddr);
 			if (spte) {
+				mm_inc_nr_pmds(mm);
 				get_page(virt_to_page(spte));
 				break;
 			}
@@ -3609,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 
 	ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
 	spin_lock(ptl);
-	if (pud_none(*pud))
+	if (pud_none(*pud)) {
 		pud_populate(mm, pud,
 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
-	else
+	} else {
 		put_page(virt_to_page(spte));
+		mm_inc_nr_pmds(mm);
+	}
 	spin_unlock(ptl);
 out:
 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3644,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 
 	pud_clear(pud);
 	put_page(virt_to_page(ptep));
+	mm_dec_nr_pmds(mm);
 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
 	return 1;
 }
diff --git a/mm/memory.c b/mm/memory.c
index d63849b5188f..bbe6a73a899d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
 	pmd_free_tlb(tlb, pmd, start);
+	mm_dec_nr_pmds(tlb->mm);
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
-	if (pud_present(*pud))		/* Another has populated it */
-		pmd_free(mm, new);
-	else
+	if (!pud_present(*pud)) {
+		mm_inc_nr_pmds(mm);
 		pud_populate(mm, pud, new);
-#else
-	if (pgd_present(*pud))		/* Another has populated it */
+	} else	/* Another has populated it */
 		pmd_free(mm, new);
-	else
+#else
+	if (!pgd_present(*pud)) {
+		mm_inc_nr_pmds(mm);
 		pgd_populate(mm, pud, new);
+	} else /* Another has populated it */
+		pmd_free(mm, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 14d84666e8ba..6a7d36d133fb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm)
 	vm_unacct_memory(nr_accounted);
 
 	WARN_ON(atomic_long_read(&mm->nr_ptes) >
-			(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+			round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT);
+	WARN_ON(mm_nr_pmds(mm) >
+			round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b8df76ee2be3..642f38cb175a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
-		 get_mm_counter(p->mm, MM_SWAPENTS);
+	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
+		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			atomic_long_read(&task->mm->nr_ptes),
+			mm_nr_pmds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.2.3


From 16c4a097a035c01809aa0c0abd458ca1fe4ff3d0 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Feb 2015 15:27:01 -0800
Subject: mm/compaction: enhance tracepoint output for compaction begin/end

We now have tracepoint for begin event of compaction and it prints start
position of both scanners, but, tracepoint for end event of compaction
doesn't print finish position of both scanners.  It'd be also useful to
know finish position of both scanners so this patch add it.  It will help
to find odd behavior or problem on compaction internal logic.

And mode is added to both begin/end tracepoint output, since according to
mode, compaction behavior is quite different.

And lastly, status format is changed to string rather than status number
for readability.

[akpm@linux-foundation.org: fix sparse warning]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        |  1 +
 include/trace/events/compaction.h | 49 ++++++++++++++++++++++++++++-----------
 mm/compaction.c                   | 15 ++++++++++--
 3 files changed, 49 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index f2efda2e6ac6..db64cae06530 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -12,6 +12,7 @@
 #define COMPACT_PARTIAL		3
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	4
+/* When adding new state, please change compaction_status_string, too */
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
 /* No contention detected */
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 1337d9e01e3d..839f6fac921a 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -85,46 +85,67 @@ TRACE_EVENT(mm_compaction_migratepages,
 );
 
 TRACE_EVENT(mm_compaction_begin,
-	TP_PROTO(unsigned long zone_start, unsigned long migrate_start,
-		unsigned long free_start, unsigned long zone_end),
+	TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
+		unsigned long free_pfn, unsigned long zone_end, bool sync),
 
-	TP_ARGS(zone_start, migrate_start, free_start, zone_end),
+	TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, zone_start)
-		__field(unsigned long, migrate_start)
-		__field(unsigned long, free_start)
+		__field(unsigned long, migrate_pfn)
+		__field(unsigned long, free_pfn)
 		__field(unsigned long, zone_end)
+		__field(bool, sync)
 	),
 
 	TP_fast_assign(
 		__entry->zone_start = zone_start;
-		__entry->migrate_start = migrate_start;
-		__entry->free_start = free_start;
+		__entry->migrate_pfn = migrate_pfn;
+		__entry->free_pfn = free_pfn;
 		__entry->zone_end = zone_end;
+		__entry->sync = sync;
 	),
 
-	TP_printk("zone_start=0x%lx migrate_start=0x%lx free_start=0x%lx zone_end=0x%lx",
+	TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s",
 		__entry->zone_start,
-		__entry->migrate_start,
-		__entry->free_start,
-		__entry->zone_end)
+		__entry->migrate_pfn,
+		__entry->free_pfn,
+		__entry->zone_end,
+		__entry->sync ? "sync" : "async")
 );
 
 TRACE_EVENT(mm_compaction_end,
-	TP_PROTO(int status),
+	TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
+		unsigned long free_pfn, unsigned long zone_end, bool sync,
+		int status),
 
-	TP_ARGS(status),
+	TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync, status),
 
 	TP_STRUCT__entry(
+		__field(unsigned long, zone_start)
+		__field(unsigned long, migrate_pfn)
+		__field(unsigned long, free_pfn)
+		__field(unsigned long, zone_end)
+		__field(bool, sync)
 		__field(int, status)
 	),
 
 	TP_fast_assign(
+		__entry->zone_start = zone_start;
+		__entry->migrate_pfn = migrate_pfn;
+		__entry->free_pfn = free_pfn;
+		__entry->zone_end = zone_end;
+		__entry->sync = sync;
 		__entry->status = status;
 	),
 
-	TP_printk("status=%d", __entry->status)
+	TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s",
+		__entry->zone_start,
+		__entry->migrate_pfn,
+		__entry->free_pfn,
+		__entry->zone_end,
+		__entry->sync ? "sync" : "async",
+		compaction_status_string[__entry->status])
 );
 
 #endif /* _TRACE_COMPACTION_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index 9c7e6909dd29..66f7c365e888 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -34,6 +34,15 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #endif
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
+#ifdef CONFIG_TRACEPOINTS
+static const char *const compaction_status_string[] = {
+	"deferred",
+	"skipped",
+	"continue",
+	"partial",
+	"complete",
+};
+#endif
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/compaction.h>
@@ -1197,7 +1206,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
-	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
+	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
+				cc->free_pfn, end_pfn, sync);
 
 	migrate_prep_local();
 
@@ -1299,7 +1309,8 @@ out:
 			zone->compact_cached_free_pfn = free_pfn;
 	}
 
-	trace_mm_compaction_end(ret);
+	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
+				cc->free_pfn, end_pfn, sync, ret);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 837d026d560c5ef26abeca0441713d82e4e82cad Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Feb 2015 15:27:06 -0800
Subject: mm/compaction: more trace to understand when/why compaction
 start/finish

It is not well analyzed that when/why compaction start/finish or not.
With these new tracepoints, we can know much more about start/finish
reason of compaction.  I can find following bug with these tracepoint.

http://www.spinics.net/lists/linux-mm/msg81582.html

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        |  3 ++
 include/trace/events/compaction.h | 74 +++++++++++++++++++++++++++++++++++++++
 mm/compaction.c                   | 38 +++++++++++++++++---
 3 files changed, 111 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index db64cae06530..501d7513aac1 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -12,6 +12,9 @@
 #define COMPACT_PARTIAL		3
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	4
+/* For more detailed tracepoint output */
+#define COMPACT_NO_SUITABLE_PAGE	5
+#define COMPACT_NOT_SUITABLE_ZONE	6
 /* When adding new state, please change compaction_status_string, too */
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 139020b55612..d46535801f63 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -164,6 +164,80 @@ TRACE_EVENT(mm_compaction_end,
 		compaction_status_string[__entry->status])
 );
 
+TRACE_EVENT(mm_compaction_try_to_compact_pages,
+
+	TP_PROTO(
+		int order,
+		gfp_t gfp_mask,
+		enum migrate_mode mode),
+
+	TP_ARGS(order, gfp_mask, mode),
+
+	TP_STRUCT__entry(
+		__field(int, order)
+		__field(gfp_t, gfp_mask)
+		__field(enum migrate_mode, mode)
+	),
+
+	TP_fast_assign(
+		__entry->order = order;
+		__entry->gfp_mask = gfp_mask;
+		__entry->mode = mode;
+	),
+
+	TP_printk("order=%d gfp_mask=0x%x mode=%d",
+		__entry->order,
+		__entry->gfp_mask,
+		(int)__entry->mode)
+);
+
+DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
+
+	TP_PROTO(struct zone *zone,
+		int order,
+		int ret),
+
+	TP_ARGS(zone, order, ret),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(char *, name)
+		__field(int, order)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->nid = zone_to_nid(zone);
+		__entry->name = (char *)zone->name;
+		__entry->order = order;
+		__entry->ret = ret;
+	),
+
+	TP_printk("node=%d zone=%-8s order=%d ret=%s",
+		__entry->nid,
+		__entry->name,
+		__entry->order,
+		compaction_status_string[__entry->ret])
+);
+
+DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished,
+
+	TP_PROTO(struct zone *zone,
+		int order,
+		int ret),
+
+	TP_ARGS(zone, order, ret)
+);
+
+DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable,
+
+	TP_PROTO(struct zone *zone,
+		int order,
+		int ret),
+
+	TP_ARGS(zone, order, ret)
+);
+
 #endif /* _TRACE_COMPACTION_H */
 
 /* This part must be outside protection */
diff --git a/mm/compaction.c b/mm/compaction.c
index b12df9fe10b4..b6ede459c1bb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -41,6 +41,8 @@ static const char *const compaction_status_string[] = {
 	"continue",
 	"partial",
 	"complete",
+	"no_suitable_page",
+	"not_suitable_zone",
 };
 #endif
 
@@ -1049,7 +1051,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
-static int compact_finished(struct zone *zone, struct compact_control *cc,
+static int __compact_finished(struct zone *zone, struct compact_control *cc,
 			    const int migratetype)
 {
 	unsigned int order;
@@ -1104,7 +1106,20 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
 			return COMPACT_PARTIAL;
 	}
 
-	return COMPACT_CONTINUE;
+	return COMPACT_NO_SUITABLE_PAGE;
+}
+
+static int compact_finished(struct zone *zone, struct compact_control *cc,
+			    const int migratetype)
+{
+	int ret;
+
+	ret = __compact_finished(zone, cc, migratetype);
+	trace_mm_compaction_finished(zone, cc->order, ret);
+	if (ret == COMPACT_NO_SUITABLE_PAGE)
+		ret = COMPACT_CONTINUE;
+
+	return ret;
 }
 
 /*
@@ -1114,7 +1129,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
  *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
  *   COMPACT_CONTINUE - If compaction should run now
  */
-unsigned long compaction_suitable(struct zone *zone, int order,
+static unsigned long __compaction_suitable(struct zone *zone, int order,
 					int alloc_flags, int classzone_idx)
 {
 	int fragindex;
@@ -1158,11 +1173,24 @@ unsigned long compaction_suitable(struct zone *zone, int order,
 	 */
 	fragindex = fragmentation_index(zone, order);
 	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-		return COMPACT_SKIPPED;
+		return COMPACT_NOT_SUITABLE_ZONE;
 
 	return COMPACT_CONTINUE;
 }
 
+unsigned long compaction_suitable(struct zone *zone, int order,
+					int alloc_flags, int classzone_idx)
+{
+	unsigned long ret;
+
+	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+	trace_mm_compaction_suitable(zone, order, ret);
+	if (ret == COMPACT_NOT_SUITABLE_ZONE)
+		ret = COMPACT_SKIPPED;
+
+	return ret;
+}
+
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	int ret;
@@ -1376,6 +1404,8 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	if (!order || !may_enter_fs || !may_perform_io)
 		return COMPACT_SKIPPED;
 
+	trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
+
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
-- 
cgit v1.2.3


From 24e2716f63e613cf15d3beba3faa0711bcacc427 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Feb 2015 15:27:09 -0800
Subject: mm/compaction: add tracepoint to observe behaviour of compaction
 defer

Compaction deferring logic is heavy hammer that block the way to the
compaction.  It doesn't consider overall system state, so it could prevent
user from doing compaction falsely.  In other words, even if system has
enough range of memory to compact, compaction would be skipped due to
compaction deferring logic.  This patch add new tracepoint to understand
work of deferring logic.  This will also help to check compaction success
and fail.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 65 +++--------------------------------
 include/trace/events/compaction.h | 56 ++++++++++++++++++++++++++++++
 mm/compaction.c                   | 71 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 501d7513aac1..a014559e4a49 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -44,66 +44,11 @@ extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order,
 					int alloc_flags, int classzone_idx);
 
-/* Do not skip compaction more than 64 times */
-#define COMPACT_MAX_DEFER_SHIFT 6
-
-/*
- * Compaction is deferred when compaction fails to result in a page
- * allocation success. 1 << compact_defer_limit compactions are skipped up
- * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
- */
-static inline void defer_compaction(struct zone *zone, int order)
-{
-	zone->compact_considered = 0;
-	zone->compact_defer_shift++;
-
-	if (order < zone->compact_order_failed)
-		zone->compact_order_failed = order;
-
-	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
-		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
-}
-
-/* Returns true if compaction should be skipped this time */
-static inline bool compaction_deferred(struct zone *zone, int order)
-{
-	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
-
-	if (order < zone->compact_order_failed)
-		return false;
-
-	/* Avoid possible overflow */
-	if (++zone->compact_considered > defer_limit)
-		zone->compact_considered = defer_limit;
-
-	return zone->compact_considered < defer_limit;
-}
-
-/*
- * Update defer tracking counters after successful compaction of given order,
- * which means an allocation either succeeded (alloc_success == true) or is
- * expected to succeed.
- */
-static inline void compaction_defer_reset(struct zone *zone, int order,
-		bool alloc_success)
-{
-	if (alloc_success) {
-		zone->compact_considered = 0;
-		zone->compact_defer_shift = 0;
-	}
-	if (order >= zone->compact_order_failed)
-		zone->compact_order_failed = order + 1;
-}
-
-/* Returns true if restarting compaction after many failures */
-static inline bool compaction_restarting(struct zone *zone, int order)
-{
-	if (order < zone->compact_order_failed)
-		return false;
-
-	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
-		zone->compact_considered >= 1UL << zone->compact_defer_shift;
-}
+extern void defer_compaction(struct zone *zone, int order);
+extern bool compaction_deferred(struct zone *zone, int order);
+extern void compaction_defer_reset(struct zone *zone, int order,
+				bool alloc_success);
+extern bool compaction_restarting(struct zone *zone, int order);
 
 #else
 static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index d46535801f63..9a6a3fe0fb51 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -238,6 +238,62 @@ DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable,
 	TP_ARGS(zone, order, ret)
 );
 
+#ifdef CONFIG_COMPACTION
+DECLARE_EVENT_CLASS(mm_compaction_defer_template,
+
+	TP_PROTO(struct zone *zone, int order),
+
+	TP_ARGS(zone, order),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(char *, name)
+		__field(int, order)
+		__field(unsigned int, considered)
+		__field(unsigned int, defer_shift)
+		__field(int, order_failed)
+	),
+
+	TP_fast_assign(
+		__entry->nid = zone_to_nid(zone);
+		__entry->name = (char *)zone->name;
+		__entry->order = order;
+		__entry->considered = zone->compact_considered;
+		__entry->defer_shift = zone->compact_defer_shift;
+		__entry->order_failed = zone->compact_order_failed;
+	),
+
+	TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu",
+		__entry->nid,
+		__entry->name,
+		__entry->order,
+		__entry->order_failed,
+		__entry->considered,
+		1UL << __entry->defer_shift)
+);
+
+DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred,
+
+	TP_PROTO(struct zone *zone, int order),
+
+	TP_ARGS(zone, order)
+);
+
+DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction,
+
+	TP_PROTO(struct zone *zone, int order),
+
+	TP_ARGS(zone, order)
+);
+
+DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
+
+	TP_PROTO(struct zone *zone, int order),
+
+	TP_ARGS(zone, order)
+);
+#endif
+
 #endif /* _TRACE_COMPACTION_H */
 
 /* This part must be outside protection */
diff --git a/mm/compaction.c b/mm/compaction.c
index b6ede459c1bb..b68736c8a1ce 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -124,6 +124,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
 }
 
 #ifdef CONFIG_COMPACTION
+
+/* Do not skip compaction more than 64 times */
+#define COMPACT_MAX_DEFER_SHIFT 6
+
+/*
+ * Compaction is deferred when compaction fails to result in a page
+ * allocation success. 1 << compact_defer_limit compactions are skipped up
+ * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
+ */
+void defer_compaction(struct zone *zone, int order)
+{
+	zone->compact_considered = 0;
+	zone->compact_defer_shift++;
+
+	if (order < zone->compact_order_failed)
+		zone->compact_order_failed = order;
+
+	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
+		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
+
+	trace_mm_compaction_defer_compaction(zone, order);
+}
+
+/* Returns true if compaction should be skipped this time */
+bool compaction_deferred(struct zone *zone, int order)
+{
+	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
+
+	if (order < zone->compact_order_failed)
+		return false;
+
+	/* Avoid possible overflow */
+	if (++zone->compact_considered > defer_limit)
+		zone->compact_considered = defer_limit;
+
+	if (zone->compact_considered >= defer_limit)
+		return false;
+
+	trace_mm_compaction_deferred(zone, order);
+
+	return true;
+}
+
+/*
+ * Update defer tracking counters after successful compaction of given order,
+ * which means an allocation either succeeded (alloc_success == true) or is
+ * expected to succeed.
+ */
+void compaction_defer_reset(struct zone *zone, int order,
+		bool alloc_success)
+{
+	if (alloc_success) {
+		zone->compact_considered = 0;
+		zone->compact_defer_shift = 0;
+	}
+	if (order >= zone->compact_order_failed)
+		zone->compact_order_failed = order + 1;
+
+	trace_mm_compaction_defer_reset(zone, order);
+}
+
+/* Returns true if restarting compaction after many failures */
+bool compaction_restarting(struct zone *zone, int order)
+{
+	if (order < zone->compact_order_failed)
+		return false;
+
+	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
+		zone->compact_considered >= 1UL << zone->compact_defer_shift;
+}
+
 /* Returns true if the pageblock should be scanned for pages to isolate. */
 static inline bool isolation_suitable(struct compact_control *cc,
 					struct page *page)
-- 
cgit v1.2.3


From 077fcf116c8c2bd7ee9487b645aa3b50368db7e1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 11 Feb 2015 15:27:12 -0800
Subject: mm/thp: allocate transparent hugepages on local node

This make sure that we try to allocate hugepages from local node if
allowed by mempolicy.  If we can't, we fallback to small page allocation
based on mempolicy.  This is based on the observation that allocating
pages on local node is more beneficial than allocating hugepages on remote
node.

With this patch applied we may find transparent huge page allocation
failures if the current node doesn't have enough freee hugepages.  Before
this patch such failures result in us retrying the allocation on other
nodes in the numa node mask.

[akpm@linux-foundation.org: fix comment, add CONFIG_TRANSPARENT_HUGEPAGE dependency]
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  4 +++
 mm/huge_memory.c    | 24 +++++++-----------
 mm/mempolicy.c      | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b840e3b2770d..60110e06419d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -335,11 +335,15 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr,
 			int node);
+extern struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
+				       unsigned long addr, int order);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
 #define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
 	alloc_pages(gfp_mask, order)
+#define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\
+	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)			\
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 889713180980..0531ea7dd7cf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -761,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 }
 
-static inline struct page *alloc_hugepage_vma(int defrag,
-					      struct vm_area_struct *vma,
-					      unsigned long haddr, int nd,
-					      gfp_t extra_gfp)
-{
-	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
-			       HPAGE_PMD_ORDER, vma, haddr, nd);
-}
-
 /* Caller must hold page table lock. */
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -790,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			       unsigned long address, pmd_t *pmd,
 			       unsigned int flags)
 {
+	gfp_t gfp;
 	struct page *page;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 
@@ -824,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		return 0;
 	}
-	page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-			vma, haddr, numa_node_id(), 0);
+	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 	if (unlikely(!page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
@@ -1113,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_unlock(ptl);
 alloc:
 	if (transparent_hugepage_enabled(vma) &&
-	    !transparent_hugepage_debug_cow())
-		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					      vma, haddr, numa_node_id(), 0);
-	else
+	    !transparent_hugepage_debug_cow()) {
+		gfp_t gfp;
+
+		gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+		new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+	} else
 		new_page = NULL;
 
 	if (unlikely(!new_page)) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b8c39c..8a32873fdbf7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2030,6 +2030,78 @@ retry_cpuset:
 	return page;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/**
+ * alloc_hugepage_vma: Allocate a hugepage for a VMA
+ * @gfp:
+ *   %GFP_USER	  user allocation.
+ *   %GFP_KERNEL  kernel allocations,
+ *   %GFP_HIGHMEM highmem/user allocations,
+ *   %GFP_FS	  allocation should not call back into a file system.
+ *   %GFP_ATOMIC  don't sleep.
+ *
+ * @vma:   Pointer to VMA or NULL if not available.
+ * @addr:  Virtual Address of the allocation. Must be inside the VMA.
+ * @order: Order of the hugepage for gfp allocation.
+ *
+ * This functions allocate a huge page from the kernel page pool and applies
+ * a NUMA policy associated with the VMA or the current process.
+ * For policy other than %MPOL_INTERLEAVE, we make sure we allocate hugepage
+ * only from the current node if the current node is part of the node mask.
+ * If we can't allocate a hugepage we fail the allocation and don' try to fallback
+ * to other nodes in the node mask. If the current node is not part of node mask
+ * or if the NUMA policy is MPOL_INTERLEAVE we use the allocator that can
+ * fallback to nodes in the policy node mask.
+ *
+ * When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ * mm_struct of the VMA to prevent it from going away. Should be used for
+ * all allocations for pages that will be mapped into
+ * user space. Returns NULL when no page can be allocated.
+ *
+ * Should be called with vma->vm_mm->mmap_sem held.
+ *
+ */
+struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
+				unsigned long addr, int order)
+{
+	struct page *page;
+	nodemask_t *nmask;
+	struct mempolicy *pol;
+	int node = numa_node_id();
+	unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+	pol = get_vma_policy(vma, addr);
+	cpuset_mems_cookie = read_mems_allowed_begin();
+	/*
+	 * For interleave policy, we don't worry about
+	 * current node. Otherwise if current node is
+	 * in nodemask, try to allocate hugepage from
+	 * the current node. Don't fall back to other nodes
+	 * for THP.
+	 */
+	if (unlikely(pol->mode == MPOL_INTERLEAVE))
+		goto alloc_with_fallback;
+	nmask = policy_nodemask(gfp, pol);
+	if (!nmask || node_isset(node, *nmask)) {
+		mpol_cond_put(pol);
+		page = alloc_pages_exact_node(node, gfp, order);
+		if (unlikely(!page &&
+			     read_mems_allowed_retry(cpuset_mems_cookie)))
+			goto retry_cpuset;
+		return page;
+	}
+alloc_with_fallback:
+	mpol_cond_put(pol);
+	/*
+	 * if current node is not part of node mask, try
+	 * the allocation from any node, and we can do retry
+	 * in that case.
+	 */
+	return alloc_pages_vma(gfp, order, vma, addr, node);
+}
+#endif
+
 /**
  * 	alloc_pages_current - Allocate pages.
  *
-- 
cgit v1.2.3


From be97a41b291e495d6cb767b3ee0f84ed05804892 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Feb 2015 15:27:15 -0800
Subject: mm/mempolicy.c: merge alloc_hugepage_vma to alloc_pages_vma

The previous commit ("mm/thp: Allocate transparent hugepages on local
node") introduced alloc_hugepage_vma() to mm/mempolicy.c to perform a
special policy for THP allocations.  The function has the same interface
as alloc_pages_vma(), shares a lot of boilerplate code and a long
comment.

This patch merges the hugepage special case into alloc_pages_vma.  The
extra if condition should be cheap enough price to pay.  We also prevent
a (however unlikely) race with parallel mems_allowed update, which could
make hugepage allocation restart only within the fallback call to
alloc_hugepage_vma() and not reconsider the special rule in
alloc_hugepage_vma().

Also by making sure mpol_cond_put(pol) is always called before actual
allocation attempt, we can use a single exit path within the function.

Also update the comment for missing node parameter and obsolete reference
to mm_sem.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  12 +++---
 mm/mempolicy.c      | 118 +++++++++++++++-------------------------------------
 2 files changed, 39 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 60110e06419d..51bd1e72a917 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -334,22 +334,22 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr,
-			int node);
-extern struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
-				       unsigned long addr, int order);
+			int node, bool hugepage);
+#define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\
+	alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
 	alloc_pages(gfp_mask, order)
 #define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)			\
-	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
+	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
 #define alloc_page_vma_node(gfp_mask, vma, addr, node)		\
-	alloc_pages_vma(gfp_mask, 0, vma, addr, node)
+	alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
 
 extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order);
 extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8a32873fdbf7..acbbf4c821e2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1988,120 +1988,68 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  *	@order:Order of the GFP allocation.
  * 	@vma:  Pointer to VMA or NULL if not available.
  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
+ *	@node: Which node to prefer for allocation (modulo policy).
+ *	@hugepage: for hugepages try only the preferred node if possible
  *
  * 	This function allocates a page from the kernel page pool and applies
  *	a NUMA policy associated with the VMA or the current process.
  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
  *	mm_struct of the VMA to prevent it from going away. Should be used for
- *	all allocations for pages that will be mapped into
- * 	user space. Returns NULL when no page can be allocated.
- *
- *	Should be called with the mm_sem of the vma hold.
+ *	all allocations for pages that will be mapped into user space. Returns
+ *	NULL when no page can be allocated.
  */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr, int node)
+		unsigned long addr, int node, bool hugepage)
 {
 	struct mempolicy *pol;
 	struct page *page;
 	unsigned int cpuset_mems_cookie;
+	struct zonelist *zl;
+	nodemask_t *nmask;
 
 retry_cpuset:
 	pol = get_vma_policy(vma, addr);
 	cpuset_mems_cookie = read_mems_allowed_begin();
 
-	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
+					pol->mode != MPOL_INTERLEAVE)) {
+		/*
+		 * For hugepage allocation and non-interleave policy which
+		 * allows the current node, we only try to allocate from the
+		 * current node and don't fall back to other nodes, as the
+		 * cost of remote accesses would likely offset THP benefits.
+		 *
+		 * If the policy is interleave, or does not allow the current
+		 * node in its nodemask, we allocate the standard way.
+		 */
+		nmask = policy_nodemask(gfp, pol);
+		if (!nmask || node_isset(node, *nmask)) {
+			mpol_cond_put(pol);
+			page = alloc_pages_exact_node(node, gfp, order);
+			goto out;
+		}
+	}
+
+	if (pol->mode == MPOL_INTERLEAVE) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
-		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
-			goto retry_cpuset;
-
-		return page;
+		goto out;
 	}
-	page = __alloc_pages_nodemask(gfp, order,
-				      policy_zonelist(gfp, pol, node),
-				      policy_nodemask(gfp, pol));
+
+	nmask = policy_nodemask(gfp, pol);
+	zl = policy_zonelist(gfp, pol, node);
 	mpol_cond_put(pol);
+	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+out:
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	return page;
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/**
- * alloc_hugepage_vma: Allocate a hugepage for a VMA
- * @gfp:
- *   %GFP_USER	  user allocation.
- *   %GFP_KERNEL  kernel allocations,
- *   %GFP_HIGHMEM highmem/user allocations,
- *   %GFP_FS	  allocation should not call back into a file system.
- *   %GFP_ATOMIC  don't sleep.
- *
- * @vma:   Pointer to VMA or NULL if not available.
- * @addr:  Virtual Address of the allocation. Must be inside the VMA.
- * @order: Order of the hugepage for gfp allocation.
- *
- * This functions allocate a huge page from the kernel page pool and applies
- * a NUMA policy associated with the VMA or the current process.
- * For policy other than %MPOL_INTERLEAVE, we make sure we allocate hugepage
- * only from the current node if the current node is part of the node mask.
- * If we can't allocate a hugepage we fail the allocation and don' try to fallback
- * to other nodes in the node mask. If the current node is not part of node mask
- * or if the NUMA policy is MPOL_INTERLEAVE we use the allocator that can
- * fallback to nodes in the policy node mask.
- *
- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
- * mm_struct of the VMA to prevent it from going away. Should be used for
- * all allocations for pages that will be mapped into
- * user space. Returns NULL when no page can be allocated.
- *
- * Should be called with vma->vm_mm->mmap_sem held.
- *
- */
-struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
-				unsigned long addr, int order)
-{
-	struct page *page;
-	nodemask_t *nmask;
-	struct mempolicy *pol;
-	int node = numa_node_id();
-	unsigned int cpuset_mems_cookie;
-
-retry_cpuset:
-	pol = get_vma_policy(vma, addr);
-	cpuset_mems_cookie = read_mems_allowed_begin();
-	/*
-	 * For interleave policy, we don't worry about
-	 * current node. Otherwise if current node is
-	 * in nodemask, try to allocate hugepage from
-	 * the current node. Don't fall back to other nodes
-	 * for THP.
-	 */
-	if (unlikely(pol->mode == MPOL_INTERLEAVE))
-		goto alloc_with_fallback;
-	nmask = policy_nodemask(gfp, pol);
-	if (!nmask || node_isset(node, *nmask)) {
-		mpol_cond_put(pol);
-		page = alloc_pages_exact_node(node, gfp, order);
-		if (unlikely(!page &&
-			     read_mems_allowed_retry(cpuset_mems_cookie)))
-			goto retry_cpuset;
-		return page;
-	}
-alloc_with_fallback:
-	mpol_cond_put(pol);
-	/*
-	 * if current node is not part of node mask, try
-	 * the allocation from any node, and we can do retry
-	 * in that case.
-	 */
-	return alloc_pages_vma(gfp, order, vma, addr, node);
-}
-#endif
-
 /**
  * 	alloc_pages_current - Allocate pages.
  *
-- 
cgit v1.2.3


From f0818f472d8d527a96ec9cc2c3a56223497f9dd3 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 11 Feb 2015 15:27:17 -0800
Subject: mm: gup: add get_user_pages_locked and get_user_pages_unlocked

FAULT_FOLL_ALLOW_RETRY allows the page fault to drop the mmap_sem for
reading to reduce the mmap_sem contention (for writing), like while
waiting for I/O completion.  The problem is that right now practically no
get_user_pages call uses FAULT_FOLL_ALLOW_RETRY, so we're not leveraging
that nifty feature.

Andres fixed it for the KVM page fault.  However get_user_pages_fast
remains uncovered, and 99% of other get_user_pages aren't using it either
(the only exception being FOLL_NOWAIT in KVM which is really nonblocking
and in fact it doesn't even release the mmap_sem).

So this patchsets extends the optimization Andres did in the KVM page
fault to the whole kernel.  It makes most important places (including
gup_fast) to use FAULT_FOLL_ALLOW_RETRY to reduce the mmap_sem hold times
during I/O.

The only few places that remains uncovered are drivers like v4l and other
exceptions that tends to work on their own memory and they're not working
on random user memory (for example like O_DIRECT that uses gup_fast and is
fully covered by this patch).

A follow up patch should probably also add a printk_once warning to
get_user_pages that should go obsolete and be phased out eventually.  The
"vmas" parameter of get_user_pages makes it fundamentally incompatible
with FAULT_FOLL_ALLOW_RETRY (vmas array becomes meaningless the moment the
mmap_sem is released).

While this is just an optimization, this becomes an absolute requirement
for the userfaultfd feature http://lwn.net/Articles/615086/ .

The userfaultfd allows to block the page fault, and in order to do so I
need to drop the mmap_sem first.  So this patch also ensures that all
memory where userfaultfd could be registered by KVM, the very first fault
(no matter if it is a regular page fault, or a get_user_pages) always has
FAULT_FOLL_ALLOW_RETRY set.  Then the userfaultfd blocks and it is waken
only when the pagetable is already mapped.  The second fault attempt after
the wakeup doesn't need FAULT_FOLL_ALLOW_RETRY, so it's ok to retry
without it.

This patch (of 5):

We can leverage the VM_FAULT_RETRY functionality in the page fault paths
better by using either get_user_pages_locked or get_user_pages_unlocked.

The former allows conversion of get_user_pages invocations that will have
to pass a "&locked" parameter to know if the mmap_sem was dropped during
the call.  Example from:

    down_read(&mm->mmap_sem);
    do_something()
    get_user_pages(tsk, mm, ..., pages, NULL);
    up_read(&mm->mmap_sem);

to:

    int locked = 1;
    down_read(&mm->mmap_sem);
    do_something()
    get_user_pages_locked(tsk, mm, ..., pages, &locked);
    if (locked)
        up_read(&mm->mmap_sem);

The latter is suitable only as a drop in replacement of the form:

    down_read(&mm->mmap_sem);
    get_user_pages(tsk, mm, ..., pages, NULL);
    up_read(&mm->mmap_sem);

into:

    get_user_pages_unlocked(tsk, mm, ..., pages);

Where tsk, mm, the intermediate "..." paramters and "pages" can be any
value as before.  Just the last parameter of get_user_pages (vmas) must be
NULL for get_user_pages_locked|unlocked to be usable (the latter original
form wouldn't have been safe anyway if vmas wasn't null, for the former we
just make it explicit by dropping the parameter).

If vmas is not NULL these two methods cannot be used.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Reviewed-by: Peter Feiner <pfeiner@google.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |   7 +++
 mm/gup.c           | 177 +++++++++++++++++++++++++++++++++++++++++++++++++----
 mm/nommu.c         |  23 +++++++
 3 files changed, 196 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 644990b83cda..fc499e675475 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1261,6 +1261,13 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
 		    int write, int force, struct page **pages,
 		    struct vm_area_struct **vmas);
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+		    unsigned long start, unsigned long nr_pages,
+		    int write, int force, struct page **pages,
+		    int *locked);
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+		    unsigned long start, unsigned long nr_pages,
+		    int write, int force, struct page **pages);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 struct kvec;
diff --git a/mm/gup.c b/mm/gup.c
index 1a8ab05918e0..71a37738a326 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -575,6 +575,165 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 	return 0;
 }
 
+static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
+						struct mm_struct *mm,
+						unsigned long start,
+						unsigned long nr_pages,
+						int write, int force,
+						struct page **pages,
+						struct vm_area_struct **vmas,
+						int *locked, bool notify_drop)
+{
+	int flags = FOLL_TOUCH;
+	long ret, pages_done;
+	bool lock_dropped;
+
+	if (locked) {
+		/* if VM_FAULT_RETRY can be returned, vmas become invalid */
+		BUG_ON(vmas);
+		/* check caller initialized locked */
+		BUG_ON(*locked != 1);
+	}
+
+	if (pages)
+		flags |= FOLL_GET;
+	if (write)
+		flags |= FOLL_WRITE;
+	if (force)
+		flags |= FOLL_FORCE;
+
+	pages_done = 0;
+	lock_dropped = false;
+	for (;;) {
+		ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+				       vmas, locked);
+		if (!locked)
+			/* VM_FAULT_RETRY couldn't trigger, bypass */
+			return ret;
+
+		/* VM_FAULT_RETRY cannot return errors */
+		if (!*locked) {
+			BUG_ON(ret < 0);
+			BUG_ON(ret >= nr_pages);
+		}
+
+		if (!pages)
+			/* If it's a prefault don't insist harder */
+			return ret;
+
+		if (ret > 0) {
+			nr_pages -= ret;
+			pages_done += ret;
+			if (!nr_pages)
+				break;
+		}
+		if (*locked) {
+			/* VM_FAULT_RETRY didn't trigger */
+			if (!pages_done)
+				pages_done = ret;
+			break;
+		}
+		/* VM_FAULT_RETRY triggered, so seek to the faulting offset */
+		pages += ret;
+		start += ret << PAGE_SHIFT;
+
+		/*
+		 * Repeat on the address that fired VM_FAULT_RETRY
+		 * without FAULT_FLAG_ALLOW_RETRY but with
+		 * FAULT_FLAG_TRIED.
+		 */
+		*locked = 1;
+		lock_dropped = true;
+		down_read(&mm->mmap_sem);
+		ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+				       pages, NULL, NULL);
+		if (ret != 1) {
+			BUG_ON(ret > 1);
+			if (!pages_done)
+				pages_done = ret;
+			break;
+		}
+		nr_pages--;
+		pages_done++;
+		if (!nr_pages)
+			break;
+		pages++;
+		start += PAGE_SIZE;
+	}
+	if (notify_drop && lock_dropped && *locked) {
+		/*
+		 * We must let the caller know we temporarily dropped the lock
+		 * and so the critical section protected by it was lost.
+		 */
+		up_read(&mm->mmap_sem);
+		*locked = 0;
+	}
+	return pages_done;
+}
+
+/*
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
+ * get_user_pages_locked() is suitable to replace the form:
+ *
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      up_read(&mm->mmap_sem);
+ *
+ *  to:
+ *
+ *      int locked = 1;
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ *      if (locked)
+ *          up_read(&mm->mmap_sem);
+ */
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+			   unsigned long start, unsigned long nr_pages,
+			   int write, int force, struct page **pages,
+			   int *locked)
+{
+	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+				       pages, NULL, locked, true);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+/*
+ * get_user_pages_unlocked() is suitable to replace the form:
+ *
+ *      down_read(&mm->mmap_sem);
+ *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      up_read(&mm->mmap_sem);
+ *
+ *  with:
+ *
+ *      get_user_pages_unlocked(tsk, mm, ..., pages);
+ *
+ * It is functionally equivalent to get_user_pages_fast so
+ * get_user_pages_fast should be used instead, if the two parameters
+ * "tsk" and "mm" are respectively equal to current and current->mm,
+ * or if "force" shall be set to 1 (get_user_pages_fast misses the
+ * "force" parameter).
+ */
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			     unsigned long start, unsigned long nr_pages,
+			     int write, int force, struct page **pages)
+{
+	long ret;
+	int locked = 1;
+	down_read(&mm->mmap_sem);
+	ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+				      pages, NULL, &locked, false);
+	if (locked)
+		up_read(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
 /*
  * get_user_pages() - pin user pages in memory
  * @tsk:	the task_struct to use for page fault accounting, or
@@ -624,22 +783,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
  * use the correct cache flushing APIs.
  *
  * See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
  */
 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages, int write,
 		int force, struct page **pages, struct vm_area_struct **vmas)
 {
-	int flags = FOLL_TOUCH;
-
-	if (pages)
-		flags |= FOLL_GET;
-	if (write)
-		flags |= FOLL_WRITE;
-	if (force)
-		flags |= FOLL_FORCE;
-
-	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
-				NULL);
+	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+				       pages, vmas, NULL, false);
 }
 EXPORT_SYMBOL(get_user_pages);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 541bed64e348..bfb690b0f986 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -214,6 +214,29 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages);
 
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+			   unsigned long start, unsigned long nr_pages,
+			   int write, int force, struct page **pages,
+			   int *locked)
+{
+	return get_user_pages(tsk, mm, start, nr_pages, write, force,
+			      pages, NULL);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			     unsigned long start, unsigned long nr_pages,
+			     int write, int force, struct page **pages)
+{
+	long ret;
+	down_read(&mm->mmap_sem);
+	ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
+			     pages, NULL);
+	up_read(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping
-- 
cgit v1.2.3


From 0fd71a56f41d4ffabeda1dae9ff5ed4f34d4e935 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 11 Feb 2015 15:27:20 -0800
Subject: mm: gup: add __get_user_pages_unlocked to customize gup_flags

Some callers (like KVM) may want to set the gup_flags like FOLL_HWPOSION
to get a proper -EHWPOSION retval instead of -EFAULT to take a more
appropriate action if get_user_pages runs into a memory failure.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  4 ++++
 mm/gup.c           | 44 ++++++++++++++++++++++++++++++++------------
 mm/nommu.c         | 16 +++++++++++++---
 3 files changed, 49 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fc499e675475..3696b3bd1d7e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1265,6 +1265,10 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
 		    int write, int force, struct page **pages,
 		    int *locked);
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			       unsigned long start, unsigned long nr_pages,
+			       int write, int force, struct page **pages,
+			       unsigned int gup_flags);
 long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
 		    int write, int force, struct page **pages);
diff --git a/mm/gup.c b/mm/gup.c
index 71a37738a326..dad5875fb766 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -582,9 +582,9 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 						int write, int force,
 						struct page **pages,
 						struct vm_area_struct **vmas,
-						int *locked, bool notify_drop)
+						int *locked, bool notify_drop,
+						unsigned int flags)
 {
-	int flags = FOLL_TOUCH;
 	long ret, pages_done;
 	bool lock_dropped;
 
@@ -698,10 +698,36 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 			   int *locked)
 {
 	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-				       pages, NULL, locked, true);
+				       pages, NULL, locked, true, FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages_locked);
 
+/*
+ * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
+ * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
+ *
+ * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
+ * caller if required (just like with __get_user_pages). "FOLL_GET",
+ * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
+ * according to the parameters "pages", "write", "force"
+ * respectively.
+ */
+__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+					       unsigned long start, unsigned long nr_pages,
+					       int write, int force, struct page **pages,
+					       unsigned int gup_flags)
+{
+	long ret;
+	int locked = 1;
+	down_read(&mm->mmap_sem);
+	ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+				      pages, NULL, &locked, false, gup_flags);
+	if (locked)
+		up_read(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
 /*
  * get_user_pages_unlocked() is suitable to replace the form:
  *
@@ -723,14 +749,8 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			     unsigned long start, unsigned long nr_pages,
 			     int write, int force, struct page **pages)
 {
-	long ret;
-	int locked = 1;
-	down_read(&mm->mmap_sem);
-	ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-				      pages, NULL, &locked, false);
-	if (locked)
-		up_read(&mm->mmap_sem);
-	return ret;
+	return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+					 force, pages, FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages_unlocked);
 
@@ -794,7 +814,7 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		int force, struct page **pages, struct vm_area_struct **vmas)
 {
 	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-				       pages, vmas, NULL, false);
+				       pages, vmas, NULL, false, FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index bfb690b0f986..4d1b8a199867 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -224,9 +224,10 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages_locked);
 
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
-			     unsigned long start, unsigned long nr_pages,
-			     int write, int force, struct page **pages)
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			       unsigned long start, unsigned long nr_pages,
+			       int write, int force, struct page **pages,
+			       unsigned int gup_flags)
 {
 	long ret;
 	down_read(&mm->mmap_sem);
@@ -235,6 +236,15 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 	up_read(&mm->mmap_sem);
 	return ret;
 }
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			     unsigned long start, unsigned long nr_pages,
+			     int write, int force, struct page **pages)
+{
+	return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+					 force, pages, 0);
+}
 EXPORT_SYMBOL(get_user_pages_unlocked);
 
 /**
-- 
cgit v1.2.3


From 0664e57ff0c68cbca012a45a38288fa277eb6795 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 11 Feb 2015 15:27:28 -0800
Subject: mm: gup: kvm use get_user_pages_unlocked

Use the more generic get_user_pages_unlocked which has the additional
benefit of passing FAULT_FLAG_ALLOW_RETRY at the very first page fault
(which allows the first page fault in an unmapped area to be always able
to block indefinitely by being allowed to release the mmap_sem).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Peter Feiner <pfeiner@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kvm_host.h | 11 -----------
 virt/kvm/async_pf.c      |  2 +-
 virt/kvm/kvm_main.c      | 50 ++++--------------------------------------------
 3 files changed, 5 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 26f106022c88..d189ee098aa2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -200,17 +200,6 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
-/*
- * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
- * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
- * controls whether we retry the gup one more time to completion in that case.
- * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
- * handler.
- */
-int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
-			 unsigned long addr, bool write_fault,
-			 struct page **pagep);
-
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 5ff7f7f2689a..44660aee335f 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,7 +80,7 @@ static void async_pf_execute(struct work_struct *work)
 
 	might_sleep();
 
-	kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
+	get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
 	kvm_async_page_present_sync(vcpu, apf);
 
 	spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cc6e2e19982..458b9b14b15c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1128,43 +1128,6 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
 
-int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
-			 unsigned long addr, bool write_fault,
-			 struct page **pagep)
-{
-	int npages;
-	int locked = 1;
-	int flags = FOLL_TOUCH | FOLL_HWPOISON |
-		    (pagep ? FOLL_GET : 0) |
-		    (write_fault ? FOLL_WRITE : 0);
-
-	/*
-	 * If retrying the fault, we get here *not* having allowed the filemap
-	 * to wait on the page lock. We should now allow waiting on the IO with
-	 * the mmap semaphore released.
-	 */
-	down_read(&mm->mmap_sem);
-	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
-				  &locked);
-	if (!locked) {
-		VM_BUG_ON(npages);
-
-		if (!pagep)
-			return 0;
-
-		/*
-		 * The previous call has now waited on the IO. Now we can
-		 * retry and complete. Pass TRIED to ensure we do not re
-		 * schedule async IO (see e.g. filemap_fault).
-		 */
-		down_read(&mm->mmap_sem);
-		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
-					  pagep, NULL, NULL);
-	}
-	up_read(&mm->mmap_sem);
-	return npages;
-}
-
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1227,15 +1190,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 		npages = get_user_page_nowait(current, current->mm,
 					      addr, write_fault, page);
 		up_read(&current->mm->mmap_sem);
-	} else {
-		/*
-		 * By now we have tried gup_fast, and possibly async_pf, and we
-		 * are certainly not atomic. Time to retry the gup, allowing
-		 * mmap semaphore to be relinquished in the case of IO.
-		 */
-		npages = kvm_get_user_page_io(current, current->mm, addr,
-					      write_fault, page);
-	}
+	} else
+		npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
+						   write_fault, 0, page,
+						   FOLL_TOUCH|FOLL_HWPOISON);
 	if (npages != 1)
 		return npages;
 
-- 
cgit v1.2.3


From 0b1fbfe50006c41014cc25660c0e735d21c34939 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:27:34 -0800
Subject: mm/pagewalk: remove pgd_entry() and pud_entry()

Currently no user of page table walker sets ->pgd_entry() or
->pud_entry(), so checking their existence in each loop is just wasting
CPU cycle.  So let's remove it to reduce overhead.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 6 ------
 mm/pagewalk.c      | 9 ++-------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3696b3bd1d7e..f6106d3f3dab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1164,8 +1164,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 
 /**
  * mm_walk - callbacks for walk_page_range
- * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
- * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
  * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
  *	       this handler is required to be able to handle
  *	       pmd_trans_huge() pmds.  They may simply choose to
@@ -1179,10 +1177,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
  * (see walk_page_range for more details)
  */
 struct mm_walk {
-	int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
-			 unsigned long next, struct mm_walk *walk);
-	int (*pud_entry)(pud_t *pud, unsigned long addr,
-	                 unsigned long next, struct mm_walk *walk);
 	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
 			 unsigned long next, struct mm_walk *walk);
 	int (*pte_entry)(pte_t *pte, unsigned long addr,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b264bda46e1b..b793ef149da2 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 				break;
 			continue;
 		}
-		if (walk->pud_entry)
-			err = walk->pud_entry(pud, addr, next, walk);
-		if (!err && (walk->pmd_entry || walk->pte_entry))
+		if (walk->pmd_entry || walk->pte_entry)
 			err = walk_pmd_range(pud, addr, next, walk);
 		if (err)
 			break;
@@ -237,10 +235,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
 			pgd++;
 			continue;
 		}
-		if (walk->pgd_entry)
-			err = walk->pgd_entry(pgd, addr, next, walk);
-		if (!err &&
-		    (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
+		if (walk->pmd_entry || walk->pte_entry)
 			err = walk_pud_range(pgd, addr, next, walk);
 		if (err)
 			break;
-- 
cgit v1.2.3


From fafaa4264eba49fd10695c193a82760558d093f4 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:27:37 -0800
Subject: pagewalk: improve vma handling

Current implementation of page table walker has a fundamental problem in
vma handling, which started when we tried to handle vma(VM_HUGETLB).
Because it's done in pgd loop, considering vma boundary makes code
complicated and bug-prone.

From the users viewpoint, some user checks some vma-related condition to
determine whether the user really does page walk over the vma.

In order to solve these, this patch moves vma check outside pgd loop and
introduce a new callback ->test_walk().

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  15 +++-
 mm/pagewalk.c      | 206 ++++++++++++++++++++++++++++++-----------------------
 2 files changed, 129 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6106d3f3dab..3891a368e5e0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1171,10 +1171,16 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
  * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
  * @pte_hole: if set, called for each hole at all levels
  * @hugetlb_entry: if set, called for each hugetlb entry
- *		   *Caution*: The caller must hold mmap_sem() if @hugetlb_entry
- * 			      is used.
+ * @test_walk: caller specific callback function to determine whether
+ *             we walk over the current vma or not. A positive returned
+ *             value means "do page table walk over the current vma,"
+ *             and a negative one means "abort current page table walk
+ *             right now." 0 means "skip the current vma."
+ * @mm:        mm_struct representing the target process of page table walk
+ * @vma:       vma currently walked (NULL if walking outside vmas)
+ * @private:   private data for callbacks' usage
  *
- * (see walk_page_range for more details)
+ * (see the comment on walk_page_range() for more details)
  */
 struct mm_walk {
 	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
@@ -1186,7 +1192,10 @@ struct mm_walk {
 	int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
 			     unsigned long addr, unsigned long next,
 			     struct mm_walk *walk);
+	int (*test_walk)(unsigned long addr, unsigned long next,
+			struct mm_walk *walk);
 	struct mm_struct *mm;
+	struct vm_area_struct *vma;
 	void *private;
 };
 
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b793ef149da2..d9cc3caae802 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
 			continue;
 
 		split_huge_page_pmd_mm(walk->mm, addr, pmd);
-		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+		if (pmd_trans_unstable(pmd))
 			goto again;
 		err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
@@ -95,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	int err = 0;
+
+	pgd = pgd_offset(walk->mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pmd_entry || walk->pte_entry)
+			err = walk_pud_range(pgd, addr, next, walk);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+
+	return err;
+}
+
 #ifdef CONFIG_HUGETLB_PAGE
 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
 				       unsigned long end)
@@ -103,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
 	return boundary < end ? boundary : end;
 }
 
-static int walk_hugetlb_range(struct vm_area_struct *vma,
-			      unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 			      struct mm_walk *walk)
 {
+	struct vm_area_struct *vma = walk->vma;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long next;
 	unsigned long hmask = huge_page_mask(h);
@@ -119,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 		if (pte && walk->hugetlb_entry)
 			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
 		if (err)
-			return err;
+			break;
 	} while (addr = next, addr != end);
 
-	return 0;
+	return err;
 }
 
 #else /* CONFIG_HUGETLB_PAGE */
-static int walk_hugetlb_range(struct vm_area_struct *vma,
-			      unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 			      struct mm_walk *walk)
 {
 	return 0;
@@ -135,112 +160,115 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it via the returned value. Return 0 if we do walk over the
+ * current vma, and return 1 if we skip the vma. Negative values means
+ * error, where we abort the current walk.
+ *
+ * Default check (only VM_PFNMAP check for now) is used when the caller
+ * doesn't define test_walk() callback.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+			struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
 
+	if (walk->test_walk)
+		return walk->test_walk(start, end, walk);
+
+	/*
+	 * Do not walk over vma(VM_PFNMAP), because we have no valid struct
+	 * page backing a VM_PFNMAP range. See also commit a9ff785e4437.
+	 */
+	if (vma->vm_flags & VM_PFNMAP)
+		return 1;
+	return 0;
+}
+
+static int __walk_page_range(unsigned long start, unsigned long end,
+			struct mm_walk *walk)
+{
+	int err = 0;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (vma && is_vm_hugetlb_page(vma)) {
+		if (walk->hugetlb_entry)
+			err = walk_hugetlb_range(start, end, walk);
+	} else
+		err = walk_pgd_range(start, end, walk);
+
+	return err;
+}
 
 /**
- * walk_page_range - walk a memory map's page tables with a callback
- * @addr: starting address
- * @end: ending address
- * @walk: set of callbacks to invoke for each level of the tree
- *
- * Recursively walk the page table for the memory area in a VMA,
- * calling supplied callbacks. Callbacks are called in-order (first
- * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
- * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ * walk_page_range - walk page table with caller specific callbacks
  *
- * Each callback receives an entry pointer and the start and end of the
- * associated range, and a copy of the original mm_walk for access to
- * the ->private or ->mm fields.
+ * Recursively walk the page table tree of the process represented by @walk->mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ *  - 0  : succeeded to handle the current entry, and if you don't reach the
+ *         end address yet, continue to walk.
+ *  - >0 : succeeded to handle the current entry, and return to the caller
+ *         with caller specific value.
+ *  - <0 : failed to handle the current entry, and return to the caller
+ *         with error code.
  *
- * Usually no locks are taken, but splitting transparent huge page may
- * take page table lock. And the bottom level iterator will map PTE
- * directories from highmem if necessary.
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * purpose.
  *
- * If any callback returns a non-zero value, the walk is aborted and
- * the return value is propagated back to the caller. Otherwise 0 is returned.
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @walk->private should be helpful.
  *
- * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
- * is !NULL.
+ * Locking:
+ *   Callers of walk_page_range() and walk_page_vma() should hold
+ *   @walk->mm->mmap_sem, because these function traverse vma list and/or
+ *   access to vma's data.
  */
-int walk_page_range(unsigned long addr, unsigned long end,
+int walk_page_range(unsigned long start, unsigned long end,
 		    struct mm_walk *walk)
 {
-	pgd_t *pgd;
-	unsigned long next;
 	int err = 0;
+	unsigned long next;
+	struct vm_area_struct *vma;
 
-	if (addr >= end)
-		return err;
+	if (start >= end)
+		return -EINVAL;
 
 	if (!walk->mm)
 		return -EINVAL;
 
 	VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
 
-	pgd = pgd_offset(walk->mm, addr);
+	vma = find_vma(walk->mm, start);
 	do {
-		struct vm_area_struct *vma = NULL;
+		if (!vma) { /* after the last vma */
+			walk->vma = NULL;
+			next = end;
+		} else if (start < vma->vm_start) { /* outside vma */
+			walk->vma = NULL;
+			next = min(end, vma->vm_start);
+		} else { /* inside vma */
+			walk->vma = vma;
+			next = min(end, vma->vm_end);
+			vma = vma->vm_next;
 
-		next = pgd_addr_end(addr, end);
-
-		/*
-		 * This function was not intended to be vma based.
-		 * But there are vma special cases to be handled:
-		 * - hugetlb vma's
-		 * - VM_PFNMAP vma's
-		 */
-		vma = find_vma(walk->mm, addr);
-		if (vma) {
-			/*
-			 * There are no page structures backing a VM_PFNMAP
-			 * range, so do not allow split_huge_page_pmd().
-			 */
-			if ((vma->vm_start <= addr) &&
-			    (vma->vm_flags & VM_PFNMAP)) {
-				if (walk->pte_hole)
-					err = walk->pte_hole(addr, next, walk);
-				if (err)
-					break;
-				pgd = pgd_offset(walk->mm, next);
+			err = walk_page_test(start, next, walk);
+			if (err > 0)
 				continue;
-			}
-			/*
-			 * Handle hugetlb vma individually because pagetable
-			 * walk for the hugetlb page is dependent on the
-			 * architecture and we can't handled it in the same
-			 * manner as non-huge pages.
-			 */
-			if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
-			    is_vm_hugetlb_page(vma)) {
-				if (vma->vm_end < next)
-					next = vma->vm_end;
-				/*
-				 * Hugepage is very tightly coupled with vma,
-				 * so walk through hugetlb entries within a
-				 * given vma.
-				 */
-				err = walk_hugetlb_range(vma, addr, next, walk);
-				if (err)
-					break;
-				pgd = pgd_offset(walk->mm, next);
-				continue;
-			}
-		}
-
-		if (pgd_none_or_clear_bad(pgd)) {
-			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, walk);
-			if (err)
+			if (err < 0)
 				break;
-			pgd++;
-			continue;
 		}
-		if (walk->pmd_entry || walk->pte_entry)
-			err = walk_pud_range(pgd, addr, next, walk);
+		if (walk->vma || walk->pte_hole)
+			err = __walk_page_range(start, next, walk);
 		if (err)
 			break;
-		pgd++;
-	} while (addr = next, addr < end);
-
+	} while (start = next, start < end);
 	return err;
 }
-- 
cgit v1.2.3


From 900fc5f197b05253ae9433fb9a066c3f37d08f69 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:27:40 -0800
Subject: pagewalk: add walk_page_vma()

Introduce walk_page_vma(), which is useful for the callers which want to
walk over a given vma.  It's used by later patches.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 +
 mm/pagewalk.c      | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3891a368e5e0..a4d24f3c5430 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1201,6 +1201,7 @@ struct mm_walk {
 
 int walk_page_range(unsigned long addr, unsigned long end,
 		struct mm_walk *walk);
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d9cc3caae802..4c9a653ba563 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -272,3 +272,21 @@ int walk_page_range(unsigned long start, unsigned long end,
 	} while (start = next, start < end);
 	return err;
 }
+
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+{
+	int err;
+
+	if (!walk->mm)
+		return -EINVAL;
+
+	VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+	VM_BUG_ON(!vma);
+	walk->vma = vma;
+	err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+	if (err > 0)
+		return 0;
+	if (err < 0)
+		return err;
+	return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+}
-- 
cgit v1.2.3


From 94f759d62b2c6a9d124b0622077b1ddcfac43fb5 Mon Sep 17 00:00:00 2001
From: Sergei Rogachev <rogachevsergei@gmail.com>
Date: Wed, 11 Feb 2015 15:28:34 -0800
Subject: mm/page_owner.c: remove unnecessary stack_trace field

Page owner uses the page_ext structure to keep meta-information for every
page in the system.  The structure also contains a field of type 'struct
stack_trace', page owner uses this field during invocation of the function
save_stack_trace.  It is easy to notice that keeping a copy of this
structure for every page in the system is very inefficiently in terms of
memory.

The patch removes this unnecessary field of page_ext and forces page owner
to use a stack_trace structure allocated on the stack.

[akpm@linux-foundation.org: use struct initializers]
Signed-off-by: Sergei Rogachev <rogachevsergei@gmail.com>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_ext.h |  2 +-
 mm/page_owner.c          | 26 ++++++++++++++------------
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index d2a2c84c72d0..c42981cd99aa 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -40,7 +40,7 @@ struct page_ext {
 #ifdef CONFIG_PAGE_OWNER
 	unsigned int order;
 	gfp_t gfp_mask;
-	struct stack_trace trace;
+	unsigned int nr_entries;
 	unsigned long trace_entries[8];
 #endif
 };
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9ab4a9b5bc09..0993f5f36b01 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order)
 
 void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 {
-	struct page_ext *page_ext;
-	struct stack_trace *trace;
-
-	page_ext = lookup_page_ext(page);
+	struct page_ext *page_ext = lookup_page_ext(page);
+	struct stack_trace trace = {
+		.nr_entries = 0,
+		.max_entries = ARRAY_SIZE(page_ext->trace_entries),
+		.entries = &page_ext->trace_entries[0],
+		.skip = 3,
+	};
 
-	trace = &page_ext->trace;
-	trace->nr_entries = 0;
-	trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
-	trace->entries = &page_ext->trace_entries[0];
-	trace->skip = 3;
-	save_stack_trace(&page_ext->trace);
+	save_stack_trace(&trace);
 
 	page_ext->order = order;
 	page_ext->gfp_mask = gfp_mask;
+	page_ext->nr_entries = trace.nr_entries;
 
 	__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 }
@@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 	int ret;
 	int pageblock_mt, page_mt;
 	char *kbuf;
+	struct stack_trace trace = {
+		.nr_entries = page_ext->nr_entries,
+		.entries = &page_ext->trace_entries[0],
+	};
 
 	kbuf = kmalloc(count, GFP_KERNEL);
 	if (!kbuf)
@@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 	if (ret >= count)
 		goto err;
 
-	ret += snprint_stack_trace(kbuf + ret, count - ret,
-					&page_ext->trace, 0);
+	ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
 	if (ret >= count)
 		goto err;
 
-- 
cgit v1.2.3


From 54d7e72a758609da5936d7452320d799cfc6a25c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 12 Feb 2015 08:28:12 -0500
Subject: SUNRPC: Fix a compile error when #undef CONFIG_PROC_FS

The definition of rpc_count_iostats_metrics() is borked.

Reported by: Jim Davis <jim.epost@gmail.com>
Fixes: d67ae825a59d6 ("pnfs/flexfiles: Add the FlexFile Layout Driver")
Cc: Tom Haynes <thomas.haynes@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/metrics.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
index 7e61a17030a4..694eecb2f1b5 100644
--- a/include/linux/sunrpc/metrics.h
+++ b/include/linux/sunrpc/metrics.h
@@ -89,8 +89,11 @@ void			rpc_free_iostats(struct rpc_iostats *);
 static inline struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { return NULL; }
 static inline void rpc_count_iostats(const struct rpc_task *task,
 				     struct rpc_iostats *stats) {}
-static inline void rpc_count_iostats_metrics(const struct rpc_task *,
-					     struct rpc_iostats *) {}
+static inline void rpc_count_iostats_metrics(const struct rpc_task *task,
+					     struct rpc_iostats *stats)
+{
+}
+
 static inline void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) {}
 static inline void rpc_free_iostats(struct rpc_iostats *stats) {}
 
-- 
cgit v1.2.3


From 5d833062139d290adb8b62c093b654a01a353448 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:16 -0800
Subject: mm: numa: do not dereference pmd outside of the lock during NUMA
 hinting fault

Automatic NUMA balancing depends on being able to protect PTEs to trap a
fault and gather reference locality information.  Very broadly speaking
it would mark PTEs as not present and use another bit to distinguish
between NUMA hinting faults and other types of faults.  It was
universally loved by everybody and caused no problems whatsoever.  That
last sentence might be a lie.

This series is very heavily based on patches from Linus and Aneesh to
replace the existing PTE/PMD NUMA helper functions with normal change
protections.  I did alter and add parts of it but I consider them
relatively minor contributions.  At their suggestion, acked-bys are in
there but I've no problem converting them to Signed-off-by if requested.

AFAIK, this has received no testing on ppc64 and I'm depending on Aneesh
for that.  I tested trinity under kvm-tool and passed and ran a few
other basic tests.  At the time of writing, only the short-lived tests
have completed but testing of V2 indicated that long-term testing had no
surprises.  In most cases I'm leaving out detail as it's not that
interesting.

specjbb single JVM: There was negligible performance difference in the
	benchmark itself for short runs. However, system activity is
	higher and interrupts are much higher over time -- possibly TLB
	flushes. Migrations are also higher. Overall, this is more overhead
	but considering the problems faced with the old approach I think
	we just have to suck it up and find another way of reducing the
	overhead.

specjbb multi JVM: Negligible performance difference to the actual benchmark
	but like the single JVM case, the system overhead is noticeably
	higher.  Again, interrupts are a major factor.

autonumabench: This was all over the place and about all that can be
	reasonably concluded is that it's different but not necessarily
	better or worse.

autonumabench
                                     3.18.0-rc5            3.18.0-rc5
                                 mmotm-20141119         protnone-v3r3
User    NUMA01               32380.24 (  0.00%)    21642.92 ( 33.16%)
User    NUMA01_THEADLOCAL    22481.02 (  0.00%)    22283.22 (  0.88%)
User    NUMA02                3137.00 (  0.00%)     3116.54 (  0.65%)
User    NUMA02_SMT            1614.03 (  0.00%)     1543.53 (  4.37%)
System  NUMA01                 322.97 (  0.00%)     1465.89 (-353.88%)
System  NUMA01_THEADLOCAL       91.87 (  0.00%)       49.32 ( 46.32%)
System  NUMA02                  37.83 (  0.00%)       14.61 ( 61.38%)
System  NUMA02_SMT               7.36 (  0.00%)        7.45 ( -1.22%)
Elapsed NUMA01                 716.63 (  0.00%)      599.29 ( 16.37%)
Elapsed NUMA01_THEADLOCAL      553.98 (  0.00%)      539.94 (  2.53%)
Elapsed NUMA02                  83.85 (  0.00%)       83.04 (  0.97%)
Elapsed NUMA02_SMT              86.57 (  0.00%)       79.15 (  8.57%)
CPU     NUMA01                4563.00 (  0.00%)     3855.00 ( 15.52%)
CPU     NUMA01_THEADLOCAL     4074.00 (  0.00%)     4136.00 ( -1.52%)
CPU     NUMA02                3785.00 (  0.00%)     3770.00 (  0.40%)
CPU     NUMA02_SMT            1872.00 (  0.00%)     1959.00 ( -4.65%)

System CPU usage of NUMA01 is worse but it's an adverse workload on this
machine so I'm reluctant to conclude that it's a problem that matters.  On
the other workloads that are sensible on this machine, system CPU usage is
great.  Overall time to complete the benchmark is comparable

          3.18.0-rc5  3.18.0-rc5
        mmotm-20141119protnone-v3r3
User        59612.50    48586.44
System        460.22     1537.45
Elapsed      1442.20     1304.29

NUMA alloc hit                 5075182     5743353
NUMA alloc miss                      0           0
NUMA interleave hit                  0           0
NUMA alloc local               5075174     5743339
NUMA base PTE updates        637061448   443106883
NUMA huge PMD updates          1243434      864747
NUMA page range updates     1273699656   885857347
NUMA hint faults               1658116     1214277
NUMA hint local faults          959487      754113
NUMA hint local percent             57          62
NUMA pages migrated            5467056    61676398

The NUMA pages migrated look terrible but when I looked at a graph of the
activity over time I see that the massive spike in migration activity was
during NUMA01.  This correlates with high system CPU usage and could be
simply down to bad luck but any modifications that affect that workload
would be related to scan rates and migrations, not the protection
mechanism.  For all other workloads, migration activity was comparable.

Overall, headline performance figures are comparable but the overhead is
higher, mostly in interrupts.  To some extent, higher overhead from this
approach was anticipated but not to this degree.  It's going to be
necessary to reduce this again with a separate series in the future.  It's
still worth going ahead with this series though as it's likely to avoid
constant headaches with Xen and is probably easier to maintain.

This patch (of 10):

A transhuge NUMA hinting fault may find the page is migrating and should
wait until migration completes.  The check is race-prone because the pmd
is deferenced outside of the page lock and while the race is tiny, it'll
be larger if the PMD is cleared while marking PMDs for hinting fault.
This patch closes the race.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 4 ----
 mm/huge_memory.c        | 3 ++-
 mm/migrate.c            | 6 ------
 3 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index fab9b32ace8e..78baed5f2952 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -67,7 +67,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 #ifdef CONFIG_NUMA_BALANCING
 extern bool pmd_trans_migrating(pmd_t pmd);
-extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd);
 extern int migrate_misplaced_page(struct page *page,
 				  struct vm_area_struct *vma, int node);
 extern bool migrate_ratelimited(int node);
@@ -76,9 +75,6 @@ static inline bool pmd_trans_migrating(pmd_t pmd)
 {
 	return false;
 }
-static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
-}
 static inline int migrate_misplaced_page(struct page *page,
 					 struct vm_area_struct *vma, int node)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cb7be110cad3..c6921362c5fc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1272,8 +1272,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * check_same as the page may no longer be mapped.
 	 */
 	if (unlikely(pmd_trans_migrating(*pmdp))) {
+		page = pmd_page(*pmdp);
 		spin_unlock(ptl);
-		wait_migrate_huge_page(vma->anon_vma, pmdp);
+		wait_on_page_locked(page);
 		goto out;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index f98067e5d353..5e8f03a8de2a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1654,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd)
 	return PageLocked(page);
 }
 
-void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
-	struct page *page = pmd_page(*pmd);
-	wait_on_page_locked(page);
-}
-
 /*
  * Attempt to migrate a misplaced page to the specified destination
  * node. Caller is expected to have an elevated reference count on
-- 
cgit v1.2.3


From 4d9424669946532be754a6e116618dcb58430cb4 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:28 -0800
Subject: mm: convert p[te|md]_mknonnuma and remaining page table manipulations

With PROT_NONE, the traditional page table manipulation functions are
sufficient.

[andre.przywara@arm.com: fix compiler warning in pmdp_invalidate()]
[akpm@linux-foundation.org: fix build with STRICT_MM_TYPECHECKS]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/pgtable-3level.h |  5 ++++-
 include/linux/huge_mm.h               |  3 +--
 mm/huge_memory.c                      | 33 +++++++--------------------------
 mm/memory.c                           | 10 ++++++----
 mm/mempolicy.c                        |  2 +-
 mm/migrate.c                          |  2 +-
 mm/mprotect.c                         |  2 +-
 mm/pgtable-generic.c                  |  2 --
 8 files changed, 21 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 18dbc82f85e5..423a5ac09d3a 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -257,7 +257,10 @@ PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 #define mk_pmd(page,prot)	pfn_pmd(page_to_pfn(page),prot)
 
 /* represent a notpresent pmd by zero, this is used by pmdp_invalidate */
-#define pmd_mknotpresent(pmd)	(__pmd(0))
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	return __pmd(0);
+}
 
 static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 {
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f10b20f05159..062bd252e994 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -31,8 +31,7 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-			unsigned long addr, pgprot_t newprot,
-			int prot_numa);
+			unsigned long addr, pgprot_t newprot);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 915941c45169..cb9b3e847dac 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1355,9 +1355,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	goto out;
 clear_pmdnuma:
 	BUG_ON(!PageLocked(page));
-	pmd = pmd_mknonnuma(pmd);
+	pmd = pmd_modify(pmd, vma->vm_page_prot);
 	set_pmd_at(mm, haddr, pmdp, pmd);
-	VM_BUG_ON(pmd_protnone(*pmdp));
 	update_mmu_cache_pmd(vma, addr, pmdp);
 	unlock_page(page);
 out_unlock:
@@ -1472,7 +1471,7 @@ out:
  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
  */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, pgprot_t newprot, int prot_numa)
+		unsigned long addr, pgprot_t newprot)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	spinlock_t *ptl;
@@ -1481,29 +1480,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		pmd_t entry;
 		ret = 1;
-		if (!prot_numa) {
-			entry = pmdp_get_and_clear_notify(mm, addr, pmd);
-			if (pmd_protnone(entry))
-				entry = pmd_mknonnuma(entry);
-			entry = pmd_modify(entry, newprot);
-			ret = HPAGE_PMD_NR;
-			set_pmd_at(mm, addr, pmd, entry);
-			BUG_ON(pmd_write(entry));
-		} else {
-			struct page *page = pmd_page(*pmd);
-
-			/*
-			 * Do not trap faults against the zero page. The
-			 * read-only data is likely to be read-cached on the
-			 * local CPU cache and it is less useful to know about
-			 * local vs remote hits on the zero page.
-			 */
-			if (!is_huge_zero_page(page) &&
-			    !pmd_protnone(*pmd)) {
-				pmdp_set_numa(mm, addr, pmd);
-				ret = HPAGE_PMD_NR;
-			}
-		}
+		entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+		entry = pmd_modify(entry, newprot);
+		ret = HPAGE_PMD_NR;
+		set_pmd_at(mm, addr, pmd, entry);
+		BUG_ON(pmd_write(entry));
 		spin_unlock(ptl);
 	}
 
diff --git a/mm/memory.c b/mm/memory.c
index 92e6a6299e86..d7921760cf79 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3018,9 +3018,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	* validation through pte_unmap_same(). It's of NUMA type but
 	* the pfn may be screwed if the read is non atomic.
 	*
-	* ptep_modify_prot_start is not called as this is clearing
-	* the _PAGE_NUMA bit and it is not really expected that there
-	* would be concurrent hardware modifications to the PTE.
+	* We can safely just do a "set_pte_at()", because the old
+	* page table entry is not accessible, so there would be no
+	* concurrent hardware modifications to the PTE.
 	*/
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
@@ -3029,7 +3029,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 	}
 
-	pte = pte_mknonnuma(pte);
+	/* Make it present again */
+	pte = pte_modify(pte, vma->vm_page_prot);
+	pte = pte_mkyoung(pte);
 	set_pte_at(mm, addr, ptep, pte);
 	update_mmu_cache(vma, addr, ptep);
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1bd23803576..c75f4dcec808 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -569,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 {
 	int nr_updated;
 
-	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 	if (nr_updated)
 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 5e8f03a8de2a..85e042686031 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1847,7 +1847,7 @@ out_fail:
 out_dropref:
 	ptl = pmd_lock(mm, pmd);
 	if (pmd_same(*pmd, entry)) {
-		entry = pmd_mknonnuma(entry);
+		entry = pmd_modify(entry, vma->vm_page_prot);
 		set_pmd_at(mm, mmun_start, pmd, entry);
 		update_mmu_cache_pmd(vma, address, &entry);
 	}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 44ffa698484d..76824d73380d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -142,7 +142,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 				split_huge_page_pmd(vma, addr, pmd);
 			else {
 				int nr_ptes = change_huge_pmd(vma, pmd, addr,
-						newprot, prot_numa);
+						newprot);
 
 				if (nr_ptes) {
 					if (nr_ptes == HPAGE_PMD_NR) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4b8ad760dde3..c25f94b33811 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
 	pmd_t entry = *pmdp;
-	if (pmd_protnone(entry))
-		entry = pmd_mknonnuma(entry);
 	set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 }
-- 
cgit v1.2.3


From 21d9ee3eda7792c45880b2f11bff8e95c9a061fb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:32 -0800
Subject: mm: remove remaining references to NUMA hinting bits and helpers

This patch removes the NUMA PTE bits and associated helpers.  As a
side-effect it increases the maximum possible swap space on x86-64.

One potential source of problems is races between the marking of PTEs
PROT_NONE, NUMA hinting faults and migration.  It must be guaranteed that
a PTE being protected is not faulted in parallel, seen as a pte_none and
corrupting memory.  The base case is safe but transhuge has problems in
the past due to an different migration mechanism and a dependance on page
lock to serialise migrations and warrants a closer look.

task_work hinting update			parallel fault
------------------------			--------------
change_pmd_range
  change_huge_pmd
    __pmd_trans_huge_lock
      pmdp_get_and_clear
						__handle_mm_fault
						pmd_none
						  do_huge_pmd_anonymous_page
						  read? pmd_lock blocks until hinting complete, fail !pmd_none test
						  write? __do_huge_pmd_anonymous_page acquires pmd_lock, checks pmd_none
      pmd_modify
      set_pmd_at

task_work hinting update			parallel migration
------------------------			------------------
change_pmd_range
  change_huge_pmd
    __pmd_trans_huge_lock
      pmdp_get_and_clear
						__handle_mm_fault
						  do_huge_pmd_numa_page
						    migrate_misplaced_transhuge_page
						    pmd_lock waits for updates to complete, recheck pmd_same
      pmd_modify
      set_pmd_at

Both of those are safe and the case where a transhuge page is inserted
during a protection update is unchanged.  The case where two processes try
migrating at the same time is unchanged by this series so should still be
ok.  I could not find a case where we are accidentally depending on the
PTE not being cleared and flushed.  If one is missed, it'll manifest as
corruption problems that start triggering shortly after this series is
merged and only happen when NUMA balancing is enabled.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/pgtable.h    |  54 +-----------
 arch/powerpc/include/asm/pte-common.h |   5 --
 arch/powerpc/include/asm/pte-hash64.h |   6 --
 arch/x86/include/asm/pgtable.h        |  22 +----
 arch/x86/include/asm/pgtable_64.h     |   5 --
 arch/x86/include/asm/pgtable_types.h  |  41 +--------
 include/asm-generic/pgtable.h         | 155 ----------------------------------
 include/linux/swapops.h               |   2 +-
 8 files changed, 7 insertions(+), 283 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 1146006d3477..79fee2eb8d56 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -55,64 +55,12 @@ static inline int pmd_protnone(pmd_t pmd)
 {
 	return pte_protnone(pmd_pte(pmd));
 }
-
-static inline int pte_present(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_NUMA_MASK;
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t pte)
-{
-	return pte_val(pte) & (_PAGE_PRESENT);
-}
-
-#define ptep_set_numa ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep)
-{
-	if ((pte_val(*ptep) & _PAGE_PRESENT) == 0)
-		VM_BUG_ON(1);
-
-	pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
-	return;
-}
-
-#define pmdp_set_numa pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pmd_t *pmdp)
-{
-	if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0)
-		VM_BUG_ON(1);
-
-	pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
-	return;
-}
-
-/*
- * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
- * which was inherited from x86. For the purposes of powerpc pte_basic_t and
- * pmd_t are equivalent
- */
-#define pteval_t pte_basic_t
-#define pmdval_t pmd_t
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
-	return pmd_val(pmd) & _PAGE_NUMA_MASK;
-}
-
-# else
+#endif /* CONFIG_NUMA_BALANCING */
 
 static inline int pte_present(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_PRESENT;
 }
-#endif /* CONFIG_NUMA_BALANCING */
 
 /* Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 2aef9b7a0eb2..c5a755ef7011 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -104,11 +104,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
 			 _PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | \
 			 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)
 
-#ifdef CONFIG_NUMA_BALANCING
-/* Mask of bits that distinguish present and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT)
-#endif
-
 /*
  * We define 2 sets of base prot bits, one for basic pages (ie,
  * cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index 2505d8eab15c..55aea0caf95e 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -27,12 +27,6 @@
 #define _PAGE_RW		0x0200 /* software: user write access allowed */
 #define _PAGE_BUSY		0x0800 /* software: PTE & hash are busy */
 
-/*
- * Used for tracking numa faults
- */
-#define _PAGE_NUMA	0x00000010 /* Gather numa placement stats */
-
-
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW		(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
 #define _PAGE_KERNEL_RO		 _PAGE_KERNEL_RW
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f519b0b529dd..34d42a7d5595 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -300,7 +300,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-	return pmd_clear_flags(pmd, _PAGE_PRESENT);
+	return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -442,13 +442,6 @@ static inline int pte_same(pte_t a, pte_t b)
 }
 
 static inline int pte_present(pte_t a)
-{
-	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
-			       _PAGE_NUMA);
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t a)
 {
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
@@ -459,7 +452,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
 	if (pte_flags(a) & _PAGE_PRESENT)
 		return true;
 
-	if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
+	if ((pte_flags(a) & _PAGE_PROTNONE) &&
 			mm_tlb_flush_pending(mm))
 		return true;
 
@@ -479,8 +472,7 @@ static inline int pmd_present(pmd_t pmd)
 	 * the _PAGE_PSE flag will remain set at all times while the
 	 * _PAGE_PRESENT bit is clear).
 	 */
-	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
-				 _PAGE_NUMA);
+	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
 }
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -555,11 +547,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 
 static inline int pmd_bad(pmd_t pmd)
 {
-#ifdef CONFIG_NUMA_BALANCING
-	/* pmd_numa check */
-	if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
-		return 0;
-#endif
 	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 }
 
@@ -878,19 +865,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 
 static inline int pte_swp_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
 }
 
 static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 #endif
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e227970f983e..2ee781114d34 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -142,12 +142,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 /* Encode and de-code a swap entry */
 #define SWP_TYPE_BITS 5
-#ifdef CONFIG_NUMA_BALANCING
-/* Automatic NUMA balancing needs to be distinguishable from swap entries */
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
-#else
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#endif
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3e0230c94cff..8c7c10802e9c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -27,14 +27,6 @@
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
-/*
- * Swap offsets on configurations that allow automatic NUMA balancing use the
- * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
- * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
- * maximum possible swap space from 16TB to 8TB.
- */
-#define _PAGE_BIT_NUMA		(_PAGE_BIT_GLOBAL+1)
-
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
@@ -75,21 +67,6 @@
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
-/*
- * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
- * that is not present. The hinting fault gathers numa placement statistics
- * (see pte_numa()). The bit is always zero when the PTE is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- */
-#ifdef CONFIG_NUMA_BALANCING
-#define _PAGE_NUMA	(_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
-#else
-#define _PAGE_NUMA	(_AT(pteval_t, 0))
-#endif
-
 /*
  * Tracking soft dirty bit when a page goes to a swap is tricky.
  * We need a bit which can be stored in pte _and_ not conflict
@@ -122,8 +99,8 @@
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
 			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
-			 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
+			 _PAGE_SOFT_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
 /*
  * The cache modes defined here are used to translate between pure SW usage
@@ -324,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte)
 	return native_pte_val(pte) & PTE_FLAGS_MASK;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-/* Set of bits that distinguishes present, prot_none and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
-	return pmd_flags(pmd) & _PAGE_NUMA_MASK;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #define pgprot_val(x)	((x).pgprot)
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 067922c06c29..4d46085c1b90 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -244,10 +244,6 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 # define pte_accessible(mm, pte)	((void)(pte), 1)
 #endif
 
-#ifndef pte_present_nonuma
-#define pte_present_nonuma(pte) pte_present(pte)
-#endif
-
 #ifndef flush_tlb_fix_spurious_fault
 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
 #endif
@@ -693,157 +689,6 @@ static inline int pmd_protnone(pmd_t pmd)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
- * is protected for PROT_NONE and a NUMA hinting fault entry. If the
- * architecture defines __PAGE_PROTNONE then it should take that into account
- * but those that do not can rely on the fact that the NUMA hinting scanner
- * skips inaccessible VMAs.
- *
- * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
- * fault triggers on those regions if pte/pmd_numa returns true
- * (because _PAGE_PRESENT is not set).
- */
-#ifndef pte_numa
-static inline int pte_numa(pte_t pte)
-{
-	return ptenuma_flags(pte) == _PAGE_NUMA;
-}
-#endif
-
-#ifndef pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
-	return pmdnuma_flags(pmd) == _PAGE_NUMA;
-}
-#endif
-
-/*
- * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
- * because they're called by the NUMA hinting minor page fault. If we
- * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
- * would be forced to set it later while filling the TLB after we
- * return to userland. That would trigger a second write to memory
- * that we optimize away by setting _PAGE_ACCESSED here.
- */
-#ifndef pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
-	pteval_t val = pte_val(pte);
-
-	val &= ~_PAGE_NUMA;
-	val |= (_PAGE_PRESENT|_PAGE_ACCESSED);
-	return __pte(val);
-}
-#endif
-
-#ifndef pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-	pmdval_t val = pmd_val(pmd);
-
-	val &= ~_PAGE_NUMA;
-	val |= (_PAGE_PRESENT|_PAGE_ACCESSED);
-
-	return __pmd(val);
-}
-#endif
-
-#ifndef pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
-	pteval_t val = pte_val(pte);
-
-	VM_BUG_ON(!(val & _PAGE_PRESENT));
-
-	val &= ~_PAGE_PRESENT;
-	val |= _PAGE_NUMA;
-
-	return __pte(val);
-}
-#endif
-
-#ifndef ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep)
-{
-	pte_t ptent = *ptep;
-
-	ptent = pte_mknuma(ptent);
-	set_pte_at(mm, addr, ptep, ptent);
-	return;
-}
-#endif
-
-#ifndef pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-	pmdval_t val = pmd_val(pmd);
-
-	val &= ~_PAGE_PRESENT;
-	val |= _PAGE_NUMA;
-
-	return __pmd(val);
-}
-#endif
-
-#ifndef pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pmd_t *pmdp)
-{
-	pmd_t pmd = *pmdp;
-
-	pmd = pmd_mknuma(pmd);
-	set_pmd_at(mm, addr, pmdp, pmd);
-	return;
-}
-#endif
-#else
-static inline int pmd_numa(pmd_t pmd)
-{
-	return 0;
-}
-
-static inline int pte_numa(pte_t pte)
-{
-	return 0;
-}
-
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
-	return pte;
-}
-
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-	return pmd;
-}
-
-static inline pte_t pte_mknuma(pte_t pte)
-{
-	return pte;
-}
-
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep)
-{
-	return;
-}
-
-
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-	return pmd;
-}
-
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-				 pmd_t *pmdp)
-{
-	return ;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #endif /* CONFIG_MMU */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 831a3168ab35..cedf3d3c373f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
-	return !pte_none(pte) && !pte_present_nonuma(pte);
+	return !pte_none(pte) && !pte_present(pte);
 }
 #endif
 
-- 
cgit v1.2.3


From e944fd67b625c02bda4a78ddf85e413c5e401474 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:35 -0800
Subject: mm: numa: do not trap faults on the huge zero page

Faults on the huge zero page are pointless and there is a BUG_ON to catch
them during fault time.  This patch reintroduces a check that avoids
marking the zero page PAGE_NONE.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  3 ++-
 mm/huge_memory.c        | 13 ++++++++++++-
 mm/memory.c             |  1 -
 mm/mprotect.c           | 14 +++++++++++++-
 4 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 062bd252e994..f10b20f05159 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-			unsigned long addr, pgprot_t newprot);
+			unsigned long addr, pgprot_t newprot,
+			int prot_numa);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cb9b3e847dac..8e791a3db6b6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1471,7 +1471,7 @@ out:
  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
  */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, pgprot_t newprot)
+		unsigned long addr, pgprot_t newprot, int prot_numa)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	spinlock_t *ptl;
@@ -1479,6 +1479,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		pmd_t entry;
+
+		/*
+		 * Avoid trapping faults against the zero page. The read-only
+		 * data is likely to be read-cached on the local CPU and
+		 * local/remote hits to the zero page are not interesting.
+		 */
+		if (prot_numa && is_huge_zero_pmd(*pmd)) {
+			spin_unlock(ptl);
+			return 0;
+		}
+
 		ret = 1;
 		entry = pmdp_get_and_clear_notify(mm, addr, pmd);
 		entry = pmd_modify(entry, newprot);
diff --git a/mm/memory.c b/mm/memory.c
index d7921760cf79..bf244f56b05a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3040,7 +3040,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte_unmap_unlock(ptep, ptl);
 		return 0;
 	}
-	BUG_ON(is_zero_pfn(page_to_pfn(page)));
 
 	/*
 	 * Avoid grouping on DSO/COW pages in specific and RO pages
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 76824d73380d..dd599fc235c2 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -76,6 +76,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (pte_present(oldpte)) {
 			pte_t ptent;
 
+			/*
+			 * Avoid trapping faults against the zero or KSM
+			 * pages. See similar comment in change_huge_pmd.
+			 */
+			if (prot_numa) {
+				struct page *page;
+
+				page = vm_normal_page(vma, addr, oldpte);
+				if (!page || PageKsm(page))
+					continue;
+			}
+
 			ptent = ptep_modify_prot_start(mm, addr, pte);
 			ptent = pte_modify(ptent, newprot);
 
@@ -142,7 +154,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 				split_huge_page_pmd(vma, addr, pmd);
 			else {
 				int nr_ptes = change_huge_pmd(vma, pmd, addr,
-						newprot);
+						newprot, prot_numa);
 
 				if (nr_ptes) {
 					if (nr_ptes == HPAGE_PMD_NR) {
-- 
cgit v1.2.3


From 503c358cf1925853195ee39ec437e51138bbb7df Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:58:47 -0800
Subject: list_lru: introduce list_lru_shrink_{count,walk}

Kmem accounting of memcg is unusable now, because it lacks slab shrinker
support.  That means when we hit the limit we will get ENOMEM w/o any
chance to recover.  What we should do then is to call shrink_slab, which
would reclaim old inode/dentry caches from this cgroup.  This is what
this patch set is intended to do.

Basically, it does two things.  First, it introduces the notion of
per-memcg slab shrinker.  A shrinker that wants to reclaim objects per
cgroup should mark itself as SHRINKER_MEMCG_AWARE.  Then it will be
passed the memory cgroup to scan from in shrink_control->memcg.  For
such shrinkers shrink_slab iterates over the whole cgroup subtree under
the target cgroup and calls the shrinker for each kmem-active memory
cgroup.

Secondly, this patch set makes the list_lru structure per-memcg.  It's
done transparently to list_lru users - everything they have to do is to
tell list_lru_init that they want memcg-aware list_lru.  Then the
list_lru will automatically distribute objects among per-memcg lists
basing on which cgroup the object is accounted to.  This way to make FS
shrinkers (icache, dcache) memcg-aware we only need to make them use
memcg-aware list_lru, and this is what this patch set does.

As before, this patch set only enables per-memcg kmem reclaim when the
pressure goes from memory.limit, not from memory.kmem.limit.  Handling
memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and
it is still unclear whether we will have this knob in the unified
hierarchy.

This patch (of 9):

NUMA aware slab shrinkers use the list_lru structure to distribute
objects coming from different NUMA nodes to different lists.  Whenever
such a shrinker needs to count or scan objects from a particular node,
it issues commands like this:

        count = list_lru_count_node(lru, sc->nid);
        freed = list_lru_walk_node(lru, sc->nid, isolate_func,
                                   isolate_arg, &sc->nr_to_scan);

where sc is an instance of the shrink_control structure passed to it
from vmscan.

To simplify this, let's add special list_lru functions to be used by
shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which
consolidate the nid and nr_to_scan arguments in the shrink_control
structure.

This will also allow us to avoid patching shrinkers that use list_lru
when we make shrink_slab() per-memcg - all we will have to do is extend
the shrink_control structure to include the target memcg and make
list_lru_shrink_{count,walk} handle this appropriately.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c              | 14 ++++++--------
 fs/gfs2/quota.c          |  6 +++---
 fs/inode.c               |  7 +++----
 fs/internal.h            |  7 +++----
 fs/super.c               | 24 +++++++++++-------------
 fs/xfs/xfs_buf.c         |  7 +++----
 fs/xfs/xfs_qm.c          |  7 +++----
 include/linux/list_lru.h | 16 ++++++++++++++++
 mm/workingset.c          |  6 +++---
 9 files changed, 51 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..56c5da89f58a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -930,24 +930,22 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 /**
  * prune_dcache_sb - shrink the dcache
  * @sb: superblock
- * @nr_to_scan : number of entries to try to free
- * @nid: which node to scan for freeable entities
+ * @sc: shrink control, passed to list_lru_shrink_walk()
  *
- * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
- * done when we need more memory an called from the superblock shrinker
+ * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
+ * is done when we need more memory and called from the superblock shrinker
  * function.
  *
  * This function may fail to free any resources if all the dentries are in
  * use.
  */
-long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
-		     int nid)
+long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
 {
 	LIST_HEAD(dispose);
 	long freed;
 
-	freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
-				       &dispose, &nr_to_scan);
+	freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
+				     dentry_lru_isolate, &dispose);
 	shrink_dentry_list(&dispose);
 	return freed;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e193cb36996..c15d6b216d0b 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -171,8 +171,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 	if (!(sc->gfp_mask & __GFP_FS))
 		return SHRINK_STOP;
 
-	freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate,
-				   &dispose, &sc->nr_to_scan);
+	freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
+				     gfs2_qd_isolate, &dispose);
 
 	gfs2_qd_dispose(&dispose);
 
@@ -182,7 +182,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
 					  struct shrink_control *sc)
 {
-	return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid));
+	return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
 }
 
 struct shrinker gfs2_qd_shrinker = {
diff --git a/fs/inode.c b/fs/inode.c
index 3a53b1da3fb8..524a32c2b0c6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -751,14 +751,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
  * to trim from the LRU. Inodes to be freed are moved to a temporary list and
  * then are freed outside inode_lock by dispose_list().
  */
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
-		     int nid)
+long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
 {
 	LIST_HEAD(freeable);
 	long freed;
 
-	freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
-				       &freeable, &nr_to_scan);
+	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
+				     inode_lru_isolate, &freeable);
 	dispose_list(&freeable);
 	return freed;
 }
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..d92c346a793d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,6 +14,7 @@ struct file_system_type;
 struct linux_binprm;
 struct path;
 struct mount;
+struct shrink_control;
 
 /*
  * block_dev.c
@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
  * inode.c
  */
 extern spinlock_t inode_sb_list_lock;
-extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
-			    int nid);
+extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
 extern void inode_add_lru(struct inode *inode);
 
 /*
@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
  */
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
-extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
-			    int nid);
+extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
 
 /*
  * read_write.c
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..4554ac257647 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -77,8 +77,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	if (sb->s_op->nr_cached_objects)
 		fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
 
-	inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
-	dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
+	inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
+	dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
 	total_objects = dentries + inodes + fs_objects + 1;
 	if (!total_objects)
 		total_objects = 1;
@@ -86,20 +86,20 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	/* proportion the scan between the caches */
 	dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
 	inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+	fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
 
 	/*
 	 * prune the dcache first as the icache is pinned by it, then
 	 * prune the icache, followed by the filesystem specific caches
 	 */
-	freed = prune_dcache_sb(sb, dentries, sc->nid);
-	freed += prune_icache_sb(sb, inodes, sc->nid);
+	sc->nr_to_scan = dentries;
+	freed = prune_dcache_sb(sb, sc);
+	sc->nr_to_scan = inodes;
+	freed += prune_icache_sb(sb, sc);
 
-	if (fs_objects) {
-		fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
-								total_objects);
+	if (fs_objects)
 		freed += sb->s_op->free_cached_objects(sb, fs_objects,
 						       sc->nid);
-	}
 
 	drop_super(sb);
 	return freed;
@@ -118,17 +118,15 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	 * scalability bottleneck. The counts could get updated
 	 * between super_cache_count and super_cache_scan anyway.
 	 * Call to super_cache_count with shrinker_rwsem held
-	 * ensures the safety of call to list_lru_count_node() and
+	 * ensures the safety of call to list_lru_shrink_count() and
 	 * s_op->nr_cached_objects().
 	 */
 	if (sb->s_op && sb->s_op->nr_cached_objects)
 		total_objects = sb->s_op->nr_cached_objects(sb,
 						 sc->nid);
 
-	total_objects += list_lru_count_node(&sb->s_dentry_lru,
-						 sc->nid);
-	total_objects += list_lru_count_node(&sb->s_inode_lru,
-						 sc->nid);
+	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
+	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
 
 	total_objects = vfs_pressure_ratio(total_objects);
 	return total_objects;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bb502a391792..15c9d224c721 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1583,10 +1583,9 @@ xfs_buftarg_shrink_scan(
 					struct xfs_buftarg, bt_shrinker);
 	LIST_HEAD(dispose);
 	unsigned long		freed;
-	unsigned long		nr_to_scan = sc->nr_to_scan;
 
-	freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
-				       &dispose, &nr_to_scan);
+	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
+				     xfs_buftarg_isolate, &dispose);
 
 	while (!list_empty(&dispose)) {
 		struct xfs_buf *bp;
@@ -1605,7 +1604,7 @@ xfs_buftarg_shrink_count(
 {
 	struct xfs_buftarg	*btp = container_of(shrink,
 					struct xfs_buftarg, bt_shrinker);
-	return list_lru_count_node(&btp->bt_lru, sc->nid);
+	return list_lru_shrink_count(&btp->bt_lru, sc);
 }
 
 void
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3e8186279541..4f4b1274e144 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -523,7 +523,6 @@ xfs_qm_shrink_scan(
 	struct xfs_qm_isolate	isol;
 	unsigned long		freed;
 	int			error;
-	unsigned long		nr_to_scan = sc->nr_to_scan;
 
 	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
 		return 0;
@@ -531,8 +530,8 @@ xfs_qm_shrink_scan(
 	INIT_LIST_HEAD(&isol.buffers);
 	INIT_LIST_HEAD(&isol.dispose);
 
-	freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
-					&nr_to_scan);
+	freed = list_lru_shrink_walk(&qi->qi_lru, sc,
+				     xfs_qm_dquot_isolate, &isol);
 
 	error = xfs_buf_delwri_submit(&isol.buffers);
 	if (error)
@@ -557,7 +556,7 @@ xfs_qm_shrink_count(
 	struct xfs_quotainfo	*qi = container_of(shrink,
 					struct xfs_quotainfo, qi_shrinker);
 
-	return list_lru_count_node(&qi->qi_lru, sc->nid);
+	return list_lru_shrink_count(&qi->qi_lru, sc);
 }
 
 /*
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index f3434533fbf8..f500a2e39b13 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -9,6 +9,7 @@
 
 #include <linux/list.h>
 #include <linux/nodemask.h>
+#include <linux/shrinker.h>
 
 /* list_lru_walk_cb has to always return one of those */
 enum lru_status {
@@ -81,6 +82,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item);
  * Callers that want such a guarantee need to provide an outer lock.
  */
 unsigned long list_lru_count_node(struct list_lru *lru, int nid);
+
+static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
+						  struct shrink_control *sc)
+{
+	return list_lru_count_node(lru, sc->nid);
+}
+
 static inline unsigned long list_lru_count(struct list_lru *lru)
 {
 	long count = 0;
@@ -119,6 +127,14 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 				 list_lru_walk_cb isolate, void *cb_arg,
 				 unsigned long *nr_to_walk);
 
+static inline unsigned long
+list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+		     list_lru_walk_cb isolate, void *cb_arg)
+{
+	return list_lru_walk_node(lru, sc->nid, isolate, cb_arg,
+				  &sc->nr_to_scan);
+}
+
 static inline unsigned long
 list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
 	      void *cb_arg, unsigned long nr_to_walk)
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..d4fa7fb10a52 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 
 	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
 	local_irq_disable();
-	shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+	shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
 	local_irq_enable();
 
 	pages = node_present_pages(sc->nid);
@@ -376,8 +376,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
 
 	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
 	local_irq_disable();
-	ret =  list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
-				  shadow_lru_isolate, NULL, &sc->nr_to_scan);
+	ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
+				    shadow_lru_isolate, NULL);
 	local_irq_enable();
 	return ret;
 }
-- 
cgit v1.2.3


From 4101b624352fddb5ed72e7a1b6f8be8cffaa20fa Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:58:51 -0800
Subject: fs: consolidate {nr,free}_cached_objects args in shrink_control

We are going to make FS shrinkers memcg-aware.  To achieve that, we will
have to pass the memcg to scan to the nr_cached_objects and
free_cached_objects VFS methods, which currently take only the NUMA node
to scan.  Since the shrink_control structure already holds the node, and
the memcg to scan will be added to it when we introduce memcg-aware
vmscan, let us consolidate the methods' arguments in this structure to
keep things clean.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c         | 12 ++++++------
 fs/xfs/xfs_super.c |  7 +++----
 include/linux/fs.h |  6 ++++--
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 4554ac257647..a2b735a42e74 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -75,7 +75,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 		return SHRINK_STOP;
 
 	if (sb->s_op->nr_cached_objects)
-		fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
+		fs_objects = sb->s_op->nr_cached_objects(sb, sc);
 
 	inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
 	dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
@@ -97,9 +97,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	sc->nr_to_scan = inodes;
 	freed += prune_icache_sb(sb, sc);
 
-	if (fs_objects)
-		freed += sb->s_op->free_cached_objects(sb, fs_objects,
-						       sc->nid);
+	if (fs_objects) {
+		sc->nr_to_scan = fs_objects;
+		freed += sb->s_op->free_cached_objects(sb, sc);
+	}
 
 	drop_super(sb);
 	return freed;
@@ -122,8 +123,7 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	 * s_op->nr_cached_objects().
 	 */
 	if (sb->s_op && sb->s_op->nr_cached_objects)
-		total_objects = sb->s_op->nr_cached_objects(sb,
-						 sc->nid);
+		total_objects = sb->s_op->nr_cached_objects(sb, sc);
 
 	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
 	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f2449fd86926..8fcc4ccc5c79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1537,7 +1537,7 @@ xfs_fs_mount(
 static long
 xfs_fs_nr_cached_objects(
 	struct super_block	*sb,
-	int			nid)
+	struct shrink_control	*sc)
 {
 	return xfs_reclaim_inodes_count(XFS_M(sb));
 }
@@ -1545,10 +1545,9 @@ xfs_fs_nr_cached_objects(
 static long
 xfs_fs_free_cached_objects(
 	struct super_block	*sb,
-	long			nr_to_scan,
-	int			nid)
+	struct shrink_control	*sc)
 {
-	return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
 }
 
 static const struct super_operations xfs_super_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cdcb1e9d9613..a20d6586e517 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1635,8 +1635,10 @@ struct super_operations {
 	struct dquot **(*get_dquots)(struct inode *);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
-	long (*nr_cached_objects)(struct super_block *, int);
-	long (*free_cached_objects)(struct super_block *, long, int);
+	long (*nr_cached_objects)(struct super_block *,
+				  struct shrink_control *);
+	long (*free_cached_objects)(struct super_block *,
+				    struct shrink_control *);
 };
 
 /*
-- 
cgit v1.2.3


From cb731d6c62bbc2f890b08ea3d0386d5dad887326 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:58:54 -0800
Subject: vmscan: per memory cgroup slab shrinkers

This patch adds SHRINKER_MEMCG_AWARE flag.  If a shrinker has this flag
set, it will be called per memory cgroup.  The memory cgroup to scan
objects from is passed in shrink_control->memcg.  If the memory cgroup
is NULL, a memcg aware shrinker is supposed to scan objects from the
global list.  Unaware shrinkers are only called on global pressure with
memcg=NULL.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/drop_caches.c           | 14 --------
 include/linux/memcontrol.h |  7 ++++
 include/linux/mm.h         |  5 ++-
 include/linux/shrinker.h   |  6 +++-
 mm/memcontrol.c            |  2 +-
 mm/memory-failure.c        | 11 ++----
 mm/vmscan.c                | 85 ++++++++++++++++++++++++++++++++++------------
 7 files changed, 80 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2bc2c87f35e7..5718cb9f7273 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 	iput(toput_inode);
 }
 
-static void drop_slab(void)
-{
-	int nr_objects;
-
-	do {
-		int nid;
-
-		nr_objects = 0;
-		for_each_online_node(nid)
-			nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
-							1000, 1000);
-	} while (nr_objects > 10);
-}
-
 int drop_caches_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6cfd934c7c9b..54992fe0959f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -413,6 +413,8 @@ static inline bool memcg_kmem_enabled(void)
 	return static_key_false(&memcg_kmem_enabled_key);
 }
 
+bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+
 /*
  * In general, we'll do everything in our power to not incur in any overhead
  * for non-memcg users for the kmem functions. Not even a function call, if we
@@ -542,6 +544,11 @@ static inline bool memcg_kmem_enabled(void)
 	return false;
 }
 
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+	return false;
+}
+
 static inline bool
 memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a4d24f3c5430..af4ff88a11e0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2168,9 +2168,8 @@ int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 #endif
 
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
-				unsigned long nr_scanned,
-				unsigned long nr_eligible);
+void drop_slab(void);
+void drop_slab_node(int nid);
 
 #ifndef CONFIG_MMU
 #define randomize_va_space 0
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index f4aee75f00b1..4fcacd915d45 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -20,6 +20,9 @@ struct shrink_control {
 
 	/* current node being shrunk (for NUMA aware shrinkers) */
 	int nid;
+
+	/* current memcg being shrunk (for memcg aware shrinkers) */
+	struct mem_cgroup *memcg;
 };
 
 #define SHRINK_STOP (~0UL)
@@ -61,7 +64,8 @@ struct shrinker {
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
 /* Flags */
-#define SHRINKER_NUMA_AWARE (1 << 0)
+#define SHRINKER_NUMA_AWARE	(1 << 0)
+#define SHRINKER_MEMCG_AWARE	(1 << 1)
 
 extern int register_shrinker(struct shrinker *);
 extern void unregister_shrinker(struct shrinker *);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 095c1f96fbec..3c2a1a8286ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -352,7 +352,7 @@ struct mem_cgroup {
 };
 
 #ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
 	return memcg->kmemcg_id >= 0;
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index feb803bf3443..1a735fad2a13 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access)
 	 * Only call shrink_node_slabs here (which would also shrink
 	 * other caches) if access is not potentially fatal.
 	 */
-	if (access) {
-		int nr;
-		int nid = page_to_nid(p);
-		do {
-			nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
-			if (page_count(p) == 1)
-				break;
-		} while (nr > 10);
-	}
+	if (access)
+		drop_slab_node(page_to_nid(p));
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8e645ee52045..803886b8e353 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -232,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker);
 
 #define SHRINK_BATCH 128
 
-static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
-				  struct shrinker *shrinker,
-				  unsigned long nr_scanned,
-				  unsigned long nr_eligible)
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+				    struct shrinker *shrinker,
+				    unsigned long nr_scanned,
+				    unsigned long nr_eligible)
 {
 	unsigned long freed = 0;
 	unsigned long long delta;
@@ -344,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
 }
 
 /**
- * shrink_node_slabs - shrink slab caches of a given node
+ * shrink_slab - shrink slab caches
  * @gfp_mask: allocation context
  * @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
  * @nr_scanned: pressure numerator
  * @nr_eligible: pressure denominator
  *
@@ -355,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
  * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
  * unaware shrinkers will receive a node id of 0 instead.
  *
+ * @memcg specifies the memory cgroup to target. If it is not NULL,
+ * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
+ * objects from the memory cgroup specified. Otherwise all shrinkers
+ * are called, and memcg aware shrinkers are supposed to scan the
+ * global list then.
+ *
  * @nr_scanned and @nr_eligible form a ratio that indicate how much of
  * the available objects should be scanned.  Page reclaim for example
  * passes the number of pages scanned and the number of pages on the
@@ -365,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
  *
  * Returns the number of reclaimed slab objects.
  */
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
-				unsigned long nr_scanned,
-				unsigned long nr_eligible)
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+				 struct mem_cgroup *memcg,
+				 unsigned long nr_scanned,
+				 unsigned long nr_eligible)
 {
 	struct shrinker *shrinker;
 	unsigned long freed = 0;
 
+	if (memcg && !memcg_kmem_is_active(memcg))
+		return 0;
+
 	if (nr_scanned == 0)
 		nr_scanned = SWAP_CLUSTER_MAX;
 
@@ -390,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
+			.memcg = memcg,
 		};
 
+		if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+			continue;
+
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 			sc.nid = 0;
 
-		freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
+		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
 	}
 
 	up_read(&shrinker_rwsem);
@@ -404,6 +419,29 @@ out:
 	return freed;
 }
 
+void drop_slab_node(int nid)
+{
+	unsigned long freed;
+
+	do {
+		struct mem_cgroup *memcg = NULL;
+
+		freed = 0;
+		do {
+			freed += shrink_slab(GFP_KERNEL, nid, memcg,
+					     1000, 1000);
+		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+	} while (freed > 10);
+}
+
+void drop_slab(void)
+{
+	int nid;
+
+	for_each_online_node(nid)
+		drop_slab_node(nid);
+}
+
 static inline int is_page_cache_freeable(struct page *page)
 {
 	/*
@@ -2276,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
 static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 			bool is_classzone)
 {
+	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long nr_reclaimed, nr_scanned;
 	bool reclaimable = false;
 
@@ -2294,6 +2333,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
 			unsigned long lru_pages;
+			unsigned long scanned;
 			struct lruvec *lruvec;
 			int swappiness;
 
@@ -2305,10 +2345,16 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			swappiness = mem_cgroup_swappiness(memcg);
+			scanned = sc->nr_scanned;
 
 			shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
 			zone_lru_pages += lru_pages;
 
+			if (memcg && is_classzone)
+				shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+					    memcg, sc->nr_scanned - scanned,
+					    lru_pages);
+
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
 			 * cgroups to fulfill the overall scan target for the
@@ -2330,19 +2376,14 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 		 * Shrink the slab caches in the same proportion that
 		 * the eligible LRU pages were scanned.
 		 */
-		if (global_reclaim(sc) && is_classzone) {
-			struct reclaim_state *reclaim_state;
-
-			shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
-					  sc->nr_scanned - nr_scanned,
-					  zone_lru_pages);
-
-			reclaim_state = current->reclaim_state;
-			if (reclaim_state) {
-				sc->nr_reclaimed +=
-					reclaim_state->reclaimed_slab;
-				reclaim_state->reclaimed_slab = 0;
-			}
+		if (global_reclaim(sc) && is_classzone)
+			shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+				    sc->nr_scanned - nr_scanned,
+				    zone_lru_pages);
+
+		if (reclaim_state) {
+			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+			reclaim_state->reclaimed_slab = 0;
 		}
 
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
-- 
cgit v1.2.3


From dbcf73e26cd0b3d66e6db65ab595e664a55e58ff Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:58:57 -0800
Subject: memcg: rename some cache id related variables

memcg_limited_groups_array_size, which defines the size of memcg_caches
arrays, sounds rather cumbersome.  Also it doesn't point anyhow that
it's related to kmem/caches stuff.  So let's rename it to
memcg_nr_cache_ids.  It's concise and points us directly to
memcg_cache_id.

Also, rename kmem_limited_groups to memcg_cache_ida.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  4 ++--
 mm/memcontrol.c            | 19 +++++++++----------
 mm/slab_common.c           |  4 ++--
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 54992fe0959f..2607c91230af 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -398,7 +398,7 @@ static inline void sock_release_memcg(struct sock *sk)
 #ifdef CONFIG_MEMCG_KMEM
 extern struct static_key memcg_kmem_enabled_key;
 
-extern int memcg_limited_groups_array_size;
+extern int memcg_nr_cache_ids;
 
 /*
  * Helper macro to loop through all memcg-specific caches. Callers must still
@@ -406,7 +406,7 @@ extern int memcg_limited_groups_array_size;
  * the slab_mutex must be held when looping through those caches
  */
 #define for_each_memcg_cache_index(_idx)	\
-	for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++)
+	for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
 
 static inline bool memcg_kmem_enabled(void)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3c2a1a8286ac..8608fa543b84 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -538,12 +538,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
  *  200 entry array for that.
  *
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size.  It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
  */
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
 
 /*
  * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -2538,12 +2537,12 @@ static int memcg_alloc_cache_id(void)
 	int id, size;
 	int err;
 
-	id = ida_simple_get(&kmem_limited_groups,
+	id = ida_simple_get(&memcg_cache_ida,
 			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
 	if (id < 0)
 		return id;
 
-	if (id < memcg_limited_groups_array_size)
+	if (id < memcg_nr_cache_ids)
 		return id;
 
 	/*
@@ -2559,7 +2558,7 @@ static int memcg_alloc_cache_id(void)
 
 	err = memcg_update_all_caches(size);
 	if (err) {
-		ida_simple_remove(&kmem_limited_groups, id);
+		ida_simple_remove(&memcg_cache_ida, id);
 		return err;
 	}
 	return id;
@@ -2567,7 +2566,7 @@ static int memcg_alloc_cache_id(void)
 
 static void memcg_free_cache_id(int id)
 {
-	ida_simple_remove(&kmem_limited_groups, id);
+	ida_simple_remove(&memcg_cache_ida, id);
 }
 
 /*
@@ -2577,7 +2576,7 @@ static void memcg_free_cache_id(int id)
  */
 void memcg_update_array_size(int num)
 {
-	memcg_limited_groups_array_size = num;
+	memcg_nr_cache_ids = num;
 }
 
 struct memcg_kmem_cache_create_work {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6e1e4cf65836..f8899eedab68 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -116,7 +116,7 @@ static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
 
 	if (!memcg) {
 		size = offsetof(struct memcg_cache_params, memcg_caches);
-		size += memcg_limited_groups_array_size * sizeof(void *);
+		size += memcg_nr_cache_ids * sizeof(void *);
 	} else
 		size = sizeof(struct memcg_cache_params);
 
@@ -154,7 +154,7 @@ static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
 
 	cur_params = s->memcg_params;
 	memcpy(new_params->memcg_caches, cur_params->memcg_caches,
-	       memcg_limited_groups_array_size * sizeof(void *));
+	       memcg_nr_cache_ids * sizeof(void *));
 
 	new_params->is_root_cache = true;
 
-- 
cgit v1.2.3


From 05257a1a3dcc196c197714b5c9a8dd35b7f6aefc Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:01 -0800
Subject: memcg: add rwsem to synchronize against memcg_caches arrays
 relocation

We need a stable value of memcg_nr_cache_ids in kmem_cache_create()
(memcg_alloc_cache_params() wants it for root caches), where we only
hold the slab_mutex and no memcg-related locks.  As a result, we have to
update memcg_nr_cache_ids under the slab_mutex, which we can only take
on the slab's side (see memcg_update_array_size).  This looks awkward
and will become even worse when per-memcg list_lru is introduced, which
also wants stable access to memcg_nr_cache_ids.

To get rid of this dependency between the memcg_nr_cache_ids and the
slab_mutex, this patch introduces a special rwsem.  The rwsem is held
for writing during memcg_caches arrays relocation and memcg_nr_cache_ids
updates.  Therefore one can take it for reading to get a stable access
to memcg_caches arrays and/or memcg_nr_cache_ids.

Currently the semaphore is taken for reading only from
kmem_cache_create, right before taking the slab_mutex, so right now
there's no much point in using rwsem instead of mutex.  However, once
list_lru is made per-memcg it will allow list_lru initializations to
proceed concurrently.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 12 ++++++++++--
 mm/memcontrol.c            | 29 +++++++++++++++++++----------
 mm/slab_common.c           |  9 ++++-----
 3 files changed, 33 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2607c91230af..dbc4baa3619c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -399,6 +399,8 @@ static inline void sock_release_memcg(struct sock *sk)
 extern struct static_key memcg_kmem_enabled_key;
 
 extern int memcg_nr_cache_ids;
+extern void memcg_get_cache_ids(void);
+extern void memcg_put_cache_ids(void);
 
 /*
  * Helper macro to loop through all memcg-specific caches. Callers must still
@@ -434,8 +436,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
 
-void memcg_update_array_size(int num_groups);
-
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
 
@@ -569,6 +569,14 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
 	return -1;
 }
 
+static inline void memcg_get_cache_ids(void)
+{
+}
+
+static inline void memcg_put_cache_ids(void)
+{
+}
+
 static inline struct kmem_cache *
 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8608fa543b84..6706e5fa5ac0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -544,6 +544,19 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 static DEFINE_IDA(memcg_cache_ida);
 int memcg_nr_cache_ids;
 
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+	down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+	up_read(&memcg_cache_ids_sem);
+}
+
 /*
  * MIN_SIZE is different than 1, because we would like to avoid going through
  * the alloc/free process all the time. In a small machine, 4 kmem-limited
@@ -2549,6 +2562,7 @@ static int memcg_alloc_cache_id(void)
 	 * There's no space for the new id in memcg_caches arrays,
 	 * so we have to grow them.
 	 */
+	down_write(&memcg_cache_ids_sem);
 
 	size = 2 * (id + 1);
 	if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2557,6 +2571,11 @@ static int memcg_alloc_cache_id(void)
 		size = MEMCG_CACHES_MAX_SIZE;
 
 	err = memcg_update_all_caches(size);
+	if (!err)
+		memcg_nr_cache_ids = size;
+
+	up_write(&memcg_cache_ids_sem);
+
 	if (err) {
 		ida_simple_remove(&memcg_cache_ida, id);
 		return err;
@@ -2569,16 +2588,6 @@ static void memcg_free_cache_id(int id)
 	ida_simple_remove(&memcg_cache_ida, id);
 }
 
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
-	memcg_nr_cache_ids = num;
-}
-
 struct memcg_kmem_cache_create_work {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *cachep;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f8899eedab68..23f5fcde6043 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -169,8 +169,8 @@ int memcg_update_all_caches(int num_memcgs)
 {
 	struct kmem_cache *s;
 	int ret = 0;
-	mutex_lock(&slab_mutex);
 
+	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
 		if (!is_root_cache(s))
 			continue;
@@ -181,11 +181,8 @@ int memcg_update_all_caches(int num_memcgs)
 		 * up to this point in an updated state.
 		 */
 		if (ret)
-			goto out;
+			break;
 	}
-
-	memcg_update_array_size(num_memcgs);
-out:
 	mutex_unlock(&slab_mutex);
 	return ret;
 }
@@ -369,6 +366,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 
 	get_online_cpus();
 	get_online_mems();
+	memcg_get_cache_ids();
 
 	mutex_lock(&slab_mutex);
 
@@ -407,6 +405,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 out_unlock:
 	mutex_unlock(&slab_mutex);
 
+	memcg_put_cache_ids();
 	put_online_mems();
 	put_online_cpus();
 
-- 
cgit v1.2.3


From ff0b67ef5b1687692bc1fd3ce4bc3d1ff83587c7 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:04 -0800
Subject: list_lru: get rid of ->active_nodes

The active_nodes mask allows us to skip empty nodes when walking over
list_lru items from all nodes in list_lru_count/walk.  However, these
functions are never called from hot paths, so it doesn't seem we need
such kind of optimization there.  OTOH, removing the mask will make it
easier to make list_lru per-memcg.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list_lru.h |  5 ++---
 mm/list_lru.c            | 10 +++-------
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index f500a2e39b13..53c1d6b78270 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -31,7 +31,6 @@ struct list_lru_node {
 
 struct list_lru {
 	struct list_lru_node	*node;
-	nodemask_t		active_nodes;
 };
 
 void list_lru_destroy(struct list_lru *lru);
@@ -94,7 +93,7 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 	long count = 0;
 	int nid;
 
-	for_each_node_mask(nid, lru->active_nodes)
+	for_each_node_state(nid, N_NORMAL_MEMORY)
 		count += list_lru_count_node(lru, nid);
 
 	return count;
@@ -142,7 +141,7 @@ list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
 	long isolated = 0;
 	int nid;
 
-	for_each_node_mask(nid, lru->active_nodes) {
+	for_each_node_state(nid, N_NORMAL_MEMORY) {
 		isolated += list_lru_walk_node(lru, nid, isolate,
 					       cb_arg, &nr_to_walk);
 		if (nr_to_walk <= 0)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f1a0db194173..07e198c77888 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -19,8 +19,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 	WARN_ON_ONCE(nlru->nr_items < 0);
 	if (list_empty(item)) {
 		list_add_tail(item, &nlru->list);
-		if (nlru->nr_items++ == 0)
-			node_set(nid, lru->active_nodes);
+		nlru->nr_items++;
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -37,8 +36,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 	spin_lock(&nlru->lock);
 	if (!list_empty(item)) {
 		list_del_init(item);
-		if (--nlru->nr_items == 0)
-			node_clear(nid, lru->active_nodes);
+		nlru->nr_items--;
 		WARN_ON_ONCE(nlru->nr_items < 0);
 		spin_unlock(&nlru->lock);
 		return true;
@@ -90,8 +88,7 @@ restart:
 		case LRU_REMOVED_RETRY:
 			assert_spin_locked(&nlru->lock);
 		case LRU_REMOVED:
-			if (--nlru->nr_items == 0)
-				node_clear(nid, lru->active_nodes);
+			nlru->nr_items--;
 			WARN_ON_ONCE(nlru->nr_items < 0);
 			isolated++;
 			/*
@@ -133,7 +130,6 @@ int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
 	if (!lru->node)
 		return -ENOMEM;
 
-	nodes_clear(lru->active_nodes);
 	for (i = 0; i < nr_node_ids; i++) {
 		spin_lock_init(&lru->node[i].lock);
 		if (key)
-- 
cgit v1.2.3


From c0a5b560938a0f2fd2fbf66ddc446c7c2b41383a Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:07 -0800
Subject: list_lru: organize all list_lrus to list

To make list_lru memcg aware, we need all list_lrus to be kept on a list
protected by a mutex, so that we could sleep while walking over the
list.

Therefore after this change list_lru_destroy may sleep.  Fortunately,
there is only one user that calls it from an atomic context - it's
put_super - and we can easily fix it by calling list_lru_destroy before
put_super in destroy_locked_super - anyway we don't longer need lrus by
that time.

Another point that should be noted is that list_lru_destroy is allowed
to be called on an uninitialized zeroed-out object, in which case it is
a no-op.  Before this patch this was guaranteed by kfree, but now we
need an explicit check there.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c               |  8 ++++++++
 include/linux/list_lru.h |  3 +++
 mm/list_lru.c            | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index a2b735a42e74..b027849d92d2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -282,6 +282,14 @@ void deactivate_locked_super(struct super_block *s)
 		unregister_shrinker(&s->s_shrink);
 		fs->kill_sb(s);
 
+		/*
+		 * Since list_lru_destroy() may sleep, we cannot call it from
+		 * put_super(), where we hold the sb_lock. Therefore we destroy
+		 * the lru lists right now.
+		 */
+		list_lru_destroy(&s->s_dentry_lru);
+		list_lru_destroy(&s->s_inode_lru);
+
 		put_filesystem(fs);
 		put_super(s);
 	} else {
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 53c1d6b78270..ee9486ac0621 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -31,6 +31,9 @@ struct list_lru_node {
 
 struct list_lru {
 	struct list_lru_node	*node;
+#ifdef CONFIG_MEMCG_KMEM
+	struct list_head	list;
+#endif
 };
 
 void list_lru_destroy(struct list_lru *lru);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 07e198c77888..a9021cb3ccde 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,6 +9,34 @@
 #include <linux/mm.h>
 #include <linux/list_lru.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+
+static void list_lru_register(struct list_lru *lru)
+{
+	mutex_lock(&list_lrus_mutex);
+	list_add(&lru->list, &list_lrus);
+	mutex_unlock(&list_lrus_mutex);
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+	mutex_lock(&list_lrus_mutex);
+	list_del(&lru->list);
+	mutex_unlock(&list_lrus_mutex);
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
@@ -137,12 +165,18 @@ int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
 		INIT_LIST_HEAD(&lru->node[i].list);
 		lru->node[i].nr_items = 0;
 	}
+	list_lru_register(lru);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(list_lru_init_key);
 
 void list_lru_destroy(struct list_lru *lru)
 {
+	/* Already destroyed or not yet initialized? */
+	if (!lru->node)
+		return;
+	list_lru_unregister(lru);
 	kfree(lru->node);
+	lru->node = NULL;
 }
 EXPORT_SYMBOL_GPL(list_lru_destroy);
-- 
cgit v1.2.3


From 60d3fd32a7a9da4c8c93a9f89cfda22a0b4c65ce Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:10 -0800
Subject: list_lru: introduce per-memcg lists

There are several FS shrinkers, including super_block::s_shrink, that
keep reclaimable objects in the list_lru structure.  Hence to turn them
to memcg-aware shrinkers, it is enough to make list_lru per-memcg.

This patch does the trick.  It adds an array of lru lists to the
list_lru_node structure (per-node part of the list_lru), one for each
kmem-active memcg, and dispatches every item addition or removal to the
list corresponding to the memcg which the item is accounted to.  So now
the list_lru structure is not just per node, but per node and per memcg.

Not all list_lrus need this feature, so this patch also adds a new
method, list_lru_init_memcg, which initializes a list_lru as memcg
aware.  Otherwise (i.e.  if initialized with old list_lru_init), the
list_lru won't have per memcg lists.

Just like per memcg caches arrays, the arrays of per-memcg lists are
indexed by memcg_cache_id, so we must grow them whenever
memcg_nr_cache_ids is increased.  So we introduce a callback,
memcg_update_all_list_lrus, invoked by memcg_alloc_cache_id if the id
space is full.

The locking is implemented in a manner similar to lruvecs, i.e.  we have
one lock per node that protects all lists (both global and per cgroup) on
the node.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list_lru.h   |  52 +++++--
 include/linux/memcontrol.h |  14 ++
 mm/list_lru.c              | 374 ++++++++++++++++++++++++++++++++++++++++++---
 mm/memcontrol.c            |  20 +++
 4 files changed, 424 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index ee9486ac0621..305b598abac2 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -11,6 +11,8 @@
 #include <linux/nodemask.h>
 #include <linux/shrinker.h>
 
+struct mem_cgroup;
+
 /* list_lru_walk_cb has to always return one of those */
 enum lru_status {
 	LRU_REMOVED,		/* item removed from list */
@@ -22,11 +24,26 @@ enum lru_status {
 				   internally, but has to return locked. */
 };
 
-struct list_lru_node {
-	spinlock_t		lock;
+struct list_lru_one {
 	struct list_head	list;
 	/* kept as signed so we can catch imbalance bugs */
 	long			nr_items;
+};
+
+struct list_lru_memcg {
+	/* array of per cgroup lists, indexed by memcg_cache_id */
+	struct list_lru_one	*lru[0];
+};
+
+struct list_lru_node {
+	/* protects all lists on the node, including per cgroup */
+	spinlock_t		lock;
+	/* global list, used for the root cgroup in cgroup aware lrus */
+	struct list_lru_one	lru;
+#ifdef CONFIG_MEMCG_KMEM
+	/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
+	struct list_lru_memcg	*memcg_lrus;
+#endif
 } ____cacheline_aligned_in_smp;
 
 struct list_lru {
@@ -37,11 +54,14 @@ struct list_lru {
 };
 
 void list_lru_destroy(struct list_lru *lru);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
-static inline int list_lru_init(struct list_lru *lru)
-{
-	return list_lru_init_key(lru, NULL);
-}
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+		    struct lock_class_key *key);
+
+#define list_lru_init(lru)		__list_lru_init((lru), false, NULL)
+#define list_lru_init_key(lru, key)	__list_lru_init((lru), false, (key))
+#define list_lru_init_memcg(lru)	__list_lru_init((lru), true, NULL)
+
+int memcg_update_all_list_lrus(int num_memcgs);
 
 /**
  * list_lru_add: add an element to the lru list's tail
@@ -75,20 +95,23 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
 bool list_lru_del(struct list_lru *lru, struct list_head *item);
 
 /**
- * list_lru_count_node: return the number of objects currently held by @lru
+ * list_lru_count_one: return the number of objects currently held by @lru
  * @lru: the lru pointer.
  * @nid: the node id to count from.
+ * @memcg: the cgroup to count from.
  *
  * Always return a non-negative number, 0 for empty lists. There is no
  * guarantee that the list is not updated while the count is being computed.
  * Callers that want such a guarantee need to provide an outer lock.
  */
+unsigned long list_lru_count_one(struct list_lru *lru,
+				 int nid, struct mem_cgroup *memcg);
 unsigned long list_lru_count_node(struct list_lru *lru, int nid);
 
 static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
 						  struct shrink_control *sc)
 {
-	return list_lru_count_node(lru, sc->nid);
+	return list_lru_count_one(lru, sc->nid, sc->memcg);
 }
 
 static inline unsigned long list_lru_count(struct list_lru *lru)
@@ -105,9 +128,10 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 typedef enum lru_status
 (*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
 /**
- * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
+ * @memcg: the cgroup to scan from.
  * @isolate: callback function that is resposible for deciding what to do with
  *  the item currently being scanned
  * @cb_arg: opaque type that will be passed to @isolate
@@ -125,6 +149,10 @@ typedef enum lru_status
  *
  * Return value: the number of objects effectively removed from the LRU.
  */
+unsigned long list_lru_walk_one(struct list_lru *lru,
+				int nid, struct mem_cgroup *memcg,
+				list_lru_walk_cb isolate, void *cb_arg,
+				unsigned long *nr_to_walk);
 unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 				 list_lru_walk_cb isolate, void *cb_arg,
 				 unsigned long *nr_to_walk);
@@ -133,8 +161,8 @@ static inline unsigned long
 list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
 		     list_lru_walk_cb isolate, void *cb_arg)
 {
-	return list_lru_walk_node(lru, sc->nid, isolate, cb_arg,
-				  &sc->nr_to_scan);
+	return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg,
+				 &sc->nr_to_scan);
 }
 
 static inline unsigned long
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dbc4baa3619c..72dff5fb0d0c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -439,6 +439,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
 
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
+
 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 		      unsigned long nr_pages);
 void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
@@ -535,6 +537,13 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
 	if (memcg_kmem_enabled())
 		__memcg_kmem_put_cache(cachep);
 }
+
+static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+	if (!memcg_kmem_enabled())
+		return NULL;
+	return __mem_cgroup_from_kmem(ptr);
+}
 #else
 #define for_each_memcg_cache_index(_idx)	\
 	for (; NULL; )
@@ -586,6 +595,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
 }
+
+static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+	return NULL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 #endif /* _LINUX_MEMCONTROL_H */
 
diff --git a/mm/list_lru.c b/mm/list_lru.c
index a9021cb3ccde..79aee70c3b9d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -10,6 +10,7 @@
 #include <linux/list_lru.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
+#include <linux/memcontrol.h>
 
 #ifdef CONFIG_MEMCG_KMEM
 static LIST_HEAD(list_lrus);
@@ -38,16 +39,71 @@ static void list_lru_unregister(struct list_lru *lru)
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+	return !!lru->node[0].memcg_lrus;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+	/*
+	 * The lock protects the array of per cgroup lists from relocation
+	 * (see memcg_update_list_lru_node).
+	 */
+	lockdep_assert_held(&nlru->lock);
+	if (nlru->memcg_lrus && idx >= 0)
+		return nlru->memcg_lrus->lru[idx];
+
+	return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+	struct mem_cgroup *memcg;
+
+	if (!nlru->memcg_lrus)
+		return &nlru->lru;
+
+	memcg = mem_cgroup_from_kmem(ptr);
+	if (!memcg)
+		return &nlru->lru;
+
+	return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+	return false;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+	return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+	return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
-	WARN_ON_ONCE(nlru->nr_items < 0);
+	l = list_lru_from_kmem(nlru, item);
+	WARN_ON_ONCE(l->nr_items < 0);
 	if (list_empty(item)) {
-		list_add_tail(item, &nlru->list);
-		nlru->nr_items++;
+		list_add_tail(item, &l->list);
+		l->nr_items++;
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -60,12 +116,14 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
+	l = list_lru_from_kmem(nlru, item);
 	if (!list_empty(item)) {
 		list_del_init(item);
-		nlru->nr_items--;
-		WARN_ON_ONCE(nlru->nr_items < 0);
+		l->nr_items--;
+		WARN_ON_ONCE(l->nr_items < 0);
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -74,33 +132,58 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
 
-unsigned long
-list_lru_count_node(struct list_lru *lru, int nid)
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+					  int nid, int memcg_idx)
 {
-	unsigned long count = 0;
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
+	unsigned long count;
 
 	spin_lock(&nlru->lock);
-	WARN_ON_ONCE(nlru->nr_items < 0);
-	count += nlru->nr_items;
+	l = list_lru_from_memcg_idx(nlru, memcg_idx);
+	WARN_ON_ONCE(l->nr_items < 0);
+	count = l->nr_items;
 	spin_unlock(&nlru->lock);
 
 	return count;
 }
+
+unsigned long list_lru_count_one(struct list_lru *lru,
+				 int nid, struct mem_cgroup *memcg)
+{
+	return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+	long count = 0;
+	int memcg_idx;
+
+	count += __list_lru_count_one(lru, nid, -1);
+	if (list_lru_memcg_aware(lru)) {
+		for_each_memcg_cache_index(memcg_idx)
+			count += __list_lru_count_one(lru, nid, memcg_idx);
+	}
+	return count;
+}
 EXPORT_SYMBOL_GPL(list_lru_count_node);
 
-unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
-		   void *cb_arg, unsigned long *nr_to_walk)
+static unsigned long
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+		    list_lru_walk_cb isolate, void *cb_arg,
+		    unsigned long *nr_to_walk)
 {
 
-	struct list_lru_node	*nlru = &lru->node[nid];
+	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 	struct list_head *item, *n;
 	unsigned long isolated = 0;
 
 	spin_lock(&nlru->lock);
+	l = list_lru_from_memcg_idx(nlru, memcg_idx);
 restart:
-	list_for_each_safe(item, n, &nlru->list) {
+	list_for_each_safe(item, n, &l->list) {
 		enum lru_status ret;
 
 		/*
@@ -116,8 +199,8 @@ restart:
 		case LRU_REMOVED_RETRY:
 			assert_spin_locked(&nlru->lock);
 		case LRU_REMOVED:
-			nlru->nr_items--;
-			WARN_ON_ONCE(nlru->nr_items < 0);
+			l->nr_items--;
+			WARN_ON_ONCE(l->nr_items < 0);
 			isolated++;
 			/*
 			 * If the lru lock has been dropped, our list
@@ -128,7 +211,7 @@ restart:
 				goto restart;
 			break;
 		case LRU_ROTATE:
-			list_move_tail(item, &nlru->list);
+			list_move_tail(item, &l->list);
 			break;
 		case LRU_SKIP:
 			break;
@@ -147,36 +230,279 @@ restart:
 	spin_unlock(&nlru->lock);
 	return isolated;
 }
+
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+		  list_lru_walk_cb isolate, void *cb_arg,
+		  unsigned long *nr_to_walk)
+{
+	return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+				   isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+				 list_lru_walk_cb isolate, void *cb_arg,
+				 unsigned long *nr_to_walk)
+{
+	long isolated = 0;
+	int memcg_idx;
+
+	isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+					nr_to_walk);
+	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+		for_each_memcg_cache_index(memcg_idx) {
+			isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+						isolate, cb_arg, nr_to_walk);
+			if (*nr_to_walk <= 0)
+				break;
+		}
+	}
+	return isolated;
+}
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
 
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+	INIT_LIST_HEAD(&l->list);
+	l->nr_items = 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+					  int begin, int end)
+{
+	int i;
+
+	for (i = begin; i < end; i++)
+		kfree(memcg_lrus->lru[i]);
+}
+
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+				      int begin, int end)
+{
+	int i;
+
+	for (i = begin; i < end; i++) {
+		struct list_lru_one *l;
+
+		l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+		if (!l)
+			goto fail;
+
+		init_one_lru(l);
+		memcg_lrus->lru[i] = l;
+	}
+	return 0;
+fail:
+	__memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+	return -ENOMEM;
+}
+
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+	int size = memcg_nr_cache_ids;
+
+	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+	if (!nlru->memcg_lrus)
+		return -ENOMEM;
+
+	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
+		kfree(nlru->memcg_lrus);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
+	kfree(nlru->memcg_lrus);
+}
+
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+				      int old_size, int new_size)
+{
+	struct list_lru_memcg *old, *new;
+
+	BUG_ON(old_size > new_size);
+
+	old = nlru->memcg_lrus;
+	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+		kfree(new);
+		return -ENOMEM;
+	}
+
+	memcpy(new, old, old_size * sizeof(void *));
+
+	/*
+	 * The lock guarantees that we won't race with a reader
+	 * (see list_lru_from_memcg_idx).
+	 *
+	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+	 * we have to use IRQ-safe primitives here to avoid deadlock.
+	 */
+	spin_lock_irq(&nlru->lock);
+	nlru->memcg_lrus = new;
+	spin_unlock_irq(&nlru->lock);
+
+	kfree(old);
+	return 0;
+}
+
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+					      int old_size, int new_size)
+{
+	/* do not bother shrinking the array back to the old size, because we
+	 * cannot handle allocation failures here */
+	__memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+}
+
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+	int i;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		if (!memcg_aware)
+			lru->node[i].memcg_lrus = NULL;
+		else if (memcg_init_list_lru_node(&lru->node[i]))
+			goto fail;
+	}
+	return 0;
+fail:
+	for (i = i - 1; i >= 0; i--)
+		memcg_destroy_list_lru_node(&lru->node[i]);
+	return -ENOMEM;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_destroy_list_lru_node(&lru->node[i]);
+}
+
+static int memcg_update_list_lru(struct list_lru *lru,
+				 int old_size, int new_size)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return 0;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		if (memcg_update_list_lru_node(&lru->node[i],
+					       old_size, new_size))
+			goto fail;
+	}
+	return 0;
+fail:
+	for (i = i - 1; i >= 0; i--)
+		memcg_cancel_update_list_lru_node(&lru->node[i],
+						  old_size, new_size);
+	return -ENOMEM;
+}
+
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+					 int old_size, int new_size)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_cancel_update_list_lru_node(&lru->node[i],
+						  old_size, new_size);
+}
+
+int memcg_update_all_list_lrus(int new_size)
+{
+	int ret = 0;
+	struct list_lru *lru;
+	int old_size = memcg_nr_cache_ids;
+
+	mutex_lock(&list_lrus_mutex);
+	list_for_each_entry(lru, &list_lrus, list) {
+		ret = memcg_update_list_lru(lru, old_size, new_size);
+		if (ret)
+			goto fail;
+	}
+out:
+	mutex_unlock(&list_lrus_mutex);
+	return ret;
+fail:
+	list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+		memcg_cancel_update_list_lru(lru, old_size, new_size);
+	goto out;
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+	return 0;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+		    struct lock_class_key *key)
 {
 	int i;
 	size_t size = sizeof(*lru->node) * nr_node_ids;
+	int err = -ENOMEM;
+
+	memcg_get_cache_ids();
 
 	lru->node = kzalloc(size, GFP_KERNEL);
 	if (!lru->node)
-		return -ENOMEM;
+		goto out;
 
 	for (i = 0; i < nr_node_ids; i++) {
 		spin_lock_init(&lru->node[i].lock);
 		if (key)
 			lockdep_set_class(&lru->node[i].lock, key);
-		INIT_LIST_HEAD(&lru->node[i].list);
-		lru->node[i].nr_items = 0;
+		init_one_lru(&lru->node[i].lru);
+	}
+
+	err = memcg_init_list_lru(lru, memcg_aware);
+	if (err) {
+		kfree(lru->node);
+		goto out;
 	}
+
 	list_lru_register(lru);
-	return 0;
+out:
+	memcg_put_cache_ids();
+	return err;
 }
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);
 
 void list_lru_destroy(struct list_lru *lru)
 {
 	/* Already destroyed or not yet initialized? */
 	if (!lru->node)
 		return;
+
+	memcg_get_cache_ids();
+
 	list_lru_unregister(lru);
+
+	memcg_destroy_list_lru(lru);
 	kfree(lru->node);
 	lru->node = NULL;
+
+	memcg_put_cache_ids();
 }
 EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6706e5fa5ac0..afa55bb38cbd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2571,6 +2571,8 @@ static int memcg_alloc_cache_id(void)
 		size = MEMCG_CACHES_MAX_SIZE;
 
 	err = memcg_update_all_caches(size);
+	if (!err)
+		err = memcg_update_all_list_lrus(size);
 	if (!err)
 		memcg_nr_cache_ids = size;
 
@@ -2765,6 +2767,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
 	memcg_uncharge_kmem(memcg, 1 << order);
 	page->mem_cgroup = NULL;
 }
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+	struct mem_cgroup *memcg = NULL;
+	struct kmem_cache *cachep;
+	struct page *page;
+
+	page = virt_to_head_page(ptr);
+	if (PageSlab(page)) {
+		cachep = page->slab_cache;
+		if (!is_root_cache(cachep))
+			memcg = cachep->memcg_params->memcg;
+	} else
+		/* page allocated by alloc_kmem_pages */
+		memcg = page->mem_cgroup;
+
+	return memcg;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-- 
cgit v1.2.3


From f7ce3190c4a35bf887adb7a1aa1ba899b679872d Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:20 -0800
Subject: slab: embed memcg_cache_params to kmem_cache

Currently, kmem_cache stores a pointer to struct memcg_cache_params
instead of embedding it.  The rationale is to save memory when kmem
accounting is disabled.  However, the memcg_cache_params has shrivelled
drastically since it was first introduced:

* Initially:

struct memcg_cache_params {
	bool is_root_cache;
	union {
		struct kmem_cache *memcg_caches[0];
		struct {
			struct mem_cgroup *memcg;
			struct list_head list;
			struct kmem_cache *root_cache;
			bool dead;
			atomic_t nr_pages;
			struct work_struct destroy;
		};
	};
};

* Now:

struct memcg_cache_params {
	bool is_root_cache;
	union {
		struct {
			struct rcu_head rcu_head;
			struct kmem_cache *memcg_caches[0];
		};
		struct {
			struct mem_cgroup *memcg;
			struct kmem_cache *root_cache;
		};
	};
};

So the memory saving does not seem to be a clear win anymore.

OTOH, keeping a pointer to memcg_cache_params struct instead of embedding
it results in touching one more cache line on kmem alloc/free hot paths.
Besides, it makes linking kmem caches in a list chained by a field of
struct memcg_cache_params really painful due to a level of indirection,
while I want to make them linked in the following patch.  That said, let
us embed it.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h     |  17 +++----
 include/linux/slab_def.h |   2 +-
 include/linux/slub_def.h |   2 +-
 mm/memcontrol.c          |  11 ++--
 mm/slab.h                |  48 +++++++++---------
 mm/slab_common.c         | 129 ++++++++++++++++++++++++++---------------------
 mm/slub.c                |   5 +-
 7 files changed, 111 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2e3b448cfa2d..1e03c11bbfbd 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -473,14 +473,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 #ifndef ARCH_SLAB_MINALIGN
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 #endif
+
+struct memcg_cache_array {
+	struct rcu_head rcu;
+	struct kmem_cache *entries[0];
+};
+
 /*
  * This is the main placeholder for memcg-related information in kmem caches.
- * struct kmem_cache will hold a pointer to it, so the memory cost while
- * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
- * would otherwise be if that would be bundled in kmem_cache: we'll need an
- * extra pointer chase. But the trade off clearly lays in favor of not
- * penalizing non-users.
- *
  * Both the root cache and the child caches will have it. For the root cache,
  * this will hold a dynamically allocated array large enough to hold
  * information about the currently limited memcgs in the system. To allow the
@@ -495,10 +495,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 struct memcg_cache_params {
 	bool is_root_cache;
 	union {
-		struct {
-			struct rcu_head rcu_head;
-			struct kmem_cache *memcg_caches[0];
-		};
+		struct memcg_cache_array __rcu *memcg_caches;
 		struct {
 			struct mem_cgroup *memcg;
 			struct kmem_cache *root_cache;
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index b869d1662ba3..33d049066c3d 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -70,7 +70,7 @@ struct kmem_cache {
 	int obj_offset;
 #endif /* CONFIG_DEBUG_SLAB */
 #ifdef CONFIG_MEMCG_KMEM
-	struct memcg_cache_params *memcg_params;
+	struct memcg_cache_params memcg_params;
 #endif
 
 	struct kmem_cache_node *node[MAX_NUMNODES];
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d82abd40a3c0..9abf04ed0999 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -85,7 +85,7 @@ struct kmem_cache {
 	struct kobject kobj;	/* For sysfs */
 #endif
 #ifdef CONFIG_MEMCG_KMEM
-	struct memcg_cache_params *memcg_params;
+	struct memcg_cache_params memcg_params;
 	int max_attr_size; /* for propagation, maximum size of a stored attr */
 #ifdef CONFIG_SYSFS
 	struct kset *memcg_kset;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index afa55bb38cbd..6f3c0fcd7a2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -332,7 +332,7 @@ struct mem_cgroup {
 	struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
 #endif
 
@@ -531,7 +531,7 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
  * The main reason for not using cgroup id for this:
  *  this works better in sparse environments, where we have a lot of memcgs,
  *  but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -2667,8 +2667,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 	struct mem_cgroup *memcg;
 	struct kmem_cache *memcg_cachep;
 
-	VM_BUG_ON(!cachep->memcg_params);
-	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+	VM_BUG_ON(!is_root_cache(cachep));
 
 	if (current->memcg_kmem_skip_account)
 		return cachep;
@@ -2702,7 +2701,7 @@ out:
 void __memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
 	if (!is_root_cache(cachep))
-		css_put(&cachep->memcg_params->memcg->css);
+		css_put(&cachep->memcg_params.memcg->css);
 }
 
 /*
@@ -2778,7 +2777,7 @@ struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
 	if (PageSlab(page)) {
 		cachep = page->slab_cache;
 		if (!is_root_cache(cachep))
-			memcg = cachep->memcg_params->memcg;
+			memcg = cachep->memcg_params.memcg;
 	} else
 		/* page allocated by alloc_kmem_pages */
 		memcg = page->mem_cgroup;
diff --git a/mm/slab.h b/mm/slab.h
index 90430d6f665e..53a623f85931 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
 extern void create_boot_cache(struct kmem_cache *, const char *name,
 			size_t size, unsigned long flags);
 
-struct mem_cgroup;
-
 int slab_unmergeable(struct kmem_cache *s);
 struct kmem_cache *find_mergeable(size_t size, size_t align,
 		unsigned long flags, const char *name, void (*ctor)(void *));
@@ -167,14 +165,13 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 #ifdef CONFIG_MEMCG_KMEM
 static inline bool is_root_cache(struct kmem_cache *s)
 {
-	return !s->memcg_params || s->memcg_params->is_root_cache;
+	return s->memcg_params.is_root_cache;
 }
 
 static inline bool slab_equal_or_root(struct kmem_cache *s,
-					struct kmem_cache *p)
+				      struct kmem_cache *p)
 {
-	return (p == s) ||
-		(s->memcg_params && (p == s->memcg_params->root_cache));
+	return p == s || p == s->memcg_params.root_cache;
 }
 
 /*
@@ -185,37 +182,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
 static inline const char *cache_name(struct kmem_cache *s)
 {
 	if (!is_root_cache(s))
-		return s->memcg_params->root_cache->name;
+		s = s->memcg_params.root_cache;
 	return s->name;
 }
 
 /*
  * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away. Since once
- * created a memcg's cache is destroyed only along with the root cache, it is
- * true if we are going to allocate from the cache or hold a reference to the
- * root cache by other means. Otherwise, we should hold either the slab_mutex
- * or the memcg's slab_caches_mutex while calling this function and accessing
- * the returned value.
+ * That said the caller must assure the memcg's cache won't go away by either
+ * taking a css reference to the owner cgroup, or holding the slab_mutex.
  */
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
 	struct kmem_cache *cachep;
-	struct memcg_cache_params *params;
-
-	if (!s->memcg_params)
-		return NULL;
+	struct memcg_cache_array *arr;
 
 	rcu_read_lock();
-	params = rcu_dereference(s->memcg_params);
+	arr = rcu_dereference(s->memcg_params.memcg_caches);
 
 	/*
 	 * Make sure we will access the up-to-date value. The code updating
 	 * memcg_caches issues a write barrier to match this (see
-	 * memcg_register_cache()).
+	 * memcg_create_kmem_cache()).
 	 */
-	cachep = lockless_dereference(params->memcg_caches[idx]);
+	cachep = lockless_dereference(arr->entries[idx]);
 	rcu_read_unlock();
 
 	return cachep;
@@ -225,7 +215,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
 	if (is_root_cache(s))
 		return s;
-	return s->memcg_params->root_cache;
+	return s->memcg_params.root_cache;
 }
 
 static __always_inline int memcg_charge_slab(struct kmem_cache *s,
@@ -235,7 +225,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
 		return 0;
 	if (is_root_cache(s))
 		return 0;
-	return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order);
+	return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
 }
 
 static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,9 +234,13 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 		return;
 	if (is_root_cache(s))
 		return;
-	memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order);
+	memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
 }
-#else
+
+extern void slab_init_memcg_params(struct kmem_cache *);
+
+#else /* !CONFIG_MEMCG_KMEM */
+
 static inline bool is_root_cache(struct kmem_cache *s)
 {
 	return true;
@@ -282,7 +276,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
 static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 {
 }
-#endif
+
+static inline void slab_init_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 23f5fcde6043..7cc32cf126ef 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -106,62 +106,66 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 #endif
 
 #ifdef CONFIG_MEMCG_KMEM
-static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
-		struct kmem_cache *s, struct kmem_cache *root_cache)
+void slab_init_memcg_params(struct kmem_cache *s)
 {
-	size_t size;
+	s->memcg_params.is_root_cache = true;
+	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+}
+
+static int init_memcg_params(struct kmem_cache *s,
+		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+	struct memcg_cache_array *arr;
 
-	if (!memcg_kmem_enabled())
+	if (memcg) {
+		s->memcg_params.is_root_cache = false;
+		s->memcg_params.memcg = memcg;
+		s->memcg_params.root_cache = root_cache;
 		return 0;
+	}
 
-	if (!memcg) {
-		size = offsetof(struct memcg_cache_params, memcg_caches);
-		size += memcg_nr_cache_ids * sizeof(void *);
-	} else
-		size = sizeof(struct memcg_cache_params);
+	slab_init_memcg_params(s);
 
-	s->memcg_params = kzalloc(size, GFP_KERNEL);
-	if (!s->memcg_params)
-		return -ENOMEM;
+	if (!memcg_nr_cache_ids)
+		return 0;
 
-	if (memcg) {
-		s->memcg_params->memcg = memcg;
-		s->memcg_params->root_cache = root_cache;
-	} else
-		s->memcg_params->is_root_cache = true;
+	arr = kzalloc(sizeof(struct memcg_cache_array) +
+		      memcg_nr_cache_ids * sizeof(void *),
+		      GFP_KERNEL);
+	if (!arr)
+		return -ENOMEM;
 
+	RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
 	return 0;
 }
 
-static void memcg_free_cache_params(struct kmem_cache *s)
+static void destroy_memcg_params(struct kmem_cache *s)
 {
-	kfree(s->memcg_params);
+	if (is_root_cache(s))
+		kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
 }
 
-static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
+static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
-	int size;
-	struct memcg_cache_params *new_params, *cur_params;
+	struct memcg_cache_array *old, *new;
 
-	BUG_ON(!is_root_cache(s));
-
-	size = offsetof(struct memcg_cache_params, memcg_caches);
-	size += num_memcgs * sizeof(void *);
+	if (!is_root_cache(s))
+		return 0;
 
-	new_params = kzalloc(size, GFP_KERNEL);
-	if (!new_params)
+	new = kzalloc(sizeof(struct memcg_cache_array) +
+		      new_array_size * sizeof(void *), GFP_KERNEL);
+	if (!new)
 		return -ENOMEM;
 
-	cur_params = s->memcg_params;
-	memcpy(new_params->memcg_caches, cur_params->memcg_caches,
-	       memcg_nr_cache_ids * sizeof(void *));
-
-	new_params->is_root_cache = true;
-
-	rcu_assign_pointer(s->memcg_params, new_params);
-	if (cur_params)
-		kfree_rcu(cur_params, rcu_head);
+	old = rcu_dereference_protected(s->memcg_params.memcg_caches,
+					lockdep_is_held(&slab_mutex));
+	if (old)
+		memcpy(new->entries, old->entries,
+		       memcg_nr_cache_ids * sizeof(void *));
 
+	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
+	if (old)
+		kfree_rcu(old, rcu);
 	return 0;
 }
 
@@ -172,10 +176,7 @@ int memcg_update_all_caches(int num_memcgs)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
-		if (!is_root_cache(s))
-			continue;
-
-		ret = memcg_update_cache_params(s, num_memcgs);
+		ret = update_memcg_params(s, num_memcgs);
 		/*
 		 * Instead of freeing the memory, we'll just leave the caches
 		 * up to this point in an updated state.
@@ -187,13 +188,13 @@ int memcg_update_all_caches(int num_memcgs)
 	return ret;
 }
 #else
-static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
-		struct kmem_cache *s, struct kmem_cache *root_cache)
+static inline int init_memcg_params(struct kmem_cache *s,
+		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
 {
 	return 0;
 }
 
-static inline void memcg_free_cache_params(struct kmem_cache *s)
+static inline void destroy_memcg_params(struct kmem_cache *s)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
@@ -311,7 +312,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
 	s->align = align;
 	s->ctor = ctor;
 
-	err = memcg_alloc_cache_params(memcg, s, root_cache);
+	err = init_memcg_params(s, memcg, root_cache);
 	if (err)
 		goto out_free_cache;
 
@@ -327,7 +328,7 @@ out:
 	return s;
 
 out_free_cache:
-	memcg_free_cache_params(s);
+	destroy_memcg_params(s);
 	kmem_cache_free(kmem_cache, s);
 	goto out;
 }
@@ -439,11 +440,15 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
 
 #ifdef CONFIG_MEMCG_KMEM
 	if (!is_root_cache(s)) {
-		struct kmem_cache *root_cache = s->memcg_params->root_cache;
-		int memcg_id = memcg_cache_id(s->memcg_params->memcg);
-
-		BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
-		root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
+		int idx;
+		struct memcg_cache_array *arr;
+
+		idx = memcg_cache_id(s->memcg_params.memcg);
+		arr = rcu_dereference_protected(s->memcg_params.root_cache->
+						memcg_params.memcg_caches,
+						lockdep_is_held(&slab_mutex));
+		BUG_ON(arr->entries[idx] != s);
+		arr->entries[idx] = NULL;
 	}
 #endif
 	list_move(&s->list, release);
@@ -481,27 +486,32 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 			     struct kmem_cache *root_cache)
 {
 	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
-	int memcg_id = memcg_cache_id(memcg);
+	struct memcg_cache_array *arr;
 	struct kmem_cache *s = NULL;
 	char *cache_name;
+	int idx;
 
 	get_online_cpus();
 	get_online_mems();
 
 	mutex_lock(&slab_mutex);
 
+	idx = memcg_cache_id(memcg);
+	arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
+					lockdep_is_held(&slab_mutex));
+
 	/*
 	 * Since per-memcg caches are created asynchronously on first
 	 * allocation (see memcg_kmem_get_cache()), several threads can try to
 	 * create the same cache, but only one of them may succeed.
 	 */
-	if (cache_from_memcg_idx(root_cache, memcg_id))
+	if (arr->entries[idx])
 		goto out_unlock;
 
 	cgroup_name(mem_cgroup_css(memcg)->cgroup,
 		    memcg_name_buf, sizeof(memcg_name_buf));
 	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
-			       memcg_cache_id(memcg), memcg_name_buf);
+			       idx, memcg_name_buf);
 	if (!cache_name)
 		goto out_unlock;
 
@@ -525,7 +535,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 	 * initialized.
 	 */
 	smp_wmb();
-	root_cache->memcg_params->memcg_caches[memcg_id] = s;
+	arr->entries[idx] = s;
 
 out_unlock:
 	mutex_unlock(&slab_mutex);
@@ -545,7 +555,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry_safe(s, s2, &slab_caches, list) {
-		if (is_root_cache(s) || s->memcg_params->memcg != memcg)
+		if (is_root_cache(s) || s->memcg_params.memcg != memcg)
 			continue;
 		/*
 		 * The cgroup is about to be freed and therefore has no charges
@@ -564,7 +574,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 
 void slab_kmem_cache_release(struct kmem_cache *s)
 {
-	memcg_free_cache_params(s);
+	destroy_memcg_params(s);
 	kfree(s->name);
 	kmem_cache_free(kmem_cache, s);
 }
@@ -640,6 +650,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
 	s->name = name;
 	s->size = s->object_size = size;
 	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+	slab_init_memcg_params(s);
+
 	err = __kmem_cache_create(s, flags);
 
 	if (err)
@@ -980,7 +993,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
 
 	if (p == slab_caches.next)
 		print_slabinfo_header(m);
-	if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
+	if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
 		cache_show(s, m);
 	return 0;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 8b8508adf9c2..75d55fdfe3a1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3577,6 +3577,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 			p->slab_cache = s;
 #endif
 	}
+	slab_init_memcg_params(s);
 	list_add(&s->list, &slab_caches);
 	return s;
 }
@@ -4964,7 +4965,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 	if (is_root_cache(s))
 		return;
 
-	root_cache = s->memcg_params->root_cache;
+	root_cache = s->memcg_params.root_cache;
 
 	/*
 	 * This mean this cache had no attribute written. Therefore, no point
@@ -5044,7 +5045,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
 {
 #ifdef CONFIG_MEMCG_KMEM
 	if (!is_root_cache(s))
-		return s->memcg_params->root_cache->memcg_kset;
+		return s->memcg_params.root_cache->memcg_kset;
 #endif
 	return slab_kset;
 }
-- 
cgit v1.2.3


From 426589f571f7d6d5ab2ca33ece73164149279ca1 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:23 -0800
Subject: slab: link memcg caches of the same kind into a list

Sometimes, we need to iterate over all memcg copies of a particular root
kmem cache.  Currently, we use memcg_cache_params->memcg_caches array for
that, because it contains all existing memcg caches.

However, it's a bad practice to keep all caches, including those that
belong to offline cgroups, in this array, because it will be growing
beyond any bounds then.  I'm going to wipe away dead caches from it to
save space.  To still be able to perform iterations over all memcg caches
of the same kind, let us link them into a list.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  4 ++++
 mm/slab.c            | 13 +++++--------
 mm/slab.h            | 17 +++++++++++++++++
 mm/slab_common.c     | 21 ++++++++++-----------
 mm/slub.c            | 19 +++++--------------
 5 files changed, 41 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1e03c11bbfbd..26d99f41b410 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -491,9 +491,13 @@ struct memcg_cache_array {
  *
  * @memcg: pointer to the memcg this cache belongs to
  * @root_cache: pointer to the global, root cache, this cache was derived from
+ *
+ * Both root and child caches of the same kind are linked into a list chained
+ * through @list.
  */
 struct memcg_cache_params {
 	bool is_root_cache;
+	struct list_head list;
 	union {
 		struct memcg_cache_array __rcu *memcg_caches;
 		struct {
diff --git a/mm/slab.c b/mm/slab.c
index 65b5dcb6f671..7894017bc160 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared, gfp_t gfp)
 {
 	int ret;
-	struct kmem_cache *c = NULL;
-	int i = 0;
+	struct kmem_cache *c;
 
 	ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
 
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	if ((ret < 0) || !is_root_cache(cachep))
 		return ret;
 
-	VM_BUG_ON(!mutex_is_locked(&slab_mutex));
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg_idx(cachep, i);
-		if (c)
-			/* return value determined by the parent cache only */
-			__do_tune_cpucache(c, limit, batchcount, shared, gfp);
+	lockdep_assert_held(&slab_mutex);
+	for_each_memcg_cache(c, cachep) {
+		/* return value determined by the root cache only */
+		__do_tune_cpucache(c, limit, batchcount, shared, gfp);
 	}
 
 	return ret;
diff --git a/mm/slab.h b/mm/slab.h
index 53a623f85931..0a56d76ac0e9 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,6 +163,18 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *ppos);
 
 #ifdef CONFIG_MEMCG_KMEM
+/*
+ * Iterate over all memcg caches of the given root cache. The caller must hold
+ * slab_mutex.
+ */
+#define for_each_memcg_cache(iter, root) \
+	list_for_each_entry(iter, &(root)->memcg_params.list, \
+			    memcg_params.list)
+
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+	list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
+				 memcg_params.list)
+
 static inline bool is_root_cache(struct kmem_cache *s)
 {
 	return s->memcg_params.is_root_cache;
@@ -241,6 +253,11 @@ extern void slab_init_memcg_params(struct kmem_cache *);
 
 #else /* !CONFIG_MEMCG_KMEM */
 
+#define for_each_memcg_cache(iter, root) \
+	for ((void)(iter), (void)(root); 0; )
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+	for ((void)(iter), (void)(tmp), (void)(root); 0; )
+
 static inline bool is_root_cache(struct kmem_cache *s)
 {
 	return true;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 7cc32cf126ef..989784bd88be 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -109,6 +109,7 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 void slab_init_memcg_params(struct kmem_cache *s)
 {
 	s->memcg_params.is_root_cache = true;
+	INIT_LIST_HEAD(&s->memcg_params.list);
 	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
 }
 
@@ -449,6 +450,7 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
 						lockdep_is_held(&slab_mutex));
 		BUG_ON(arr->entries[idx] != s);
 		arr->entries[idx] = NULL;
+		list_del(&s->memcg_params.list);
 	}
 #endif
 	list_move(&s->list, release);
@@ -529,6 +531,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 		goto out_unlock;
 	}
 
+	list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
 	 * barrier here to ensure nobody will see the kmem_cache partially
@@ -581,11 +585,13 @@ void slab_kmem_cache_release(struct kmem_cache *s)
 
 void kmem_cache_destroy(struct kmem_cache *s)
 {
-	int i;
+	struct kmem_cache *c, *c2;
 	LIST_HEAD(release);
 	bool need_rcu_barrier = false;
 	bool busy = false;
 
+	BUG_ON(!is_root_cache(s));
+
 	get_online_cpus();
 	get_online_mems();
 
@@ -595,10 +601,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (s->refcount)
 		goto out_unlock;
 
-	for_each_memcg_cache_index(i) {
-		struct kmem_cache *c = cache_from_memcg_idx(s, i);
-
-		if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+	for_each_memcg_cache_safe(c, c2, s) {
+		if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
 			busy = true;
 	}
 
@@ -932,16 +936,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
 {
 	struct kmem_cache *c;
 	struct slabinfo sinfo;
-	int i;
 
 	if (!is_root_cache(s))
 		return;
 
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg_idx(s, i);
-		if (!c)
-			continue;
-
+	for_each_memcg_cache(c, s) {
 		memset(&sinfo, 0, sizeof(sinfo));
 		get_slabinfo(c, &sinfo);
 
diff --git a/mm/slub.c b/mm/slub.c
index 75d55fdfe3a1..1e5a4636cb23 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3636,13 +3636,10 @@ struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
 		   unsigned long flags, void (*ctor)(void *))
 {
-	struct kmem_cache *s;
+	struct kmem_cache *s, *c;
 
 	s = find_mergeable(size, align, flags, name, ctor);
 	if (s) {
-		int i;
-		struct kmem_cache *c;
-
 		s->refcount++;
 
 		/*
@@ -3652,10 +3649,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
 		s->object_size = max(s->object_size, (int)size);
 		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
 
-		for_each_memcg_cache_index(i) {
-			c = cache_from_memcg_idx(s, i);
-			if (!c)
-				continue;
+		for_each_memcg_cache(c, s) {
 			c->object_size = s->object_size;
 			c->inuse = max_t(int, c->inuse,
 					 ALIGN(size, sizeof(void *)));
@@ -4921,7 +4915,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 	err = attribute->store(s, buf, len);
 #ifdef CONFIG_MEMCG_KMEM
 	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
-		int i;
+		struct kmem_cache *c;
 
 		mutex_lock(&slab_mutex);
 		if (s->max_attr_size < len)
@@ -4944,11 +4938,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 		 * directly either failed or succeeded, in which case we loop
 		 * through the descendants with best-effort propagation.
 		 */
-		for_each_memcg_cache_index(i) {
-			struct kmem_cache *c = cache_from_memcg_idx(s, i);
-			if (c)
-				attribute->store(c, buf, len);
-		}
+		for_each_memcg_cache(c, s)
+			attribute->store(c, buf, len);
 		mutex_unlock(&slab_mutex);
 	}
 #endif
-- 
cgit v1.2.3


From 2a4db7eb9391a544ff58f4fa11d35246e87c87af Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:32 -0800
Subject: memcg: free memcg_caches slot on css offline

We need to look up a kmem_cache in ->memcg_params.memcg_caches arrays only
on allocations, so there is no need to have the array entries set until
css free - we can clear them on css offline.  This will allow us to reuse
array entries more efficiently and avoid costly array relocations.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 10 +++++-----
 mm/memcontrol.c      | 38 ++++++++++++++++++++++++++++++++------
 mm/slab_common.c     | 39 ++++++++++++++++++++++++++++-----------
 3 files changed, 65 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 26d99f41b410..ed2ffaab59ea 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -115,13 +115,12 @@ int slab_is_available(void);
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *));
-#ifdef CONFIG_MEMCG_KMEM
-void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
-void memcg_destroy_kmem_caches(struct mem_cgroup *);
-#endif
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
-void kmem_cache_free(struct kmem_cache *, void *);
+
+void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
+void memcg_deactivate_kmem_caches(struct mem_cgroup *);
+void memcg_destroy_kmem_caches(struct mem_cgroup *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -288,6 +287,7 @@ static __always_inline int kmalloc_index(size_t size)
 
 void *__kmalloc(size_t size, gfp_t flags);
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *, void *);
 
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f3c0fcd7a2d..abfe0135bfdc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -334,6 +334,7 @@ struct mem_cgroup {
 #if defined(CONFIG_MEMCG_KMEM)
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
+	bool kmem_acct_active;
 #endif
 
 	int last_scanned_node;
@@ -354,7 +355,7 @@ struct mem_cgroup {
 #ifdef CONFIG_MEMCG_KMEM
 bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
-	return memcg->kmemcg_id >= 0;
+	return memcg->kmem_acct_active;
 }
 #endif
 
@@ -585,7 +586,7 @@ static void memcg_free_cache_id(int id);
 
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
-	if (memcg_kmem_is_active(memcg)) {
+	if (memcg->kmemcg_id >= 0) {
 		static_key_slow_dec(&memcg_kmem_enabled_key);
 		memcg_free_cache_id(memcg->kmemcg_id);
 	}
@@ -2666,6 +2667,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *memcg_cachep;
+	int kmemcg_id;
 
 	VM_BUG_ON(!is_root_cache(cachep));
 
@@ -2673,10 +2675,11 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 		return cachep;
 
 	memcg = get_mem_cgroup_from_mm(current->mm);
-	if (!memcg_kmem_is_active(memcg))
+	kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+	if (kmemcg_id < 0)
 		goto out;
 
-	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
 	if (likely(memcg_cachep))
 		return memcg_cachep;
 
@@ -3318,8 +3321,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 	int err = 0;
 	int memcg_id;
 
-	if (memcg_kmem_is_active(memcg))
-		return 0;
+	BUG_ON(memcg->kmemcg_id >= 0);
+	BUG_ON(memcg->kmem_acct_active);
 
 	/*
 	 * For simplicity, we won't allow this to be disabled.  It also can't
@@ -3362,6 +3365,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 	 * patched.
 	 */
 	memcg->kmemcg_id = memcg_id;
+	memcg->kmem_acct_active = true;
 out:
 	return err;
 }
@@ -4041,6 +4045,22 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return mem_cgroup_sockets_init(memcg, ss);
 }
 
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+	if (!memcg->kmem_acct_active)
+		return;
+
+	/*
+	 * Clear the 'active' flag before clearing memcg_caches arrays entries.
+	 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+	 * guarantees no cache will be created for this cgroup after we are
+	 * done (see memcg_create_kmem_cache()).
+	 */
+	memcg->kmem_acct_active = false;
+
+	memcg_deactivate_kmem_caches(memcg);
+}
+
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 	memcg_destroy_kmem_caches(memcg);
@@ -4052,6 +4072,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return 0;
 }
 
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
+
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 }
@@ -4608,6 +4632,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	spin_unlock(&memcg->event_list_lock);
 
 	vmpressure_cleanup(&memcg->vmpressure);
+
+	memcg_deactivate_kmem(memcg);
 }
 
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6087b1f9a385..0873bcc61c7a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -440,18 +440,8 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
 		*need_rcu_barrier = true;
 
 #ifdef CONFIG_MEMCG_KMEM
-	if (!is_root_cache(s)) {
-		int idx;
-		struct memcg_cache_array *arr;
-
-		idx = memcg_cache_id(s->memcg_params.memcg);
-		arr = rcu_dereference_protected(s->memcg_params.root_cache->
-						memcg_params.memcg_caches,
-						lockdep_is_held(&slab_mutex));
-		BUG_ON(arr->entries[idx] != s);
-		arr->entries[idx] = NULL;
+	if (!is_root_cache(s))
 		list_del(&s->memcg_params.list);
-	}
 #endif
 	list_move(&s->list, release);
 	return 0;
@@ -499,6 +489,13 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	mutex_lock(&slab_mutex);
 
+	/*
+	 * The memory cgroup could have been deactivated while the cache
+	 * creation work was pending.
+	 */
+	if (!memcg_kmem_is_active(memcg))
+		goto out_unlock;
+
 	idx = memcg_cache_id(memcg);
 	arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
 					lockdep_is_held(&slab_mutex));
@@ -548,6 +545,26 @@ out_unlock:
 	put_online_cpus();
 }
 
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
+{
+	int idx;
+	struct memcg_cache_array *arr;
+	struct kmem_cache *s;
+
+	idx = memcg_cache_id(memcg);
+
+	mutex_lock(&slab_mutex);
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!is_root_cache(s))
+			continue;
+
+		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+						lockdep_is_held(&slab_mutex));
+		arr->entries[idx] = NULL;
+	}
+	mutex_unlock(&slab_mutex);
+}
+
 void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 {
 	LIST_HEAD(release);
-- 
cgit v1.2.3


From 3f97b163207c67a3b35931494ad3db1de66356f0 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:35 -0800
Subject: list_lru: add helpers to isolate items

Currently, the isolate callback passed to the list_lru_walk family of
functions is supposed to just delete an item from the list upon returning
LRU_REMOVED or LRU_REMOVED_RETRY, while nr_items counter is fixed by
__list_lru_walk_one after the callback returns.  Since the callback is
allowed to drop the lock after removing an item (it has to return
LRU_REMOVED_RETRY then), the nr_items can be less than the actual number
of elements on the list even if we check them under the lock.  This makes
it difficult to move items from one list_lru_one to another, which is
required for per-memcg list_lru reparenting - we can't just splice the
lists, we have to move entries one by one.

This patch therefore introduces helpers that must be used by callback
functions to isolate items instead of raw list_del/list_move.  These are
list_lru_isolate and list_lru_isolate_move.  They not only remove the
entry from the list, but also fix the nr_items counter, making sure
nr_items always reflects the actual number of elements on the list if
checked under the appropriate lock.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c              | 21 +++++++++++----------
 fs/gfs2/quota.c          |  5 +++--
 fs/inode.c               |  8 ++++----
 fs/xfs/xfs_buf.c         |  6 ++++--
 fs/xfs/xfs_qm.c          |  5 +++--
 include/linux/list_lru.h |  9 +++++++--
 mm/list_lru.c            | 19 ++++++++++++++++---
 mm/workingset.c          |  3 ++-
 8 files changed, 50 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 56c5da89f58a..d04be762b216 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -400,19 +400,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list)
  * LRU lists entirely, while shrink_move moves it to the indicated
  * private list.
  */
-static void d_lru_isolate(struct dentry *dentry)
+static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
 {
 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
 	dentry->d_flags &= ~DCACHE_LRU_LIST;
 	this_cpu_dec(nr_dentry_unused);
-	list_del_init(&dentry->d_lru);
+	list_lru_isolate(lru, &dentry->d_lru);
 }
 
-static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+			      struct list_head *list)
 {
 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
 	dentry->d_flags |= DCACHE_SHRINK_LIST;
-	list_move_tail(&dentry->d_lru, list);
+	list_lru_isolate_move(lru, &dentry->d_lru, list);
 }
 
 /*
@@ -869,8 +870,8 @@ static void shrink_dentry_list(struct list_head *list)
 	}
 }
 
-static enum lru_status
-dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+static enum lru_status dentry_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *freeable = arg;
 	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
@@ -890,7 +891,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 	 * another pass through the LRU.
 	 */
 	if (dentry->d_lockref.count) {
-		d_lru_isolate(dentry);
+		d_lru_isolate(lru, dentry);
 		spin_unlock(&dentry->d_lock);
 		return LRU_REMOVED;
 	}
@@ -921,7 +922,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 		return LRU_ROTATE;
 	}
 
-	d_lru_shrink_move(dentry, freeable);
+	d_lru_shrink_move(lru, dentry, freeable);
 	spin_unlock(&dentry->d_lock);
 
 	return LRU_REMOVED;
@@ -951,7 +952,7 @@ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
 }
 
 static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
-						spinlock_t *lru_lock, void *arg)
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *freeable = arg;
 	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
@@ -964,7 +965,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
 	if (!spin_trylock(&dentry->d_lock))
 		return LRU_SKIP;
 
-	d_lru_shrink_move(dentry, freeable);
+	d_lru_shrink_move(lru, dentry, freeable);
 	spin_unlock(&dentry->d_lock);
 
 	return LRU_REMOVED;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c15d6b216d0b..3aa17d4d1cfc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list)
 }
 
 
-static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+static enum lru_status gfs2_qd_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *dispose = arg;
 	struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
 
 	if (qd->qd_lockref.count == 0) {
 		lockref_mark_dead(&qd->qd_lockref);
-		list_move(&qd->qd_lru, dispose);
+		list_lru_isolate_move(lru, &qd->qd_lru, dispose);
 	}
 
 	spin_unlock(&qd->qd_lockref.lock);
diff --git a/fs/inode.c b/fs/inode.c
index 524a32c2b0c6..86c612b92c6f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -685,8 +685,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
-static enum lru_status
-inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+static enum lru_status inode_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *freeable = arg;
 	struct inode	*inode = container_of(item, struct inode, i_lru);
@@ -704,7 +704,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 	 */
 	if (atomic_read(&inode->i_count) ||
 	    (inode->i_state & ~I_REFERENCED)) {
-		list_del_init(&inode->i_lru);
+		list_lru_isolate(lru, &inode->i_lru);
 		spin_unlock(&inode->i_lock);
 		this_cpu_dec(nr_unused);
 		return LRU_REMOVED;
@@ -738,7 +738,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 
 	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
-	list_move(&inode->i_lru, freeable);
+	list_lru_isolate_move(lru, &inode->i_lru, freeable);
 	spin_unlock(&inode->i_lock);
 
 	this_cpu_dec(nr_unused);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15c9d224c721..1790b00bea7a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1488,6 +1488,7 @@ xfs_buf_iomove(
 static enum lru_status
 xfs_buftarg_wait_rele(
 	struct list_head	*item,
+	struct list_lru_one	*lru,
 	spinlock_t		*lru_lock,
 	void			*arg)
 
@@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele(
 	 */
 	atomic_set(&bp->b_lru_ref, 0);
 	bp->b_state |= XFS_BSTATE_DISPOSE;
-	list_move(item, dispose);
+	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
 }
@@ -1546,6 +1547,7 @@ xfs_wait_buftarg(
 static enum lru_status
 xfs_buftarg_isolate(
 	struct list_head	*item,
+	struct list_lru_one	*lru,
 	spinlock_t		*lru_lock,
 	void			*arg)
 {
@@ -1569,7 +1571,7 @@ xfs_buftarg_isolate(
 	}
 
 	bp->b_state |= XFS_BSTATE_DISPOSE;
-	list_move(item, dispose);
+	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
 }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 4f4b1274e144..53cc2aaf8d2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -430,6 +430,7 @@ struct xfs_qm_isolate {
 static enum lru_status
 xfs_qm_dquot_isolate(
 	struct list_head	*item,
+	struct list_lru_one	*lru,
 	spinlock_t		*lru_lock,
 	void			*arg)
 		__releases(lru_lock) __acquires(lru_lock)
@@ -450,7 +451,7 @@ xfs_qm_dquot_isolate(
 		XFS_STATS_INC(xs_qm_dqwants);
 
 		trace_xfs_dqreclaim_want(dqp);
-		list_del_init(&dqp->q_lru);
+		list_lru_isolate(lru, &dqp->q_lru);
 		XFS_STATS_DEC(xs_qm_dquot_unused);
 		return LRU_REMOVED;
 	}
@@ -494,7 +495,7 @@ xfs_qm_dquot_isolate(
 	xfs_dqunlock(dqp);
 
 	ASSERT(dqp->q_nrefs == 0);
-	list_move_tail(&dqp->q_lru, &isol->dispose);
+	list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
 	XFS_STATS_DEC(xs_qm_dquot_unused);
 	trace_xfs_dqreclaim_done(dqp);
 	XFS_STATS_INC(xs_qm_dqreclaims);
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 305b598abac2..7edf9c9ab9eb 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -125,8 +125,13 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 	return count;
 }
 
-typedef enum lru_status
-(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+			   struct list_head *head);
+
+typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
+		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
+
 /**
  * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 79aee70c3b9d..8d9d168c6c38 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -132,6 +132,21 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
 
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
+{
+	list_del_init(item);
+	list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+			   struct list_head *head)
+{
+	list_move(item, head);
+	list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+
 static unsigned long __list_lru_count_one(struct list_lru *lru,
 					  int nid, int memcg_idx)
 {
@@ -194,13 +209,11 @@ restart:
 			break;
 		--*nr_to_walk;
 
-		ret = isolate(item, &nlru->lock, cb_arg);
+		ret = isolate(item, l, &nlru->lock, cb_arg);
 		switch (ret) {
 		case LRU_REMOVED_RETRY:
 			assert_spin_locked(&nlru->lock);
 		case LRU_REMOVED:
-			l->nr_items--;
-			WARN_ON_ONCE(l->nr_items < 0);
 			isolated++;
 			/*
 			 * If the lru lock has been dropped, our list
diff --git a/mm/workingset.c b/mm/workingset.c
index d4fa7fb10a52..aa017133744b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 }
 
 static enum lru_status shadow_lru_isolate(struct list_head *item,
+					  struct list_lru_one *lru,
 					  spinlock_t *lru_lock,
 					  void *arg)
 {
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 		goto out;
 	}
 
-	list_del_init(item);
+	list_lru_isolate(lru, item);
 	spin_unlock(lru_lock);
 
 	/*
-- 
cgit v1.2.3


From 2788cf0c401c268b4819c5407493a8769b7007aa Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Feb 2015 14:59:38 -0800
Subject: memcg: reparent list_lrus and free kmemcg_id on css offline

Now, the only reason to keep kmemcg_id till css free is list_lru, which
uses it to distribute elements between per-memcg lists.  However, it can
be easily sorted out - we only need to change kmemcg_id of an offline
cgroup to its parent's id, making further list_lru_add()'s add elements to
the parent's list, and then move all elements from the offline cgroup's
list to the one of its parent.  It will work, because a racing
list_lru_del() does not need to know the list it is deleting the element
from.  It can decrement the wrong nr_items counter though, but the ongoing
reparenting will fix it.  After list_lru reparenting is done we are free
to release kmemcg_id saving a valuable slot in a per-memcg array for new
cgroups.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list_lru.h |  3 ++-
 mm/list_lru.c            | 46 +++++++++++++++++++++++++++++++++++++++++++---
 mm/memcontrol.c          | 39 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 79 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 7edf9c9ab9eb..2a6b9947aaa3 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -26,7 +26,7 @@ enum lru_status {
 
 struct list_lru_one {
 	struct list_head	list;
-	/* kept as signed so we can catch imbalance bugs */
+	/* may become negative during memcg reparenting */
 	long			nr_items;
 };
 
@@ -62,6 +62,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 #define list_lru_init_memcg(lru)	__list_lru_init((lru), true, NULL)
 
 int memcg_update_all_list_lrus(int num_memcgs);
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx);
 
 /**
  * list_lru_add: add an element to the lru list's tail
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 8d9d168c6c38..909eca2c820e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -100,7 +100,6 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 
 	spin_lock(&nlru->lock);
 	l = list_lru_from_kmem(nlru, item);
-	WARN_ON_ONCE(l->nr_items < 0);
 	if (list_empty(item)) {
 		list_add_tail(item, &l->list);
 		l->nr_items++;
@@ -123,7 +122,6 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 	if (!list_empty(item)) {
 		list_del_init(item);
 		l->nr_items--;
-		WARN_ON_ONCE(l->nr_items < 0);
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -156,7 +154,6 @@ static unsigned long __list_lru_count_one(struct list_lru *lru,
 
 	spin_lock(&nlru->lock);
 	l = list_lru_from_memcg_idx(nlru, memcg_idx);
-	WARN_ON_ONCE(l->nr_items < 0);
 	count = l->nr_items;
 	spin_unlock(&nlru->lock);
 
@@ -458,6 +455,49 @@ fail:
 		memcg_cancel_update_list_lru(lru, old_size, new_size);
 	goto out;
 }
+
+static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
+				      int src_idx, int dst_idx)
+{
+	struct list_lru_one *src, *dst;
+
+	/*
+	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+	 * we have to use IRQ-safe primitives here to avoid deadlock.
+	 */
+	spin_lock_irq(&nlru->lock);
+
+	src = list_lru_from_memcg_idx(nlru, src_idx);
+	dst = list_lru_from_memcg_idx(nlru, dst_idx);
+
+	list_splice_init(&src->list, &dst->list);
+	dst->nr_items += src->nr_items;
+	src->nr_items = 0;
+
+	spin_unlock_irq(&nlru->lock);
+}
+
+static void memcg_drain_list_lru(struct list_lru *lru,
+				 int src_idx, int dst_idx)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+}
+
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+{
+	struct list_lru *lru;
+
+	mutex_lock(&list_lrus_mutex);
+	list_for_each_entry(lru, &list_lrus, list)
+		memcg_drain_list_lru(lru, src_idx, dst_idx);
+	mutex_unlock(&list_lrus_mutex);
+}
 #else
 static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index abfe0135bfdc..419c06b1794a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -334,6 +334,7 @@ struct mem_cgroup {
 #if defined(CONFIG_MEMCG_KMEM)
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
+	bool kmem_acct_activated;
 	bool kmem_acct_active;
 #endif
 
@@ -582,14 +583,10 @@ void memcg_put_cache_ids(void)
 struct static_key memcg_kmem_enabled_key;
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
-static void memcg_free_cache_id(int id);
-
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
-	if (memcg->kmemcg_id >= 0) {
+	if (memcg->kmem_acct_activated)
 		static_key_slow_dec(&memcg_kmem_enabled_key);
-		memcg_free_cache_id(memcg->kmemcg_id);
-	}
 	/*
 	 * This check can't live in kmem destruction function,
 	 * since the charges will outlive the cgroup
@@ -3322,6 +3319,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 	int memcg_id;
 
 	BUG_ON(memcg->kmemcg_id >= 0);
+	BUG_ON(memcg->kmem_acct_activated);
 	BUG_ON(memcg->kmem_acct_active);
 
 	/*
@@ -3365,6 +3363,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 	 * patched.
 	 */
 	memcg->kmemcg_id = memcg_id;
+	memcg->kmem_acct_activated = true;
 	memcg->kmem_acct_active = true;
 out:
 	return err;
@@ -4047,6 +4046,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
 static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
 {
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *parent, *child;
+	int kmemcg_id;
+
 	if (!memcg->kmem_acct_active)
 		return;
 
@@ -4059,6 +4062,32 @@ static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
 	memcg->kmem_acct_active = false;
 
 	memcg_deactivate_kmem_caches(memcg);
+
+	kmemcg_id = memcg->kmemcg_id;
+	BUG_ON(kmemcg_id < 0);
+
+	parent = parent_mem_cgroup(memcg);
+	if (!parent)
+		parent = root_mem_cgroup;
+
+	/*
+	 * Change kmemcg_id of this cgroup and all its descendants to the
+	 * parent's id, and then move all entries from this cgroup's list_lrus
+	 * to ones of the parent. After we have finished, all list_lrus
+	 * corresponding to this cgroup are guaranteed to remain empty. The
+	 * ordering is imposed by list_lru_node->lock taken by
+	 * memcg_drain_all_list_lrus().
+	 */
+	css_for_each_descendant_pre(css, &memcg->css) {
+		child = mem_cgroup_from_css(css);
+		BUG_ON(child->kmemcg_id != kmemcg_id);
+		child->kmemcg_id = parent->kmemcg_id;
+		if (!memcg->use_hierarchy)
+			break;
+	}
+	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+	memcg_free_cache_id(kmemcg_id);
 }
 
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-- 
cgit v1.2.3


From 2d2f5119b8bb057595e18f5b2f07aa097ea1b233 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 12 Feb 2015 14:59:59 -0800
Subject: mm: do not use mm->nr_pmds on !MMU configurations

mm->nr_pmds doesn't make sense on !MMU configurations

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 9 ++++++++-
 kernel/fork.c      | 4 +---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index af4ff88a11e0..bd52e2f14027 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1447,13 +1447,15 @@ static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
 #endif
 
-#ifdef __PAGETABLE_PMD_FOLDED
+#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
 static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 						unsigned long address)
 {
 	return 0;
 }
 
+static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
+
 static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
 {
 	return 0;
@@ -1465,6 +1467,11 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 
+static inline void mm_nr_pmds_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_pmds, 0);
+}
+
 static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
 {
 	return atomic_long_read(&mm->nr_pmds);
diff --git a/kernel/fork.c b/kernel/fork.c
index 66e19c251581..cf65139615a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -555,9 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
-#ifndef __PAGETABLE_PMD_FOLDED
-	atomic_long_set(&mm->nr_pmds, 0);
-#endif
+	mm_nr_pmds_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
-- 
cgit v1.2.3


From 3eba0c6a56c04f2b017b43641a821f1ebfb7fb4c Mon Sep 17 00:00:00 2001
From: Ganesh Mahendran <opensource.ganesh@gmail.com>
Date: Thu, 12 Feb 2015 15:00:51 -0800
Subject: mm/zpool: add name argument to create zpool

Currently the underlay of zpool: zsmalloc/zbud, do not know who creates
them.  There is not a method to let zsmalloc/zbud find which caller they
belong to.

Now we want to add statistics collection in zsmalloc.  We need to name the
debugfs dir for each pool created.  The way suggested by Minchan Kim is to
use a name passed by caller(such as zram) to create the zsmalloc pool.

    /sys/kernel/debug/zsmalloc/zram0

This patch adds an argument `name' to zs_create_pool() and other related
functions.

Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Seth Jennings <sjennings@variantweb.net>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 8 +++++---
 include/linux/zpool.h         | 5 +++--
 include/linux/zsmalloc.h      | 2 +-
 mm/zbud.c                     | 3 ++-
 mm/zpool.c                    | 6 ++++--
 mm/zsmalloc.c                 | 6 +++---
 mm/zswap.c                    | 5 +++--
 7 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index eca4b67274c1..8e233edd7a09 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -327,9 +327,10 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)
 	kfree(meta);
 }
 
-static struct zram_meta *zram_meta_alloc(u64 disksize)
+static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
 {
 	size_t num_pages;
+	char pool_name[8];
 	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
 
 	if (!meta)
@@ -342,7 +343,8 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
 		goto out_error;
 	}
 
-	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
+	snprintf(pool_name, sizeof(pool_name), "zram%d", device_id);
+	meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
 	if (!meta->mem_pool) {
 		pr_err("Error creating memory pool\n");
 		goto out_error;
@@ -783,7 +785,7 @@ static ssize_t disksize_store(struct device *dev,
 		return -EINVAL;
 
 	disksize = PAGE_ALIGN(disksize);
-	meta = zram_meta_alloc(disksize);
+	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
 	if (!meta)
 		return -ENOMEM;
 
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index f14bd75f08b3..56529b34dc63 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -36,7 +36,8 @@ enum zpool_mapmode {
 	ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
 };
 
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops);
+struct zpool *zpool_create_pool(char *type, char *name,
+			gfp_t gfp, struct zpool_ops *ops);
 
 char *zpool_get_type(struct zpool *pool);
 
@@ -80,7 +81,7 @@ struct zpool_driver {
 	atomic_t refcount;
 	struct list_head list;
 
-	void *(*create)(gfp_t gfp, struct zpool_ops *ops);
+	void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops);
 	void (*destroy)(void *pool);
 
 	int (*malloc)(void *pool, size_t size, gfp_t gfp,
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 05c214760977..3283c6a55425 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -36,7 +36,7 @@ enum zs_mapmode {
 
 struct zs_pool;
 
-struct zs_pool *zs_create_pool(gfp_t flags);
+struct zs_pool *zs_create_pool(char *name, gfp_t flags);
 void zs_destroy_pool(struct zs_pool *pool);
 
 unsigned long zs_malloc(struct zs_pool *pool, size_t size);
diff --git a/mm/zbud.c b/mm/zbud.c
index 4e387bea702e..2ee4e4520493 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
 	.evict =	zbud_zpool_evict
 };
 
-static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zbud_zpool_create(char *name, gfp_t gfp,
+			struct zpool_ops *zpool_ops)
 {
 	return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
 }
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
 /**
  * zpool_create_pool() - Create a new zpool
  * @type	The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name	The name of the zpool (e.g. zram0, zswap)
  * @gfp		The GFP flags to use when allocating the pool.
  * @ops		The optional ops callback.
  *
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
  *
  * Returns: New zpool on success, NULL on failure.
  */
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+		struct zpool_ops *ops)
 {
 	struct zpool_driver *driver;
 	struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
 
 	zpool->type = driver->type;
 	zpool->driver = driver;
-	zpool->pool = driver->create(gfp, ops);
+	zpool->pool = driver->create(name, gfp, ops);
 	zpool->ops = ops;
 
 	if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b72403927aa4..2359e61b02bf 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -246,9 +246,9 @@ struct mapping_area {
 
 #ifdef CONFIG_ZPOOL
 
-static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
 {
-	return zs_create_pool(gfp);
+	return zs_create_pool(name, gfp);
 }
 
 static void zs_zpool_destroy(void *pool)
@@ -1148,7 +1148,7 @@ EXPORT_SYMBOL_GPL(zs_free);
  * On success, a pointer to the newly created pool is returned,
  * otherwise NULL.
  */
-struct zs_pool *zs_create_pool(gfp_t flags)
+struct zs_pool *zs_create_pool(char *name, gfp_t flags)
 {
 	int i;
 	struct zs_pool *pool;
diff --git a/mm/zswap.c b/mm/zswap.c
index 0cfce9bc51e4..4249e82ff934 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -906,11 +906,12 @@ static int __init init_zswap(void)
 
 	pr_info("loading zswap\n");
 
-	zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+	zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+					&zswap_zpool_ops);
 	if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
 		pr_info("%s zpool not available\n", zswap_zpool_type);
 		zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-		zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+		zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
 					&zswap_zpool_ops);
 	}
 	if (!zswap_pool) {
-- 
cgit v1.2.3


From 695f055936938c674473ea071ca7359a863551e7 Mon Sep 17 00:00:00 2001
From: Petr Cermak <petrcermak@chromium.org>
Date: Thu, 12 Feb 2015 15:01:00 -0800
Subject: fs/proc/task_mmu.c: add user-space support for resetting
 mm->hiwater_rss (peak RSS)

Peak resident size of a process can be reset back to the process's
current rss value by writing "5" to /proc/pid/clear_refs.  The driving
use-case for this would be getting the peak RSS value, which can be
retrieved from the VmHWM field in /proc/pid/status, per benchmark
iteration or test scenario.

[akpm@linux-foundation.org: clarify behaviour in documentation]
Signed-off-by: Petr Cermak <petrcermak@chromium.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Primiano Tucci <primiano@chromium.org>
Cc: Petr Cermak <petrcermak@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  4 ++++
 fs/proc/task_mmu.c                 | 14 ++++++++++++++
 include/linux/mm.h                 |  5 +++++
 3 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cf8fc2f0b34b..0b3448dba9ec 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -489,6 +489,10 @@ To clear the bits for the file mapped pages associated with the process
 To clear the soft-dirty bit
     > echo 4 > /proc/PID/clear_refs
 
+To reset the peak resident set size ("high water mark") to the process's
+current value:
+    > echo 5 > /proc/PID/clear_refs
+
 Any other value written to /proc/PID/clear_refs will have no effect.
 
 The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0e36c1e49fe3..1359a911d194 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -732,6 +732,7 @@ enum clear_refs_types {
 	CLEAR_REFS_ANON,
 	CLEAR_REFS_MAPPED,
 	CLEAR_REFS_SOFT_DIRTY,
+	CLEAR_REFS_MM_HIWATER_RSS,
 	CLEAR_REFS_LAST,
 };
 
@@ -907,6 +908,18 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			.mm = mm,
 			.private = &cp,
 		};
+
+		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+			/*
+			 * Writing 5 to /proc/pid/clear_refs resets the peak
+			 * resident set size to this mm's current rss value.
+			 */
+			down_write(&mm->mmap_sem);
+			reset_mm_hiwater_rss(mm);
+			up_write(&mm->mmap_sem);
+			goto out_mm;
+		}
+
 		down_read(&mm->mmap_sem);
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
 			for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -928,6 +941,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			mmu_notifier_invalidate_range_end(mm, 0, -1);
 		flush_tlb_mm(mm);
 		up_read(&mm->mmap_sem);
+out_mm:
 		mmput(mm);
 	}
 	put_task_struct(task);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd52e2f14027..9bee7ec0c31f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1408,6 +1408,11 @@ static inline void update_hiwater_vm(struct mm_struct *mm)
 		mm->hiwater_vm = mm->total_vm;
 }
 
+static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
+{
+	mm->hiwater_rss = get_mm_rss(mm);
+}
+
 static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 					 struct mm_struct *mm)
 {
-- 
cgit v1.2.3


From f56141e3e2d9aabf7e6b89680ab572c2cdbb2a24 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 12 Feb 2015 15:01:14 -0800
Subject: all arches, signal: move restart_block to struct task_struct

If an attacker can cause a controlled kernel stack overflow, overwriting
the restart block is a very juicy exploit target.  This is because the
restart_block is held in the same memory allocation as the kernel stack.

Moving the restart block to struct task_struct prevents this exploit by
making the restart_block harder to locate.

Note that there are other fields in thread_info that are also easy
targets, at least on some architectures.

It's also a decent simplification, since the restart code is more or less
identical on all architectures.

[james.hogan@imgtec.com: metag: align thread_info::supervisor_stack]
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: David Miller <davem@davemloft.net>
Acked-by: Richard Weinberger <richard@nod.at>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Tested-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Chris Zankel <chris@zankel.net>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/thread_info.h      |  5 -----
 arch/alpha/kernel/signal.c                |  2 +-
 arch/arc/include/asm/thread_info.h        |  4 ----
 arch/arc/kernel/signal.c                  |  2 +-
 arch/arm/include/asm/thread_info.h        |  4 ----
 arch/arm/kernel/signal.c                  |  4 ++--
 arch/arm64/include/asm/thread_info.h      |  4 ----
 arch/arm64/kernel/signal.c                |  2 +-
 arch/arm64/kernel/signal32.c              |  4 ++--
 arch/avr32/include/asm/thread_info.h      |  4 ----
 arch/avr32/kernel/asm-offsets.c           |  1 -
 arch/avr32/kernel/signal.c                |  2 +-
 arch/blackfin/include/asm/thread_info.h   |  4 ----
 arch/blackfin/kernel/signal.c             |  2 +-
 arch/c6x/include/asm/thread_info.h        |  4 ----
 arch/c6x/kernel/signal.c                  |  2 +-
 arch/cris/arch-v10/kernel/signal.c        |  2 +-
 arch/cris/arch-v32/kernel/signal.c        |  2 +-
 arch/cris/include/asm/thread_info.h       |  4 ----
 arch/frv/include/asm/thread_info.h        |  4 ----
 arch/frv/kernel/asm-offsets.c             |  1 -
 arch/frv/kernel/signal.c                  |  2 +-
 arch/hexagon/include/asm/thread_info.h    |  4 ----
 arch/hexagon/kernel/signal.c              |  2 +-
 arch/ia64/include/asm/thread_info.h       |  4 ----
 arch/ia64/kernel/signal.c                 |  2 +-
 arch/m32r/include/asm/thread_info.h       |  5 -----
 arch/m32r/kernel/signal.c                 |  2 +-
 arch/m68k/include/asm/thread_info.h       |  4 ----
 arch/m68k/kernel/signal.c                 |  4 ++--
 arch/metag/include/asm/thread_info.h      |  6 +-----
 arch/metag/kernel/signal.c                |  2 +-
 arch/microblaze/include/asm/thread_info.h |  4 ----
 arch/microblaze/kernel/signal.c           |  2 +-
 arch/mips/include/asm/thread_info.h       |  4 ----
 arch/mips/kernel/asm-offsets.c            |  1 -
 arch/mips/kernel/signal.c                 |  2 +-
 arch/mips/kernel/signal32.c               |  2 +-
 arch/mn10300/include/asm/thread_info.h    |  4 ----
 arch/mn10300/kernel/asm-offsets.c         |  1 -
 arch/mn10300/kernel/signal.c              |  2 +-
 arch/openrisc/include/asm/thread_info.h   |  4 ----
 arch/openrisc/kernel/signal.c             |  2 +-
 arch/parisc/include/asm/thread_info.h     |  4 ----
 arch/parisc/kernel/signal.c               |  2 +-
 arch/powerpc/include/asm/thread_info.h    |  4 ----
 arch/powerpc/kernel/signal_32.c           |  4 ++--
 arch/powerpc/kernel/signal_64.c           |  2 +-
 arch/s390/include/asm/thread_info.h       |  4 ----
 arch/s390/kernel/compat_signal.c          |  2 +-
 arch/s390/kernel/signal.c                 |  2 +-
 arch/score/include/asm/thread_info.h      |  4 ----
 arch/score/kernel/asm-offsets.c           |  1 -
 arch/score/kernel/signal.c                |  2 +-
 arch/sh/include/asm/thread_info.h         |  4 ----
 arch/sh/kernel/asm-offsets.c              |  1 -
 arch/sh/kernel/signal_32.c                |  4 ++--
 arch/sh/kernel/signal_64.c                |  4 ++--
 arch/sparc/include/asm/thread_info_32.h   |  6 ------
 arch/sparc/include/asm/thread_info_64.h   | 12 +++---------
 arch/sparc/kernel/signal32.c              |  4 ++--
 arch/sparc/kernel/signal_32.c             |  2 +-
 arch/sparc/kernel/signal_64.c             |  2 +-
 arch/sparc/kernel/traps_64.c              |  2 --
 arch/tile/include/asm/thread_info.h       |  4 ----
 arch/tile/kernel/signal.c                 |  2 +-
 arch/um/include/asm/thread_info.h         |  4 ----
 arch/unicore32/include/asm/thread_info.h  |  4 ----
 arch/unicore32/kernel/signal.c            |  2 +-
 arch/x86/ia32/ia32_signal.c               |  2 +-
 arch/x86/include/asm/thread_info.h        |  4 ----
 arch/x86/kernel/signal.c                  |  2 +-
 arch/x86/um/signal.c                      |  2 +-
 arch/xtensa/include/asm/thread_info.h     |  5 -----
 arch/xtensa/kernel/signal.c               |  2 +-
 fs/select.c                               |  2 +-
 include/linux/init_task.h                 |  3 +++
 include/linux/sched.h                     |  2 ++
 kernel/compat.c                           |  5 ++---
 kernel/futex.c                            |  2 +-
 kernel/signal.c                           |  2 +-
 kernel/time/alarmtimer.c                  |  2 +-
 kernel/time/hrtimer.c                     |  2 +-
 kernel/time/posix-cpu-timers.c            |  3 +--
 84 files changed, 62 insertions(+), 194 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 48bbea6898b3..d5b98ab514bb 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -27,8 +27,6 @@ struct thread_info {
 	int bpt_nsaved;
 	unsigned long bpt_addr[2];		/* breakpoint handling  */
 	unsigned int bpt_insn[2];
-
-	struct restart_block	restart_block;
 };
 
 /*
@@ -40,9 +38,6 @@ struct thread_info {
 	.exec_domain	= &default_exec_domain,	\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 6cec2881acbf..8dbfb15f1745 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -150,7 +150,7 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
 	struct switch_stack *sw = (struct switch_stack *)regs - 1;
 	long i, err = __get_user(regs->pc, &sc->sc_pc);
 
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	sw->r26 = (unsigned long) ret_from_sys_call;
 
diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h
index 02bc5ec0fb2e..1163a1838ac1 100644
--- a/arch/arc/include/asm/thread_info.h
+++ b/arch/arc/include/asm/thread_info.h
@@ -46,7 +46,6 @@ struct thread_info {
 	struct exec_domain *exec_domain;/* execution domain */
 	__u32 cpu;			/* current CPU */
 	unsigned long thr_ptr;		/* TLS ptr */
-	struct restart_block restart_block;
 };
 
 /*
@@ -62,9 +61,6 @@ struct thread_info {
 	.cpu        = 0,			\
 	.preempt_count  = INIT_PREEMPT_COUNT,	\
 	.addr_limit = KERNEL_DS,		\
-	.restart_block  = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info    (init_thread_union.thread_info)
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index cb3142a2d40b..114234e83caa 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -104,7 +104,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	struct pt_regs *regs = current_pt_regs();
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* Since we stacked the signal on a word boundary,
 	 * then 'sp' should be word aligned here.  If it's
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index d890e41f5520..72812a1f3d1c 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -68,7 +68,6 @@ struct thread_info {
 #ifdef CONFIG_ARM_THUMBEE
 	unsigned long		thumbee_state;	/* ThumbEE Handler Base register */
 #endif
-	struct restart_block	restart_block;
 };
 
 #define INIT_THREAD_INFO(tsk)						\
@@ -81,9 +80,6 @@ struct thread_info {
 	.cpu_domain	= domain_val(DOMAIN_USER, DOMAIN_MANAGER) |	\
 			  domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) |	\
 			  domain_val(DOMAIN_IO, DOMAIN_CLIENT),		\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall,			\
-	},								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 8aa6f1b87c9e..023ac905e4c3 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -191,7 +191,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs)
 	struct sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
@@ -221,7 +221,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 	struct rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 459bf8e53208..702e1e6a0d80 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -48,7 +48,6 @@ struct thread_info {
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
-	struct restart_block	restart_block;
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	int			cpu;		/* cpu */
 };
@@ -60,9 +59,6 @@ struct thread_info {
 	.flags		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
 	.addr_limit	= KERNEL_DS,					\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall,			\
-	},								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 6fa792137eda..660ccf9f7524 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -131,7 +131,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	struct rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 128-bit boundary, then 'sp' should
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index e299de396e9b..c20a300e2213 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -347,7 +347,7 @@ asmlinkage int compat_sys_sigreturn(struct pt_regs *regs)
 	struct compat_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
@@ -381,7 +381,7 @@ asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs)
 	struct compat_rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h
index a978f3fe7c25..d56afa99a514 100644
--- a/arch/avr32/include/asm/thread_info.h
+++ b/arch/avr32/include/asm/thread_info.h
@@ -30,7 +30,6 @@ struct thread_info {
 						   saved by debug handler
 						   when setting up
 						   trampoline */
-	struct restart_block	restart_block;
 	__u8			supervisor_stack[0];
 };
 
@@ -41,9 +40,6 @@ struct thread_info {
 	.flags		= 0,						\
 	.cpu		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall				\
-	}								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c
index d6a8193a1d2f..e41c84516e5d 100644
--- a/arch/avr32/kernel/asm-offsets.c
+++ b/arch/avr32/kernel/asm-offsets.c
@@ -18,7 +18,6 @@ void foo(void)
 	OFFSET(TI_preempt_count, thread_info, preempt_count);
 	OFFSET(TI_rar_saved, thread_info, rar_saved);
 	OFFSET(TI_rsr_saved, thread_info, rsr_saved);
-	OFFSET(TI_restart_block, thread_info, restart_block);
 	BLANK();
 	OFFSET(TSK_active_mm, task_struct, active_mm);
 	BLANK();
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index d309fbcc3bd6..8f1c63b9b983 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -69,7 +69,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 	sigset_t set;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *)regs->sp;
 	pr_debug("SIG return: frame = %p\n", frame);
diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h
index 55f473bdad36..57c3a8bd583d 100644
--- a/arch/blackfin/include/asm/thread_info.h
+++ b/arch/blackfin/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
 	int cpu;		/* cpu we're on */
 	int preempt_count;	/* 0 => preemptable, <0 => BUG */
 	mm_segment_t addr_limit;	/* address limit */
-	struct restart_block restart_block;
 #ifndef CONFIG_SMP
 	struct l1_scratch_task_info l1_task_info;
 #endif
@@ -58,9 +57,6 @@ struct thread_info {
 	.flags		= 0,			\
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index ef275571d885..f2a8b5493bd3 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -44,7 +44,7 @@ rt_restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *p
 	int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 #define RESTORE(x) err |= __get_user(regs->x, &sc->sc_##x)
 
diff --git a/arch/c6x/include/asm/thread_info.h b/arch/c6x/include/asm/thread_info.h
index d4e9ef87076d..584e253f3217 100644
--- a/arch/c6x/include/asm/thread_info.h
+++ b/arch/c6x/include/asm/thread_info.h
@@ -45,7 +45,6 @@ struct thread_info {
 	int			cpu;		/* cpu we're on */
 	int			preempt_count;	/* 0 = preemptable, <0 = BUG */
 	mm_segment_t		addr_limit;	/* thread address space */
-	struct restart_block	restart_block;
 };
 
 /*
@@ -61,9 +60,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index fe68226f6c4d..3c4bb5a5c382 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -68,7 +68,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs)
 	sigset_t set;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a dword boundary,
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 9b32d338838b..74d7ba35120d 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -67,7 +67,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	unsigned long old_usp;
 
         /* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* restore the regs from &sc->regs (same as sc, since regs is first)
 	 * (sc is already checked for VERIFY_READ since the sigframe was
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 78ce3b1c9bcb..870e3e069318 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -59,7 +59,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	unsigned long old_usp;
 
         /* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Restore the registers from &sc->regs. sc is already checked
diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h
index 55dede18c032..7286db5ed90e 100644
--- a/arch/cris/include/asm/thread_info.h
+++ b/arch/cris/include/asm/thread_info.h
@@ -38,7 +38,6 @@ struct thread_info {
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
-	struct restart_block    restart_block;
 	__u8			supervisor_stack[0];
 };
 
@@ -56,9 +55,6 @@ struct thread_info {
 	.cpu		= 0,				\
 	.preempt_count	= INIT_PREEMPT_COUNT,		\
 	.addr_limit	= KERNEL_DS,			\
-	.restart_block = {				\
-		       .fn = do_no_restart_syscall,	\
-	},						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h
index af29e17c0181..6b917f1c2955 100644
--- a/arch/frv/include/asm/thread_info.h
+++ b/arch/frv/include/asm/thread_info.h
@@ -41,7 +41,6 @@ struct thread_info {
 						 * 0-0xBFFFFFFF for user-thead
 						 * 0-0xFFFFFFFF for kernel-thread
 						 */
-	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
 };
@@ -65,9 +64,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/frv/kernel/asm-offsets.c b/arch/frv/kernel/asm-offsets.c
index 9de96843a278..446e89d500cc 100644
--- a/arch/frv/kernel/asm-offsets.c
+++ b/arch/frv/kernel/asm-offsets.c
@@ -40,7 +40,6 @@ void foo(void)
 	OFFSET(TI_CPU,			thread_info, cpu);
 	OFFSET(TI_PREEMPT_COUNT,	thread_info, preempt_count);
 	OFFSET(TI_ADDR_LIMIT,		thread_info, addr_limit);
-	OFFSET(TI_RESTART_BLOCK,	thread_info, restart_block);
 	BLANK();
 
 	/* offsets into register file storage */
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index dc3d59de0870..336713ab4745 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -62,7 +62,7 @@ static int restore_sigcontext(struct sigcontext __user *sc, int *_gr8)
 	unsigned long tbr, psr;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	tbr = user->i.tbr;
 	psr = user->i.psr;
diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h
index a59dad3b3695..bacd3d6030c5 100644
--- a/arch/hexagon/include/asm/thread_info.h
+++ b/arch/hexagon/include/asm/thread_info.h
@@ -56,7 +56,6 @@ struct thread_info {
 	 * used for syscalls somehow;
 	 * seems to have a function pointer and four arguments
 	 */
-	struct restart_block    restart_block;
 	/* Points to the current pt_regs frame  */
 	struct pt_regs		*regs;
 	/*
@@ -83,9 +82,6 @@ struct thread_info {
 	.cpu            = 0,                    \
 	.preempt_count  = 1,                    \
 	.addr_limit     = KERNEL_DS,            \
-	.restart_block = {                      \
-		.fn = do_no_restart_syscall,    \
-	},                                      \
 	.sp = 0,				\
 	.regs = NULL,			\
 }
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index eadd70e47e7e..b039a624c170 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -239,7 +239,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	sigset_t blocked;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *)pt_psp(regs);
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 5b17418b4223..c16f21a068ff 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -27,7 +27,6 @@ struct thread_info {
 	__u32 status;			/* Thread synchronous flags */
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
-	struct restart_block restart_block;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	__u64 ac_stamp;
 	__u64 ac_leave;
@@ -46,9 +45,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #ifndef ASM_OFFSETS_C
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 6d92170be457..b3a124da71e5 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -46,7 +46,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 	long err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* restore scratch that always needs gets updated during signal delivery: */
 	err  = __get_user(flags, &sc->sc_flags);
diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h
index 00171703402f..32422d0211c3 100644
--- a/arch/m32r/include/asm/thread_info.h
+++ b/arch/m32r/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
 					 	   0-0xBFFFFFFF for user-thread
 						   0-0xFFFFFFFF for kernel-thread
 						*/
-	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
 };
@@ -49,7 +48,6 @@ struct thread_info {
 #define TI_CPU		0x00000010
 #define TI_PRE_COUNT	0x00000014
 #define TI_ADDR_LIMIT	0x00000018
-#define TI_RESTART_BLOCK 0x000001C
 
 #endif
 
@@ -68,9 +66,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 95408b8f130a..7736c6660a15 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -48,7 +48,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 #define COPY(x)		err |= __get_user(regs->x, &sc->sc_##x)
 	COPY(r4);
diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h
index 21a4784ca5a1..c54256e69e64 100644
--- a/arch/m68k/include/asm/thread_info.h
+++ b/arch/m68k/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	__u32			cpu;		/* should always be 0 on m68k */
 	unsigned long		tp_value;	/* thread pointer */
-	struct restart_block    restart_block;
 };
 #endif /* __ASSEMBLY__ */
 
@@ -41,9 +40,6 @@ struct thread_info {
 	.exec_domain	= &default_exec_domain,	\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 967a8b7e1527..d7179281e74a 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -655,7 +655,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u
 	int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* get previous context */
 	if (copy_from_user(&context, usc, sizeof(context)))
@@ -693,7 +693,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw,
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = __get_user(temp, &uc->uc_mcontext.version);
 	if (temp != MCONTEXT_VERSION)
diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h
index 47711336119e..afb3ca4776d1 100644
--- a/arch/metag/include/asm/thread_info.h
+++ b/arch/metag/include/asm/thread_info.h
@@ -35,9 +35,8 @@ struct thread_info {
 	int preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t addr_limit;	/* thread address space */
-	struct restart_block restart_block;
 
-	u8 supervisor_stack[0];
+	u8 supervisor_stack[0] __aligned(8);
 };
 
 #else /* !__ASSEMBLY__ */
@@ -74,9 +73,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c
index 0d100d5c1407..ce49d429c74a 100644
--- a/arch/metag/kernel/signal.c
+++ b/arch/metag/kernel/signal.c
@@ -48,7 +48,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = metag_gp_regs_copyin(regs, 0, sizeof(struct user_gp_regs), NULL,
 				   &sc->regs);
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index 8c9d36591a03..b699fbd7de4a 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -71,7 +71,6 @@ struct thread_info {
 	__u32			cpu; /* current CPU */
 	__s32			preempt_count; /* 0 => preemptable,< 0 => BUG*/
 	mm_segment_t		addr_limit; /* thread address space */
-	struct restart_block	restart_block;
 
 	struct cpu_context	cpu_context;
 };
@@ -87,9 +86,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 235706055b7f..a1cbaf90e2ea 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -89,7 +89,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	int rval;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index e4440f92b366..9e1295f874f0 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
 						 * 0x7fffffff for user-thead
 						 * 0xffffffff for kernel-thread
 						 */
-	struct restart_block	restart_block;
 	struct pt_regs		*regs;
 	long			syscall;	/* syscall number */
 };
@@ -50,9 +49,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index b1d84bd4efb3..3b2dfdb4865f 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -98,7 +98,6 @@ void output_thread_info_defines(void)
 	OFFSET(TI_CPU, thread_info, cpu);
 	OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
 	OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
-	OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
 	OFFSET(TI_REGS, thread_info, regs);
 	DEFINE(_THREAD_SIZE, THREAD_SIZE);
 	DEFINE(_THREAD_MASK, THREAD_MASK);
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 545bf11bd2ed..6a28c792d862 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -243,7 +243,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	int i;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err |= __get_user(regs->cp0_epc, &sc->sc_pc);
 
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index d69179c0d49d..19a7705f2a01 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -220,7 +220,7 @@ static int restore_sigcontext32(struct pt_regs *regs,
 	int i;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err |= __get_user(regs->cp0_epc, &sc->sc_pc);
 	err |= __get_user(regs->hi, &sc->sc_mdhi);
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index bf280eaccd36..c1c374f0ec12 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -50,7 +50,6 @@ struct thread_info {
 						   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
-	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
 };
@@ -80,9 +79,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/mn10300/kernel/asm-offsets.c b/arch/mn10300/kernel/asm-offsets.c
index 47b3bb0c04ff..d780670cbaf3 100644
--- a/arch/mn10300/kernel/asm-offsets.c
+++ b/arch/mn10300/kernel/asm-offsets.c
@@ -28,7 +28,6 @@ void foo(void)
 	OFFSET(TI_cpu,			thread_info, cpu);
 	OFFSET(TI_preempt_count,	thread_info, preempt_count);
 	OFFSET(TI_addr_limit,		thread_info, addr_limit);
-	OFFSET(TI_restart_block,	thread_info, restart_block);
 	BLANK();
 
 	OFFSET(REG_D0,			pt_regs, d0);
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index a6c0858592c3..8609845f12c5 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -40,7 +40,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (is_using_fpu(current))
 		fpu_kill_state(current);
diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h
index d797acc901e4..875f0845a707 100644
--- a/arch/openrisc/include/asm/thread_info.h
+++ b/arch/openrisc/include/asm/thread_info.h
@@ -57,7 +57,6 @@ struct thread_info {
 					       0-0x7FFFFFFF for user-thead
 					       0-0xFFFFFFFF for kernel-thread
 					     */
-	struct restart_block    restart_block;
 	__u8			supervisor_stack[0];
 
 	/* saved context data */
@@ -79,9 +78,6 @@ struct thread_info {
 	.cpu		= 0,				\
 	.preempt_count	= 1,				\
 	.addr_limit	= KERNEL_DS,			\
-	.restart_block  = {				\
-			  .fn = do_no_restart_syscall,	\
-	},						\
 	.ksp            = 0,                            \
 }
 
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 7d1b8235bf90..4112175bf803 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -46,7 +46,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Restore the regs from &sc->regs.
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index a84611835549..fb13e3865563 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -14,7 +14,6 @@ struct thread_info {
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	__u32 cpu;			/* current CPU */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
-	struct restart_block restart_block;
 };
 
 #define INIT_THREAD_INFO(tsk)			\
@@ -25,9 +24,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.addr_limit	= KERNEL_DS,		\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-  	.restart_block	= {			\
-		.fn = do_no_restart_syscall	\
-	}					\
 }
 
 #define init_thread_info        (init_thread_union.thread_info)
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 012d4fa63d97..9b910a0251b8 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -99,7 +99,7 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall)
 		sigframe_size = PARISC_RT_SIGFRAME_SIZE32;
 #endif
 
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* Unwind the user stack to get the rt_sigframe structure. */
 	frame = (struct rt_sigframe __user *)
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index e8abc83e699f..72489799cf02 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -43,7 +43,6 @@ struct thread_info {
 	int		cpu;			/* cpu we're on */
 	int		preempt_count;		/* 0 => preemptable,
 						   <0 => BUG */
-	struct restart_block restart_block;
 	unsigned long	local_flags;		/* private flags for thread */
 
 	/* low level flags - has atomic operations done on it */
@@ -59,9 +58,6 @@ struct thread_info {
 	.exec_domain =	&default_exec_domain,	\
 	.cpu =		0,			\
 	.preempt_count = INIT_PREEMPT_COUNT,	\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 	.flags =	0,			\
 }
 
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index b171001698ff..d3a831ac0f92 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1231,7 +1231,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 	int tm_restore = 0;
 #endif
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	rt_sf = (struct rt_sigframe __user *)
 		(regs->gpr[1] + __SIGNAL_FRAMESIZE + 16);
@@ -1504,7 +1504,7 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 #endif
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
 	sc = &sf->sctx;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 2cb0c94cafa5..c7c24d2e2bdb 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -666,7 +666,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
 #endif
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, uc, sizeof(*uc)))
 		goto badframe;
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 4d62fd5b56e5..ef1df718642d 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -39,7 +39,6 @@ struct thread_info {
 	unsigned long		sys_call_table;	/* System call table address */
 	unsigned int		cpu;		/* current CPU */
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
-	struct restart_block	restart_block;
 	unsigned int		system_call;
 	__u64			user_timer;
 	__u64			system_timer;
@@ -56,9 +55,6 @@ struct thread_info {
 	.flags		= 0,			\
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 34d5fa7b01b5..bc1df12dd4f8 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -209,7 +209,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
 	int i;
 
 	/* Alwys make any pending restarted system call return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
 		return -EFAULT;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 6a2ac257d98f..b3ae6f70c6d6 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -162,7 +162,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
 	_sigregs user_sregs;
 
 	/* Alwys make any pending restarted system call return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs)))
 		return -EFAULT;
diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h
index 656b7ada9326..33864fa2a8d4 100644
--- a/arch/score/include/asm/thread_info.h
+++ b/arch/score/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
 	 * 0-0xFFFFFFFF for kernel-thread
 	 */
 	mm_segment_t		addr_limit;
-	struct restart_block	restart_block;
 	struct pt_regs		*regs;
 };
 
@@ -58,9 +57,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= 1,			\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/score/kernel/asm-offsets.c b/arch/score/kernel/asm-offsets.c
index 57788f44c6fb..b4d5214a7a7e 100644
--- a/arch/score/kernel/asm-offsets.c
+++ b/arch/score/kernel/asm-offsets.c
@@ -106,7 +106,6 @@ void output_thread_info_defines(void)
 	OFFSET(TI_CPU, thread_info, cpu);
 	OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
 	OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
-	OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
 	OFFSET(TI_REGS, thread_info, regs);
 	DEFINE(KERNEL_STACK_SIZE, THREAD_SIZE);
 	DEFINE(KERNEL_STACK_MASK, THREAD_MASK);
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index 1651807774ad..e381c8c4ff65 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -141,7 +141,7 @@ score_rt_sigreturn(struct pt_regs *regs)
 	int sig;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *) regs->regs[0];
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index ad27ffa65e2e..657c03919627 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -33,7 +33,6 @@ struct thread_info {
 	__u32			cpu;
 	int			preempt_count; /* 0 => preemptable, <0 => BUG */
 	mm_segment_t		addr_limit;	/* thread address space */
-	struct restart_block	restart_block;
 	unsigned long		previous_sp;	/* sp of previous stack in case
 						   of nested IRQ stacks */
 	__u8			supervisor_stack[0];
@@ -63,9 +62,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
index 08a2be775b6c..542225fedb11 100644
--- a/arch/sh/kernel/asm-offsets.c
+++ b/arch/sh/kernel/asm-offsets.c
@@ -25,7 +25,6 @@ int main(void)
 	DEFINE(TI_FLAGS,	offsetof(struct thread_info, flags));
 	DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
 	DEFINE(TI_PRE_COUNT,	offsetof(struct thread_info, preempt_count));
-	DEFINE(TI_RESTART_BLOCK,offsetof(struct thread_info, restart_block));
 	DEFINE(TI_SIZE,		sizeof(struct thread_info));
 
 #ifdef CONFIG_HIBERNATION
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 2f002b24fb92..0b34f2a704fe 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -156,7 +156,7 @@ asmlinkage int sys_sigreturn(void)
 	int r0;
 
         /* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -186,7 +186,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	int r0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 897abe7b871e..71993c6a7d94 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -260,7 +260,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3,
 	long long ret;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -294,7 +294,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3,
 	long long ret;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h
index 025c98446b1e..fd7bd0a440ca 100644
--- a/arch/sparc/include/asm/thread_info_32.h
+++ b/arch/sparc/include/asm/thread_info_32.h
@@ -47,8 +47,6 @@ struct thread_info {
 	struct reg_window32	reg_window[NSWINS];	/* align for ldd! */
 	unsigned long		rwbuf_stkptrs[NSWINS];
 	unsigned long		w_saved;
-
-	struct restart_block	restart_block;
 };
 
 /*
@@ -62,9 +60,6 @@ struct thread_info {
 	.flags		=	0,			\
 	.cpu		=	0,			\
 	.preempt_count	=	INIT_PREEMPT_COUNT,	\
-	.restart_block	= {				\
-		.fn	=	do_no_restart_syscall,	\
-	},						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
@@ -103,7 +98,6 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define TI_REG_WINDOW	0x30
 #define TI_RWIN_SPTRS	0x230
 #define TI_W_SAVED	0x250
-/* #define TI_RESTART_BLOCK 0x25n */ /* Nobody cares */
 
 /*
  * thread information flag bit numbers
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 798f0279a4b5..ff455164732a 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -58,8 +58,6 @@ struct thread_info {
 	unsigned long		gsr[7];
 	unsigned long		xfsr[7];
 
-	struct restart_block	restart_block;
-
 	struct pt_regs		*kern_una_regs;
 	unsigned int		kern_una_insn;
 
@@ -92,10 +90,9 @@ struct thread_info {
 #define TI_RWIN_SPTRS	0x000003c8
 #define TI_GSR		0x00000400
 #define TI_XFSR		0x00000438
-#define TI_RESTART_BLOCK 0x00000470
-#define TI_KUNA_REGS	0x000004a0
-#define TI_KUNA_INSN	0x000004a8
-#define TI_FPREGS	0x000004c0
+#define TI_KUNA_REGS	0x00000470
+#define TI_KUNA_INSN	0x00000478
+#define TI_FPREGS	0x00000480
 
 /* We embed this in the uppermost byte of thread_info->flags */
 #define FAULT_CODE_WRITE	0x01	/* Write access, implies D-TLB	   */
@@ -124,9 +121,6 @@ struct thread_info {
 	.current_ds	=	ASI_P,			\
 	.exec_domain	=	&default_exec_domain,	\
 	.preempt_count	=	INIT_PREEMPT_COUNT,	\
-	.restart_block	= {				\
-		.fn	=	do_no_restart_syscall,	\
-	},						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 62deba7be1a9..4eed773a7735 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -150,7 +150,7 @@ void do_sigreturn32(struct pt_regs *regs)
 	int err, i;
 	
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack();
 
@@ -235,7 +235,7 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
 	int err, i;
 	
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack();
 	regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 9ee72fc8e0e4..52aa5e4ce5e7 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -70,7 +70,7 @@ asmlinkage void do_sigreturn(struct pt_regs *regs)
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack();
 
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 1a6999868031..d88beff47bab 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -254,7 +254,7 @@ void do_rt_sigreturn(struct pt_regs *regs)
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	synchronize_user_stack ();
 	sf = (struct rt_signal_frame __user *)
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 981a769b9558..a27651e866e7 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2730,8 +2730,6 @@ void __init trap_init(void)
 		     TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
 		     TI_CURRENT_DS != offsetof(struct thread_info,
 						current_ds) ||
-		     TI_RESTART_BLOCK != offsetof(struct thread_info,
-						  restart_block) ||
 		     TI_KUNA_REGS != offsetof(struct thread_info,
 					      kern_una_regs) ||
 		     TI_KUNA_INSN != offsetof(struct thread_info,
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 48e4fd0f38e4..96c14c1430d8 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -36,7 +36,6 @@ struct thread_info {
 
 	mm_segment_t		addr_limit;	/* thread address space
 						   (KERNEL_DS or USER_DS) */
-	struct restart_block	restart_block;
 	struct single_step_state *step_state;	/* single step state
 						   (if non-zero) */
 	int			align_ctl;	/* controls unaligned access */
@@ -57,9 +56,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 	.step_state	= NULL,			\
 	.align_ctl	= 0,			\
 }
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index bb0a9ce7ae23..8a524e332c1a 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -48,7 +48,7 @@ int restore_sigcontext(struct pt_regs *regs,
 	int err;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Enforce that sigcontext is like pt_regs, and doesn't mess
diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index 1c5b2a83046a..e04114c4fcd9 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -22,7 +22,6 @@ struct thread_info {
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user
 						   0-0xFFFFFFFF for kernel */
-	struct restart_block    restart_block;
 	struct thread_info	*real_thread;    /* Points to non-IRQ stack */
 };
 
@@ -34,9 +33,6 @@ struct thread_info {
 	.cpu =		0,			\
 	.preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit =	KERNEL_DS,		\
-	.restart_block =  {			\
-		.fn =  do_no_restart_syscall,	\
-	},					\
 	.real_thread = NULL,			\
 }
 
diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h
index af36d8eabdf1..63e2839dfeb8 100644
--- a/arch/unicore32/include/asm/thread_info.h
+++ b/arch/unicore32/include/asm/thread_info.h
@@ -79,7 +79,6 @@ struct thread_info {
 #ifdef CONFIG_UNICORE_FPU_F64
 	struct fp_state		fpstate __attribute__((aligned(8)));
 #endif
-	struct restart_block	restart_block;
 };
 
 #define INIT_THREAD_INFO(tsk)						\
@@ -89,9 +88,6 @@ struct thread_info {
 	.flags		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
 	.addr_limit	= KERNEL_DS,					\
-	.restart_block	= {						\
-		.fn	= do_no_restart_syscall,			\
-	},								\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 7c8fb7018dc6..d329f85766cc 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -105,7 +105,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs)
 	struct rt_sigframe __user *frame;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/*
 	 * Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f9e181aaba97..d0165c9a2932 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	u32 tmp;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	get_user_try {
 		/*
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e82e95abc92b..1d4e4f279a32 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	int			saved_preempt_count;
 	mm_segment_t		addr_limit;
-	struct restart_block    restart_block;
 	void __user		*sysenter_return;
 	unsigned int		sig_on_uaccess_error:1;
 	unsigned int		uaccess_err:1;	/* uaccess failed */
@@ -45,9 +44,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2a33c8f68319..e5042463c1bc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	get_user_try {
 
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 79d824551c1a..0c8c32bfd792 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
 	int err, pid;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = copy_from_user(&sc, from, sizeof(sc));
 	if (err)
diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h
index 470153e8547c..a9b5d3ba196c 100644
--- a/arch/xtensa/include/asm/thread_info.h
+++ b/arch/xtensa/include/asm/thread_info.h
@@ -51,7 +51,6 @@ struct thread_info {
 	__s32			preempt_count;	/* 0 => preemptable,< 0 => BUG*/
 
 	mm_segment_t		addr_limit;	/* thread address space */
-	struct restart_block    restart_block;
 
 	unsigned long		cpenable;
 
@@ -72,7 +71,6 @@ struct thread_info {
 #define TI_CPU		 0x00000010
 #define TI_PRE_COUNT	 0x00000014
 #define TI_ADDR_LIMIT	 0x00000018
-#define TI_RESTART_BLOCK 0x000001C
 
 #endif
 
@@ -90,9 +88,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index 4612321c73cc..3d733ba16f28 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -245,7 +245,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
 	int ret;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	if (regs->depc > 64)
 		panic("rt_sigreturn in double exception!\n");
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 	if (ret == -EINTR) {
 		struct restart_block *restart_block;
 
-		restart_block = &current_thread_info()->restart_block;
+		restart_block = &current->restart_block;
 		restart_block->fn = do_restart_poll;
 		restart_block->poll.ufds = ufds;
 		restart_block->poll.nfds = nfds;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3037fc085e8e..d3d43ecf148c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -193,6 +193,9 @@ extern struct task_group root_task_group;
 	.nr_cpus_allowed= NR_CPUS,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
+	.restart_block = {						\
+		.fn = do_no_restart_syscall,				\
+	},								\
 	.se		= {						\
 		.group_node 	= LIST_HEAD_INIT(tsk.se.group_node),	\
 	},								\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef98d2f..22ee0d5d7f8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1370,6 +1370,8 @@ struct task_struct {
 
 	unsigned long atomic_flags; /* Flags needing atomic access. */
 
+	struct restart_block restart_block;
+
 	pid_t pid;
 	pid_t tgid;
 
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 	 * core implementation decides to return random nonsense.
 	 */
 	if (ret == -ERESTART_RESTARTBLOCK) {
-		struct restart_block *restart
-			= &current_thread_info()->restart_block;
+		struct restart_block *restart = &current->restart_block;
 
 		restart->fn = compat_nanosleep_restart;
 		restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 		return -EFAULT;
 
 	if (err == -ERESTART_RESTARTBLOCK) {
-		restart = &current_thread_info()->restart_block;
+		restart = &current->restart_block;
 		restart->fn = compat_clock_nanosleep_restart;
 		restart->nanosleep.compat_rmtp = rmtp;
 	}
diff --git a/kernel/futex.c b/kernel/futex.c
index 4eeb63de7e54..2a5e3830e953 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2217,7 +2217,7 @@ retry:
 	if (!abs_time)
 		goto out;
 
-	restart = &current_thread_info()->restart_block;
+	restart = &current->restart_block;
 	restart->fn = futex_wait_restart;
 	restart->futex.uaddr = uaddr;
 	restart->futex.val = val;
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..33a52759cc0e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
  */
 SYSCALL_DEFINE0(restart_syscall)
 {
-	struct restart_block *restart = &current_thread_info()->restart_block;
+	struct restart_block *restart = &current->restart_block;
 	return restart->fn(restart);
 }
 
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 			goto out;
 	}
 
-	restart = &current_thread_info()->restart_block;
+	restart = &current->restart_block;
 	restart->fn = alarm_timer_nsleep_restart;
 	restart->nanosleep.clockid = type;
 	restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3f5e183c3d97..bee0c1f78091 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1583,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 			goto out;
 	}
 
-	restart = &current_thread_info()->restart_block;
+	restart = &current->restart_block;
 	restart->fn = hrtimer_nanosleep_restart;
 	restart->nanosleep.clockid = t.timer.base->clockid;
 	restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 			    struct timespec *rqtp, struct timespec __user *rmtp)
 {
-	struct restart_block *restart_block =
-		&current_thread_info()->restart_block;
+	struct restart_block *restart_block = &current->restart_block;
 	struct itimerspec it;
 	int error;
 
-- 
cgit v1.2.3


From dd4a5c1e6531b9c6ba6ee1f3cb11f0aeea25881e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 12 Feb 2015 15:01:22 -0800
Subject: linux/types.h: Always use unsigned long for pgoff_t

Everybody uses unsigned long for pgoff_t, and no one ever overrode the
definition of pgoff_t.  Keep it that way, and remove the option of
overriding it.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/types.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index 62323825cff9..6747247e3f9f 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -135,12 +135,9 @@ typedef unsigned long blkcnt_t;
 #endif
 
 /*
- * The type of an index into the pagecache.  Use a #define so asm/types.h
- * can override it.
+ * The type of an index into the pagecache.
  */
-#ifndef pgoff_t
 #define pgoff_t unsigned long
-#endif
 
 /* A dma_addr_t can hold any valid DMA or bus address for the platform */
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
-- 
cgit v1.2.3


From 545a2bf742fb41f17d03486dd8a8c74ad511dec2 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Thu, 12 Feb 2015 15:01:24 -0800
Subject: kernel/sched/clock.c: add another clock for use with the soft lockup
 watchdog

When the hypervisor pauses a virtualised kernel the kernel will observe a
jump in timebase, this can cause spurious messages from the softlockup
detector.

Whilst these messages are harmless, they are accompanied with a stack
trace which causes undue concern and more problematically the stack trace
in the guest has nothing to do with the observed problem and can only be
misleading.

Futhermore, on POWER8 this is completely avoidable with the introduction
of the Virtual Time Base (VTB) register.

This patch (of 2):

This permits the use of arch specific clocks for which virtualised kernels
can use their notion of 'running' time, not the elpased wall time which
will include host execution time.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Andrew Jones <drjones@redhat.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: chai wen <chaiw.fnst@cn.fujitsu.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Ben Zhang <benzh@chromium.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/clock.c  | 13 +++++++++++++
 kernel/watchdog.c     |  2 +-
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 22ee0d5d7f8c..048b91b983ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2147,6 +2147,7 @@ extern unsigned long long notrace sched_clock(void);
  */
 extern u64 cpu_clock(int cpu);
 extern u64 local_clock(void);
+extern u64 running_clock(void);
 extern u64 sched_clock_cpu(int cpu);
 
 
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
 
 EXPORT_SYMBOL_GPL(cpu_clock);
 EXPORT_SYMBOL_GPL(local_clock);
+
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+	return local_clock();
+}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..3174bf8e3538 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,7 @@ static int get_softlockup_thresh(void)
  */
 static unsigned long get_timestamp(void)
 {
-	return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
+	return running_clock() >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
 static void set_sample_period(void)
-- 
cgit v1.2.3


From 02f1f2170d2831b3233e91091c60a66622f29e82 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:31 -0800
Subject: kernel.h: remove ancient __FUNCTION__ hack

__FUNCTION__ hasn't been treated as a string literal since gcc 3.4, so
this only helps people who only test-compile using 3.3 (compiler-gcc3.h
barks at anything older than that).  Besides, there are almost no
occurrences of __FUNCTION__ left in the tree.

[akpm@linux-foundation.org: convert remaining __FUNCTION__ references]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/hpet.c                       | 2 +-
 arch/x86/kernel/rtc.c                        | 4 ++--
 arch/x86/platform/intel-mid/intel_mid_vrtc.c | 2 +-
 drivers/acpi/acpica/utdebug.c                | 4 ++--
 drivers/block/xen-blkfront.c                 | 2 +-
 include/acpi/acoutput.h                      | 6 +++---
 include/linux/kernel.h                       | 3 ---
 7 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 319bcb9372fe..3acbff4716b0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line)
 #define hpet_print_config()					\
 do {								\
 	if (hpet_verbose)					\
-		_hpet_print_config(__FUNCTION__, __LINE__);	\
+		_hpet_print_config(__func__, __LINE__);	\
 } while (0)
 
 /*
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index fe3dbfe0c4a5..cd9685235df9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now)
 		retval = set_rtc_time(&tm);
 		if (retval)
 			printk(KERN_ERR "%s: RTC write failed with error %d\n",
-			       __FUNCTION__, retval);
+			       __func__, retval);
 	} else {
 		printk(KERN_ERR
 		       "%s: Invalid RTC value: write of %lx to RTC failed\n",
-			__FUNCTION__, nowtime);
+			__func__, nowtime);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 4762cff7facd..32947ba0f62d 100644
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -110,7 +110,7 @@ int vrtc_set_mmss(const struct timespec *now)
 		spin_unlock_irqrestore(&rtc_lock, flags);
 	} else {
 		pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n",
-			__FUNCTION__, now->tv_sec);
+			__func__, now->tv_sec);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/drivers/acpi/acpica/utdebug.c b/drivers/acpi/acpica/utdebug.c
index 57078e3ea9b7..4f3f888d33bb 100644
--- a/drivers/acpi/acpica/utdebug.c
+++ b/drivers/acpi/acpica/utdebug.c
@@ -111,8 +111,8 @@ void acpi_ut_track_stack_ptr(void)
  * RETURN:      Updated pointer to the function name
  *
  * DESCRIPTION: Remove the "Acpi" prefix from the function name, if present.
- *              This allows compiler macros such as __FUNCTION__ to be used
- *              with no change to the debug output.
+ *              This allows compiler macros such as __func__ to be used with no
+ *              change to the debug output.
  *
  ******************************************************************************/
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2236c6f31608..d2cae5fc211a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1391,7 +1391,7 @@ static int blkfront_probe(struct xenbus_device *dev,
 			if (major != XENVBD_MAJOR) {
 				printk(KERN_INFO
 						"%s: HVM does not support vbd %d as xen block device\n",
-						__FUNCTION__, vdevice);
+						__func__, vdevice);
 				return -ENODEV;
 			}
 		}
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index 9318a87ee39a..a8f344363e77 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -240,7 +240,7 @@
 /*
  * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header,
  * define it now. This is the case where there the compiler does not support
- * a __FUNCTION__ macro or equivalent.
+ * a __func__ macro or equivalent.
  */
 #ifndef ACPI_GET_FUNCTION_NAME
 #define ACPI_GET_FUNCTION_NAME          _acpi_function_name
@@ -249,12 +249,12 @@
  * The Name parameter should be the procedure name as a quoted string.
  * The function name is also used by the function exit macros below.
  * Note: (const char) is used to be compatible with the debug interfaces
- * and macros such as __FUNCTION__.
+ * and macros such as __func__.
  */
 #define ACPI_FUNCTION_NAME(name)        static const char _acpi_function_name[] = #name;
 
 #else
-/* Compiler supports __FUNCTION__ (or equivalent) -- Ignore this macro */
+/* Compiler supports __func__ (or equivalent) -- Ignore this macro */
 
 #define ACPI_FUNCTION_NAME(name)
 #endif				/* ACPI_GET_FUNCTION_NAME */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e42e7dc34c68..d6d630d31ef3 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -800,9 +800,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
 	(type *)( (char *)__mptr - offsetof(type,member) );})
 
-/* Trap pasters of __FUNCTION__ at compile-time */
-#define __FUNCTION__ (__func__)
-
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
-- 
cgit v1.2.3


From d1214c65c02d503330ce86bd38e344a36599e055 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:50 -0800
Subject: libstring_helpers.c:string_get_size(): return void

string_get_size() was documented to return an error, but in fact always
returned 0.  Since the output always fits in 9 bytes, just document that
and let callers do what they do now: pass a small stack buffer and ignore
the return value.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string_helpers.h |  4 ++--
 lib/string_helpers.c           | 10 ++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 6eb567ac56bc..657571817260 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -10,8 +10,8 @@ enum string_size_units {
 	STRING_UNITS_2,		/* use binary powers of 2^10 */
 };
 
-int string_get_size(u64 size, enum string_size_units units,
-		    char *buf, int len);
+void string_get_size(u64 size, enum string_size_units units,
+		     char *buf, int len);
 
 #define UNESCAPE_SPACE		0x01
 #define UNESCAPE_OCTAL		0x02
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 2b3757f84b3b..8f8c4417f228 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -20,12 +20,12 @@
  * @len:	length of buffer
  *
  * This function returns a string formatted to 3 significant figures
- * giving the size in the required units.  Returns 0 on success or
- * error on failure.  @buf is always zero terminated.
+ * giving the size in the required units.  @buf should have room for
+ * at least 9 bytes and will always be zero terminated.
  *
  */
-int string_get_size(u64 size, const enum string_size_units units,
-		    char *buf, int len)
+void string_get_size(u64 size, const enum string_size_units units,
+		     char *buf, int len)
 {
 	static const char *const units_10[] = {
 		"B", "kB", "MB", "GB", "TB", "PB", "EB"
@@ -67,8 +67,6 @@ int string_get_size(u64 size, const enum string_size_units units,
 
 	snprintf(buf, len, "%u%s %s", (u32)size,
 		 tmp, units_str[units][i]);
-
-	return 0;
 }
 EXPORT_SYMBOL(string_get_size);
 
-- 
cgit v1.2.3


From 8b4daad52fee7731ea4bae22a99ece2d4ba7ba43 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:53 -0800
Subject: lib/bitmap.c: more signed->unsigned conversions

For consistency with the other bitmap_* functions, also make the nbits
parameter of bitmap_zero, bitmap_fill and bitmap_copy unsigned.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 202e4034fe26..1406d5453781 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -185,33 +185,33 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf,
 #define small_const_nbits(nbits) \
 	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
 
-static inline void bitmap_zero(unsigned long *dst, int nbits)
+static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = 0UL;
 	else {
-		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+		unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
 		memset(dst, 0, len);
 	}
 }
 
-static inline void bitmap_fill(unsigned long *dst, int nbits)
+static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
-	size_t nlongs = BITS_TO_LONGS(nbits);
+	unsigned int nlongs = BITS_TO_LONGS(nbits);
 	if (!small_const_nbits(nbits)) {
-		int len = (nlongs - 1) * sizeof(unsigned long);
+		unsigned int len = (nlongs - 1) * sizeof(unsigned long);
 		memset(dst, 0xff,  len);
 	}
 	dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
 }
 
 static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
-			int nbits)
+			unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = *src;
 	else {
-		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+		unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
 		memcpy(dst, src, len);
 	}
 }
-- 
cgit v1.2.3


From 33c4fa8c6763f1ba9f4ea64079882eaa6d7957b7 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:56 -0800
Subject: linux/nodemask.h: update bitmap wrappers to take unsigned int

Since the various bitmap_* functions now take an unsigned int as nbits
parameter, it makes sense to also update the various wrappers.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nodemask.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 83a6aeda899d..21cef483dc1b 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -120,13 +120,13 @@ static inline void __node_clear(int node, volatile nodemask_t *dstp)
 }
 
 #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
-static inline void __nodes_setall(nodemask_t *dstp, int nbits)
+static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
 {
 	bitmap_fill(dstp->bits, nbits);
 }
 
 #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
-static inline void __nodes_clear(nodemask_t *dstp, int nbits)
+static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
 {
 	bitmap_zero(dstp->bits, nbits);
 }
@@ -144,7 +144,7 @@ static inline int __node_test_and_set(int node, nodemask_t *addr)
 #define nodes_and(dst, src1, src2) \
 			__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
 static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -152,7 +152,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
 #define nodes_or(dst, src1, src2) \
 			__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
 static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -160,7 +160,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
 #define nodes_xor(dst, src1, src2) \
 			__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
 static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -168,7 +168,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
 #define nodes_andnot(dst, src1, src2) \
 			__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
 static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -176,7 +176,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
 #define nodes_complement(dst, src) \
 			__nodes_complement(&(dst), &(src), MAX_NUMNODES)
 static inline void __nodes_complement(nodemask_t *dstp,
-					const nodemask_t *srcp, int nbits)
+					const nodemask_t *srcp, unsigned int nbits)
 {
 	bitmap_complement(dstp->bits, srcp->bits, nbits);
 }
@@ -184,7 +184,7 @@ static inline void __nodes_complement(nodemask_t *dstp,
 #define nodes_equal(src1, src2) \
 			__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
 static inline int __nodes_equal(const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_equal(src1p->bits, src2p->bits, nbits);
 }
@@ -192,7 +192,7 @@ static inline int __nodes_equal(const nodemask_t *src1p,
 #define nodes_intersects(src1, src2) \
 			__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
 static inline int __nodes_intersects(const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
 }
@@ -200,25 +200,25 @@ static inline int __nodes_intersects(const nodemask_t *src1p,
 #define nodes_subset(src1, src2) \
 			__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
 static inline int __nodes_subset(const nodemask_t *src1p,
-					const nodemask_t *src2p, int nbits)
+					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_subset(src1p->bits, src2p->bits, nbits);
 }
 
 #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
-static inline int __nodes_empty(const nodemask_t *srcp, int nbits)
+static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_empty(srcp->bits, nbits);
 }
 
 #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
-static inline int __nodes_full(const nodemask_t *srcp, int nbits)
+static inline int __nodes_full(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_full(srcp->bits, nbits);
 }
 
 #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
-static inline int __nodes_weight(const nodemask_t *srcp, int nbits)
+static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_weight(srcp->bits, nbits);
 }
-- 
cgit v1.2.3


From f5ac1f55204fec9d9c63644bc1de0ab6a59af9f1 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:01:59 -0800
Subject: linux/cpumask.h: update bitmap wrappers to take unsigned int

Since the various bitmap_* functions now take an unsigned int as nbits
parameter, it makes sense to also update the various wrappers, even though
they're marked as obsolete.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b950e9d6008b..ff9044286d88 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -905,13 +905,13 @@ static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
 }
 
 #define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
-static inline void __cpus_setall(cpumask_t *dstp, int nbits)
+static inline void __cpus_setall(cpumask_t *dstp, unsigned int nbits)
 {
 	bitmap_fill(dstp->bits, nbits);
 }
 
 #define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
-static inline void __cpus_clear(cpumask_t *dstp, int nbits)
+static inline void __cpus_clear(cpumask_t *dstp, unsigned int nbits)
 {
 	bitmap_zero(dstp->bits, nbits);
 }
@@ -927,21 +927,21 @@ static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
 
 #define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
 static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
 static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
 static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
@@ -949,40 +949,40 @@ static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
 #define cpus_andnot(dst, src1, src2) \
 				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
 static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
 static inline int __cpus_equal(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_equal(src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
 static inline int __cpus_intersects(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
 static inline int __cpus_subset(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+					const cpumask_t *src2p, unsigned int nbits)
 {
 	return bitmap_subset(src1p->bits, src2p->bits, nbits);
 }
 
 #define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
-static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
+static inline int __cpus_empty(const cpumask_t *srcp, unsigned int nbits)
 {
 	return bitmap_empty(srcp->bits, nbits);
 }
 
 #define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
-static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
+static inline int __cpus_weight(const cpumask_t *srcp, unsigned int nbits)
 {
 	return bitmap_weight(srcp->bits, nbits);
 }
-- 
cgit v1.2.3


From eb5698837881687841c9e477e4162ac3387c6b59 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:01 -0800
Subject: lib/bitmap.c: update bitmap_onto to unsigned

Change the nbits parameter of bitmap_onto to unsigned int for consistency
with other bitmap_* functions.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 2 +-
 lib/bitmap.c           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 1406d5453781..d0c6214eb190 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -164,7 +164,7 @@ extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
 extern int bitmap_bitremap(int oldbit,
 		const unsigned long *old, const unsigned long *new, int bits);
 extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
-		const unsigned long *relmap, int bits);
+		const unsigned long *relmap, unsigned int bits);
 extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 		int sz, int bits);
 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 324ea9eab8c1..ee598a496895 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1006,9 +1006,9 @@ EXPORT_SYMBOL(bitmap_bitremap);
  * All bits in @dst not set by the above rule are cleared.
  */
 void bitmap_onto(unsigned long *dst, const unsigned long *orig,
-			const unsigned long *relmap, int bits)
+			const unsigned long *relmap, unsigned int bits)
 {
-	int n, m;       	/* same meaning as in above comment */
+	unsigned int n, m;	/* same meaning as in above comment */
 
 	if (dst == orig)	/* following doesn't handle inplace mappings */
 		return;
-- 
cgit v1.2.3


From b26ad5836c3a0a9d456eb60b9f841ca15403ee59 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:04 -0800
Subject: lib/bitmap.c: change parameters of bitmap_fold to unsigned

Change the sz and nbits parameters of bitmap_fold to unsigned int for
consistency with other bitmap_* functions, and to save another few bytes
in the generated code.

[akpm@linux-foundation.org: fix kerneldoc]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h |  2 +-
 lib/bitmap.c           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index d0c6214eb190..95dcd2f76e1a 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -166,7 +166,7 @@ extern int bitmap_bitremap(int oldbit,
 extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
 		const unsigned long *relmap, unsigned int bits);
 extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
-		int sz, int bits);
+		unsigned int sz, unsigned int nbits);
 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index ee598a496895..b0d3e823dab3 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1039,22 +1039,22 @@ EXPORT_SYMBOL(bitmap_onto);
  *	@dst: resulting smaller bitmap
  *	@orig: original larger bitmap
  *	@sz: specified size
- *	@bits: number of bits in each of these bitmaps
+ *	@nbits: number of bits in each of these bitmaps
  *
  * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
  * Clear all other bits in @dst.  See further the comment and
  * Example [2] for bitmap_onto() for why and how to use this.
  */
 void bitmap_fold(unsigned long *dst, const unsigned long *orig,
-			int sz, int bits)
+			unsigned int sz, unsigned int nbits)
 {
-	int oldbit;
+	unsigned int oldbit;
 
 	if (dst == orig)	/* following doesn't handle inplace mappings */
 		return;
-	bitmap_zero(dst, bits);
+	bitmap_zero(dst, nbits);
 
-	for_each_set_bit(oldbit, orig, bits)
+	for_each_set_bit(oldbit, orig, nbits)
 		set_bit(oldbit % sz, dst);
 }
 EXPORT_SYMBOL(bitmap_fold);
-- 
cgit v1.2.3


From f6a1f5db8d7a7a94ff07251996959d27daba4ee7 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:10 -0800
Subject: lib/bitmap.c: simplify bitmap_ord_to_pos

Make the return value and the ord and nbits parameters of
bitmap_ord_to_pos unsigned.

Also, simplify the implementation and as a side effect make the result
fully defined, returning nbits for ord >= weight, in analogy with what
find_{first,next}_bit does.  This is a better sentinel than the former
("unofficial") 0.  No current users are affected by this change.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h |  2 +-
 lib/bitmap.c           | 28 +++++++++++-----------------
 2 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 95dcd2f76e1a..1e74fe7aa167 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -171,7 +171,7 @@ extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int
 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
-extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
+extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
 extern int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
 
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 84d20b5c6bf1..e8a38bde7af9 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -771,34 +771,28 @@ static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigne
  * bitmap_ord_to_pos - find position of n-th set bit in bitmap
  *	@buf: pointer to bitmap
  *	@ord: ordinal bit position (n-th set bit, n >= 0)
- *	@bits: number of valid bit positions in @buf
+ *	@nbits: number of valid bit positions in @buf
  *
  * Map the ordinal offset of bit @ord in @buf to its position in @buf.
- * Value of @ord should be in range 0 <= @ord < weight(buf), else
- * results are undefined.
+ * Value of @ord should be in range 0 <= @ord < weight(buf). If @ord
+ * >= weight(buf), returns @nbits.
  *
  * If for example, just bits 4 through 7 are set in @buf, then @ord
  * values 0 through 3 will get mapped to 4 through 7, respectively,
- * and all other @ord values return undefined values.  When @ord value 3
+ * and all other @ord values returns @nbits.  When @ord value 3
  * gets mapped to (returns) @pos value 7 in this example, that means
  * that the 3rd set bit (starting with 0th) is at position 7 in @buf.
  *
- * The bit positions 0 through @bits are valid positions in @buf.
+ * The bit positions 0 through @nbits-1 are valid positions in @buf.
  */
-int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
+unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits)
 {
-	int pos = 0;
+	unsigned int pos;
 
-	if (ord >= 0 && ord < bits) {
-		int i;
-
-		for (i = find_first_bit(buf, bits);
-		     i < bits && ord > 0;
-		     i = find_next_bit(buf, bits, i + 1))
-	     		ord--;
-		if (i < bits && ord == 0)
-			pos = i;
-	}
+	for (pos = find_first_bit(buf, nbits);
+	     pos < nbits && ord;
+	     pos = find_next_bit(buf, nbits, pos + 1))
+		ord--;
 
 	return pos;
 }
-- 
cgit v1.2.3


From 9814ec135dedb70a9daa41e68798d540d7ba71f2 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:13 -0800
Subject: lib/bitmap.c: make the bits parameter of bitmap_remap unsigned

Also, rename bits to nbits. Both changes for consistency with other
bitmap_* functions.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h |  2 +-
 lib/bitmap.c           | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 1e74fe7aa167..5f5c00de39f0 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -160,7 +160,7 @@ extern int bitmap_parselist(const char *buf, unsigned long *maskp,
 extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
 			unsigned long *dst, int nbits);
 extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
-		const unsigned long *old, const unsigned long *new, int bits);
+		const unsigned long *old, const unsigned long *new, unsigned int nbits);
 extern int bitmap_bitremap(int oldbit,
 		const unsigned long *old, const unsigned long *new, int bits);
 extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index e8a38bde7af9..ad161a6c82db 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -803,7 +803,7 @@ unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsig
  *	@src: subset to be remapped
  *	@old: defines domain of map
  *	@new: defines range of map
- *	@bits: number of bits in each of these bitmaps
+ *	@nbits: number of bits in each of these bitmaps
  *
  * Let @old and @new define a mapping of bit positions, such that
  * whatever position is held by the n-th set bit in @old is mapped
@@ -831,22 +831,22 @@ unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsig
  */
 void bitmap_remap(unsigned long *dst, const unsigned long *src,
 		const unsigned long *old, const unsigned long *new,
-		int bits)
+		unsigned int nbits)
 {
-	int oldbit, w;
+	unsigned int oldbit, w;
 
 	if (dst == src)		/* following doesn't handle inplace remaps */
 		return;
-	bitmap_zero(dst, bits);
+	bitmap_zero(dst, nbits);
 
-	w = bitmap_weight(new, bits);
-	for_each_set_bit(oldbit, src, bits) {
-	     	int n = bitmap_pos_to_ord(old, oldbit, bits);
+	w = bitmap_weight(new, nbits);
+	for_each_set_bit(oldbit, src, nbits) {
+		int n = bitmap_pos_to_ord(old, oldbit, nbits);
 
 		if (n < 0 || w == 0)
 			set_bit(oldbit, dst);	/* identity map */
 		else
-			set_bit(bitmap_ord_to_pos(new, n % w, bits), dst);
+			set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst);
 	}
 }
 EXPORT_SYMBOL(bitmap_remap);
-- 
cgit v1.2.3


From af3cd13501eb04ca61d017ff4406f1cbffafdc04 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:15 -0800
Subject: lib/string.c: remove strnicmp()

Now that all in-tree users of strnicmp have been converted to
strncasecmp, the wrapper can be removed.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: David Howells <dhowells@redhat.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/include/asm/string.h  | 1 -
 arch/s390/include/asm/string.h | 1 -
 include/linux/string.h         | 3 ---
 lib/string.c                   | 8 --------
 4 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/frv/include/asm/string.h b/arch/frv/include/asm/string.h
index 5ed310f64b7e..1f6c35990439 100644
--- a/arch/frv/include/asm/string.h
+++ b/arch/frv/include/asm/string.h
@@ -33,7 +33,6 @@ extern void *memcpy(void *, const void *, __kernel_size_t);
 #define __HAVE_ARCH_STRNCAT 1
 #define __HAVE_ARCH_STRCMP 1
 #define __HAVE_ARCH_STRNCMP 1
-#define __HAVE_ARCH_STRNICMP 1
 #define __HAVE_ARCH_STRCHR 1
 #define __HAVE_ARCH_STRRCHR 1
 #define __HAVE_ARCH_STRSTR 1
diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h
index 7e2dcd7c57ef..8662f5c8e17f 100644
--- a/arch/s390/include/asm/string.h
+++ b/arch/s390/include/asm/string.h
@@ -44,7 +44,6 @@ extern char *strstr(const char *, const char *);
 #undef __HAVE_ARCH_STRCHR
 #undef __HAVE_ARCH_STRNCHR
 #undef __HAVE_ARCH_STRNCMP
-#undef __HAVE_ARCH_STRNICMP
 #undef __HAVE_ARCH_STRPBRK
 #undef __HAVE_ARCH_STRSEP
 #undef __HAVE_ARCH_STRSPN
diff --git a/include/linux/string.h b/include/linux/string.h
index 2e22a2e58f3a..b9bc9a5d9e21 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -40,9 +40,6 @@ extern int strcmp(const char *,const char *);
 #ifndef __HAVE_ARCH_STRNCMP
 extern int strncmp(const char *,const char *,__kernel_size_t);
 #endif
-#ifndef __HAVE_ARCH_STRNICMP
-#define strnicmp strncasecmp
-#endif
 #ifndef __HAVE_ARCH_STRCASECMP
 extern int strcasecmp(const char *s1, const char *s2);
 #endif
diff --git a/lib/string.c b/lib/string.c
index 10063300b830..3206d0178296 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -58,14 +58,6 @@ int strncasecmp(const char *s1, const char *s2, size_t len)
 }
 EXPORT_SYMBOL(strncasecmp);
 #endif
-#ifndef __HAVE_ARCH_STRNICMP
-#undef strnicmp
-int strnicmp(const char *s1, const char *s2, size_t len)
-{
-	return strncasecmp(s1, s2, len);
-}
-EXPORT_SYMBOL(strnicmp);
-#endif
 
 #ifndef __HAVE_ARCH_STRCASECMP
 int strcasecmp(const char *s1, const char *s2)
-- 
cgit v1.2.3


From 114fc1afb2de7dec40da137dc2a55cd38fc220f2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 12 Feb 2015 15:02:29 -0800
Subject: hexdump: make it return number of bytes placed in buffer

This patch makes hexdump return the number of bytes placed in the buffer
excluding trailing NUL.  In the case of overflow it returns the desired
amount of bytes to produce the entire dump.  Thus, it mimics snprintf().

This will be useful for users that would like to repeat with a bigger
buffer.

[akpm@linux-foundation.org: fix printk warning]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h |  6 ++---
 lib/hexdump.c          | 73 +++++++++++++++++++++++++++++++++++++-------------
 lib/test-hexdump.c     | 45 +++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 4d5bf5726578..baa3f97d8ce8 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -417,9 +417,9 @@ enum {
 	DUMP_PREFIX_ADDRESS,
 	DUMP_PREFIX_OFFSET
 };
-extern void hex_dump_to_buffer(const void *buf, size_t len,
-			       int rowsize, int groupsize,
-			       char *linebuf, size_t linebuflen, bool ascii);
+extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
+			      int groupsize, char *linebuf, size_t linebuflen,
+			      bool ascii);
 #ifdef CONFIG_PRINTK
 extern void print_hex_dump(const char *level, const char *prefix_str,
 			   int prefix_type, int rowsize, int groupsize,
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 4af53f73c7cc..7ea09699855d 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -97,22 +97,26 @@ EXPORT_SYMBOL(bin2hex);
  *
  * example output buffer:
  * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f  @ABCDEFGHIJKLMNO
+ *
+ * Return:
+ * The amount of bytes placed in the buffer without terminating NUL. If the
+ * output was truncated, then the return value is the number of bytes
+ * (excluding the terminating NUL) which would have been written to the final
+ * string if enough space had been available.
  */
-void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
-			int groupsize, char *linebuf, size_t linebuflen,
-			bool ascii)
+int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize,
+		       char *linebuf, size_t linebuflen, bool ascii)
 {
 	const u8 *ptr = buf;
 	int ngroups;
 	u8 ch;
 	int j, lx = 0;
 	int ascii_column;
+	int ret;
 
 	if (rowsize != 16 && rowsize != 32)
 		rowsize = 16;
 
-	if (!len)
-		goto nil;
 	if (len > rowsize)		/* limit to one line at a time */
 		len = rowsize;
 	if (!is_power_of_2(groupsize) || groupsize > 8)
@@ -122,27 +126,50 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
 
 	ngroups = len / groupsize;
 	ascii_column = rowsize * 2 + rowsize / groupsize + 1;
+
+	if (!linebuflen)
+		goto overflow1;
+
+	if (!len)
+		goto nil;
+
 	if (groupsize == 8) {
 		const u64 *ptr8 = buf;
 
-		for (j = 0; j < ngroups; j++)
-			lx += scnprintf(linebuf + lx, linebuflen - lx,
-					"%s%16.16llx", j ? " " : "",
-					(unsigned long long)*(ptr8 + j));
+		for (j = 0; j < ngroups; j++) {
+			ret = snprintf(linebuf + lx, linebuflen - lx,
+				       "%s%16.16llx", j ? " " : "",
+				       (unsigned long long)*(ptr8 + j));
+			if (ret >= linebuflen - lx)
+				goto overflow1;
+			lx += ret;
+		}
 	} else if (groupsize == 4) {
 		const u32 *ptr4 = buf;
 
-		for (j = 0; j < ngroups; j++)
-			lx += scnprintf(linebuf + lx, linebuflen - lx,
-					"%s%8.8x", j ? " " : "", *(ptr4 + j));
+		for (j = 0; j < ngroups; j++) {
+			ret = snprintf(linebuf + lx, linebuflen - lx,
+				       "%s%8.8x", j ? " " : "",
+				       *(ptr4 + j));
+			if (ret >= linebuflen - lx)
+				goto overflow1;
+			lx += ret;
+		}
 	} else if (groupsize == 2) {
 		const u16 *ptr2 = buf;
 
-		for (j = 0; j < ngroups; j++)
-			lx += scnprintf(linebuf + lx, linebuflen - lx,
-					"%s%4.4x", j ? " " : "", *(ptr2 + j));
+		for (j = 0; j < ngroups; j++) {
+			ret = snprintf(linebuf + lx, linebuflen - lx,
+				       "%s%4.4x", j ? " " : "",
+				       *(ptr2 + j));
+			if (ret >= linebuflen - lx)
+				goto overflow1;
+			lx += ret;
+		}
 	} else {
-		for (j = 0; (j < len) && (lx + 3) <= linebuflen; j++) {
+		for (j = 0; j < len; j++) {
+			if (linebuflen < lx + 3)
+				goto overflow2;
 			ch = ptr[j];
 			linebuf[lx++] = hex_asc_hi(ch);
 			linebuf[lx++] = hex_asc_lo(ch);
@@ -154,14 +181,24 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
 	if (!ascii)
 		goto nil;
 
-	while (lx < (linebuflen - 1) && lx < ascii_column)
+	while (lx < ascii_column) {
+		if (linebuflen < lx + 2)
+			goto overflow2;
 		linebuf[lx++] = ' ';
-	for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) {
+	}
+	for (j = 0; j < len; j++) {
+		if (linebuflen < lx + 2)
+			goto overflow2;
 		ch = ptr[j];
 		linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.';
 	}
 nil:
+	linebuf[lx] = '\0';
+	return lx;
+overflow2:
 	linebuf[lx++] = '\0';
+overflow1:
+	return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1;
 }
 EXPORT_SYMBOL(hex_dump_to_buffer);
 
diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c
index 9d3bd1e9ae48..daf29a390a89 100644
--- a/lib/test-hexdump.c
+++ b/lib/test-hexdump.c
@@ -114,6 +114,45 @@ static void __init test_hexdump_set(int rowsize, bool ascii)
 	test_hexdump(len, rowsize, 1, ascii);
 }
 
+static void __init test_hexdump_overflow(bool ascii)
+{
+	char buf[56];
+	const char *t = test_data_1_le[0];
+	size_t l = get_random_int() % sizeof(buf);
+	bool a;
+	int e, r;
+
+	memset(buf, ' ', sizeof(buf));
+
+	r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii);
+
+	if (ascii)
+		e = 50;
+	else
+		e = 2;
+	buf[e + 2] = '\0';
+
+	if (!l) {
+		a = r == e && buf[0] == ' ';
+	} else if (l < 3) {
+		a = r == e && buf[0] == '\0';
+	} else if (l < 4) {
+		a = r == e && !strcmp(buf, t);
+	} else if (ascii) {
+		if (l < 51)
+			a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' ';
+		else
+			a = r == e && buf[50] == '\0' && buf[49] == '.';
+	} else {
+		a = r == e && buf[e] == '\0';
+	}
+
+	if (!a) {
+		pr_err("Len: %zu rc: %u strlen: %zu\n", l, r, strlen(buf));
+		pr_err("Result: '%s'\n", buf);
+	}
+}
+
 static int __init test_hexdump_init(void)
 {
 	unsigned int i;
@@ -129,6 +168,12 @@ static int __init test_hexdump_init(void)
 	for (i = 0; i < 16; i++)
 		test_hexdump_set(rowsize, true);
 
+	for (i = 0; i < 16; i++)
+		test_hexdump_overflow(false);
+
+	for (i = 0; i < 16; i++)
+		test_hexdump_overflow(true);
+
 	return -EINVAL;
 }
 module_init(test_hexdump_init);
-- 
cgit v1.2.3


From 3248340d3f10871d2ae2d1df9e323f284fa1fdb6 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 12 Feb 2015 15:02:40 -0800
Subject: lib/halfmd4.c: simplify includes

We only need EXPORT_SYMBOL, so compiler.h and export.h suffice.  This
means linux/types.h is no longer implicitly included, so add an include of
uapi/linux/types.h to linux/cryptohash.h for __u32.  Other users of
cryptohash.h cannot be affected, since they must already have been
including uapi/linux/types.h in order for gcc not to complain about
unknown types.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cryptohash.h | 2 ++
 lib/halfmd4.c              | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index 2cd9f1cf9fa3..f4754282c9c2 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -1,6 +1,8 @@
 #ifndef __CRYPTOHASH_H
 #define __CRYPTOHASH_H
 
+#include <uapi/linux/types.h>
+
 #define SHA_DIGEST_WORDS 5
 #define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
 #define SHA_WORKSPACE_WORDS 16
diff --git a/lib/halfmd4.c b/lib/halfmd4.c
index 66d0ee8b7776..a8fe6274a13c 100644
--- a/lib/halfmd4.c
+++ b/lib/halfmd4.c
@@ -1,4 +1,4 @@
-#include <linux/kernel.h>
+#include <linux/compiler.h>
 #include <linux/export.h>
 #include <linux/cryptohash.h>
 
-- 
cgit v1.2.3


From 3810631332465d967ba5e27ea2c7dff2c9afac6c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 12 Feb 2015 23:33:15 +0100
Subject: PM / sleep: Re-implement suspend-to-idle handling

In preparation for adding support for quiescing timers in the final
stage of suspend-to-idle transitions, rework the freeze_enter()
function making the system wait on a wakeup event, the freeze_wake()
function terminating the suspend-to-idle loop and the mechanism by
which deep idle states are entered during suspend-to-idle.

First of all, introduce a simple state machine for suspend-to-idle
and make the code in question use it.

Second, prevent freeze_enter() from losing wakeup events due to race
conditions and ensure that the number of online CPUs won't change
while it is being executed.  In addition to that, make it force
all of the CPUs re-enter the idle loop in case they are in idle
states already (so they can enter deeper idle states if possible).

Next, drop cpuidle_use_deepest_state() and replace use_deepest_state
checks in cpuidle_select() and cpuidle_reflect() with a single
suspend-to-idle state check in cpuidle_idle_call().

Finally, introduce cpuidle_enter_freeze() that will simply find the
deepest idle state available to the given CPU and enter it using
cpuidle_enter().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/cpuidle/cpuidle.c | 49 +++++++++++++++++++++++++----------------------
 include/linux/cpuidle.h   |  4 ++--
 include/linux/suspend.h   | 16 ++++++++++++++++
 kernel/power/suspend.c    | 43 ++++++++++++++++++++++++++++++++++-------
 kernel/sched/idle.c       | 16 ++++++++++++++++
 5 files changed, 96 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 125150dc6e81..23a8d6cc8d30 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -19,6 +19,7 @@
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -32,7 +33,6 @@ LIST_HEAD(cpuidle_detected_devices);
 static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
-static bool use_deepest_state __read_mostly;
 
 int cpuidle_disabled(void)
 {
@@ -66,24 +66,9 @@ int cpuidle_play_dead(void)
 }
 
 /**
- * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
- * @enable: Whether enable or disable the feature.
- *
- * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
- * always use the state with the greatest exit latency (out of the states that
- * are not disabled).
- *
- * This function can only be called after cpuidle_pause() to avoid races.
- */
-void cpuidle_use_deepest_state(bool enable)
-{
-	use_deepest_state = enable;
-}
-
-/**
- * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
- * @drv: cpuidle driver for a given CPU.
- * @dev: cpuidle device for a given CPU.
+ * cpuidle_find_deepest_state - Find deepest state meeting specific conditions.
+ * @drv: cpuidle driver for the given CPU.
+ * @dev: cpuidle device for the given CPU.
  */
 static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 				      struct cpuidle_device *dev)
@@ -104,6 +89,27 @@ static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 	return ret;
 }
 
+/**
+ * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle.
+ *
+ * Find the deepest state available and enter it.
+ */
+void cpuidle_enter_freeze(void)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int index;
+
+	index = cpuidle_find_deepest_state(drv, dev);
+	if (index >= 0)
+		cpuidle_enter(drv, dev, index);
+	else
+		arch_cpu_idle();
+
+	/* Interrupts are enabled again here. */
+	local_irq_disable();
+}
+
 /**
  * cpuidle_enter_state - enter the state and update stats
  * @dev: cpuidle device for this cpu
@@ -166,9 +172,6 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	if (!drv || !dev || !dev->enabled)
 		return -EBUSY;
 
-	if (unlikely(use_deepest_state))
-		return cpuidle_find_deepest_state(drv, dev);
-
 	return cpuidle_curr_governor->select(drv, dev);
 }
 
@@ -200,7 +203,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  */
 void cpuidle_reflect(struct cpuidle_device *dev, int index)
 {
-	if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
+	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev, index);
 }
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index ab70f3bc44ad..f63aabf4ee90 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -141,7 +141,7 @@ extern void cpuidle_resume(void);
 extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
-extern void cpuidle_use_deepest_state(bool enable);
+extern void cpuidle_enter_freeze(void);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
 #else
@@ -174,7 +174,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev)
 {return -ENODEV; }
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
-static inline void cpuidle_use_deepest_state(bool enable) {}
+static inline void cpuidle_enter_freeze(void) { }
 static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 	struct cpuidle_device *dev) {return NULL; }
 #endif
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 3388c1b6f7d8..5efe743ce1e8 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -201,6 +201,21 @@ struct platform_freeze_ops {
  */
 extern void suspend_set_ops(const struct platform_suspend_ops *ops);
 extern int suspend_valid_only_mem(suspend_state_t state);
+
+/* Suspend-to-idle state machnine. */
+enum freeze_state {
+	FREEZE_STATE_NONE,      /* Not suspended/suspending. */
+	FREEZE_STATE_ENTER,     /* Enter suspend-to-idle. */
+	FREEZE_STATE_WAKE,      /* Wake up from suspend-to-idle. */
+};
+
+extern enum freeze_state __read_mostly suspend_freeze_state;
+
+static inline bool idle_should_freeze(void)
+{
+	return unlikely(suspend_freeze_state == FREEZE_STATE_ENTER);
+}
+
 extern void freeze_set_ops(const struct platform_freeze_ops *ops);
 extern void freeze_wake(void);
 
@@ -228,6 +243,7 @@ extern int pm_suspend(suspend_state_t state);
 
 static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
+static inline bool idle_should_freeze(void) { return false; }
 static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
 static inline void freeze_wake(void) {}
 #endif /* !CONFIG_SUSPEND */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..b7d6b3a721b1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX];
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
 
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
 
 static void freeze_begin(void)
 {
-	suspend_freeze_wake = false;
+	suspend_freeze_state = FREEZE_STATE_NONE;
 }
 
 static void freeze_enter(void)
 {
-	cpuidle_use_deepest_state(true);
+	spin_lock_irq(&suspend_freeze_lock);
+	if (pm_wakeup_pending())
+		goto out;
+
+	suspend_freeze_state = FREEZE_STATE_ENTER;
+	spin_unlock_irq(&suspend_freeze_lock);
+
+	get_online_cpus();
 	cpuidle_resume();
-	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+	/* Push all the CPUs into the idle loop. */
+	wake_up_all_idle_cpus();
+	pr_debug("PM: suspend-to-idle\n");
+	/* Make the current CPU wait so it can enter the idle loop too. */
+	wait_event(suspend_freeze_wait_head,
+		   suspend_freeze_state == FREEZE_STATE_WAKE);
+	pr_debug("PM: resume from suspend-to-idle\n");
+
 	cpuidle_pause();
-	cpuidle_use_deepest_state(false);
+	put_online_cpus();
+
+	spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+	suspend_freeze_state = FREEZE_STATE_NONE;
+	spin_unlock_irq(&suspend_freeze_lock);
 }
 
 void freeze_wake(void)
 {
-	suspend_freeze_wake = true;
-	wake_up(&suspend_freeze_wait_head);
+	unsigned long flags;
+
+	spin_lock_irqsave(&suspend_freeze_lock, flags);
+	if (suspend_freeze_state > FREEZE_STATE_NONE) {
+		suspend_freeze_state = FREEZE_STATE_WAKE;
+		wake_up(&suspend_freeze_wait_head);
+	}
+	spin_unlock_irqrestore(&suspend_freeze_lock, flags);
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index aaf1c1d5cf5d..94b2d7b88a27 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <linux/suspend.h>
 
 #include <asm/tlb.h>
 
@@ -104,6 +105,21 @@ static void cpuidle_idle_call(void)
 	 */
 	rcu_idle_enter();
 
+	/*
+	 * Suspend-to-idle ("freeze") is a system state in which all user space
+	 * has been frozen, all I/O devices have been suspended and the only
+	 * activity happens here and in iterrupts (if any).  In that case bypass
+	 * the cpuidle governor and go stratight for the deepest idle state
+	 * available.  Possibly also suspend the local tick and the entire
+	 * timekeeping to prevent timer interrupts from kicking us out of idle
+	 * until a proper wakeup interrupt happens.
+	 */
+	if (idle_should_freeze()) {
+		cpuidle_enter_freeze();
+		local_irq_enable();
+		goto exit_idle;
+	}
+
 	/*
 	 * Ask the cpuidle framework to choose a convenient idle state.
 	 * Fall back to the default arch idle method on errors.
-- 
cgit v1.2.3


From f4086a3d789dbe18949862276d83b8f49fce6d2f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 13 Feb 2015 21:03:16 -0500
Subject: NFS: struct nfs_commit_info.lock must always point to inode->i_lock

Commit 411a99adffb4f (nfs: clear_request_commit while holding i_lock)
assumes that the nfs_commit_info always points to the inode->i_lock.
For historical reasons, that is not the case for O_DIRECT writes.

Cc: Weston Andros Adamson <dros@primarydata.com>
Fixes: 411a99adffb4f ("nfs: clear_request_commit while holding i_lock")
Cc: stable@vger.kernel.org # 3.17.x
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c         | 2 +-
 include/linux/nfs_xdr.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7077521acdf4..e907c8cf732e 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -283,7 +283,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 			      struct nfs_direct_req *dreq)
 {
-	cinfo->lock = &dreq->lock;
+	cinfo->lock = &dreq->inode->i_lock;
 	cinfo->mds = &dreq->mds_cinfo;
 	cinfo->ds = &dreq->ds_cinfo;
 	cinfo->dreq = dreq;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 38d96ba935c2..9a39132fda49 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1351,7 +1351,7 @@ struct nfs_commit_completion_ops {
 };
 
 struct nfs_commit_info {
-	spinlock_t			*lock;
+	spinlock_t			*lock;	/* inode->i_lock */
 	struct nfs_mds_commit_info	*mds;
 	struct pnfs_ds_commit_info	*ds;
 	struct nfs_direct_req		*dreq;	/* O_DIRECT request */
-- 
cgit v1.2.3


From bf40e5561fd288a505d5d8d8bf45eef96fe7253d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 13 Feb 2015 21:40:27 -0500
Subject: NFSv4: Kill unused nfs_inode->delegation_state field

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/delegation.c    | 4 ----
 fs/nfs/inode.c         | 1 -
 include/linux/nfs_fs.h | 1 -
 3 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 16b754ee0d09..4464eb06b0b6 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -175,7 +175,6 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
 			delegation->cred = get_rpccred(cred);
 			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
 				  &delegation->flags);
-			NFS_I(inode)->delegation_state = delegation->type;
 			spin_unlock(&delegation->lock);
 			put_rpccred(oldcred);
 			rcu_read_unlock();
@@ -270,7 +269,6 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
 	set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
 	list_del_rcu(&delegation->super_list);
 	delegation->inode = NULL;
-	nfsi->delegation_state = 0;
 	rcu_assign_pointer(nfsi->delegation, NULL);
 	spin_unlock(&delegation->lock);
 	return delegation;
@@ -350,7 +348,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 					&delegation->stateid)) {
 			nfs_update_inplace_delegation(old_delegation,
 					delegation);
-			nfsi->delegation_state = old_delegation->type;
 			goto out;
 		}
 		/*
@@ -374,7 +371,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			goto out;
 	}
 	list_add_rcu(&delegation->super_list, &server->delegations);
-	nfsi->delegation_state = delegation->type;
 	rcu_assign_pointer(nfsi->delegation, delegation);
 	delegation = NULL;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d2398c193bda..e211f975a69a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1776,7 +1776,6 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #if IS_ENABLED(CONFIG_NFS_V4)
 	INIT_LIST_HEAD(&nfsi->open_states);
 	nfsi->delegation = NULL;
-	nfsi->delegation_state = 0;
 	init_rwsem(&nfsi->rwsem);
 	nfsi->layout = NULL;
 #endif
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6d627b92df53..2f77e0c651c8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -180,7 +180,6 @@ struct nfs_inode {
         /* NFSv4 state */
 	struct list_head	open_states;
 	struct nfs_delegation __rcu *delegation;
-	fmode_t			 delegation_state;
 	struct rw_semaphore	rwsem;
 
 	/* pNFS layout information */
-- 
cgit v1.2.3


From 9b6c2d2e2ba5280649eb043cbc7e3483c77e5d69 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 13 Feb 2015 14:35:57 -0800
Subject: lib/bitmap.c: change prototype of bitmap_copy_le

Make the prototype of bitmap_copy_le the same as bitmap_copy's.  All other
bitmap_* functions take unsigned long* parameters; there's no reason this
should be special.

The only current user is the static inline uwb_mas_bm_copy_le, which
already does the void* laundering, so the end users can pass their u8 or
__le32 buffers without a cast.

Furthermore, this allows us to simply let bitmap_copy_le be an alias for
bitmap_copy on little-endian; see next patch.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 2 +-
 lib/bitmap.c           | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 5f5c00de39f0..334fe32d8f0e 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -170,7 +170,7 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
-extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
+extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
 extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
 extern int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index ad161a6c82db..e4ac20bec76c 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1191,16 +1191,15 @@ EXPORT_SYMBOL(bitmap_allocate_region);
  *
  * Require nbits % BITS_PER_LONG == 0.
  */
-void bitmap_copy_le(void *dst, const unsigned long *src, int nbits)
+void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits)
 {
-	unsigned long *d = dst;
-	int i;
+	unsigned int i;
 
 	for (i = 0; i < nbits/BITS_PER_LONG; i++) {
 		if (BITS_PER_LONG == 64)
-			d[i] = cpu_to_le64(src[i]);
+			dst[i] = cpu_to_le64(src[i]);
 		else
-			d[i] = cpu_to_le32(src[i]);
+			dst[i] = cpu_to_le32(src[i]);
 	}
 }
 EXPORT_SYMBOL(bitmap_copy_le);
-- 
cgit v1.2.3


From e8f24278329dc31b3b8223c83a5465c9df153d9d Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 13 Feb 2015 14:36:00 -0800
Subject: lib/bitmap.c: elide bitmap_copy_le on little-endian

On little-endian, there's no reason to have an extra, presumably less
efficient, way of copying a bitmap.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 4 ++++
 lib/bitmap.c           | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 334fe32d8f0e..cffc89c23c02 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -170,7 +170,11 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
+#ifdef __BIG_ENDIAN
 extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
+#else
+#define bitmap_copy_le bitmap_copy
+#endif
 extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
 extern int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index e4ac20bec76c..d2cd50cd4f5d 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1191,6 +1191,7 @@ EXPORT_SYMBOL(bitmap_allocate_region);
  *
  * Require nbits % BITS_PER_LONG == 0.
  */
+#ifdef __BIG_ENDIAN
 void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits)
 {
 	unsigned int i;
@@ -1203,3 +1204,4 @@ void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int n
 	}
 }
 EXPORT_SYMBOL(bitmap_copy_le);
+#endif
-- 
cgit v1.2.3


From 2fbad29917c9852fa018d572cd3d43a13465d0f8 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 13 Feb 2015 14:36:02 -0800
Subject: lib: bitmap: change bitmap_shift_right to take unsigned parameters

I've previously changed the nbits parameter of most bitmap_* functions to
unsigned; now it is bitmap_shift_{left,right}'s turn.  This alone saves
some .text, but while at it I found that there were a few other things one
could do.  The end result of these seven patches is

  $ scripts/bloat-o-meter /tmp/bitmap.o.{old,new}
  add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-328 (-328)
  function                                     old     new   delta
  __bitmap_shift_right                         384     226    -158
  __bitmap_shift_left                          306     136    -170

and less importantly also a smaller stack footprint

  $ stack-o-meter.pl master bitmap
  file                 function                       old  new  delta
  lib/bitmap.o         __bitmap_shift_right             24    8  -16
  lib/bitmap.o         __bitmap_shift_left              24    0  -24

For each pair of 0 <= shift <= nbits <= 256 I've tested the end result
with a few randomly filled src buffers (including garbage beyond nbits),
in each case verifying that the shift {left,right}-most bits of dst are
zero and the remaining nbits-shift bits correspond to src, so I'm fairly
confident I didn't screw up.  That hasn't stopped me from being wrong
before, though.

This patch (of 7):

gcc can generate slightly better code for stuff like "nbits %
BITS_PER_LONG" when it knows nbits is not negative.  Since negative size
bitmaps or shift amounts don't make sense, change these parameters of
bitmap_shift_right to unsigned.

The expressions involving "lim - 1" are still ok, since if lim is 0 the
loop is never executed.

Also use "shift" and "nbits" consistently for the parameter names.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 12 ++++++------
 lib/bitmap.c           | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index cffc89c23c02..c168a807ab9a 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -96,8 +96,8 @@ extern int __bitmap_equal(const unsigned long *bitmap1,
 			  const unsigned long *bitmap2, unsigned int nbits);
 extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
 			unsigned int nbits);
-extern void __bitmap_shift_right(unsigned long *dst,
-                        const unsigned long *src, int shift, int bits);
+extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+				unsigned int shift, unsigned int nbits);
 extern void __bitmap_shift_left(unsigned long *dst,
                         const unsigned long *src, int shift, int bits);
 extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
@@ -313,13 +313,13 @@ static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
 	return __bitmap_weight(src, nbits);
 }
 
-static inline void bitmap_shift_right(unsigned long *dst,
-			const unsigned long *src, int n, int nbits)
+static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+				unsigned int shift, int nbits)
 {
 	if (small_const_nbits(nbits))
-		*dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> n;
+		*dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
 	else
-		__bitmap_shift_right(dst, src, n, nbits);
+		__bitmap_shift_right(dst, src, shift, nbits);
 }
 
 static inline void bitmap_shift_left(unsigned long *dst,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index d2cd50cd4f5d..45e7d14ebdfd 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -104,17 +104,17 @@ EXPORT_SYMBOL(__bitmap_complement);
  *   @dst : destination bitmap
  *   @src : source bitmap
  *   @shift : shift by this many bits
- *   @bits : bitmap size, in bits
+ *   @nbits : bitmap size, in bits
  *
  * Shifting right (dividing) means moving bits in the MS -> LS bit
  * direction.  Zeros are fed into the vacated MS positions and the
  * LS bits shifted off the bottom are lost.
  */
-void __bitmap_shift_right(unsigned long *dst,
-			const unsigned long *src, int shift, int bits)
+void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+			unsigned shift, unsigned nbits)
 {
-	int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG;
-	int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
+	unsigned k, lim = BITS_TO_LONGS(nbits), left = nbits % BITS_PER_LONG;
+	unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
 	unsigned long mask = (1UL << left) - 1;
 	for (k = 0; off + k < lim; ++k) {
 		unsigned long upper, lower;
-- 
cgit v1.2.3


From dba94c2553da1928303c2a6c6410247c88cafc1d Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 13 Feb 2015 14:36:13 -0800
Subject: lib: bitmap: change bitmap_shift_left to take unsigned parameters

gcc can generate slightly better code for stuff like "nbits %
BITS_PER_LONG" when it knows nbits is not negative.  Since negative size
bitmaps or shift amounts don't make sense, change these parameters of
bitmap_shift_right to unsigned.

If off >= lim (which requires shift >= nbits), k is initialized with a
large positive value, but since I've let k continue to be signed, the loop
will never run and dst will be zeroed as expected.  Inside the loop, k is
guaranteed to be non-negative, so the fact that it is promoted to unsigned
in the various expressions it appears in is harmless.

Also use "shift" and "nbits" consistently for the parameter names.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 12 ++++++------
 lib/bitmap.c           | 11 ++++++-----
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index c168a807ab9a..5e7f75a6d7d0 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -98,8 +98,8 @@ extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
 			unsigned int nbits);
 extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
 				unsigned int shift, unsigned int nbits);
-extern void __bitmap_shift_left(unsigned long *dst,
-                        const unsigned long *src, int shift, int bits);
+extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+				unsigned int shift, unsigned int nbits);
 extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 			const unsigned long *bitmap2, unsigned int nbits);
 extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
@@ -322,13 +322,13 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s
 		__bitmap_shift_right(dst, src, shift, nbits);
 }
 
-static inline void bitmap_shift_left(unsigned long *dst,
-			const unsigned long *src, int n, int nbits)
+static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+				unsigned int shift, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
-		*dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits);
+		*dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits);
 	else
-		__bitmap_shift_left(dst, src, n, nbits);
+		__bitmap_shift_left(dst, src, shift, nbits);
 }
 
 static inline int bitmap_parse(const char *buf, unsigned int buflen,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index db88512c3451..74bdf3601245 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -148,18 +148,19 @@ EXPORT_SYMBOL(__bitmap_shift_right);
  *   @dst : destination bitmap
  *   @src : source bitmap
  *   @shift : shift by this many bits
- *   @bits : bitmap size, in bits
+ *   @nbits : bitmap size, in bits
  *
  * Shifting left (multiplying) means moving bits in the LS -> MS
  * direction.  Zeros are fed into the vacated LS bit positions
  * and those MS bits shifted off the top are lost.
  */
 
-void __bitmap_shift_left(unsigned long *dst,
-			const unsigned long *src, int shift, int bits)
+void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+			unsigned int shift, unsigned int nbits)
 {
-	int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG;
-	int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
+	int k;
+	unsigned int lim = BITS_TO_LONGS(nbits), left = nbits % BITS_PER_LONG;
+	unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
 	for (k = lim - off - 1; k >= 0; --k) {
 		unsigned long upper, lower;
 
-- 
cgit v1.2.3


From a4bb1e43e22d3cade8f942fc6f95920248eb2fd0 Mon Sep 17 00:00:00 2001
From: Andrzej Hajda <a.hajda@samsung.com>
Date: Fri, 13 Feb 2015 14:36:24 -0800
Subject: mm/util: add kstrdup_const

kstrdup() is often used to duplicate strings where neither source neither
destination will be ever modified.  In such case we can just reuse the
source instead of duplicating it.  The problem is that we must be sure
that the source is non-modifiable and its life-time is long enough.

I suspect the good candidates for such strings are strings located in
kernel .rodata section, they cannot be modifed because the section is
read-only and their life-time is equal to kernel life-time.

This small patchset proposes alternative version of kstrdup -
kstrdup_const, which returns source string if it is located in .rodata
otherwise it fallbacks to kstrdup.  To verify if the source is in
.rodata function checks if the address is between sentinels
__start_rodata, __end_rodata.  I guess it should work with all
architectures.

The main patch is accompanied by four patches constifying kstrdup for
cases where situtation described above happens frequently.

I have tested the patchset on mobile platform (exynos4210-trats) and it
saves 3272 string allocations.  Since minimal allocation is 32 or 64
bytes depending on Kconfig options the patchset saves respectively about
100KB or 200KB of memory.

Stats from tested platform show that the main offender is sysfs:

By caller:
  2260 __kernfs_new_node
    631 clk_register+0xc8/0x1b8
    318 clk_register+0x34/0x1b8
      51 kmem_cache_create
      12 alloc_vfsmnt

By string (with count >= 5):
    883 power
    876 subsystem
    135 parameters
    132 device
     61 iommu_group
    ...

This patch (of 5):

Add an alternative version of kstrdup which returns pointer to constant
char array.  The function checks if input string is in persistent and
read-only memory section, if yes it returns the input string, otherwise it
fallbacks to kstrdup.

kstrdup_const is accompanied by kfree_const performing conditional memory
deallocation of the string.

Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Mike Turquette <mturquette@linaro.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Greg KH <greg@kroah.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h |  3 +++
 mm/util.c              | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index b9bc9a5d9e21..e40099e585c9 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -112,7 +112,10 @@ extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 void *memchr_inv(const void *s, int c, size_t n);
 
+extern void kfree_const(const void *x);
+
 extern char *kstrdup(const char *s, gfp_t gfp);
+extern const char *kstrdup_const(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
 
diff --git a/mm/util.c b/mm/util.c
index f3ef639c4857..3981ae9d1b15 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,10 +12,30 @@
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
 
+#include <asm/sections.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
 
+static inline int is_kernel_rodata(unsigned long addr)
+{
+	return addr >= (unsigned long)__start_rodata &&
+		addr < (unsigned long)__end_rodata;
+}
+
+/**
+ * kfree_const - conditionally free memory
+ * @x: pointer to the memory
+ *
+ * Function calls kfree only if @x is not in .rodata section.
+ */
+void kfree_const(const void *x)
+{
+	if (!is_kernel_rodata((unsigned long)x))
+		kfree(x);
+}
+EXPORT_SYMBOL(kfree_const);
+
 /**
  * kstrdup - allocate space for and copy an existing string
  * @s: the string to duplicate
@@ -37,6 +57,24 @@ char *kstrdup(const char *s, gfp_t gfp)
 }
 EXPORT_SYMBOL(kstrdup);
 
+/**
+ * kstrdup_const - conditionally duplicate an existing const string
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Function returns source string if it is in .rodata section otherwise it
+ * fallbacks to kstrdup.
+ * Strings allocated by kstrdup_const should be freed by kfree_const.
+ */
+const char *kstrdup_const(const char *s, gfp_t gfp)
+{
+	if (is_kernel_rodata((unsigned long)s))
+		return s;
+
+	return kstrdup(s, gfp);
+}
+EXPORT_SYMBOL(kstrdup_const);
+
 /**
  * kstrndup - allocate space for and copy an existing string
  * @s: the string to duplicate
-- 
cgit v1.2.3


From dfeb0750b630b72b5d4fb2461bc7179eceb54666 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Feb 2015 14:36:31 -0800
Subject: kernfs: remove KERNFS_STATIC_NAME

When a new kernfs node is created, KERNFS_STATIC_NAME is used to avoid
making a separate copy of its name.  It's currently only used for sysfs
attributes whose filenames are required to stay accessible and unchanged.
There are rare exceptions where these names are allocated and formatted
dynamically but for the vast majority of cases they're consts in the
rodata section.

Now that kernfs is converted to use kstrdup_const() and kfree_const(),
there's little point in keeping KERNFS_STATIC_NAME around.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrzej Hajda <a.hajda@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/kernfs/dir.c        | 20 ++++++++------------
 fs/kernfs/file.c       |  4 ----
 fs/sysfs/file.c        |  2 +-
 include/linux/kernfs.h |  7 ++-----
 kernel/cgroup.c        |  2 +-
 5 files changed, 12 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 35e40879860a..6acc9648f986 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -411,8 +411,9 @@ void kernfs_put(struct kernfs_node *kn)
 
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
-	if (!(kn->flags & KERNFS_STATIC_NAME))
-		kfree_const(kn->name);
+
+	kfree_const(kn->name);
+
 	if (kn->iattr) {
 		if (kn->iattr->ia_secdata)
 			security_release_secctx(kn->iattr->ia_secdata,
@@ -506,15 +507,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 					     const char *name, umode_t mode,
 					     unsigned flags)
 {
-	const char *dup_name = NULL;
 	struct kernfs_node *kn;
 	int ret;
 
-	if (!(flags & KERNFS_STATIC_NAME)) {
-		name = dup_name = kstrdup_const(name, GFP_KERNEL);
-		if (!name)
-			return NULL;
-	}
+	name = kstrdup_const(name, GFP_KERNEL);
+	if (!name)
+		return NULL;
 
 	kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
 	if (!kn)
@@ -538,7 +536,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
  err_out2:
 	kmem_cache_free(kernfs_node_cache, kn);
  err_out1:
-	kfree_const(dup_name);
+	kfree_const(name);
 	return NULL;
 }
 
@@ -1285,9 +1283,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 
 	kn->ns = new_ns;
 	if (new_name) {
-		if (!(kn->flags & KERNFS_STATIC_NAME))
-			old_name = kn->name;
-		kn->flags &= ~KERNFS_STATIC_NAME;
+		old_name = kn->name;
 		kn->name = new_name;
 	}
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ddc9f9612f16..b684e8a132e6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -901,7 +901,6 @@ const struct file_operations kernfs_file_fops = {
  * @ops: kernfs operations for the file
  * @priv: private data for the file
  * @ns: optional namespace tag of the file
- * @name_is_static: don't copy file name
  * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
  *
  * Returns the created node on success, ERR_PTR() value on error.
@@ -911,7 +910,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 umode_t mode, loff_t size,
 					 const struct kernfs_ops *ops,
 					 void *priv, const void *ns,
-					 bool name_is_static,
 					 struct lock_class_key *key)
 {
 	struct kernfs_node *kn;
@@ -919,8 +917,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 	int rc;
 
 	flags = KERNFS_FILE;
-	if (name_is_static)
-		flags |= KERNFS_STATIC_NAME;
 
 	kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
 	if (!kn)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dfe928a9540f..7c2867b44141 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -295,7 +295,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 		key = attr->key ?: (struct lock_class_key *)&attr->skey;
 #endif
 	kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
-				  (void *)attr, ns, true, key);
+				  (void *)attr, ns, key);
 	if (IS_ERR(kn)) {
 		if (PTR_ERR(kn) == -EEXIST)
 			sysfs_warn_dup(parent, attr->name);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index d4e01b358341..71ecdab1671b 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -43,7 +43,6 @@ enum kernfs_node_flag {
 	KERNFS_HAS_SEQ_SHOW	= 0x0040,
 	KERNFS_HAS_MMAP		= 0x0080,
 	KERNFS_LOCKDEP		= 0x0100,
-	KERNFS_STATIC_NAME	= 0x0200,
 	KERNFS_SUICIDAL		= 0x0400,
 	KERNFS_SUICIDED		= 0x0800,
 };
@@ -291,7 +290,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 umode_t mode, loff_t size,
 					 const struct kernfs_ops *ops,
 					 void *priv, const void *ns,
-					 bool name_is_static,
 					 struct lock_class_key *key);
 struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       const char *name,
@@ -369,8 +367,7 @@ kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
 static inline struct kernfs_node *
 __kernfs_create_file(struct kernfs_node *parent, const char *name,
 		     umode_t mode, loff_t size, const struct kernfs_ops *ops,
-		     void *priv, const void *ns, bool name_is_static,
-		     struct lock_class_key *key)
+		     void *priv, const void *ns, struct lock_class_key *key)
 { return ERR_PTR(-ENOSYS); }
 
 static inline struct kernfs_node *
@@ -439,7 +436,7 @@ kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
 	key = (struct lock_class_key *)&ops->lockdep_key;
 #endif
 	return __kernfs_create_file(parent, name, mode, size, ops, priv, ns,
-				    false, key);
+				    key);
 }
 
 static inline struct kernfs_node *
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d5f6ec251fb2..29a7b2cc593e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3077,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 #endif
 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
 				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
-				  NULL, false, key);
+				  NULL, key);
 	if (IS_ERR(kn))
 		return PTR_ERR(kn);
 
-- 
cgit v1.2.3


From 513e3d2d11c9f05db1edc70deb18a82555cf9309 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Feb 2015 14:36:50 -0800
Subject: cpumask: always use nr_cpu_ids in formatting and parsing functions

bitmap implements two variants of scnprintf functions to format a bitmap
into a string and cpumask and nodemask wrap them to provide equivalent
interfaces.  The scnprintf family of functions require a string buffer as
an output target which complicates code paths which just want to print out
the mask through printk for informational or debug purposes as they have
to worry about how large the buffer should be and whether it's too large
to allocate on stack.

Neither cpumask or nodemask provides a guildeline on how large the target
buffer should be forcing users come up with their own solutions - some
allocate an arbitrarily sized buffer which is small enough to allocate on
stack but may be too short in corner cases, other come up with a custom
upper limit calculation considering the output format, some allocate the
buffer dynamically while one resorted to using lock to synchronize access
to a static buffer.

This is an artificial problem which is being solved repeatedly for no
benefit.  In a lot of cases, the output area already exists and can be
targeted directly making the intermediate buffer unnecessary.  This
patchset teaches printf family of functions how to format bitmaps and
replace the dedicated formatting functions with it.

Pointer formatting is extended to cover bitmap formatting.  It uses the
field width for the number of bits instead of precision.  The format used
is '%*pb[l]', with the optional trailing 'l' specifying list format
instead of hex masks.  For more details, please see 0002.

This patch (of 31):

Currently, the formatting and parsing functions in cpumask.h use
nr_cpumask_bits like other cpumask functions; however, nr_cpumask_bits
is either NR_CPUS or nr_cpu_ids depending on CONFIG_CPUMASK_OFFSTACK.
This leads to inconsistent behaviors.

With CONFIG_NR_CPUS=512 and !CONFIG_CPUMASK_OFFSTACK

  # cat /sys/devices/virtual/net/lo/queues/rx-0/rps_cpus
  00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000
  # cat /proc/self/status | grep Cpus_allowed:
  Cpus_allowed:   f

With CONFIG_NR_CPUS=1024 and CONFIG_CPUMASK_OFFSTACK (fedora default)

  # cat /sys/devices/virtual/net/lo/queues/rx-0/rps_cpus
  0
  # cat /proc/self/status | grep Cpus_allowed:
  Cpus_allowed:   f

Note that /proc/self/status is always using nr_cpu_ids regardless of
config.  This is because seq cpumask formattings functions always use
nr_cpu_ids.

Given that the same output fields may switch between the two forms,
converging on nr_cpu_ids always isn't too likely to surprise userland.
This patch updates the formatting and parsing functions in cpumask.h
to always use nr_cpu_ids.  There's no point in dealing with CPUs which
aren't even possible on the machine.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ff9044286d88..ee9acb0ce542 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -550,7 +550,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
 static inline int cpumask_scnprintf(char *buf, int len,
 				    const struct cpumask *srcp)
 {
-	return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpumask_bits);
+	return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpu_ids);
 }
 
 /**
@@ -564,7 +564,7 @@ static inline int cpumask_scnprintf(char *buf, int len,
 static inline int cpumask_parse_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
 {
-	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
+	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpu_ids);
 }
 
 /**
@@ -579,7 +579,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
 {
 	return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
-							nr_cpumask_bits);
+				     nr_cpu_ids);
 }
 
 /**
@@ -595,7 +595,7 @@ static inline int cpulist_scnprintf(char *buf, int len,
 				    const struct cpumask *srcp)
 {
 	return bitmap_scnlistprintf(buf, len, cpumask_bits(srcp),
-				    nr_cpumask_bits);
+				    nr_cpu_ids);
 }
 
 /**
@@ -610,7 +610,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
 	char *nl = strchr(buf, '\n');
 	unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf);
 
-	return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
+	return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpu_ids);
 }
 
 /**
@@ -622,7 +622,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
  */
 static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 {
-	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
+	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpu_ids);
 }
 
 /**
@@ -817,7 +817,7 @@ static inline ssize_t
 cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
 {
 	return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
-				      nr_cpumask_bits);
+				      nr_cpu_ids);
 }
 
 /*
-- 
cgit v1.2.3


From f1bbc032e45106400905ebb47550983af4690b0b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Feb 2015 14:36:57 -0800
Subject: cpumask, nodemask: implement cpumask/nodemask_pr_args()

printf family of functions can now format bitmaps using '%*pb[l]' and
all cpumask and nodemask formatting will be converted to use it.  To
ease printing these masks with '%*pb[l]' which require two params -
the number of bits and the actual bitmap, this patch implement
cpumask_pr_args() and nodemask_pr_args() which can be used to provide
arguments for '%*pb[l]'

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h  | 8 ++++++++
 include/linux/nodemask.h | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ee9acb0ce542..a9b3d00915a0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -22,6 +22,14 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
  */
 #define cpumask_bits(maskp) ((maskp)->bits)
 
+/**
+ * cpumask_pr_args - printf args to output a cpumask
+ * @maskp: cpumask to be printed
+ *
+ * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
+ */
+#define cpumask_pr_args(maskp)		nr_cpu_ids, cpumask_bits(maskp)
+
 #if NR_CPUS == 1
 #define nr_cpu_ids		1
 #else
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 21cef483dc1b..10f8e556ba07 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -98,6 +98,14 @@
 typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 extern nodemask_t _unused_nodemask_arg_;
 
+/**
+ * nodemask_pr_args - printf args to output a nodemask
+ * @maskp: nodemask to be printed
+ *
+ * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
+ */
+#define nodemask_pr_args(maskp)		MAX_NUMNODES, (maskp)->bits
+
 /*
  * The inline keyword gives the compiler room to decide to inline, or
  * not inline a function as it sees best.  However, as these functions
-- 
cgit v1.2.3


From 46385326cc1577587ed3e7432c2425cf6d3e4308 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Feb 2015 14:38:15 -0800
Subject: bitmap, cpumask, nodemask: remove dedicated formatting functions

Now that all bitmap formatting usages have been converted to
'%*pb[l]', the separate formatting functions are unnecessary.  The
following functions are removed.

* bitmap_scn[list]printf()
* cpumask_scnprintf(), cpulist_scnprintf()
* [__]nodemask_scnprintf(), [__]nodelist_scnprintf()
* seq_bitmap[_list](), seq_cpumask[_list](), seq_nodemask[_list]()
* seq_buf_bitmask()

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 32 --------------------------------
 include/linux/bitmap.h   |  7 -------
 include/linux/cpumask.h  | 31 -------------------------------
 include/linux/nodemask.h | 33 +++++++--------------------------
 include/linux/seq_buf.h  |  3 ---
 include/linux/seq_file.h | 25 -------------------------
 lib/bitmap.c             | 41 -----------------------------------------
 lib/seq_buf.c            | 36 ------------------------------------
 8 files changed, 7 insertions(+), 201 deletions(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index dbf3a59c86bb..555f82155be8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -539,38 +539,6 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
 	return res;
 }
 
-int seq_bitmap(struct seq_file *m, const unsigned long *bits,
-				   unsigned int nr_bits)
-{
-	if (m->count < m->size) {
-		int len = bitmap_scnprintf(m->buf + m->count,
-				m->size - m->count, bits, nr_bits);
-		if (m->count + len < m->size) {
-			m->count += len;
-			return 0;
-		}
-	}
-	seq_set_overflow(m);
-	return -1;
-}
-EXPORT_SYMBOL(seq_bitmap);
-
-int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
-		unsigned int nr_bits)
-{
-	if (m->count < m->size) {
-		int len = bitmap_scnlistprintf(m->buf + m->count,
-				m->size - m->count, bits, nr_bits);
-		if (m->count + len < m->size) {
-			m->count += len;
-			return 0;
-		}
-	}
-	seq_set_overflow(m);
-	return -1;
-}
-EXPORT_SYMBOL(seq_bitmap_list);
-
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
 	return NULL + (*pos == 0);
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 5e7f75a6d7d0..dbfbf4990005 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -52,16 +52,13 @@
  * bitmap_bitremap(oldbit, old, new, nbits)	newbit = map(old, new)(oldbit)
  * bitmap_onto(dst, orig, relmap, nbits)	*dst = orig relative to relmap
  * bitmap_fold(dst, orig, sz, nbits)		dst bits = orig bits mod sz
- * bitmap_scnprintf(buf, len, src, nbits)	Print bitmap src to buf
  * bitmap_parse(buf, buflen, dst, nbits)	Parse bitmap dst from kernel buf
  * bitmap_parse_user(ubuf, ulen, dst, nbits)	Parse bitmap dst from user buf
- * bitmap_scnlistprintf(buf, len, src, nbits)	Print bitmap src as list to buf
  * bitmap_parselist(buf, dst, nbits)		Parse bitmap dst from kernel buf
  * bitmap_parselist_user(buf, dst, nbits)	Parse bitmap dst from user buf
  * bitmap_find_free_region(bitmap, bits, order)	Find and allocate bit region
  * bitmap_release_region(bitmap, pos, order)	Free specified bit region
  * bitmap_allocate_region(bitmap, pos, order)	Allocate specified bit region
- * bitmap_print_to_pagebuf(list, buf, mask, nbits) Print bitmap src as list/hex
  */
 
 /*
@@ -147,14 +144,10 @@ bitmap_find_next_zero_area(unsigned long *map,
 					      align_mask, 0);
 }
 
-extern int bitmap_scnprintf(char *buf, unsigned int len,
-			const unsigned long *src, int nbits);
 extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
 			unsigned long *dst, int nbits);
 extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
 			unsigned long *dst, int nbits);
-extern int bitmap_scnlistprintf(char *buf, unsigned int len,
-			const unsigned long *src, int nbits);
 extern int bitmap_parselist(const char *buf, unsigned long *maskp,
 			int nmaskbits);
 extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index a9b3d00915a0..086549a665e2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -546,21 +546,6 @@ static inline void cpumask_copy(struct cpumask *dstp,
  */
 #define cpumask_of(cpu) (get_cpu_mask(cpu))
 
-/**
- * cpumask_scnprintf - print a cpumask into a string as comma-separated hex
- * @buf: the buffer to sprintf into
- * @len: the length of the buffer
- * @srcp: the cpumask to print
- *
- * If len is zero, returns zero.  Otherwise returns the length of the
- * (nul-terminated) @buf string.
- */
-static inline int cpumask_scnprintf(char *buf, int len,
-				    const struct cpumask *srcp)
-{
-	return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpu_ids);
-}
-
 /**
  * cpumask_parse_user - extract a cpumask from a user string
  * @buf: the buffer to extract from
@@ -590,22 +575,6 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
 				     nr_cpu_ids);
 }
 
-/**
- * cpulist_scnprintf - print a cpumask into a string as comma-separated list
- * @buf: the buffer to sprintf into
- * @len: the length of the buffer
- * @srcp: the cpumask to print
- *
- * If len is zero, returns zero.  Otherwise returns the length of the
- * (nul-terminated) @buf string.
- */
-static inline int cpulist_scnprintf(char *buf, int len,
-				    const struct cpumask *srcp)
-{
-	return bitmap_scnlistprintf(buf, len, cpumask_bits(srcp),
-				    nr_cpu_ids);
-}
-
 /**
  * cpumask_parse - extract a cpumask from from a string
  * @buf: the buffer to extract from
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 10f8e556ba07..6e85889cf9ab 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -8,14 +8,13 @@
  * See detailed comments in the file linux/bitmap.h describing the
  * data type on which these nodemasks are based.
  *
- * For details of nodemask_scnprintf() and nodemask_parse_user(),
- * see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c.
- * For details of nodelist_scnprintf() and nodelist_parse(), see
- * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
- * For details of node_remap(), see bitmap_bitremap in lib/bitmap.c.
- * For details of nodes_remap(), see bitmap_remap in lib/bitmap.c.
- * For details of nodes_onto(), see bitmap_onto in lib/bitmap.c.
- * For details of nodes_fold(), see bitmap_fold in lib/bitmap.c.
+ * For details of nodemask_parse_user(), see bitmap_parse_user() in
+ * lib/bitmap.c.  For details of nodelist_parse(), see bitmap_parselist(),
+ * also in bitmap.c.  For details of node_remap(), see bitmap_bitremap in
+ * lib/bitmap.c.  For details of nodes_remap(), see bitmap_remap in
+ * lib/bitmap.c.  For details of nodes_onto(), see bitmap_onto in
+ * lib/bitmap.c.  For details of nodes_fold(), see bitmap_fold in
+ * lib/bitmap.c.
  *
  * The available nodemask operations are:
  *
@@ -52,9 +51,7 @@
  * NODE_MASK_NONE			Initializer - no bits set
  * unsigned long *nodes_addr(mask)	Array of unsigned long's in mask
  *
- * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
  * int nodemask_parse_user(ubuf, ulen, mask)	Parse ascii string as nodemask
- * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
  * int nodelist_parse(buf, map)		Parse ascii string as nodelist
  * int node_remap(oldbit, old, new)	newbit = map(old, new)(oldbit)
  * void nodes_remap(dst, src, old, new)	*dst = map(old, new)(src)
@@ -312,14 +309,6 @@ static inline int __first_unset_node(const nodemask_t *maskp)
 
 #define nodes_addr(src) ((src).bits)
 
-#define nodemask_scnprintf(buf, len, src) \
-			__nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES)
-static inline int __nodemask_scnprintf(char *buf, int len,
-					const nodemask_t *srcp, int nbits)
-{
-	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
-}
-
 #define nodemask_parse_user(ubuf, ulen, dst) \
 		__nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
 static inline int __nodemask_parse_user(const char __user *buf, int len,
@@ -328,14 +317,6 @@ static inline int __nodemask_parse_user(const char __user *buf, int len,
 	return bitmap_parse_user(buf, len, dstp->bits, nbits);
 }
 
-#define nodelist_scnprintf(buf, len, src) \
-			__nodelist_scnprintf((buf), (len), &(src), MAX_NUMNODES)
-static inline int __nodelist_scnprintf(char *buf, int len,
-					const nodemask_t *srcp, int nbits)
-{
-	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
-}
-
 #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
 static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
 {
diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 9aafe0e24c68..fb7eb9ccb1cd 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -125,9 +125,6 @@ extern int seq_buf_putmem_hex(struct seq_buf *s, const void *mem,
 			      unsigned int len);
 extern int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc);
 
-extern int seq_buf_bitmask(struct seq_buf *s, const unsigned long *maskp,
-			   int nmaskbits);
-
 #ifdef CONFIG_BINARY_PRINTF
 extern int
 seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index cf6a9daaaf6d..afbb1fd77c77 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -126,31 +126,6 @@ int seq_path(struct seq_file *, const struct path *, const char *);
 int seq_dentry(struct seq_file *, struct dentry *, const char *);
 int seq_path_root(struct seq_file *m, const struct path *path,
 		  const struct path *root, const char *esc);
-int seq_bitmap(struct seq_file *m, const unsigned long *bits,
-				   unsigned int nr_bits);
-static inline int seq_cpumask(struct seq_file *m, const struct cpumask *mask)
-{
-	return seq_bitmap(m, cpumask_bits(mask), nr_cpu_ids);
-}
-
-static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
-{
-	return seq_bitmap(m, mask->bits, MAX_NUMNODES);
-}
-
-int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
-		unsigned int nr_bits);
-
-static inline int seq_cpumask_list(struct seq_file *m,
-				   const struct cpumask *mask)
-{
-	return seq_bitmap_list(m, cpumask_bits(mask), nr_cpu_ids);
-}
-
-static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask)
-{
-	return seq_bitmap_list(m, mask->bits, MAX_NUMNODES);
-}
 
 int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
 int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 088adbdcbad9..d456f4c15a9f 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -369,24 +369,6 @@ EXPORT_SYMBOL(bitmap_find_next_zero_area_off);
 #define nbits_to_hold_value(val)	fls(val)
 #define BASEDEC 10		/* fancier cpuset lists input in decimal */
 
-/**
- * bitmap_scnprintf - convert bitmap to an ASCII hex string.
- * @buf: byte buffer into which string is placed
- * @buflen: reserved size of @buf, in bytes
- * @maskp: pointer to bitmap to convert
- * @nmaskbits: size of bitmap, in bits
- *
- * Exactly @nmaskbits bits are displayed.  Hex digits are grouped into
- * comma-separated sets of eight digits per set.  Returns the number of
- * characters which were written to *buf, excluding the trailing \0.
- */
-int bitmap_scnprintf(char *buf, unsigned int buflen,
-	const unsigned long *maskp, int nmaskbits)
-{
-	return scnprintf(buf, buflen, "%*pb", nmaskbits, maskp);
-}
-EXPORT_SYMBOL(bitmap_scnprintf);
-
 /**
  * __bitmap_parse - convert an ASCII hex string into a bitmap.
  * @buf: pointer to buffer containing string.
@@ -500,29 +482,6 @@ int bitmap_parse_user(const char __user *ubuf,
 }
 EXPORT_SYMBOL(bitmap_parse_user);
 
-/**
- * bitmap_scnlistprintf - convert bitmap to list format ASCII string
- * @buf: byte buffer into which string is placed
- * @buflen: reserved size of @buf, in bytes
- * @maskp: pointer to bitmap to convert
- * @nmaskbits: size of bitmap, in bits
- *
- * Output format is a comma-separated list of decimal numbers and
- * ranges.  Consecutively set bits are shown as two hyphen-separated
- * decimal numbers, the smallest and largest bit numbers set in
- * the range.  Output format is compatible with the format
- * accepted as input by bitmap_parselist().
- *
- * The return value is the number of characters which were written to *buf
- * excluding the trailing '\0', as per ISO C99's scnprintf.
- */
-int bitmap_scnlistprintf(char *buf, unsigned int buflen,
-	const unsigned long *maskp, int nmaskbits)
-{
-	return scnprintf(buf, buflen, "%*pbl", nmaskbits, maskp);
-}
-EXPORT_SYMBOL(bitmap_scnlistprintf);
-
 /**
  * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string
  * @list: indicates whether the bitmap must be list
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index 4eedfedb9e31..88c0854bd752 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -91,42 +91,6 @@ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...)
 	return ret;
 }
 
-/**
- * seq_buf_bitmask - write a bitmask array in its ASCII representation
- * @s:		seq_buf descriptor
- * @maskp:	points to an array of unsigned longs that represent a bitmask
- * @nmaskbits:	The number of bits that are valid in @maskp
- *
- * Writes a ASCII representation of a bitmask string into @s.
- *
- * Returns zero on success, -1 on overflow.
- */
-int seq_buf_bitmask(struct seq_buf *s, const unsigned long *maskp,
-		    int nmaskbits)
-{
-	unsigned int len = seq_buf_buffer_left(s);
-	int ret;
-
-	WARN_ON(s->size == 0);
-
-	/*
-	 * Note, because bitmap_scnprintf() only returns the number of bytes
-	 * written and not the number that would be written, we use the last
-	 * byte of the buffer to let us know if we overflowed. There's a small
-	 * chance that the bitmap could have fit exactly inside the buffer, but
-	 * it's not that critical if that does happen.
-	 */
-	if (len > 1) {
-		ret = bitmap_scnprintf(s->buffer + s->len, len, maskp, nmaskbits);
-		if (ret < len) {
-			s->len += ret;
-			return 0;
-		}
-	}
-	seq_buf_set_overflow(s);
-	return -1;
-}
-
 #ifdef CONFIG_BINARY_PRINTF
 /**
  * seq_buf_bprintf - Write the printf string from binary arguments
-- 
cgit v1.2.3


From cb4188ac8e5779f66b9f55888ac2c75b391cde44 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:14 -0800
Subject: compiler: introduce __alias(symbol) shortcut

To be consistent with other compiler attributes introduce __alias(symbol)
macro expanding into __attribute__((alias(#symbol)))

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 02ae99e8e6d3..cdf13ca7cac3 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -66,6 +66,7 @@
 #define __deprecated			__attribute__((deprecated))
 #define __packed			__attribute__((packed))
 #define __weak				__attribute__((weak))
+#define __alias(symbol)		__attribute__((alias(#symbol)))
 
 /*
  * it doesn't make sense on ARM (currently the only user of __naked) to trace
-- 
cgit v1.2.3


From 0b24becc810dc3be6e3f94103a866f214c282394 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:17 -0800
Subject: kasan: add kernel address sanitizer infrastructure

Kernel Address sanitizer (KASan) is a dynamic memory error detector.  It
provides fast and comprehensive solution for finding use-after-free and
out-of-bounds bugs.

KASAN uses compile-time instrumentation for checking every memory access,
therefore GCC > v4.9.2 required.  v4.9.2 almost works, but has issues with
putting symbol aliases into the wrong section, which breaks kasan
instrumentation of globals.

This patch only adds infrastructure for kernel address sanitizer.  It's
not available for use yet.  The idea and some code was borrowed from [1].

Basic idea:

The main idea of KASAN is to use shadow memory to record whether each byte
of memory is safe to access or not, and use compiler's instrumentation to
check the shadow memory on each memory access.

Address sanitizer uses 1/8 of the memory addressable in kernel for shadow
memory and uses direct mapping with a scale and offset to translate a
memory address to its corresponding shadow address.

Here is function to translate address to corresponding shadow address:

     unsigned long kasan_mem_to_shadow(unsigned long addr)
     {
                return (addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET;
     }

where KASAN_SHADOW_SCALE_SHIFT = 3.

So for every 8 bytes there is one corresponding byte of shadow memory.
The following encoding used for each shadow byte: 0 means that all 8 bytes
of the corresponding memory region are valid for access; k (1 <= k <= 7)
means that the first k bytes are valid for access, and other (8 - k) bytes
are not; Any negative value indicates that the entire 8-bytes are
inaccessible.  Different negative values used to distinguish between
different kinds of inaccessible memory (redzones, freed memory) (see
mm/kasan/kasan.h).

To be able to detect accesses to bad memory we need a special compiler.
Such compiler inserts a specific function calls (__asan_load*(addr),
__asan_store*(addr)) before each memory access of size 1, 2, 4, 8 or 16.

These functions check whether memory region is valid to access or not by
checking corresponding shadow memory.  If access is not valid an error
printed.

Historical background of the address sanitizer from Dmitry Vyukov:

	"We've developed the set of tools, AddressSanitizer (Asan),
	ThreadSanitizer and MemorySanitizer, for user space. We actively use
	them for testing inside of Google (continuous testing, fuzzing,
	running prod services). To date the tools have found more than 10'000
	scary bugs in Chromium, Google internal codebase and various
	open-source projects (Firefox, OpenSSL, gcc, clang, ffmpeg, MySQL and
	lots of others): [2] [3] [4].
	The tools are part of both gcc and clang compilers.

	We have not yet done massive testing under the Kernel AddressSanitizer
	(it's kind of chicken and egg problem, you need it to be upstream to
	start applying it extensively). To date it has found about 50 bugs.
	Bugs that we've found in upstream kernel are listed in [5].
	We've also found ~20 bugs in out internal version of the kernel. Also
	people from Samsung and Oracle have found some.

	[...]

	As others noted, the main feature of AddressSanitizer is its
	performance due to inline compiler instrumentation and simple linear
	shadow memory. User-space Asan has ~2x slowdown on computational
	programs and ~2x memory consumption increase. Taking into account that
	kernel usually consumes only small fraction of CPU and memory when
	running real user-space programs, I would expect that kernel Asan will
	have ~10-30% slowdown and similar memory consumption increase (when we
	finish all tuning).

	I agree that Asan can well replace kmemcheck. We have plans to start
	working on Kernel MemorySanitizer that finds uses of unitialized
	memory. Asan+Msan will provide feature-parity with kmemcheck. As
	others noted, Asan will unlikely replace debug slab and pagealloc that
	can be enabled at runtime. Asan uses compiler instrumentation, so even
	if it is disabled, it still incurs visible overheads.

	Asan technology is easily portable to other architectures. Compiler
	instrumentation is fully portable. Runtime has some arch-dependent
	parts like shadow mapping and atomic operation interception. They are
	relatively easy to port."

Comparison with other debugging features:
========================================

KMEMCHECK:

  - KASan can do almost everything that kmemcheck can.  KASan uses
    compile-time instrumentation, which makes it significantly faster than
    kmemcheck.  The only advantage of kmemcheck over KASan is detection of
    uninitialized memory reads.

    Some brief performance testing showed that kasan could be
    x500-x600 times faster than kmemcheck:

$ netperf -l 30
		MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to localhost (127.0.0.1) port 0 AF_INET
		Recv   Send    Send
		Socket Socket  Message  Elapsed
		Size   Size    Size     Time     Throughput
		bytes  bytes   bytes    secs.    10^6bits/sec

no debug:	87380  16384  16384    30.00    41624.72

kasan inline:	87380  16384  16384    30.00    12870.54

kasan outline:	87380  16384  16384    30.00    10586.39

kmemcheck: 	87380  16384  16384    30.03      20.23

  - Also kmemcheck couldn't work on several CPUs.  It always sets
    number of CPUs to 1.  KASan doesn't have such limitation.

DEBUG_PAGEALLOC:
	- KASan is slower than DEBUG_PAGEALLOC, but KASan works on sub-page
	  granularity level, so it able to find more bugs.

SLUB_DEBUG (poisoning, redzones):
	- SLUB_DEBUG has lower overhead than KASan.

	- SLUB_DEBUG in most cases are not able to detect bad reads,
	  KASan able to detect both reads and writes.

	- In some cases (e.g. redzone overwritten) SLUB_DEBUG detect
	  bugs only on allocation/freeing of object. KASan catch
	  bugs right before it will happen, so we always know exact
	  place of first bad read/write.

[1] https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel
[2] https://code.google.com/p/address-sanitizer/wiki/FoundBugs
[3] https://code.google.com/p/thread-sanitizer/wiki/FoundBugs
[4] https://code.google.com/p/memory-sanitizer/wiki/FoundBugs
[5] https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel#Trophies

Based on work by Andrey Konovalov.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Acked-by: Michal Marek <mmarek@suse.cz>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kasan.txt               | 170 +++++++++++++++++++
 Makefile                              |   3 +-
 drivers/firmware/efi/libstub/Makefile |   1 +
 include/linux/kasan.h                 |  46 ++++++
 include/linux/sched.h                 |   3 +
 lib/Kconfig.debug                     |   2 +
 lib/Kconfig.kasan                     |  43 +++++
 mm/Makefile                           |   1 +
 mm/kasan/Makefile                     |   8 +
 mm/kasan/kasan.c                      | 302 ++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h                      |  34 ++++
 mm/kasan/report.c                     | 209 +++++++++++++++++++++++
 scripts/Makefile.kasan                |  24 +++
 scripts/Makefile.lib                  |  10 ++
 14 files changed, 855 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/kasan.txt
 create mode 100644 include/linux/kasan.h
 create mode 100644 lib/Kconfig.kasan
 create mode 100644 mm/kasan/Makefile
 create mode 100644 mm/kasan/kasan.c
 create mode 100644 mm/kasan/kasan.h
 create mode 100644 mm/kasan/report.c
 create mode 100644 scripts/Makefile.kasan

(limited to 'include/linux')

diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt
new file mode 100644
index 000000000000..f0645a8a992f
--- /dev/null
+++ b/Documentation/kasan.txt
@@ -0,0 +1,170 @@
+Kernel address sanitizer
+================
+
+0. Overview
+===========
+
+Kernel Address sanitizer (KASan) is a dynamic memory error detector. It provides
+a fast and comprehensive solution for finding use-after-free and out-of-bounds
+bugs.
+
+KASan uses compile-time instrumentation for checking every memory access,
+therefore you will need a certain version of GCC >= 4.9.2
+
+Currently KASan is supported only for x86_64 architecture and requires that the
+kernel be built with the SLUB allocator.
+
+1. Usage
+=========
+
+To enable KASAN configure kernel with:
+
+	  CONFIG_KASAN = y
+
+and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline/inline
+is compiler instrumentation types. The former produces smaller binary the
+latter is 1.1 - 2 times faster. Inline instrumentation requires GCC 5.0 or
+latter.
+
+Currently KASAN works only with the SLUB memory allocator.
+For better bug detection and nicer report, enable CONFIG_STACKTRACE and put
+at least 'slub_debug=U' in the boot cmdline.
+
+To disable instrumentation for specific files or directories, add a line
+similar to the following to the respective kernel Makefile:
+
+        For a single file (e.g. main.o):
+                KASAN_SANITIZE_main.o := n
+
+        For all files in one directory:
+                KASAN_SANITIZE := n
+
+1.1 Error reports
+==========
+
+A typical out of bounds access report looks like this:
+
+==================================================================
+BUG: AddressSanitizer: out of bounds access in kmalloc_oob_right+0x65/0x75 [test_kasan] at addr ffff8800693bc5d3
+Write of size 1 by task modprobe/1689
+=============================================================================
+BUG kmalloc-128 (Not tainted): kasan error
+-----------------------------------------------------------------------------
+
+Disabling lock debugging due to kernel taint
+INFO: Allocated in kmalloc_oob_right+0x3d/0x75 [test_kasan] age=0 cpu=0 pid=1689
+ __slab_alloc+0x4b4/0x4f0
+ kmem_cache_alloc_trace+0x10b/0x190
+ kmalloc_oob_right+0x3d/0x75 [test_kasan]
+ init_module+0x9/0x47 [test_kasan]
+ do_one_initcall+0x99/0x200
+ load_module+0x2cb3/0x3b20
+ SyS_finit_module+0x76/0x80
+ system_call_fastpath+0x12/0x17
+INFO: Slab 0xffffea0001a4ef00 objects=17 used=7 fp=0xffff8800693bd728 flags=0x100000000004080
+INFO: Object 0xffff8800693bc558 @offset=1368 fp=0xffff8800693bc720
+
+Bytes b4 ffff8800693bc548: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a  ........ZZZZZZZZ
+Object ffff8800693bc558: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc568: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc578: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc588: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc598: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5a8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5b8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5c8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5  kkkkkkkkkkkkkkk.
+Redzone ffff8800693bc5d8: cc cc cc cc cc cc cc cc                          ........
+Padding ffff8800693bc718: 5a 5a 5a 5a 5a 5a 5a 5a                          ZZZZZZZZ
+CPU: 0 PID: 1689 Comm: modprobe Tainted: G    B          3.18.0-rc1-mm1+ #98
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
+ ffff8800693bc000 0000000000000000 ffff8800693bc558 ffff88006923bb78
+ ffffffff81cc68ae 00000000000000f3 ffff88006d407600 ffff88006923bba8
+ ffffffff811fd848 ffff88006d407600 ffffea0001a4ef00 ffff8800693bc558
+Call Trace:
+ [<ffffffff81cc68ae>] dump_stack+0x46/0x58
+ [<ffffffff811fd848>] print_trailer+0xf8/0x160
+ [<ffffffffa00026a7>] ? kmem_cache_oob+0xc3/0xc3 [test_kasan]
+ [<ffffffff811ff0f5>] object_err+0x35/0x40
+ [<ffffffffa0002065>] ? kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffff8120b9fa>] kasan_report_error+0x38a/0x3f0
+ [<ffffffff8120a79f>] ? kasan_poison_shadow+0x2f/0x40
+ [<ffffffff8120b344>] ? kasan_unpoison_shadow+0x14/0x40
+ [<ffffffff8120a79f>] ? kasan_poison_shadow+0x2f/0x40
+ [<ffffffffa00026a7>] ? kmem_cache_oob+0xc3/0xc3 [test_kasan]
+ [<ffffffff8120a995>] __asan_store1+0x75/0xb0
+ [<ffffffffa0002601>] ? kmem_cache_oob+0x1d/0xc3 [test_kasan]
+ [<ffffffffa0002065>] ? kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffffa0002065>] kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffffa00026b0>] init_module+0x9/0x47 [test_kasan]
+ [<ffffffff810002d9>] do_one_initcall+0x99/0x200
+ [<ffffffff811e4e5c>] ? __vunmap+0xec/0x160
+ [<ffffffff81114f63>] load_module+0x2cb3/0x3b20
+ [<ffffffff8110fd70>] ? m_show+0x240/0x240
+ [<ffffffff81115f06>] SyS_finit_module+0x76/0x80
+ [<ffffffff81cd3129>] system_call_fastpath+0x12/0x17
+Memory state around the buggy address:
+ ffff8800693bc300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc380: fc fc 00 00 00 00 00 00 00 00 00 00 00 00 00 fc
+ ffff8800693bc400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc500: fc fc fc fc fc fc fc fc fc fc fc 00 00 00 00 00
+>ffff8800693bc580: 00 00 00 00 00 00 00 00 00 00 03 fc fc fc fc fc
+                                                 ^
+ ffff8800693bc600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc700: fc fc fc fc fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff8800693bc780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+==================================================================
+
+First sections describe slub object where bad access happened.
+See 'SLUB Debug output' section in Documentation/vm/slub.txt for details.
+
+In the last section the report shows memory state around the accessed address.
+Reading this part requires some more understanding of how KASAN works.
+
+Each 8 bytes of memory are encoded in one shadow byte as accessible,
+partially accessible, freed or they can be part of a redzone.
+We use the following encoding for each shadow byte: 0 means that all 8 bytes
+of the corresponding memory region are accessible; number N (1 <= N <= 7) means
+that the first N bytes are accessible, and other (8 - N) bytes are not;
+any negative value indicates that the entire 8-byte word is inaccessible.
+We use different negative values to distinguish between different kinds of
+inaccessible memory like redzones or freed memory (see mm/kasan/kasan.h).
+
+In the report above the arrows point to the shadow byte 03, which means that
+the accessed address is partially accessible.
+
+
+2. Implementation details
+========================
+
+From a high level, our approach to memory error detection is similar to that
+of kmemcheck: use shadow memory to record whether each byte of memory is safe
+to access, and use compile-time instrumentation to check shadow memory on each
+memory access.
+
+AddressSanitizer dedicates 1/8 of kernel memory to its shadow memory
+(e.g. 16TB to cover 128TB on x86_64) and uses direct mapping with a scale and
+offset to translate a memory address to its corresponding shadow address.
+
+Here is the function witch translate an address to its corresponding shadow
+address:
+
+static inline void *kasan_mem_to_shadow(const void *addr)
+{
+	return ((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
+		+ KASAN_SHADOW_OFFSET;
+}
+
+where KASAN_SHADOW_SCALE_SHIFT = 3.
+
+Compile-time instrumentation used for checking memory accesses. Compiler inserts
+function calls (__asan_load*(addr), __asan_store*(addr)) before each memory
+access of size 1, 2, 4, 8 or 16. These functions check whether memory access is
+valid or not by checking corresponding shadow memory.
+
+GCC 5.0 has possibility to perform inline instrumentation. Instead of making
+function calls GCC directly inserts the code to check the shadow memory.
+This option significantly enlarges kernel but it gives x1.1-x2 performance
+boost over outline instrumented kernel.
diff --git a/Makefile b/Makefile
index 5fa2e3035509..33cb15efd257 100644
--- a/Makefile
+++ b/Makefile
@@ -423,7 +423,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
 export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -781,6 +781,7 @@ ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
 	KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
+include $(srctree)/scripts/Makefile.kasan
 include $(srctree)/scripts/Makefile.extrawarn
 
 # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 8902f52e0998..280bc0a63365 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -19,6 +19,7 @@ KBUILD_CFLAGS			:= $(cflags-y) \
 				   $(call cc-option,-fno-stack-protector)
 
 GCOV_PROFILE			:= n
+KASAN_SANITIZE			:= n
 
 lib-y				:= efi-stub-helper.o
 lib-$(CONFIG_EFI_ARMSTUB)	+= arm-stub.o fdt.o
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
new file mode 100644
index 000000000000..9102fda60def
--- /dev/null
+++ b/include/linux/kasan.h
@@ -0,0 +1,46 @@
+#ifndef _LINUX_KASAN_H
+#define _LINUX_KASAN_H
+
+#include <linux/types.h>
+
+struct kmem_cache;
+struct page;
+
+#ifdef CONFIG_KASAN
+
+#define KASAN_SHADOW_SCALE_SHIFT 3
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+
+#include <asm/kasan.h>
+#include <linux/sched.h>
+
+static inline void *kasan_mem_to_shadow(const void *addr)
+{
+	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
+		+ KASAN_SHADOW_OFFSET;
+}
+
+/* Enable reporting bugs after kasan_disable_current() */
+static inline void kasan_enable_current(void)
+{
+	current->kasan_depth++;
+}
+
+/* Disable reporting bugs for current task */
+static inline void kasan_disable_current(void)
+{
+	current->kasan_depth--;
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size);
+
+#else /* CONFIG_KASAN */
+
+static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
+
+static inline void kasan_enable_current(void) {}
+static inline void kasan_disable_current(void) {}
+
+#endif /* CONFIG_KASAN */
+
+#endif /* LINUX_KASAN_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 048b91b983ed..41c60e5302d7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1664,6 +1664,9 @@ struct task_struct {
 	unsigned long timer_slack_ns;
 	unsigned long default_timer_slack_ns;
 
+#ifdef CONFIG_KASAN
+	unsigned int kasan_depth;
+#endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/* Index of current stored address in ret_stack */
 	int curr_ret_stack;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 79a9bb67aeaf..ecb3516f6546 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -651,6 +651,8 @@ config DEBUG_STACKOVERFLOW
 
 source "lib/Kconfig.kmemcheck"
 
+source "lib/Kconfig.kasan"
+
 endmenu # "Memory Debugging"
 
 config DEBUG_SHIRQ
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
new file mode 100644
index 000000000000..e5b3fbe5560f
--- /dev/null
+++ b/lib/Kconfig.kasan
@@ -0,0 +1,43 @@
+config HAVE_ARCH_KASAN
+	bool
+
+if HAVE_ARCH_KASAN
+
+config KASAN
+	bool "KASan: runtime memory debugger"
+	help
+	  Enables kernel address sanitizer - runtime memory debugger,
+	  designed to find out-of-bounds accesses and use-after-free bugs.
+	  This is strictly debugging feature. It consumes about 1/8
+	  of available memory and brings about ~x3 performance slowdown.
+	  For better error detection enable CONFIG_STACKTRACE,
+	  and add slub_debug=U to boot cmdline.
+
+config KASAN_SHADOW_OFFSET
+	hex
+
+choice
+	prompt "Instrumentation type"
+	depends on KASAN
+	default KASAN_OUTLINE
+
+config KASAN_OUTLINE
+	bool "Outline instrumentation"
+	help
+	  Before every memory access compiler insert function call
+	  __asan_load*/__asan_store*. These functions performs check
+	  of shadow memory. This is slower than inline instrumentation,
+	  however it doesn't bloat size of kernel's .text section so
+	  much as inline does.
+
+config KASAN_INLINE
+	bool "Inline instrumentation"
+	help
+	  Compiler directly inserts code checking shadow memory before
+	  memory accesses. This is faster than outline (in some workloads
+	  it gives about x2 boost over outline instrumentation), but
+	  make kernel's .text size much bigger.
+
+endchoice
+
+endif
diff --git a/mm/Makefile b/mm/Makefile
index 3548460ab7b6..930b52df4aca 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
+obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
new file mode 100644
index 000000000000..bd837b8c2f41
--- /dev/null
+++ b/mm/kasan/Makefile
@@ -0,0 +1,8 @@
+KASAN_SANITIZE := n
+
+CFLAGS_REMOVE_kasan.o = -pg
+# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
+# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+
+obj-y := kasan.o report.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
new file mode 100644
index 000000000000..6dc1aa7cefcc
--- /dev/null
+++ b/mm/kasan/kasan.c
@@ -0,0 +1,302 @@
+/*
+ * This file contains shadow memory manipulation code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include "kasan.h"
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
+ */
+static void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+	void *shadow_start, *shadow_end;
+
+	shadow_start = kasan_mem_to_shadow(address);
+	shadow_end = kasan_mem_to_shadow(address + size);
+
+	memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size)
+{
+	kasan_poison_shadow(address, size, 0);
+
+	if (size & KASAN_SHADOW_MASK) {
+		u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+		*shadow = size & KASAN_SHADOW_MASK;
+	}
+}
+
+
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+	s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(shadow_value)) {
+		s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+		return unlikely(last_accessible_byte >= shadow_value);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_2(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 1))
+			return true;
+
+		if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_4(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 3))
+			return true;
+
+		if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_8(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 7))
+			return true;
+
+		if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+	u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		u16 shadow_first_bytes = *(u16 *)shadow_addr;
+		s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
+
+		if (unlikely(shadow_first_bytes))
+			return true;
+
+		if (likely(!last_byte))
+			return false;
+
+		return memory_is_poisoned_1(addr + 15);
+	}
+
+	return false;
+}
+
+static __always_inline unsigned long bytes_is_zero(const u8 *start,
+					size_t size)
+{
+	while (size) {
+		if (unlikely(*start))
+			return (unsigned long)start;
+		start++;
+		size--;
+	}
+
+	return 0;
+}
+
+static __always_inline unsigned long memory_is_zero(const void *start,
+						const void *end)
+{
+	unsigned int words;
+	unsigned long ret;
+	unsigned int prefix = (unsigned long)start % 8;
+
+	if (end - start <= 16)
+		return bytes_is_zero(start, end - start);
+
+	if (prefix) {
+		prefix = 8 - prefix;
+		ret = bytes_is_zero(start, prefix);
+		if (unlikely(ret))
+			return ret;
+		start += prefix;
+	}
+
+	words = (end - start) / 8;
+	while (words) {
+		if (unlikely(*(u64 *)start))
+			return bytes_is_zero(start, 8);
+		start += 8;
+		words--;
+	}
+
+	return bytes_is_zero(start, (end - start) % 8);
+}
+
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+						size_t size)
+{
+	unsigned long ret;
+
+	ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
+			kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+
+	if (unlikely(ret)) {
+		unsigned long last_byte = addr + size - 1;
+		s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+
+		if (unlikely(ret != (unsigned long)last_shadow ||
+			((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+			return true;
+	}
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+	if (__builtin_constant_p(size)) {
+		switch (size) {
+		case 1:
+			return memory_is_poisoned_1(addr);
+		case 2:
+			return memory_is_poisoned_2(addr);
+		case 4:
+			return memory_is_poisoned_4(addr);
+		case 8:
+			return memory_is_poisoned_8(addr);
+		case 16:
+			return memory_is_poisoned_16(addr);
+		default:
+			BUILD_BUG();
+		}
+	}
+
+	return memory_is_poisoned_n(addr, size);
+}
+
+
+static __always_inline void check_memory_region(unsigned long addr,
+						size_t size, bool write)
+{
+	struct kasan_access_info info;
+
+	if (unlikely(size == 0))
+		return;
+
+	if (unlikely((void *)addr <
+		kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+		info.access_addr = (void *)addr;
+		info.access_size = size;
+		info.is_write = write;
+		info.ip = _RET_IP_;
+		kasan_report_user_access(&info);
+		return;
+	}
+
+	if (likely(!memory_is_poisoned(addr, size)))
+		return;
+
+	kasan_report(addr, size, write, _RET_IP_);
+}
+
+#define DEFINE_ASAN_LOAD_STORE(size)				\
+	void __asan_load##size(unsigned long addr)		\
+	{							\
+		check_memory_region(addr, size, false);		\
+	}							\
+	EXPORT_SYMBOL(__asan_load##size);			\
+	__alias(__asan_load##size)				\
+	void __asan_load##size##_noabort(unsigned long);	\
+	EXPORT_SYMBOL(__asan_load##size##_noabort);		\
+	void __asan_store##size(unsigned long addr)		\
+	{							\
+		check_memory_region(addr, size, true);		\
+	}							\
+	EXPORT_SYMBOL(__asan_store##size);			\
+	__alias(__asan_store##size)				\
+	void __asan_store##size##_noabort(unsigned long);	\
+	EXPORT_SYMBOL(__asan_store##size##_noabort)
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+
+void __asan_loadN(unsigned long addr, size_t size)
+{
+	check_memory_region(addr, size, false);
+}
+EXPORT_SYMBOL(__asan_loadN);
+
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+
+void __asan_storeN(unsigned long addr, size_t size)
+{
+	check_memory_region(addr, size, true);
+}
+EXPORT_SYMBOL(__asan_storeN);
+
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
new file mode 100644
index 000000000000..648b9c006f3f
--- /dev/null
+++ b/mm/kasan/kasan.h
@@ -0,0 +1,34 @@
+#ifndef __MM_KASAN_KASAN_H
+#define __MM_KASAN_KASAN_H
+
+#include <linux/kasan.h>
+
+#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
+
+struct kasan_access_info {
+	const void *access_addr;
+	const void *first_bad_addr;
+	size_t access_size;
+	bool is_write;
+	unsigned long ip;
+};
+
+void kasan_report_error(struct kasan_access_info *info);
+void kasan_report_user_access(struct kasan_access_info *info);
+
+static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
+{
+	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
+		<< KASAN_SHADOW_SCALE_SHIFT);
+}
+
+static inline bool kasan_enabled(void)
+{
+	return !current->kasan_depth;
+}
+
+void kasan_report(unsigned long addr, size_t size,
+		bool is_write, unsigned long ip);
+
+#endif
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
new file mode 100644
index 000000000000..5835d69563f5
--- /dev/null
+++ b/mm/kasan/report.c
@@ -0,0 +1,209 @@
+/*
+ * This file contains error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include "kasan.h"
+
+/* Shadow layout customization. */
+#define SHADOW_BYTES_PER_BLOCK 1
+#define SHADOW_BLOCKS_PER_ROW 16
+#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
+#define SHADOW_ROWS_AROUND_ADDR 2
+
+static const void *find_first_bad_addr(const void *addr, size_t size)
+{
+	u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
+	const void *first_bad_addr = addr;
+
+	while (!shadow_val && first_bad_addr < addr + size) {
+		first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
+		shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
+	}
+	return first_bad_addr;
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+	const char *bug_type = "unknown crash";
+	u8 shadow_val;
+
+	info->first_bad_addr = find_first_bad_addr(info->access_addr,
+						info->access_size);
+
+	shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+	switch (shadow_val) {
+	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+		bug_type = "out of bounds access";
+		break;
+	}
+
+	pr_err("BUG: KASan: %s in %pS at addr %p\n",
+		bug_type, (void *)info->ip,
+		info->access_addr);
+	pr_err("%s of size %zu by task %s/%d\n",
+		info->is_write ? "Write" : "Read",
+		info->access_size, current->comm, task_pid_nr(current));
+}
+
+static void print_address_description(struct kasan_access_info *info)
+{
+	dump_stack();
+}
+
+static bool row_is_guilty(const void *row, const void *guilty)
+{
+	return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+}
+
+static int shadow_pointer_offset(const void *row, const void *shadow)
+{
+	/* The length of ">ff00ff00ff00ff00: " is
+	 *    3 + (BITS_PER_LONG/8)*2 chars.
+	 */
+	return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
+		(shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+}
+
+static void print_shadow_for_address(const void *addr)
+{
+	int i;
+	const void *shadow = kasan_mem_to_shadow(addr);
+	const void *shadow_row;
+
+	shadow_row = (void *)round_down((unsigned long)shadow,
+					SHADOW_BYTES_PER_ROW)
+		- SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+
+	pr_err("Memory state around the buggy address:\n");
+
+	for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
+		const void *kaddr = kasan_shadow_to_mem(shadow_row);
+		char buffer[4 + (BITS_PER_LONG/8)*2];
+
+		snprintf(buffer, sizeof(buffer),
+			(i == 0) ? ">%p: " : " %p: ", kaddr);
+
+		kasan_disable_current();
+		print_hex_dump(KERN_ERR, buffer,
+			DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
+			shadow_row, SHADOW_BYTES_PER_ROW, 0);
+		kasan_enable_current();
+
+		if (row_is_guilty(shadow_row, shadow))
+			pr_err("%*c\n",
+				shadow_pointer_offset(shadow_row, shadow),
+				'^');
+
+		shadow_row += SHADOW_BYTES_PER_ROW;
+	}
+}
+
+static DEFINE_SPINLOCK(report_lock);
+
+void kasan_report_error(struct kasan_access_info *info)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&report_lock, flags);
+	pr_err("================================="
+		"=================================\n");
+	print_error_description(info);
+	print_address_description(info);
+	print_shadow_for_address(info->first_bad_addr);
+	pr_err("================================="
+		"=================================\n");
+	spin_unlock_irqrestore(&report_lock, flags);
+}
+
+void kasan_report_user_access(struct kasan_access_info *info)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&report_lock, flags);
+	pr_err("================================="
+		"=================================\n");
+	pr_err("BUG: KASan: user-memory-access on address %p\n",
+		info->access_addr);
+	pr_err("%s of size %zu by task %s/%d\n",
+		info->is_write ? "Write" : "Read",
+		info->access_size, current->comm, task_pid_nr(current));
+	dump_stack();
+	pr_err("================================="
+		"=================================\n");
+	spin_unlock_irqrestore(&report_lock, flags);
+}
+
+void kasan_report(unsigned long addr, size_t size,
+		bool is_write, unsigned long ip)
+{
+	struct kasan_access_info info;
+
+	if (likely(!kasan_enabled()))
+		return;
+
+	info.access_addr = (void *)addr;
+	info.access_size = size;
+	info.is_write = is_write;
+	info.ip = ip;
+	kasan_report_error(&info);
+}
+
+
+#define DEFINE_ASAN_REPORT_LOAD(size)                     \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{                                                         \
+	kasan_report(addr, size, false, _RET_IP_);	  \
+}                                                         \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size)                     \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{                                                          \
+	kasan_report(addr, size, true, _RET_IP_);	   \
+}                                                          \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+	kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+	kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
new file mode 100644
index 000000000000..7acd6faa0335
--- /dev/null
+++ b/scripts/Makefile.kasan
@@ -0,0 +1,24 @@
+ifdef CONFIG_KASAN
+ifdef CONFIG_KASAN_INLINE
+	call_threshold := 10000
+else
+	call_threshold := 0
+endif
+
+CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
+
+CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
+		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
+		--param asan-instrumentation-with-call-threshold=$(call_threshold))
+
+ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
+        $(warning Cannot use CONFIG_KASAN: \
+            -fsanitize=kernel-address is not supported by compiler)
+else
+    ifeq ($(CFLAGS_KASAN),)
+        $(warning CONFIG_KASAN: compiler does not support all options.\
+            Trying minimal configuration)
+        CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL)
+    endif
+endif
+endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 511755200634..044eb4f89a91 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -119,6 +119,16 @@ _c_flags += $(if $(patsubst n%,, \
 		$(CFLAGS_GCOV))
 endif
 
+#
+# Enable address sanitizer flags for kernel except some files or directories
+# we don't want to check (depends on variables KASAN_SANITIZE_obj.o, KASAN_SANITIZE)
+#
+ifeq ($(CONFIG_KASAN),y)
+_c_flags += $(if $(patsubst n%,, \
+		$(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)y), \
+		$(CFLAGS_KASAN))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
-- 
cgit v1.2.3


From b8c73fc2493d42517be95cf2c89659fc6c6f4d02 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:28 -0800
Subject: mm: page_alloc: add kasan hooks on alloc and free paths

Add kernel address sanitizer hooks to mark allocated page's addresses as
accessible in corresponding shadow region.  Mark freed pages as
inaccessible.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  6 ++++++
 mm/compaction.c       |  2 ++
 mm/kasan/kasan.c      | 14 ++++++++++++++
 mm/kasan/kasan.h      |  2 ++
 mm/kasan/report.c     | 11 +++++++++++
 mm/page_alloc.c       |  3 +++
 6 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 9102fda60def..f00c15c41235 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -34,6 +34,9 @@ static inline void kasan_disable_current(void)
 
 void kasan_unpoison_shadow(const void *address, size_t size);
 
+void kasan_alloc_pages(struct page *page, unsigned int order);
+void kasan_free_pages(struct page *page, unsigned int order);
+
 #else /* CONFIG_KASAN */
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
@@ -41,6 +44,9 @@ static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 static inline void kasan_enable_current(void) {}
 static inline void kasan_disable_current(void) {}
 
+static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
+static inline void kasan_free_pages(struct page *page, unsigned int order) {}
+
 #endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index d50d6de6f1b6..8c0d9459b54a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,7 @@
 #include <linux/sysfs.h>
 #include <linux/balloon_compaction.h>
 #include <linux/page-isolation.h>
+#include <linux/kasan.h>
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -72,6 +73,7 @@ static void map_pages(struct list_head *list)
 	list_for_each_entry(page, list, lru) {
 		arch_alloc_page(page, 0);
 		kernel_map_pages(page, 1, 1);
+		kasan_alloc_pages(page, 0);
 	}
 }
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index def81104772f..b516eb8632b9 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -254,6 +254,20 @@ static __always_inline void check_memory_region(unsigned long addr,
 	kasan_report(addr, size, write, _RET_IP_);
 }
 
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+	if (likely(!PageHighMem(page)))
+		kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+	if (likely(!PageHighMem(page)))
+		kasan_poison_shadow(page_address(page),
+				PAGE_SIZE << order,
+				KASAN_FREE_PAGE);
+}
+
 #define DEFINE_ASAN_LOAD_STORE(size)				\
 	void __asan_load##size(unsigned long addr)		\
 	{							\
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 648b9c006f3f..d3c90d5dd97a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -6,6 +6,8 @@
 #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
 #define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
 
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+
 struct kasan_access_info {
 	const void *access_addr;
 	const void *first_bad_addr;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 5835d69563f5..fab8e7882ff1 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -54,6 +54,9 @@ static void print_error_description(struct kasan_access_info *info)
 	shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
 
 	switch (shadow_val) {
+	case KASAN_FREE_PAGE:
+		bug_type = "use after free";
+		break;
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
@@ -69,6 +72,14 @@ static void print_error_description(struct kasan_access_info *info)
 
 static void print_address_description(struct kasan_access_info *info)
 {
+	const void *addr = info->access_addr;
+
+	if ((addr >= (void *)PAGE_OFFSET) &&
+		(addr < high_memory)) {
+		struct page *page = virt_to_head_page(addr);
+		dump_page(page, "kasan: bad access detected");
+	}
+
 	dump_stack();
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb4758263f6b..a47f0b229a1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
+#include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
@@ -787,6 +788,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
+	kasan_free_pages(page, order);
 
 	if (PageAnon(page))
 		page->mapping = NULL;
@@ -970,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
+	kasan_alloc_pages(page, order);
 
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
-- 
cgit v1.2.3


From 912f5fbf1d3060f25d6994aed0265c55b974b2e9 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:31 -0800
Subject: mm: slub: introduce virt_to_obj function

virt_to_obj takes kmem_cache address, address of slab page, address x
pointing somewhere inside slab object, and returns address of the
beginning of object.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 9abf04ed0999..db7d5de00c5f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -110,4 +110,20 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
 }
 #endif
 
+
+/**
+ * virt_to_obj - returns address of the beginning of object.
+ * @s: object's kmem_cache
+ * @slab_page: address of slab page
+ * @x: address within object memory range
+ *
+ * Returns address of the beginning of object
+ */
+static inline void *virt_to_obj(struct kmem_cache *s,
+				const void *slab_page,
+				const void *x)
+{
+	return (void *)x - ((x - slab_page) % s->size);
+}
+
 #endif /* _LINUX_SLUB_DEF_H */
-- 
cgit v1.2.3


From 75c66def8d815201aa0386ecc7c66a5c8dbca1ee Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:35 -0800
Subject: mm: slub: share object_err function

Remove static and add function declarations to linux/slub_def.h so it
could be used by kernel address sanitizer.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 3 +++
 mm/slub.c                | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index db7d5de00c5f..33885118523c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -126,4 +126,7 @@ static inline void *virt_to_obj(struct kmem_cache *s,
 	return (void *)x - ((x - slab_page) % s->size);
 }
 
+void object_err(struct kmem_cache *s, struct page *page,
+		u8 *object, char *reason);
+
 #endif /* _LINUX_SLUB_DEF_H */
diff --git a/mm/slub.c b/mm/slub.c
index 783505ba2052..6833b73ef6b3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -629,7 +629,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 	dump_stack();
 }
 
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
 			u8 *object, char *reason)
 {
 	slab_bug(s, "%s", reason);
-- 
cgit v1.2.3


From 0316bec22ec95ea2faca6406437b0b5950553b7c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:42 -0800
Subject: mm: slub: add kernel address sanitizer support for slub allocator

With this patch kasan will be able to catch bugs in memory allocated by
slub.  Initially all objects in newly allocated slab page, marked as
redzone.  Later, when allocation of slub object happens, requested by
caller number of bytes marked as accessible, and the rest of the object
(including slub's metadata) marked as redzone (inaccessible).

We also mark object as accessible if ksize was called for this object.
There is some places in kernel where ksize function is called to inquire
size of really allocated area.  Such callers could validly access whole
allocated memory, so it should be marked as accessible.

Code in slub.c and slab_common.c files could validly access to object's
metadata, so instrumentation for this files are disabled.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Signed-off-by: Dmitry Chernenkov <dmitryc@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 27 ++++++++++++++
 include/linux/slab.h  | 11 ++++--
 lib/Kconfig.kasan     |  1 +
 mm/Makefile           |  3 ++
 mm/kasan/kasan.c      | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h      |  5 +++
 mm/kasan/report.c     | 21 +++++++++++
 mm/slab_common.c      |  5 ++-
 mm/slub.c             | 31 ++++++++++++++--
 9 files changed, 197 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index f00c15c41235..d5310eef3e38 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -37,6 +37,18 @@ void kasan_unpoison_shadow(const void *address, size_t size);
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
+void kasan_poison_slab(struct page *page);
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
+void kasan_poison_object_data(struct kmem_cache *cache, void *object);
+
+void kasan_kmalloc_large(const void *ptr, size_t size);
+void kasan_kfree_large(const void *ptr);
+void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size);
+void kasan_krealloc(const void *object, size_t new_size);
+
+void kasan_slab_alloc(struct kmem_cache *s, void *object);
+void kasan_slab_free(struct kmem_cache *s, void *object);
+
 #else /* CONFIG_KASAN */
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
@@ -47,6 +59,21 @@ static inline void kasan_disable_current(void) {}
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 
+static inline void kasan_poison_slab(struct page *page) {}
+static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+					void *object) {}
+static inline void kasan_poison_object_data(struct kmem_cache *cache,
+					void *object) {}
+
+static inline void kasan_kmalloc_large(void *ptr, size_t size) {}
+static inline void kasan_kfree_large(const void *ptr) {}
+static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
+				size_t size) {}
+static inline void kasan_krealloc(const void *object, size_t new_size) {}
+
+static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {}
+static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
+
 #endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ed2ffaab59ea..76f1feeabd38 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -104,6 +104,7 @@
 				(unsigned long)ZERO_SIZE_PTR)
 
 #include <linux/kmemleak.h>
+#include <linux/kasan.h>
 
 struct mem_cgroup;
 /*
@@ -325,7 +326,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
 		gfp_t flags, size_t size)
 {
-	return kmem_cache_alloc(s, flags);
+	void *ret = kmem_cache_alloc(s, flags);
+
+	kasan_kmalloc(s, ret, size);
+	return ret;
 }
 
 static __always_inline void *
@@ -333,7 +337,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 			      gfp_t gfpflags,
 			      int node, size_t size)
 {
-	return kmem_cache_alloc_node(s, gfpflags, node);
+	void *ret = kmem_cache_alloc_node(s, gfpflags, node);
+
+	kasan_kmalloc(s, ret, size);
+	return ret;
 }
 #endif /* CONFIG_TRACING */
 
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 0052b1b9aadd..a11ac0234452 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -5,6 +5,7 @@ if HAVE_ARCH_KASAN
 
 config KASAN
 	bool "KASan: runtime memory debugger"
+	depends on SLUB_DEBUG
 	help
 	  Enables kernel address sanitizer - runtime memory debugger,
 	  designed to find out-of-bounds accesses and use-after-free bugs.
diff --git a/mm/Makefile b/mm/Makefile
index 930b52df4aca..088c68e9ec35 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -2,6 +2,9 @@
 # Makefile for the linux memory manager.
 #
 
+KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slub.o := n
+
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index b516eb8632b9..dc83f070edb6 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -31,6 +31,7 @@
 #include <linux/kasan.h>
 
 #include "kasan.h"
+#include "../slab.h"
 
 /*
  * Poisons the shadow memory for 'size' bytes starting from 'addr'.
@@ -268,6 +269,103 @@ void kasan_free_pages(struct page *page, unsigned int order)
 				KASAN_FREE_PAGE);
 }
 
+void kasan_poison_slab(struct page *page)
+{
+	kasan_poison_shadow(page_address(page),
+			PAGE_SIZE << compound_order(page),
+			KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+{
+	kasan_unpoison_shadow(object, cache->object_size);
+}
+
+void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+{
+	kasan_poison_shadow(object,
+			round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+			KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+{
+	kasan_kmalloc(cache, object, cache->object_size);
+}
+
+void kasan_slab_free(struct kmem_cache *cache, void *object)
+{
+	unsigned long size = cache->object_size;
+	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return;
+
+	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+{
+	unsigned long redzone_start;
+	unsigned long redzone_end;
+
+	if (unlikely(object == NULL))
+		return;
+
+	redzone_start = round_up((unsigned long)(object + size),
+				KASAN_SHADOW_SCALE_SIZE);
+	redzone_end = round_up((unsigned long)object + cache->object_size,
+				KASAN_SHADOW_SCALE_SIZE);
+
+	kasan_unpoison_shadow(object, size);
+	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+		KASAN_KMALLOC_REDZONE);
+}
+EXPORT_SYMBOL(kasan_kmalloc);
+
+void kasan_kmalloc_large(const void *ptr, size_t size)
+{
+	struct page *page;
+	unsigned long redzone_start;
+	unsigned long redzone_end;
+
+	if (unlikely(ptr == NULL))
+		return;
+
+	page = virt_to_page(ptr);
+	redzone_start = round_up((unsigned long)(ptr + size),
+				KASAN_SHADOW_SCALE_SIZE);
+	redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+
+	kasan_unpoison_shadow(ptr, size);
+	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+		KASAN_PAGE_REDZONE);
+}
+
+void kasan_krealloc(const void *object, size_t size)
+{
+	struct page *page;
+
+	if (unlikely(object == ZERO_SIZE_PTR))
+		return;
+
+	page = virt_to_head_page(object);
+
+	if (unlikely(!PageSlab(page)))
+		kasan_kmalloc_large(object, size);
+	else
+		kasan_kmalloc(page->slab_cache, object, size);
+}
+
+void kasan_kfree_large(const void *ptr)
+{
+	struct page *page = virt_to_page(ptr);
+
+	kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+			KASAN_FREE_PAGE);
+}
+
 #define DEFINE_ASAN_LOAD_STORE(size)				\
 	void __asan_load##size(unsigned long addr)		\
 	{							\
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index d3c90d5dd97a..5b052ab40cf9 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -7,6 +7,11 @@
 #define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
 
 #define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
+#define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
+#define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+
 
 struct kasan_access_info {
 	const void *access_addr;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index fab8e7882ff1..2760edb4d0a8 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -24,6 +24,7 @@
 #include <linux/kasan.h>
 
 #include "kasan.h"
+#include "../slab.h"
 
 /* Shadow layout customization. */
 #define SHADOW_BYTES_PER_BLOCK 1
@@ -55,8 +56,11 @@ static void print_error_description(struct kasan_access_info *info)
 
 	switch (shadow_val) {
 	case KASAN_FREE_PAGE:
+	case KASAN_KMALLOC_FREE:
 		bug_type = "use after free";
 		break;
+	case KASAN_PAGE_REDZONE:
+	case KASAN_KMALLOC_REDZONE:
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
@@ -77,6 +81,23 @@ static void print_address_description(struct kasan_access_info *info)
 	if ((addr >= (void *)PAGE_OFFSET) &&
 		(addr < high_memory)) {
 		struct page *page = virt_to_head_page(addr);
+
+		if (PageSlab(page)) {
+			void *object;
+			struct kmem_cache *cache = page->slab_cache;
+			void *last_object;
+
+			object = virt_to_obj(cache, page_address(page), addr);
+			last_object = page_address(page) +
+				page->objects * cache->size;
+
+			if (unlikely(object > last_object))
+				object = last_object; /* we hit into padding */
+
+			object_err(cache, page, object,
+				"kasan: bad access detected");
+			return;
+		}
 		dump_page(page, "kasan: bad access detected");
 	}
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 429a4506b382..999bb3424d44 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -898,6 +898,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 	page = alloc_kmem_pages(flags, order);
 	ret = page ? page_address(page) : NULL;
 	kmemleak_alloc(ret, size, 1, flags);
+	kasan_kmalloc_large(ret, size);
 	return ret;
 }
 EXPORT_SYMBOL(kmalloc_order);
@@ -1077,8 +1078,10 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
 	if (p)
 		ks = ksize(p);
 
-	if (ks >= new_size)
+	if (ks >= new_size) {
+		kasan_krealloc((void *)p, new_size);
 		return (void *)p;
+	}
 
 	ret = kmalloc_track_caller(new_size, flags);
 	if (ret && p)
diff --git a/mm/slub.c b/mm/slub.c
index 37555ad8894d..6832c4eab104 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1251,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
 	kmemleak_alloc(ptr, size, 1, flags);
+	kasan_kmalloc_large(ptr, size);
 }
 
 static inline void kfree_hook(const void *x)
 {
 	kmemleak_free(x);
+	kasan_kfree_large(x);
 }
 
 static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
@@ -1278,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
 	kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
 	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
 	memcg_kmem_put_cache(s);
+	kasan_slab_alloc(s, object);
 }
 
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1301,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 #endif
 	if (!(s->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(x, s->object_size);
+
+	kasan_slab_free(s, x);
 }
 
 /*
@@ -1395,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 				void *object)
 {
 	setup_object_debug(s, page, object);
-	if (unlikely(s->ctor))
+	if (unlikely(s->ctor)) {
+		kasan_unpoison_object_data(s, object);
 		s->ctor(object);
+		kasan_poison_object_data(s, object);
+	}
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1429,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (unlikely(s->flags & SLAB_POISON))
 		memset(start, POISON_INUSE, PAGE_SIZE << order);
 
+	kasan_poison_slab(page);
+
 	for_each_object_idx(p, idx, s, start, page->objects) {
 		setup_object(s, page, p);
 		if (likely(idx < page->objects))
@@ -2522,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
 	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
 	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+	kasan_kmalloc(s, ret, size);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2548,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, s->size, gfpflags, node);
+
+	kasan_kmalloc(s, ret, size);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2933,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node)
 	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
 	init_tracking(kmem_cache_node, n);
 #endif
+	kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
 	init_kmem_cache_node(n);
 	inc_slabs_node(kmem_cache_node, node, page->objects);
 
@@ -3305,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags)
 
 	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
+	kasan_kmalloc(s, ret, size);
+
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc);
@@ -3348,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 
 	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
+	kasan_kmalloc(s, ret, size);
+
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
 
-size_t ksize(const void *object)
+static size_t __ksize(const void *object)
 {
 	struct page *page;
 
@@ -3369,6 +3387,15 @@ size_t ksize(const void *object)
 
 	return slab_ksize(page->slab_cache);
 }
+
+size_t ksize(const void *object)
+{
+	size_t size = __ksize(object);
+	/* We assume that ksize callers could use whole allocated area,
+	   so we need unpoison this area. */
+	kasan_krealloc(object, size);
+	return size;
+}
 EXPORT_SYMBOL(ksize);
 
 void kfree(const void *x)
-- 
cgit v1.2.3


From c420f167db8c799d69fe43a801c58a7f02e9d57c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:59 -0800
Subject: kasan: enable stack instrumentation

Stack instrumentation allows to detect out of bounds memory accesses for
variables allocated on stack.  Compiler adds redzones around every
variable on stack and poisons redzones in function's prologue.

Such approach significantly increases stack usage, so all in-kernel stacks
size were doubled.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/page_64_types.h | 12 +++++++++---
 arch/x86/kernel/Makefile             |  2 ++
 arch/x86/mm/kasan_init_64.c          | 11 +++++++++--
 include/linux/init_task.h            |  8 ++++++++
 mm/kasan/kasan.h                     |  9 +++++++++
 mm/kasan/report.c                    |  6 ++++++
 scripts/Makefile.kasan               |  1 +
 7 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 75450b2c7be4..4edd53b79a81 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,17 +1,23 @@
 #ifndef _ASM_X86_PAGE_64_DEFS_H
 #define _ASM_X86_PAGE_64_DEFS_H
 
-#define THREAD_SIZE_ORDER	2
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
 #define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define CURRENT_MASK (~(THREAD_SIZE - 1))
 
-#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
 
 #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
 #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
 
-#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
 #define DOUBLEFAULT_STACK 1
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b13b70634124..cdb1b70ddad0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -17,6 +17,8 @@ CFLAGS_REMOVE_early_printk.o = -pg
 endif
 
 KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
 
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 3e4d9a1a39fa..53508708b7aa 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -189,11 +189,18 @@ void __init kasan_init(void)
 		if (map_range(&pfn_mapped[i]))
 			panic("kasan: unable to allocate shadow!");
 	}
-
 	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
-				(void *)KASAN_SHADOW_END);
+			kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+			(unsigned long)kasan_mem_to_shadow(_end),
+			NUMA_NO_NODE);
+
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_VADDR),
+			(void *)KASAN_SHADOW_END);
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
 
 	load_cr3(init_level4_pgt);
+	init_task.kasan_depth = 0;
 }
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d3d43ecf148c..696d22312b31 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -175,6 +175,13 @@ extern struct task_group root_task_group;
 # define INIT_NUMA_BALANCING(tsk)
 #endif
 
+#ifdef CONFIG_KASAN
+# define INIT_KASAN(tsk)						\
+	.kasan_depth = 1,
+#else
+# define INIT_KASAN(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -250,6 +257,7 @@ extern struct task_group root_task_group;
 	INIT_RT_MUTEXES(tsk)						\
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
+	INIT_KASAN(tsk)							\
 }
 
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 5b052ab40cf9..1fcc1d81a9cf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -12,6 +12,15 @@
 #define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
 #define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
 
+/*
+ * Stack redzone shadow values
+ * (Those are compiler's ABI, don't change them)
+ */
+#define KASAN_STACK_LEFT        0xF1
+#define KASAN_STACK_MID         0xF2
+#define KASAN_STACK_RIGHT       0xF3
+#define KASAN_STACK_PARTIAL     0xF4
+
 
 struct kasan_access_info {
 	const void *access_addr;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 2760edb4d0a8..866732ef3db3 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -64,6 +64,12 @@ static void print_error_description(struct kasan_access_info *info)
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
+	case KASAN_STACK_LEFT:
+	case KASAN_STACK_MID:
+	case KASAN_STACK_RIGHT:
+	case KASAN_STACK_PARTIAL:
+		bug_type = "out of bounds on stack";
+		break;
 	}
 
 	pr_err("BUG: KASan: %s in %pS at addr %p\n",
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 7acd6faa0335..2163b8cc446e 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -9,6 +9,7 @@ CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
 
 CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
 		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
+		--param asan-stack=1 \
 		--param asan-instrumentation-with-call-threshold=$(call_threshold))
 
 ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
-- 
cgit v1.2.3


From 71394fe50146202f2c8d92cf50f5ebc761acf254 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:03 -0800
Subject: mm: vmalloc: add flag preventing guard hole allocation

For instrumenting global variables KASan will shadow memory backing memory
for modules.  So on module loading we will need to allocate memory for
shadow and map it at address in shadow that corresponds to the address
allocated in module_alloc().

__vmalloc_node_range() could be used for this purpose, except it puts a
guard hole after allocated area.  Guard hole in shadow memory should be a
problem because at some future point we might need to have a shadow memory
at address occupied by guard hole.  So we could fail to allocate shadow
for module_alloc().

Add a new vm_struct flag 'VM_NO_GUARD' indicating that vm area doesn't
have a guard hole.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 9 +++++++--
 mm/vmalloc.c            | 6 ++----
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index b87696fdf06a..1526fe712ca0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -16,6 +16,7 @@ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 #define VM_USERMAP		0x00000008	/* suitable for remap_vmalloc_range */
 #define VM_VPAGES		0x00000010	/* buffer for pages was vmalloc'ed */
 #define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
+#define VM_NO_GUARD		0x00000040      /* don't add guard page */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -96,8 +97,12 @@ void vmalloc_sync_all(void);
 
 static inline size_t get_vm_area_size(const struct vm_struct *area)
 {
-	/* return actual size without guard page */
-	return area->size - PAGE_SIZE;
+	if (!(area->flags & VM_NO_GUARD))
+		/* return actual size without guard page */
+		return area->size - PAGE_SIZE;
+	else
+		return area->size;
+
 }
 
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 39c338896416..2e74e99d4cfe 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1324,10 +1324,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (unlikely(!area))
 		return NULL;
 
-	/*
-	 * We always allocate a guard page.
-	 */
-	size += PAGE_SIZE;
+	if (!(flags & VM_NO_GUARD))
+		size += PAGE_SIZE;
 
 	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
 	if (IS_ERR(va)) {
-- 
cgit v1.2.3


From cb9e3c292d0115499c660028ad35ac5501d722b5 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:07 -0800
Subject: mm: vmalloc: pass additional vm_flags to __vmalloc_node_range()

For instrumenting global variables KASan will shadow memory backing memory
for modules.  So on module loading we will need to allocate memory for
shadow and map it at address in shadow that corresponds to the address
allocated in module_alloc().

__vmalloc_node_range() could be used for this purpose, except it puts a
guard hole after allocated area.  Guard hole in shadow memory should be a
problem because at some future point we might need to have a shadow memory
at address occupied by guard hole.  So we could fail to allocate shadow
for module_alloc().

Now we have VM_NO_GUARD flag disabling guard page, so we need to pass into
__vmalloc_node_range().  Add new parameter 'vm_flags' to
__vmalloc_node_range() function.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/module.c       |  2 +-
 arch/arm64/kernel/module.c     |  4 ++--
 arch/mips/kernel/module.c      |  2 +-
 arch/parisc/kernel/module.c    |  2 +-
 arch/s390/kernel/module.c      |  2 +-
 arch/sparc/kernel/module.c     |  2 +-
 arch/unicore32/kernel/module.c |  2 +-
 arch/x86/kernel/module.c       |  2 +-
 include/linux/vmalloc.h        |  4 +++-
 mm/vmalloc.c                   | 10 ++++++----
 10 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index bea7db9e5b80..2e11961f65ae 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -41,7 +41,7 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 9b6f71db2709..67bf4107f6ef 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -35,8 +35,8 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
-				    __builtin_return_address(0));
+				    GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
+				    NUMA_NO_NODE, __builtin_return_address(0));
 }
 
 enum aarch64_reloc_op {
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 2a52568dbcd6..1833f5171ccd 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -47,7 +47,7 @@ static DEFINE_SPINLOCK(dbe_lock);
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
-				GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index 5822e8e200e6..3c63a820fcda 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -219,7 +219,7 @@ void *module_alloc(unsigned long size)
 	 * init_data correctly */
 	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
 				    GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_RWX, NUMA_NO_NODE,
+				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 409d152585be..36154a2f1814 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -50,7 +50,7 @@ void *module_alloc(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				    GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 #endif
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index 97655e0fd243..192a617a32f3 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -29,7 +29,7 @@ static void *module_map(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #else
diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c
index dc41f6dfedb6..e191b3448bd3 100644
--- a/arch/unicore32/kernel/module.c
+++ b/arch/unicore32/kernel/module.c
@@ -25,7 +25,7 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e69f9882bf95..e830e61aae05 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -88,7 +88,7 @@ void *module_alloc(unsigned long size)
 	return __vmalloc_node_range(size, 1,
 				    MODULES_VADDR + get_module_load_offset(),
 				    MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1526fe712ca0..7d7acb35603d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -76,7 +76,9 @@ extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller);
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller);
+
 extern void vfree(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2e74e99d4cfe..35b25e1340ca 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1619,6 +1619,7 @@ fail:
  *	@end:		vm area range end
  *	@gfp_mask:	flags for the page level allocator
  *	@prot:		protection mask for the allocated pages
+ *	@vm_flags:	additional vm area flags (e.g. %VM_NO_GUARD)
  *	@node:		node to use for allocation or NUMA_NO_NODE
  *	@caller:	caller's return address
  *
@@ -1628,7 +1629,8 @@ fail:
  */
 void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller)
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller)
 {
 	struct vm_struct *area;
 	void *addr;
@@ -1638,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 		goto fail;
 
-	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
-				  start, end, node, gfp_mask, caller);
+	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
+				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
 
@@ -1688,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    int node, const void *caller)
 {
 	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-				gfp_mask, prot, node, caller);
+				gfp_mask, prot, 0, node, caller);
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
-- 
cgit v1.2.3


From 6301939d97d079f0d3dbe71e750f4daf5d39fc33 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:13 -0800
Subject: module: fix types of device tables aliases

MODULE_DEVICE_TABLE() macro used to create aliases to device tables.
Normally alias should have the same type as aliased symbol.

Device tables are arrays, so they have 'struct type##_device_id[x]'
types. Alias created by MODULE_DEVICE_TABLE() will have non-array type -
	'struct type##_device_id'.

This inconsistency confuses compiler, it could make a wrong assumption
about variable's size which leads KASan to produce a false positive report
about out of bounds access.

For every global variable compiler calls __asan_register_globals() passing
information about global variable (address, size, size with redzone, name
...) __asan_register_globals() poison symbols redzone to detect possible
out of bounds accesses.

When symbol has an alias __asan_register_globals() will be called as for
symbol so for alias.  Compiler determines size of variable by size of
variable's type.  Alias and symbol have the same address, so if alias have
the wrong size part of memory that actually belongs to the symbol could be
poisoned as redzone of alias symbol.

By fixing type of alias symbol we will fix size of it, so
__asan_register_globals() will not poison valid memory.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index b653d7c0a05a..42999fe2dbd0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -135,7 +135,7 @@ void trim_init_extable(struct module *m);
 #ifdef MODULE
 /* Creates an alias so file2alias.c can find device table. */
 #define MODULE_DEVICE_TABLE(type, name)					\
-  extern const struct type##_device_id __mod_##type##__##name##_device_table \
+extern const typeof(name) __mod_##type##__##name##_device_table		\
   __attribute__ ((unused, alias(__stringify(name))))
 #else  /* !MODULE */
 #define MODULE_DEVICE_TABLE(type, name)
-- 
cgit v1.2.3


From bebf56a1b176c2e1c9efe44e7e6915532cc682cf Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:40:17 -0800
Subject: kasan: enable instrumentation of global variables

This feature let us to detect accesses out of bounds of global variables.
This will work as for globals in kernel image, so for globals in modules.
Currently this won't work for symbols in user-specified sections (e.g.
__init, __read_mostly, ...)

The idea of this is simple.  Compiler increases each global variable by
redzone size and add constructors invoking __asan_register_globals()
function.  Information about global variable (address, size, size with
redzone ...) passed to __asan_register_globals() so we could poison
variable's redzone.

This patch also forces module_alloc() to return 8*PAGE_SIZE aligned
address making shadow memory handling (
kasan_module_alloc()/kasan_module_free() ) more simple.  Such alignment
guarantees that each shadow page backing modules address space correspond
to only one module_alloc() allocation.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kasan.txt       |  2 +-
 arch/x86/kernel/module.c      | 12 +++++++++-
 arch/x86/mm/kasan_init_64.c   |  2 +-
 include/linux/compiler-gcc4.h |  4 ++++
 include/linux/compiler-gcc5.h |  2 ++
 include/linux/kasan.h         | 10 +++++++++
 kernel/module.c               |  2 ++
 lib/Kconfig.kasan             |  1 +
 mm/kasan/kasan.c              | 52 +++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h              | 25 +++++++++++++++++++++
 mm/kasan/report.c             | 22 ++++++++++++++++++
 scripts/Makefile.kasan        |  2 +-
 12 files changed, 132 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt
index f0645a8a992f..092fc10961fe 100644
--- a/Documentation/kasan.txt
+++ b/Documentation/kasan.txt
@@ -9,7 +9,7 @@ a fast and comprehensive solution for finding use-after-free and out-of-bounds
 bugs.
 
 KASan uses compile-time instrumentation for checking every memory access,
-therefore you will need a certain version of GCC >= 4.9.2
+therefore you will need a certain version of GCC > 4.9.2
 
 Currently KASan is supported only for x86_64 architecture and requires that the
 kernel be built with the SLUB allocator.
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e830e61aae05..d1ac80b72c72 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kasan.h>
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -83,13 +84,22 @@ static unsigned long int get_module_load_offset(void)
 
 void *module_alloc(unsigned long size)
 {
+	void *p;
+
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
-	return __vmalloc_node_range(size, 1,
+
+	p = __vmalloc_node_range(size, MODULE_ALIGN,
 				    MODULES_VADDR + get_module_load_offset(),
 				    MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
 				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 53508708b7aa..4860906c6b9f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -196,7 +196,7 @@ void __init kasan_init(void)
 			(unsigned long)kasan_mem_to_shadow(_end),
 			NUMA_NO_NODE);
 
-	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_VADDR),
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
 			(void *)KASAN_SHADOW_END);
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index d1a558239b1a..769e19864632 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -85,3 +85,7 @@
 #define __HAVE_BUILTIN_BSWAP16__
 #endif
 #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
+#if GCC_VERSION >= 40902
+#define KASAN_ABI_VERSION 3
+#endif
diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h
index c8c565952548..efee493714eb 100644
--- a/include/linux/compiler-gcc5.h
+++ b/include/linux/compiler-gcc5.h
@@ -63,3 +63,5 @@
 #define __HAVE_BUILTIN_BSWAP64__
 #define __HAVE_BUILTIN_BSWAP16__
 #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
+#define KASAN_ABI_VERSION 4
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d5310eef3e38..72ba725ddf9c 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -49,8 +49,15 @@ void kasan_krealloc(const void *object, size_t new_size);
 void kasan_slab_alloc(struct kmem_cache *s, void *object);
 void kasan_slab_free(struct kmem_cache *s, void *object);
 
+#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
+
+int kasan_module_alloc(void *addr, size_t size);
+void kasan_module_free(void *addr);
+
 #else /* CONFIG_KASAN */
 
+#define MODULE_ALIGN 1
+
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 
 static inline void kasan_enable_current(void) {}
@@ -74,6 +81,9 @@ static inline void kasan_krealloc(const void *object, size_t new_size) {}
 static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {}
 static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
 
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_module_free(void *addr) {}
+
 #endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/kernel/module.c b/kernel/module.c
index 82dc1f899e6d..8426ad48362c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
+#include <linux/kasan.h>
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
@@ -1813,6 +1814,7 @@ static void unset_module_init_ro_nx(struct module *mod) { }
 void __weak module_memfree(void *module_region)
 {
 	vfree(module_region);
+	kasan_module_free(module_region);
 }
 
 void __weak module_arch_cleanup(struct module *mod)
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 4d47d874335c..4fecaedc80a2 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -6,6 +6,7 @@ if HAVE_ARCH_KASAN
 config KASAN
 	bool "KASan: runtime memory debugger"
 	depends on SLUB_DEBUG
+	select CONSTRUCTORS
 	help
 	  Enables kernel address sanitizer - runtime memory debugger,
 	  designed to find out-of-bounds accesses and use-after-free bugs.
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 799c52b9826c..78fee632a7ee 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -22,6 +22,7 @@
 #include <linux/memblock.h>
 #include <linux/memory.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -395,6 +396,57 @@ void kasan_kfree_large(const void *ptr)
 			KASAN_FREE_PAGE);
 }
 
+int kasan_module_alloc(void *addr, size_t size)
+{
+	void *ret;
+	size_t shadow_size;
+	unsigned long shadow_start;
+
+	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+	shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
+			PAGE_SIZE);
+
+	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+		return -EINVAL;
+
+	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+			shadow_start + shadow_size,
+			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+			__builtin_return_address(0));
+	return ret ? 0 : -ENOMEM;
+}
+
+void kasan_module_free(void *addr)
+{
+	vfree(kasan_mem_to_shadow(addr));
+}
+
+static void register_global(struct kasan_global *global)
+{
+	size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+	kasan_unpoison_shadow(global->beg, global->size);
+
+	kasan_poison_shadow(global->beg + aligned_size,
+		global->size_with_redzone - aligned_size,
+		KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
 #define DEFINE_ASAN_LOAD_STORE(size)				\
 	void __asan_load##size(unsigned long addr)		\
 	{							\
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1fcc1d81a9cf..4986b0acab21 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -11,6 +11,7 @@
 #define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
 #define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
 #define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_GLOBAL_REDZONE    0xFA  /* redzone for global variable */
 
 /*
  * Stack redzone shadow values
@@ -21,6 +22,10 @@
 #define KASAN_STACK_RIGHT       0xF3
 #define KASAN_STACK_PARTIAL     0xF4
 
+/* Don't break randconfig/all*config builds */
+#ifndef KASAN_ABI_VERSION
+#define KASAN_ABI_VERSION 1
+#endif
 
 struct kasan_access_info {
 	const void *access_addr;
@@ -30,6 +35,26 @@ struct kasan_access_info {
 	unsigned long ip;
 };
 
+/* The layout of struct dictated by compiler */
+struct kasan_source_location {
+	const char *filename;
+	int line_no;
+	int column_no;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_global {
+	const void *beg;		/* Address of the beginning of the global variable. */
+	size_t size;			/* Size of the global variable. */
+	size_t size_with_redzone;	/* Size of the variable + size of the red zone. 32 bytes aligned */
+	const void *name;
+	const void *module_name;	/* Name of the module where the global variable is declared. */
+	unsigned long has_dynamic_init;	/* This needed for C++ */
+#if KASAN_ABI_VERSION >= 4
+	struct kasan_source_location *location;
+#endif
+};
+
 void kasan_report_error(struct kasan_access_info *info);
 void kasan_report_user_access(struct kasan_access_info *info);
 
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 866732ef3db3..680ceedf810a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -23,6 +23,8 @@
 #include <linux/types.h>
 #include <linux/kasan.h>
 
+#include <asm/sections.h>
+
 #include "kasan.h"
 #include "../slab.h"
 
@@ -61,6 +63,7 @@ static void print_error_description(struct kasan_access_info *info)
 		break;
 	case KASAN_PAGE_REDZONE:
 	case KASAN_KMALLOC_REDZONE:
+	case KASAN_GLOBAL_REDZONE:
 	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
 		bug_type = "out of bounds access";
 		break;
@@ -80,6 +83,20 @@ static void print_error_description(struct kasan_access_info *info)
 		info->access_size, current->comm, task_pid_nr(current));
 }
 
+static inline bool kernel_or_module_addr(const void *addr)
+{
+	return (addr >= (void *)_stext && addr < (void *)_end)
+		|| (addr >= (void *)MODULES_VADDR
+			&& addr < (void *)MODULES_END);
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+	return addr >= (void *)&init_thread_union.stack &&
+		(addr <= (void *)&init_thread_union.stack +
+			sizeof(init_thread_union.stack));
+}
+
 static void print_address_description(struct kasan_access_info *info)
 {
 	const void *addr = info->access_addr;
@@ -107,6 +124,11 @@ static void print_address_description(struct kasan_access_info *info)
 		dump_page(page, "kasan: bad access detected");
 	}
 
+	if (kernel_or_module_addr(addr)) {
+		if (!init_task_stack_addr(addr))
+			pr_err("Address belongs to variable %pS\n", addr);
+	}
+
 	dump_stack();
 }
 
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 2163b8cc446e..631619b2b118 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -9,7 +9,7 @@ CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
 
 CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
 		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
-		--param asan-stack=1 \
+		--param asan-stack=1 --param asan-globals=1 \
 		--param asan-instrumentation-with-call-threshold=$(call_threshold))
 
 ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
-- 
cgit v1.2.3


From d347efeb16d3d5150cb7f8d50b05f388b572840e Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht <adrien+dev@schischi.me>
Date: Thu, 12 Feb 2015 14:01:37 +0100
Subject: mutex: remove unused field "name" in debug mode

This field is unused and uninitialized since commit 9a11b49a8056
("[PATCH] lockdep: better lock debugging")

Signed-off-by: Adrien Schildknecht <adrien+dev@schischi.me>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mutex.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index cc31498fc526..2cb7531e7d7a 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -59,7 +59,6 @@ struct mutex {
 	struct optimistic_spin_queue osq; /* Spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
-	const char 		*name;
 	void			*magic;
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-- 
cgit v1.2.3


From 124cf9117c5f93cc5b324530b7e105b09c729d5d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Feb 2015 23:50:43 +0100
Subject: PM / sleep: Make it possible to quiesce timers during suspend-to-idle

The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.

However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion.  A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.

The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping.  That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs.  It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.

Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit.  Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.

To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices.  Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/cpuidle/cpuidle.c | 49 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/cpuidle.h   |  9 +++++++++
 include/linux/tick.h      |  6 +++++-
 kernel/time/tick-common.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/time/timekeeping.c |  4 ++--
 kernel/time/timekeeping.h |  2 ++
 6 files changed, 112 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 23a8d6cc8d30..4d534582514e 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -20,6 +20,7 @@
 #include <linux/hrtimer.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
+#include <linux/tick.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -69,18 +70,20 @@ int cpuidle_play_dead(void)
  * cpuidle_find_deepest_state - Find deepest state meeting specific conditions.
  * @drv: cpuidle driver for the given CPU.
  * @dev: cpuidle device for the given CPU.
+ * @freeze: Whether or not the state should be suitable for suspend-to-idle.
  */
 static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-				      struct cpuidle_device *dev)
+				      struct cpuidle_device *dev, bool freeze)
 {
 	unsigned int latency_req = 0;
-	int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
+	int i, ret = freeze ? -1 : CPUIDLE_DRIVER_STATE_START - 1;
 
 	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
 		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable || s->exit_latency <= latency_req)
+		if (s->disabled || su->disable || s->exit_latency <= latency_req
+		    || (freeze && !s->enter_freeze))
 			continue;
 
 		latency_req = s->exit_latency;
@@ -89,10 +92,31 @@ static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 	return ret;
 }
 
+static void enter_freeze_proper(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev, int index)
+{
+	tick_freeze();
+	/*
+	 * The state used here cannot be a "coupled" one, because the "coupled"
+	 * cpuidle mechanism enables interrupts and doing that with timekeeping
+	 * suspended is generally unsafe.
+	 */
+	drv->states[index].enter_freeze(dev, drv, index);
+	WARN_ON(!irqs_disabled());
+	/*
+	 * timekeeping_resume() that will be called by tick_unfreeze() for the
+	 * last CPU executing it calls functions containing RCU read-side
+	 * critical sections, so tell RCU about that.
+	 */
+	RCU_NONIDLE(tick_unfreeze());
+}
+
 /**
  * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle.
  *
- * Find the deepest state available and enter it.
+ * If there are states with the ->enter_freeze callback, find the deepest of
+ * them and enter it with frozen tick.  Otherwise, find the deepest state
+ * available and enter it normally.
  */
 void cpuidle_enter_freeze(void)
 {
@@ -100,7 +124,22 @@ void cpuidle_enter_freeze(void)
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
 	int index;
 
-	index = cpuidle_find_deepest_state(drv, dev);
+	/*
+	 * Find the deepest state with ->enter_freeze present, which guarantees
+	 * that interrupts won't be enabled when it exits and allows the tick to
+	 * be frozen safely.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, true);
+	if (index >= 0) {
+		enter_freeze_proper(drv, dev, index);
+		return;
+	}
+
+	/*
+	 * It is not safe to freeze the tick, find the deepest state available
+	 * at all and try to enter it normally.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, false);
 	if (index >= 0)
 		cpuidle_enter(drv, dev, index);
 	else
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index f63aabf4ee90..f551a9299ac9 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,15 @@ struct cpuidle_state {
 			int index);
 
 	int (*enter_dead) (struct cpuidle_device *dev, int index);
+
+	/*
+	 * CPUs execute ->enter_freeze with the local tick or entire timekeeping
+	 * suspended, so it must not re-enable interrupts at any point (even
+	 * temporarily) or attempt to change states of clock event devices.
+	 */
+	void (*enter_freeze) (struct cpuidle_device *dev,
+			      struct cpuidle_driver *drv,
+			      int index);
 };
 
 /* Idle State Flags */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index eda850ca757a..9c085dc12ae9 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -79,6 +79,9 @@ extern void __init tick_init(void);
 extern int tick_is_oneshot_available(void);
 extern struct tick_device *tick_get_device(int cpu);
 
+extern void tick_freeze(void);
+extern void tick_unfreeze(void);
+
 # ifdef CONFIG_HIGH_RES_TIMERS
 extern int tick_init_highres(void);
 extern int tick_program_event(ktime_t expires, int force);
@@ -119,6 +122,8 @@ static inline int tick_oneshot_mode_active(void) { return 0; }
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
 static inline void tick_init(void) { }
+static inline void tick_freeze(void) { }
+static inline void tick_unfreeze(void) { }
 static inline void tick_cancel_sched_timer(int cpu) { }
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
@@ -226,5 +231,4 @@ static inline void tick_nohz_task_switch(struct task_struct *tsk)
 		__tick_nohz_task_switch(tsk);
 }
 
-
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf53ebd..f7c515595b42 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
 	}
 }
 
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping.  Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	tick_freeze_depth++;
+	if (tick_freeze_depth == num_online_cpus()) {
+		timekeeping_suspend();
+	} else {
+		tick_suspend();
+		tick_suspend_broadcast();
+	}
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping.  Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	if (tick_freeze_depth == num_online_cpus())
+		timekeeping_resume();
+	else
+		tick_resume();
+
+	tick_freeze_depth--;
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
 /**
  * tick_init - initialize the tick control
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index aef5dc722abf..91db94136c10 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1197,7 +1197,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
  * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct clocksource *clock = tk->tkr.clock;
@@ -1278,7 +1278,7 @@ static void timekeeping_resume(void)
 	hrtimers_resume();
 }
 
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index adc1fc98bde3..1d91416055d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(struct timespec *ts);
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
 extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
 
 #endif
-- 
cgit v1.2.3


From 1ea74014aba7cd3ddcc3cf3eef92270d2e8429e8 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Tue, 3 Feb 2015 17:12:17 -0800
Subject: Input: bfin_rotary - move platform header to linux/platform_data

The platform data definition of the rotary driver should be generic for all
architectures.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 arch/blackfin/include/asm/bfin_rotary.h      | 116 ---------------------------
 arch/blackfin/mach-bf527/boards/ad7160eval.c |   2 +-
 arch/blackfin/mach-bf527/boards/ezkit.c      |   2 +-
 arch/blackfin/mach-bf548/boards/ezkit.c      |   2 +-
 arch/blackfin/mach-bf609/boards/ezkit.c      |   2 +-
 drivers/input/misc/bfin_rotary.c             |   2 +-
 include/linux/platform_data/bfin_rotary.h    | 116 +++++++++++++++++++++++++++
 7 files changed, 121 insertions(+), 121 deletions(-)
 delete mode 100644 arch/blackfin/include/asm/bfin_rotary.h
 create mode 100644 include/linux/platform_data/bfin_rotary.h

(limited to 'include/linux')

diff --git a/arch/blackfin/include/asm/bfin_rotary.h b/arch/blackfin/include/asm/bfin_rotary.h
deleted file mode 100644
index 8895a750c70c..000000000000
--- a/arch/blackfin/include/asm/bfin_rotary.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * board initialization should put one of these structures into platform_data
- * and place the bfin-rotary onto platform_bus named "bfin-rotary".
- *
- * Copyright 2008-2010 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later.
- */
-
-#ifndef _BFIN_ROTARY_H
-#define _BFIN_ROTARY_H
-
-/* mode bitmasks */
-#define ROT_QUAD_ENC	CNTMODE_QUADENC	/* quadrature/grey code encoder mode */
-#define ROT_BIN_ENC	CNTMODE_BINENC	/* binary encoder mode */
-#define ROT_UD_CNT	CNTMODE_UDCNT	/* rotary counter mode */
-#define ROT_DIR_CNT	CNTMODE_DIRCNT	/* direction counter mode */
-
-#define ROT_DEBE	DEBE		/* Debounce Enable */
-
-#define ROT_CDGINV	CDGINV		/* CDG Pin Polarity Invert */
-#define ROT_CUDINV	CUDINV		/* CUD Pin Polarity Invert */
-#define ROT_CZMINV	CZMINV		/* CZM Pin Polarity Invert */
-
-struct bfin_rotary_platform_data {
-	/* set rotary UP KEY_### or BTN_### in case you prefer
-	 * bfin-rotary to send EV_KEY otherwise set 0
-	 */
-	unsigned int rotary_up_key;
-	/* set rotary DOWN KEY_### or BTN_### in case you prefer
-	 * bfin-rotary to send EV_KEY otherwise set 0
-	 */
-	unsigned int rotary_down_key;
-	/* set rotary BUTTON KEY_### or BTN_### */
-	unsigned int rotary_button_key;
-	/* set rotary Relative Axis REL_### in case you prefer
-	 * bfin-rotary to send EV_REL otherwise set 0
-	 */
-	unsigned int rotary_rel_code;
-	unsigned short debounce;	/* 0..17 */
-	unsigned short mode;
-	unsigned short pm_wakeup;
-};
-
-/* CNT_CONFIG bitmasks */
-#define CNTE		(1 << 0)	/* Counter Enable */
-#define DEBE		(1 << 1)	/* Debounce Enable */
-#define CDGINV		(1 << 4)	/* CDG Pin Polarity Invert */
-#define CUDINV		(1 << 5)	/* CUD Pin Polarity Invert */
-#define CZMINV		(1 << 6)	/* CZM Pin Polarity Invert */
-#define CNTMODE_SHIFT	8
-#define CNTMODE		(0x7 << CNTMODE_SHIFT)	/* Counter Operating Mode */
-#define ZMZC		(1 << 1)	/* CZM Zeroes Counter Enable */
-#define BNDMODE_SHIFT	12
-#define BNDMODE		(0x3 << BNDMODE_SHIFT)	/* Boundary register Mode */
-#define INPDIS		(1 << 15)	/* CUG and CDG Input Disable */
-
-#define CNTMODE_QUADENC	(0 << CNTMODE_SHIFT)	/* quadrature encoder mode */
-#define CNTMODE_BINENC	(1 << CNTMODE_SHIFT)	/* binary encoder mode */
-#define CNTMODE_UDCNT	(2 << CNTMODE_SHIFT)	/* up/down counter mode */
-#define CNTMODE_DIRCNT	(4 << CNTMODE_SHIFT)	/* direction counter mode */
-#define CNTMODE_DIRTMR	(5 << CNTMODE_SHIFT)	/* direction timer mode */
-
-#define BNDMODE_COMP	(0 << BNDMODE_SHIFT)	/* boundary compare mode */
-#define BNDMODE_ZERO	(1 << BNDMODE_SHIFT)	/* boundary compare and zero mode */
-#define BNDMODE_CAPT	(2 << BNDMODE_SHIFT)	/* boundary capture mode */
-#define BNDMODE_AEXT	(3 << BNDMODE_SHIFT)	/* boundary auto-extend mode */
-
-/* CNT_IMASK bitmasks */
-#define ICIE		(1 << 0)	/* Illegal Gray/Binary Code Interrupt Enable */
-#define UCIE		(1 << 1)	/* Up count Interrupt Enable */
-#define DCIE		(1 << 2)	/* Down count Interrupt Enable */
-#define MINCIE		(1 << 3)	/* Min Count Interrupt Enable */
-#define MAXCIE		(1 << 4)	/* Max Count Interrupt Enable */
-#define COV31IE		(1 << 5)	/* Bit 31 Overflow Interrupt Enable */
-#define COV15IE		(1 << 6)	/* Bit 15 Overflow Interrupt Enable */
-#define CZEROIE		(1 << 7)	/* Count to Zero Interrupt Enable */
-#define CZMIE		(1 << 8)	/* CZM Pin Interrupt Enable */
-#define CZMEIE		(1 << 9)	/* CZM Error Interrupt Enable */
-#define CZMZIE		(1 << 10)	/* CZM Zeroes Counter Interrupt Enable */
-
-/* CNT_STATUS bitmasks */
-#define ICII		(1 << 0)	/* Illegal Gray/Binary Code Interrupt Identifier */
-#define UCII		(1 << 1)	/* Up count Interrupt Identifier */
-#define DCII		(1 << 2)	/* Down count Interrupt Identifier */
-#define MINCII		(1 << 3)	/* Min Count Interrupt Identifier */
-#define MAXCII		(1 << 4)	/* Max Count Interrupt Identifier */
-#define COV31II		(1 << 5)	/* Bit 31 Overflow Interrupt Identifier */
-#define COV15II		(1 << 6)	/* Bit 15 Overflow Interrupt Identifier */
-#define CZEROII		(1 << 7)	/* Count to Zero Interrupt Identifier */
-#define CZMII		(1 << 8)	/* CZM Pin Interrupt Identifier */
-#define CZMEII		(1 << 9)	/* CZM Error Interrupt Identifier */
-#define CZMZII		(1 << 10)	/* CZM Zeroes Counter Interrupt Identifier */
-
-/* CNT_COMMAND bitmasks */
-#define W1LCNT		0xf		/* Load Counter Register */
-#define W1LMIN		0xf0		/* Load Min Register */
-#define W1LMAX		0xf00		/* Load Max Register */
-#define W1ZMONCE	(1 << 12)	/* Enable CZM Clear Counter Once */
-
-#define W1LCNT_ZERO	(1 << 0)	/* write 1 to load CNT_COUNTER with zero */
-#define W1LCNT_MIN	(1 << 2)	/* write 1 to load CNT_COUNTER from CNT_MIN */
-#define W1LCNT_MAX	(1 << 3)	/* write 1 to load CNT_COUNTER from CNT_MAX */
-
-#define W1LMIN_ZERO	(1 << 4)	/* write 1 to load CNT_MIN with zero */
-#define W1LMIN_CNT	(1 << 5)	/* write 1 to load CNT_MIN from CNT_COUNTER */
-#define W1LMIN_MAX	(1 << 7)	/* write 1 to load CNT_MIN from CNT_MAX */
-
-#define W1LMAX_ZERO	(1 << 8)	/* write 1 to load CNT_MAX with zero */
-#define W1LMAX_CNT	(1 << 9)	/* write 1 to load CNT_MAX from CNT_COUNTER */
-#define W1LMAX_MIN	(1 << 10)	/* write 1 to load CNT_MAX from CNT_MIN */
-
-/* CNT_DEBOUNCE bitmasks */
-#define DPRESCALE	0xf		/* Load Counter Register */
-
-#endif
diff --git a/arch/blackfin/mach-bf527/boards/ad7160eval.c b/arch/blackfin/mach-bf527/boards/ad7160eval.c
index 9501bd8d9cd1..beb011b6d2b3 100644
--- a/arch/blackfin/mach-bf527/boards/ad7160eval.c
+++ b/arch/blackfin/mach-bf527/boards/ad7160eval.c
@@ -666,7 +666,7 @@ static struct platform_device bfin_sport1_uart_device = {
 #endif
 
 #if IS_ENABLED(CONFIG_INPUT_BFIN_ROTARY)
-#include <asm/bfin_rotary.h>
+#include <linux/platform_data/bfin_rotary.h>
 
 static struct bfin_rotary_platform_data bfin_rotary_data = {
 	/*.rotary_up_key     = KEY_UP,*/
diff --git a/arch/blackfin/mach-bf527/boards/ezkit.c b/arch/blackfin/mach-bf527/boards/ezkit.c
index d64f565dc2a0..728cda469952 100644
--- a/arch/blackfin/mach-bf527/boards/ezkit.c
+++ b/arch/blackfin/mach-bf527/boards/ezkit.c
@@ -1092,7 +1092,7 @@ static struct platform_device bfin_device_gpiokeys = {
 #endif
 
 #if IS_ENABLED(CONFIG_INPUT_BFIN_ROTARY)
-#include <asm/bfin_rotary.h>
+#include <linux/platform_data/bfin_rotary.h>
 
 static struct bfin_rotary_platform_data bfin_rotary_data = {
 	/*.rotary_up_key     = KEY_UP,*/
diff --git a/arch/blackfin/mach-bf548/boards/ezkit.c b/arch/blackfin/mach-bf548/boards/ezkit.c
index 1fe7ff286619..8f70f83d0a42 100644
--- a/arch/blackfin/mach-bf548/boards/ezkit.c
+++ b/arch/blackfin/mach-bf548/boards/ezkit.c
@@ -159,7 +159,7 @@ static struct platform_device bf54x_kpad_device = {
 #endif
 
 #if IS_ENABLED(CONFIG_INPUT_BFIN_ROTARY)
-#include <asm/bfin_rotary.h>
+#include <linux/platform_data/bfin_rotary.h>
 
 static struct bfin_rotary_platform_data bfin_rotary_data = {
 	/*.rotary_up_key     = KEY_UP,*/
diff --git a/arch/blackfin/mach-bf609/boards/ezkit.c b/arch/blackfin/mach-bf609/boards/ezkit.c
index e2c0b024ce88..f9dc64d00d0b 100644
--- a/arch/blackfin/mach-bf609/boards/ezkit.c
+++ b/arch/blackfin/mach-bf609/boards/ezkit.c
@@ -75,7 +75,7 @@ static struct platform_device bfin_isp1760_device = {
 #endif
 
 #if IS_ENABLED(CONFIG_INPUT_BFIN_ROTARY)
-#include <asm/bfin_rotary.h>
+#include <linux/platform_data/bfin_rotary.h>
 
 static struct bfin_rotary_platform_data bfin_rotary_data = {
 	/*.rotary_up_key     = KEY_UP,*/
diff --git a/drivers/input/misc/bfin_rotary.c b/drivers/input/misc/bfin_rotary.c
index 994f7b0119fe..35ead69521a0 100644
--- a/drivers/input/misc/bfin_rotary.c
+++ b/drivers/input/misc/bfin_rotary.c
@@ -12,9 +12,9 @@
 #include <linux/platform_device.h>
 #include <linux/input.h>
 #include <linux/slab.h>
+#include <linux/platform_data/bfin_rotary.h>
 
 #include <asm/portmux.h>
-#include <asm/bfin_rotary.h>
 
 static const u16 per_cnt[] = {
 	P_CNT_CUD,
diff --git a/include/linux/platform_data/bfin_rotary.h b/include/linux/platform_data/bfin_rotary.h
new file mode 100644
index 000000000000..8895a750c70c
--- /dev/null
+++ b/include/linux/platform_data/bfin_rotary.h
@@ -0,0 +1,116 @@
+/*
+ * board initialization should put one of these structures into platform_data
+ * and place the bfin-rotary onto platform_bus named "bfin-rotary".
+ *
+ * Copyright 2008-2010 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef _BFIN_ROTARY_H
+#define _BFIN_ROTARY_H
+
+/* mode bitmasks */
+#define ROT_QUAD_ENC	CNTMODE_QUADENC	/* quadrature/grey code encoder mode */
+#define ROT_BIN_ENC	CNTMODE_BINENC	/* binary encoder mode */
+#define ROT_UD_CNT	CNTMODE_UDCNT	/* rotary counter mode */
+#define ROT_DIR_CNT	CNTMODE_DIRCNT	/* direction counter mode */
+
+#define ROT_DEBE	DEBE		/* Debounce Enable */
+
+#define ROT_CDGINV	CDGINV		/* CDG Pin Polarity Invert */
+#define ROT_CUDINV	CUDINV		/* CUD Pin Polarity Invert */
+#define ROT_CZMINV	CZMINV		/* CZM Pin Polarity Invert */
+
+struct bfin_rotary_platform_data {
+	/* set rotary UP KEY_### or BTN_### in case you prefer
+	 * bfin-rotary to send EV_KEY otherwise set 0
+	 */
+	unsigned int rotary_up_key;
+	/* set rotary DOWN KEY_### or BTN_### in case you prefer
+	 * bfin-rotary to send EV_KEY otherwise set 0
+	 */
+	unsigned int rotary_down_key;
+	/* set rotary BUTTON KEY_### or BTN_### */
+	unsigned int rotary_button_key;
+	/* set rotary Relative Axis REL_### in case you prefer
+	 * bfin-rotary to send EV_REL otherwise set 0
+	 */
+	unsigned int rotary_rel_code;
+	unsigned short debounce;	/* 0..17 */
+	unsigned short mode;
+	unsigned short pm_wakeup;
+};
+
+/* CNT_CONFIG bitmasks */
+#define CNTE		(1 << 0)	/* Counter Enable */
+#define DEBE		(1 << 1)	/* Debounce Enable */
+#define CDGINV		(1 << 4)	/* CDG Pin Polarity Invert */
+#define CUDINV		(1 << 5)	/* CUD Pin Polarity Invert */
+#define CZMINV		(1 << 6)	/* CZM Pin Polarity Invert */
+#define CNTMODE_SHIFT	8
+#define CNTMODE		(0x7 << CNTMODE_SHIFT)	/* Counter Operating Mode */
+#define ZMZC		(1 << 1)	/* CZM Zeroes Counter Enable */
+#define BNDMODE_SHIFT	12
+#define BNDMODE		(0x3 << BNDMODE_SHIFT)	/* Boundary register Mode */
+#define INPDIS		(1 << 15)	/* CUG and CDG Input Disable */
+
+#define CNTMODE_QUADENC	(0 << CNTMODE_SHIFT)	/* quadrature encoder mode */
+#define CNTMODE_BINENC	(1 << CNTMODE_SHIFT)	/* binary encoder mode */
+#define CNTMODE_UDCNT	(2 << CNTMODE_SHIFT)	/* up/down counter mode */
+#define CNTMODE_DIRCNT	(4 << CNTMODE_SHIFT)	/* direction counter mode */
+#define CNTMODE_DIRTMR	(5 << CNTMODE_SHIFT)	/* direction timer mode */
+
+#define BNDMODE_COMP	(0 << BNDMODE_SHIFT)	/* boundary compare mode */
+#define BNDMODE_ZERO	(1 << BNDMODE_SHIFT)	/* boundary compare and zero mode */
+#define BNDMODE_CAPT	(2 << BNDMODE_SHIFT)	/* boundary capture mode */
+#define BNDMODE_AEXT	(3 << BNDMODE_SHIFT)	/* boundary auto-extend mode */
+
+/* CNT_IMASK bitmasks */
+#define ICIE		(1 << 0)	/* Illegal Gray/Binary Code Interrupt Enable */
+#define UCIE		(1 << 1)	/* Up count Interrupt Enable */
+#define DCIE		(1 << 2)	/* Down count Interrupt Enable */
+#define MINCIE		(1 << 3)	/* Min Count Interrupt Enable */
+#define MAXCIE		(1 << 4)	/* Max Count Interrupt Enable */
+#define COV31IE		(1 << 5)	/* Bit 31 Overflow Interrupt Enable */
+#define COV15IE		(1 << 6)	/* Bit 15 Overflow Interrupt Enable */
+#define CZEROIE		(1 << 7)	/* Count to Zero Interrupt Enable */
+#define CZMIE		(1 << 8)	/* CZM Pin Interrupt Enable */
+#define CZMEIE		(1 << 9)	/* CZM Error Interrupt Enable */
+#define CZMZIE		(1 << 10)	/* CZM Zeroes Counter Interrupt Enable */
+
+/* CNT_STATUS bitmasks */
+#define ICII		(1 << 0)	/* Illegal Gray/Binary Code Interrupt Identifier */
+#define UCII		(1 << 1)	/* Up count Interrupt Identifier */
+#define DCII		(1 << 2)	/* Down count Interrupt Identifier */
+#define MINCII		(1 << 3)	/* Min Count Interrupt Identifier */
+#define MAXCII		(1 << 4)	/* Max Count Interrupt Identifier */
+#define COV31II		(1 << 5)	/* Bit 31 Overflow Interrupt Identifier */
+#define COV15II		(1 << 6)	/* Bit 15 Overflow Interrupt Identifier */
+#define CZEROII		(1 << 7)	/* Count to Zero Interrupt Identifier */
+#define CZMII		(1 << 8)	/* CZM Pin Interrupt Identifier */
+#define CZMEII		(1 << 9)	/* CZM Error Interrupt Identifier */
+#define CZMZII		(1 << 10)	/* CZM Zeroes Counter Interrupt Identifier */
+
+/* CNT_COMMAND bitmasks */
+#define W1LCNT		0xf		/* Load Counter Register */
+#define W1LMIN		0xf0		/* Load Min Register */
+#define W1LMAX		0xf00		/* Load Max Register */
+#define W1ZMONCE	(1 << 12)	/* Enable CZM Clear Counter Once */
+
+#define W1LCNT_ZERO	(1 << 0)	/* write 1 to load CNT_COUNTER with zero */
+#define W1LCNT_MIN	(1 << 2)	/* write 1 to load CNT_COUNTER from CNT_MIN */
+#define W1LCNT_MAX	(1 << 3)	/* write 1 to load CNT_COUNTER from CNT_MAX */
+
+#define W1LMIN_ZERO	(1 << 4)	/* write 1 to load CNT_MIN with zero */
+#define W1LMIN_CNT	(1 << 5)	/* write 1 to load CNT_MIN from CNT_COUNTER */
+#define W1LMIN_MAX	(1 << 7)	/* write 1 to load CNT_MIN from CNT_MAX */
+
+#define W1LMAX_ZERO	(1 << 8)	/* write 1 to load CNT_MAX with zero */
+#define W1LMAX_CNT	(1 << 9)	/* write 1 to load CNT_MAX from CNT_COUNTER */
+#define W1LMAX_MIN	(1 << 10)	/* write 1 to load CNT_MAX from CNT_MIN */
+
+/* CNT_DEBOUNCE bitmasks */
+#define DPRESCALE	0xf		/* Load Counter Register */
+
+#endif
-- 
cgit v1.2.3


From 5ea0699a7b452c9e1ecdc81e354fa91dfe8da4f6 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 5 Feb 2015 21:23:10 -0800
Subject: Input: bfin_rotary - move pin lists into into platform data

Newer Blackfin boards use pinctrl API to manage pins and the legacy
peripherial lists are not useful on them. Let's move pin lists into
platform data so older boards can still use them and newer boards can use
the modern API.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 arch/blackfin/mach-bf527/boards/ad7160eval.c |  8 ++++++++
 arch/blackfin/mach-bf527/boards/ezkit.c      |  8 ++++++++
 drivers/input/misc/bfin_rotary.c             | 30 +++++++++++++++-------------
 include/linux/platform_data/bfin_rotary.h    |  1 +
 4 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/blackfin/mach-bf527/boards/ad7160eval.c b/arch/blackfin/mach-bf527/boards/ad7160eval.c
index beb011b6d2b3..029a0506b669 100644
--- a/arch/blackfin/mach-bf527/boards/ad7160eval.c
+++ b/arch/blackfin/mach-bf527/boards/ad7160eval.c
@@ -668,6 +668,13 @@ static struct platform_device bfin_sport1_uart_device = {
 #if IS_ENABLED(CONFIG_INPUT_BFIN_ROTARY)
 #include <linux/platform_data/bfin_rotary.h>
 
+static const u16 per_cnt[] = {
+	P_CNT_CUD,
+	P_CNT_CDG,
+	P_CNT_CZM,
+	0
+};
+
 static struct bfin_rotary_platform_data bfin_rotary_data = {
 	/*.rotary_up_key     = KEY_UP,*/
 	/*.rotary_down_key   = KEY_DOWN,*/
@@ -676,6 +683,7 @@ static struct bfin_rotary_platform_data bfin_rotary_data = {
 	.debounce	   = 10,	/* 0..17 */
 	.mode		   = ROT_QUAD_ENC | ROT_DEBE,
 	.pm_wakeup	   = 1,
+	.pin_list	   = per_cnt,
 };
 
 static struct resource bfin_rotary_resources[] = {
diff --git a/arch/blackfin/mach-bf527/boards/ezkit.c b/arch/blackfin/mach-bf527/boards/ezkit.c
index 728cda469952..cc80c5ba9f67 100644
--- a/arch/blackfin/mach-bf527/boards/ezkit.c
+++ b/arch/blackfin/mach-bf527/boards/ezkit.c
@@ -1094,6 +1094,13 @@ static struct platform_device bfin_device_gpiokeys = {
 #if IS_ENABLED(CONFIG_INPUT_BFIN_ROTARY)
 #include <linux/platform_data/bfin_rotary.h>
 
+static const u16 per_cnt[] = {
+	P_CNT_CUD,
+	P_CNT_CDG,
+	P_CNT_CZM,
+	0
+};
+
 static struct bfin_rotary_platform_data bfin_rotary_data = {
 	/*.rotary_up_key     = KEY_UP,*/
 	/*.rotary_down_key   = KEY_DOWN,*/
@@ -1102,6 +1109,7 @@ static struct bfin_rotary_platform_data bfin_rotary_data = {
 	.debounce	   = 10,	/* 0..17 */
 	.mode		   = ROT_QUAD_ENC | ROT_DEBE,
 	.pm_wakeup	   = 1,
+	.pin_list	   = per_cnt,
 };
 
 static struct resource bfin_rotary_resources[] = {
diff --git a/drivers/input/misc/bfin_rotary.c b/drivers/input/misc/bfin_rotary.c
index 35ead69521a0..8312b29ab18d 100644
--- a/drivers/input/misc/bfin_rotary.c
+++ b/drivers/input/misc/bfin_rotary.c
@@ -16,13 +16,6 @@
 
 #include <asm/portmux.h>
 
-static const u16 per_cnt[] = {
-	P_CNT_CUD,
-	P_CNT_CDG,
-	P_CNT_CZM,
-	0
-};
-
 struct bfin_rot {
 	struct input_dev *input;
 	int irq;
@@ -90,7 +83,8 @@ static irqreturn_t bfin_rotary_isr(int irq, void *dev_id)
 
 static int bfin_rotary_probe(struct platform_device *pdev)
 {
-	struct bfin_rotary_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	const struct bfin_rotary_platform_data *pdata =
+					dev_get_platdata(&pdev->dev);
 	struct bfin_rot *rotary;
 	struct input_dev *input;
 	int error;
@@ -101,10 +95,13 @@ static int bfin_rotary_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	error = peripheral_request_list(per_cnt, dev_name(&pdev->dev));
-	if (error) {
-		dev_err(&pdev->dev, "requesting peripherals failed\n");
-		return error;
+	if (pdata->pin_list) {
+		error = peripheral_request_list(pdata->pin_list,
+						dev_name(&pdev->dev));
+		if (error) {
+			dev_err(&pdev->dev, "requesting peripherals failed\n");
+			return error;
+		}
 	}
 
 	rotary = kzalloc(sizeof(struct bfin_rot), GFP_KERNEL);
@@ -189,13 +186,16 @@ out2:
 out1:
 	input_free_device(input);
 	kfree(rotary);
-	peripheral_free_list(per_cnt);
+	if (pdata->pin_list)
+		peripheral_free_list(pdata->pin_list);
 
 	return error;
 }
 
 static int bfin_rotary_remove(struct platform_device *pdev)
 {
+	const struct bfin_rotary_platform_data *pdata =
+					dev_get_platdata(&pdev->dev);
 	struct bfin_rot *rotary = platform_get_drvdata(pdev);
 
 	bfin_write_CNT_CONFIG(0);
@@ -203,7 +203,9 @@ static int bfin_rotary_remove(struct platform_device *pdev)
 
 	free_irq(rotary->irq, rotary);
 	input_unregister_device(rotary->input);
-	peripheral_free_list(per_cnt);
+
+	if (pdata->pin_list)
+		peripheral_free_list(pdata->pin_list);
 
 	kfree(rotary);
 
diff --git a/include/linux/platform_data/bfin_rotary.h b/include/linux/platform_data/bfin_rotary.h
index 8895a750c70c..98829370fee2 100644
--- a/include/linux/platform_data/bfin_rotary.h
+++ b/include/linux/platform_data/bfin_rotary.h
@@ -40,6 +40,7 @@ struct bfin_rotary_platform_data {
 	unsigned short debounce;	/* 0..17 */
 	unsigned short mode;
 	unsigned short pm_wakeup;
+	unsigned short *pin_list;
 };
 
 /* CNT_CONFIG bitmasks */
-- 
cgit v1.2.3


From e084c1bd40926938ff8d26af3bde34396dd4d06d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Mon, 16 Feb 2015 14:32:03 -0500
Subject: Revert "locks: keep a count of locks on the flctx lists"

This reverts commit 9bd0f45b7037fcfa8b575c7e27d0431d6e6dc3bb.

Linus rightly pointed out that I failed to initialize the counters
when adding them, so they don't work as expected. Just revert this
patch for now.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/ceph/locks.c    |  9 +++++++--
 fs/cifs/file.c     | 14 ++++++++++----
 fs/locks.c         | 45 ++++++++++++++++-----------------------------
 include/linux/fs.h |  3 ---
 4 files changed, 33 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 06ea5cd05cd9..4347039ecc18 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -245,6 +245,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
  */
 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 {
+	struct file_lock *lock;
 	struct file_lock_context *ctx;
 
 	*fcntl_count = 0;
@@ -252,8 +253,12 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 
 	ctx = inode->i_flctx;
 	if (ctx) {
-		*fcntl_count = ctx->flc_posix_cnt;
-		*flock_count = ctx->flc_flock_cnt;
+		spin_lock(&ctx->flc_lock);
+		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
+			++(*fcntl_count);
+		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
+			++(*flock_count);
+		spin_unlock(&ctx->flc_lock);
 	}
 	dout("counted %d flock locks and %d fcntl locks",
 	     *flock_count, *fcntl_count);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8fe1f7a21b3e..a94b3e673182 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1129,7 +1129,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	struct file_lock *flock;
 	struct file_lock_context *flctx = inode->i_flctx;
-	unsigned int i;
+	unsigned int count = 0, i;
 	int rc = 0, xid, type;
 	struct list_head locks_to_send, *el;
 	struct lock_to_push *lck, *tmp;
@@ -1140,14 +1140,20 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	if (!flctx)
 		goto out;
 
+	spin_lock(&flctx->flc_lock);
+	list_for_each(el, &flctx->flc_posix) {
+		count++;
+	}
+	spin_unlock(&flctx->flc_lock);
+
 	INIT_LIST_HEAD(&locks_to_send);
 
 	/*
-	 * Allocating flc_posix_cnt locks is enough because no FL_POSIX locks
-	 * can be added to the list while we are holding cinode->lock_sem that
+	 * Allocating count locks is enough because no FL_POSIX locks can be
+	 * added to the list while we are holding cinode->lock_sem that
 	 * protects locking operations of this inode.
 	 */
-	for (i = 0; i < flctx->flc_posix_cnt; i++) {
+	for (i = 0; i < count; i++) {
 		lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
 		if (!lck) {
 			rc = -ENOMEM;
diff --git a/fs/locks.c b/fs/locks.c
index 4753218f308e..7998f670812c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -681,21 +681,18 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 }
 
 static void
-locks_insert_lock_ctx(struct file_lock *fl, int *counter,
-		      struct list_head *before)
+locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
 {
 	fl->fl_nspid = get_pid(task_tgid(current));
 	list_add_tail(&fl->fl_list, before);
-	++*counter;
 	locks_insert_global_locks(fl);
 }
 
 static void
-locks_unlink_lock_ctx(struct file_lock *fl, int *counter)
+locks_unlink_lock_ctx(struct file_lock *fl)
 {
 	locks_delete_global_locks(fl);
 	list_del_init(&fl->fl_list);
-	--*counter;
 	if (fl->fl_nspid) {
 		put_pid(fl->fl_nspid);
 		fl->fl_nspid = NULL;
@@ -704,10 +701,9 @@ locks_unlink_lock_ctx(struct file_lock *fl, int *counter)
 }
 
 static void
-locks_delete_lock_ctx(struct file_lock *fl, int *counter,
-		      struct list_head *dispose)
+locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
 {
-	locks_unlink_lock_ctx(fl, counter);
+	locks_unlink_lock_ctx(fl);
 	if (dispose)
 		list_add(&fl->fl_list, dispose);
 	else
@@ -895,7 +891,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 		if (request->fl_type == fl->fl_type)
 			goto out;
 		found = true;
-		locks_delete_lock_ctx(fl, &ctx->flc_flock_cnt, &dispose);
+		locks_delete_lock_ctx(fl, &dispose);
 		break;
 	}
 
@@ -929,7 +925,7 @@ find_conflict:
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 	locks_copy_lock(new_fl, request);
-	locks_insert_lock_ctx(new_fl, &ctx->flc_flock_cnt, &ctx->flc_flock);
+	locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
 	new_fl = NULL;
 	error = 0;
 
@@ -1046,8 +1042,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			else
 				request->fl_end = fl->fl_end;
 			if (added) {
-				locks_delete_lock_ctx(fl, &ctx->flc_posix_cnt,
-							&dispose);
+				locks_delete_lock_ctx(fl, &dispose);
 				continue;
 			}
 			request = fl;
@@ -1076,8 +1071,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				 * one (This may happen several times).
 				 */
 				if (added) {
-					locks_delete_lock_ctx(fl,
-						&ctx->flc_posix_cnt, &dispose);
+					locks_delete_lock_ctx(fl, &dispose);
 					continue;
 				}
 				/*
@@ -1093,10 +1087,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				locks_copy_lock(new_fl, request);
 				request = new_fl;
 				new_fl = NULL;
-				locks_insert_lock_ctx(request,
-					&ctx->flc_posix_cnt, &fl->fl_list);
-				locks_delete_lock_ctx(fl,
-					&ctx->flc_posix_cnt, &dispose);
+				locks_insert_lock_ctx(request, &fl->fl_list);
+				locks_delete_lock_ctx(fl, &dispose);
 				added = true;
 			}
 		}
@@ -1124,8 +1116,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			goto out;
 		}
 		locks_copy_lock(new_fl, request);
-		locks_insert_lock_ctx(new_fl, &ctx->flc_posix_cnt,
-					&fl->fl_list);
+		locks_insert_lock_ctx(new_fl, &fl->fl_list);
 		new_fl = NULL;
 	}
 	if (right) {
@@ -1136,8 +1127,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			left = new_fl2;
 			new_fl2 = NULL;
 			locks_copy_lock(left, right);
-			locks_insert_lock_ctx(left, &ctx->flc_posix_cnt,
-						&fl->fl_list);
+			locks_insert_lock_ctx(left, &fl->fl_list);
 		}
 		right->fl_start = request->fl_end + 1;
 		locks_wake_up_blocks(right);
@@ -1321,7 +1311,6 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
 /* We already had a lease on this file; just change its type */
 int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 {
-	struct file_lock_context *flctx;
 	int error = assign_type(fl, arg);
 
 	if (error)
@@ -1331,7 +1320,6 @@ int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 	if (arg == F_UNLCK) {
 		struct file *filp = fl->fl_file;
 
-		flctx = file_inode(filp)->i_flctx;
 		f_delown(filp);
 		filp->f_owner.signum = 0;
 		fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
@@ -1339,7 +1327,7 @@ int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 			printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
 			fl->fl_fasync = NULL;
 		}
-		locks_delete_lock_ctx(fl, &flctx->flc_lease_cnt, dispose);
+		locks_delete_lock_ctx(fl, dispose);
 	}
 	return 0;
 }
@@ -1456,8 +1444,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 			fl->fl_downgrade_time = break_time;
 		}
 		if (fl->fl_lmops->lm_break(fl))
-			locks_delete_lock_ctx(fl, &ctx->flc_lease_cnt,
-						&dispose);
+			locks_delete_lock_ctx(fl, &dispose);
 	}
 
 	if (list_empty(&ctx->flc_lease))
@@ -1697,7 +1684,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	if (!leases_enable)
 		goto out;
 
-	locks_insert_lock_ctx(lease, &ctx->flc_lease_cnt, &ctx->flc_lease);
+	locks_insert_lock_ctx(lease, &ctx->flc_lease);
 	/*
 	 * The check in break_lease() is lockless. It's possible for another
 	 * open to race in after we did the earlier check for a conflicting
@@ -1710,7 +1697,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	smp_mb();
 	error = check_conflicting_open(dentry, arg, lease->fl_flags);
 	if (error) {
-		locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
+		locks_unlink_lock_ctx(lease);
 		goto out;
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e49f10cc8a73..a5a303e8a33c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -969,9 +969,6 @@ struct file_lock_context {
 	struct list_head	flc_flock;
 	struct list_head	flc_posix;
 	struct list_head	flc_lease;
-	int			flc_flock_cnt;
-	int			flc_posix_cnt;
-	int			flc_lease_cnt;
 };
 
 /* The following constant reflects the upper bound of the file/locking space */
-- 
cgit v1.2.3


From 2e4cdab0584fa884e0a81c4f45b93ce875c9fcaa Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:58:50 -0800
Subject: mm: allow page fault handlers to perform the COW

Currently COW of an XIP file is done by first bringing in a read-only
mapping, then retrying the fault and copying the page.  It is much more
efficient to tell the fault handler that a COW is being attempted (by
passing in the pre-allocated page in the vm_fault structure), and allow
the handler to perform the COW operation itself.

The handler cannot insert the page itself if there is already a read-only
mapping at that address, so allow the handler to return VM_FAULT_LOCKED
and set the fault_page to be NULL.  This indicates to the MM code that the
i_mmap_lock is held instead of the page lock.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 +
 mm/memory.c        | 41 ++++++++++++++++++++++++++++++++---------
 2 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9bee7ec0c31f..47a93928b90f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -224,6 +224,7 @@ struct vm_fault {
 	pgoff_t pgoff;			/* Logical page offset based on vma */
 	void __user *virtual_address;	/* Faulting virtual address */
 
+	struct page *cow_page;		/* Handler may choose to COW */
 	struct page *page;		/* ->fault handlers should return a
 					 * page here, unless VM_FAULT_NOPAGE
 					 * is set (which is also implied by
diff --git a/mm/memory.c b/mm/memory.c
index 1b04e13b9993..8068893697bb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 	vmf.pgoff = page->index;
 	vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 	vmf.page = page;
+	vmf.cow_page = NULL;
 
 	ret = vma->vm_ops->page_mkwrite(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2639,7 +2640,8 @@ oom:
  * See filemap_fault() and __lock_page_retry().
  */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
-		pgoff_t pgoff, unsigned int flags, struct page **page)
+			pgoff_t pgoff, unsigned int flags,
+			struct page *cow_page, struct page **page)
 {
 	struct vm_fault vmf;
 	int ret;
@@ -2648,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	vmf.pgoff = pgoff;
 	vmf.flags = flags;
 	vmf.page = NULL;
+	vmf.cow_page = cow_page;
 
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
+	if (!vmf.page)
+		goto out;
 
 	if (unlikely(PageHWPoison(vmf.page))) {
 		if (ret & VM_FAULT_LOCKED)
@@ -2665,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	else
 		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
+ out:
 	*page = vmf.page;
 	return ret;
 }
@@ -2835,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
@@ -2875,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		goto uncharge_out;
 
-	copy_user_highpage(new_page, fault_page, address, vma);
+	if (fault_page)
+		copy_user_highpage(new_page, fault_page, address, vma);
 	__SetPageUptodate(new_page);
 
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*pte, orig_pte))) {
 		pte_unmap_unlock(pte, ptl);
-		unlock_page(fault_page);
-		page_cache_release(fault_page);
+		if (fault_page) {
+			unlock_page(fault_page);
+			page_cache_release(fault_page);
+		} else {
+			/*
+			 * The fault handler has no page to lock, so it holds
+			 * i_mmap_lock for read to protect against truncate.
+			 */
+			i_mmap_unlock_read(vma->vm_file->f_mapping);
+		}
 		goto uncharge_out;
 	}
 	do_set_pte(vma, address, new_page, pte, true, true);
 	mem_cgroup_commit_charge(new_page, memcg, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
-	unlock_page(fault_page);
-	page_cache_release(fault_page);
+	if (fault_page) {
+		unlock_page(fault_page);
+		page_cache_release(fault_page);
+	} else {
+		/*
+		 * The fault handler has no page to lock, so it holds
+		 * i_mmap_lock for read to protect against truncate.
+		 */
+		i_mmap_unlock_read(vma->vm_file->f_mapping);
+	}
 	return ret;
 uncharge_out:
 	mem_cgroup_cancel_charge(new_page, memcg);
@@ -2913,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	int dirtied = 0;
 	int ret, tmp;
 
-	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
-- 
cgit v1.2.3


From fbbbad4bc2101e452b24e6e65d3d5e11314a0b5f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:58:53 -0800
Subject: vfs,ext2: introduce IS_DAX(inode)

Use an inode flag to tag inodes which should avoid using the page cache.
Convert ext2 to use it instead of mapping_is_xip().  Prevent I/Os to files
tagged with the DAX flag from falling back to buffered I/O.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/inode.c    |  9 ++++++---
 fs/ext2/xip.h      |  2 --
 include/linux/fs.h |  6 ++++++
 mm/filemap.c       | 19 ++++++++++++-------
 4 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 36d35c36311d..0cb04486577d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -731,7 +731,7 @@ static int ext2_get_blocks(struct inode *inode,
 		goto cleanup;
 	}
 
-	if (ext2_use_xip(inode->i_sb)) {
+	if (IS_DAX(inode)) {
 		/*
 		 * we need to clear the block
 		 */
@@ -1201,7 +1201,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 
 	inode_dio_wait(inode);
 
-	if (mapping_is_xip(inode->i_mapping))
+	if (IS_DAX(inode))
 		error = xip_truncate_page(inode->i_mapping, newsize);
 	else if (test_opt(inode->i_sb, NOBH))
 		error = nobh_truncate_page(inode->i_mapping,
@@ -1273,7 +1273,8 @@ void ext2_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT2_I(inode)->i_flags;
 
-	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+				S_DIRSYNC | S_DAX);
 	if (flags & EXT2_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & EXT2_APPEND_FL)
@@ -1284,6 +1285,8 @@ void ext2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT2_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
+	if (test_opt(inode->i_sb, XIP))
+		inode->i_flags |= S_DAX;
 }
 
 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
index 18b34d2f31b3..29be73781419 100644
--- a/fs/ext2/xip.h
+++ b/fs/ext2/xip.h
@@ -16,9 +16,7 @@ static inline int ext2_use_xip (struct super_block *sb)
 }
 int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
 				void **, unsigned long *);
-#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
 #else
-#define mapping_is_xip(map)			0
 #define ext2_xip_verify_sb(sb)			do { } while (0)
 #define ext2_use_xip(sb)			0
 #define ext2_clear_xip_target(inode, chain)	0
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e49f10cc8a73..fb373bb5cf03 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1677,6 +1677,11 @@ struct super_operations {
 #define S_IMA		1024	/* Inode has an associated IMA struct */
 #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 #define S_NOSEC		4096	/* no suid or xattr security attributes */
+#ifdef CONFIG_FS_XIP
+#define S_DAX		8192	/* Direct Access, avoiding the page cache */
+#else
+#define S_DAX		0	/* Make all the DAX code disappear */
+#endif
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -1714,6 +1719,7 @@ struct super_operations {
 #define IS_IMA(inode)		((inode)->i_flags & S_IMA)
 #define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
 #define IS_NOSEC(inode)		((inode)->i_flags & S_NOSEC)
+#define IS_DAX(inode)		((inode)->i_flags & S_DAX)
 
 #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
 				 (inode)->i_rdev == WHITEOUT_DEV)
diff --git a/mm/filemap.c b/mm/filemap.c
index d9f5336552d7..1578c224285e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1723,9 +1723,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		 * we've already read everything we wanted to, or if
 		 * there was a short read because we hit EOF, go ahead
 		 * and return.  Otherwise fallthrough to buffered io for
-		 * the rest of the read.
+		 * the rest of the read.  Buffered reads will not work for
+		 * DAX files, so don't bother trying.
 		 */
-		if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+		if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
+		    IS_DAX(inode)) {
 			file_accessed(file);
 			goto out;
 		}
@@ -2587,13 +2589,16 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		loff_t endbyte;
 
 		written = generic_file_direct_write(iocb, from, pos);
-		if (written < 0 || written == count)
-			goto out;
-
 		/*
-		 * direct-io write to a hole: fall through to buffered I/O
-		 * for completing the rest of the request.
+		 * If the write stopped short of completing, fall back to
+		 * buffered writes.  Some filesystems do this for writes to
+		 * holes, for example.  For DAX files, a buffered write will
+		 * not succeed (even if it did, DAX does not handle dirty
+		 * page-cache pages correctly).
 		 */
+		if (written < 0 || written == count || IS_DAX(inode))
+			goto out;
+
 		pos += written;
 		count -= written;
 
-- 
cgit v1.2.3


From d475c6346a38aef3058eba96867bfa726a3cc940 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:58:56 -0800
Subject: dax,ext2: replace XIP read and write with DAX I/O

Use the generic AIO infrastructure instead of custom read and write
methods.  In addition to giving us support for AIO, this adds the missing
locking between read() and truncate().

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS        |   6 ++
 fs/Makefile        |   1 +
 fs/dax.c           | 186 ++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/file.c     |   6 +-
 fs/ext2/inode.c    |   8 +-
 include/linux/fs.h |  12 ++-
 mm/filemap.c       |   6 +-
 mm/filemap_xip.c   | 234 -----------------------------------------------------
 8 files changed, 214 insertions(+), 245 deletions(-)
 create mode 100644 fs/dax.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 348f5c16ef50..8670c224c833 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3151,6 +3151,12 @@ L:	linux-i2c@vger.kernel.org
 S:	Maintained
 F:	drivers/i2c/busses/i2c-diolan-u2c.c
 
+DIRECT ACCESS (DAX)
+M:	Matthew Wilcox <willy@linux.intel.com>
+L:	linux-fsdevel@vger.kernel.org
+S:	Supported
+F:	fs/dax.c
+
 DIRECTORY NOTIFICATION (DNOTIFY)
 M:	Eric Paris <eparis@parisplace.org>
 S:	Maintained
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..0534444e257c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_FS_XIP)		+= dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..1a2bdbfa3ea9
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,186 @@
+/*
+ * fs/dax.c - Direct Access filesystem code
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/mutex.h>
+#include <linux/uio.h>
+
+static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+{
+	unsigned long pfn;
+	sector_t sector = bh->b_blocknr << (blkbits - 9);
+	return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
+}
+
+static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
+			loff_t end)
+{
+	loff_t final = end - pos + first; /* The final byte of the buffer */
+
+	if (first > 0)
+		memset(addr, 0, first);
+	if (final < size)
+		memset(addr + final, 0, size - final);
+}
+
+static bool buffer_written(struct buffer_head *bh)
+{
+	return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+
+/*
+ * When ext4 encounters a hole, it returns without modifying the buffer_head
+ * which means that we can't trust b_size.  To cope with this, we set b_state
+ * to 0 before calling get_block and, if any bit is set, we know we can trust
+ * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
+ * and would save us time calling get_block repeatedly.
+ */
+static bool buffer_size_valid(struct buffer_head *bh)
+{
+	return bh->b_state != 0;
+}
+
+static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
+			loff_t start, loff_t end, get_block_t get_block,
+			struct buffer_head *bh)
+{
+	ssize_t retval = 0;
+	loff_t pos = start;
+	loff_t max = start;
+	loff_t bh_max = start;
+	void *addr;
+	bool hole = false;
+
+	if (rw != WRITE)
+		end = min(end, i_size_read(inode));
+
+	while (pos < end) {
+		unsigned len;
+		if (pos == max) {
+			unsigned blkbits = inode->i_blkbits;
+			sector_t block = pos >> blkbits;
+			unsigned first = pos - (block << blkbits);
+			long size;
+
+			if (pos == bh_max) {
+				bh->b_size = PAGE_ALIGN(end - pos);
+				bh->b_state = 0;
+				retval = get_block(inode, block, bh,
+								rw == WRITE);
+				if (retval)
+					break;
+				if (!buffer_size_valid(bh))
+					bh->b_size = 1 << blkbits;
+				bh_max = pos - first + bh->b_size;
+			} else {
+				unsigned done = bh->b_size -
+						(bh_max - (pos - first));
+				bh->b_blocknr += done >> blkbits;
+				bh->b_size -= done;
+			}
+
+			hole = (rw != WRITE) && !buffer_written(bh);
+			if (hole) {
+				addr = NULL;
+				size = bh->b_size - first;
+			} else {
+				retval = dax_get_addr(bh, &addr, blkbits);
+				if (retval < 0)
+					break;
+				if (buffer_unwritten(bh) || buffer_new(bh))
+					dax_new_buf(addr, retval, first, pos,
+									end);
+				addr += first;
+				size = retval - first;
+			}
+			max = min(pos + size, end);
+		}
+
+		if (rw == WRITE)
+			len = copy_from_iter(addr, max - pos, iter);
+		else if (!hole)
+			len = copy_to_iter(addr, max - pos, iter);
+		else
+			len = iov_iter_zero(max - pos, iter);
+
+		if (!len)
+			break;
+
+		pos += len;
+		addr += len;
+	}
+
+	return (pos == start) ? retval : pos - start;
+}
+
+/**
+ * dax_do_io - Perform I/O to a DAX file
+ * @rw: READ to read or WRITE to write
+ * @iocb: The control block for this I/O
+ * @inode: The file which the I/O is directed at
+ * @iter: The addresses to do I/O from or to
+ * @pos: The file offset where the I/O starts
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @end_io: A filesystem callback for I/O completion
+ * @flags: See below
+ *
+ * This function uses the same locking scheme as do_blockdev_direct_IO:
+ * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+ * caller for writes.  For reads, we take and release the i_mutex ourselves.
+ * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+ * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+ * is in progress.
+ */
+ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
+			struct iov_iter *iter, loff_t pos,
+			get_block_t get_block, dio_iodone_t end_io, int flags)
+{
+	struct buffer_head bh;
+	ssize_t retval = -EINVAL;
+	loff_t end = pos + iov_iter_count(iter);
+
+	memset(&bh, 0, sizeof(bh));
+
+	if ((flags & DIO_LOCKING) && (rw == READ)) {
+		struct address_space *mapping = inode->i_mapping;
+		mutex_lock(&inode->i_mutex);
+		retval = filemap_write_and_wait_range(mapping, pos, end - 1);
+		if (retval) {
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
+
+	/* Protects against truncate */
+	atomic_inc(&inode->i_dio_count);
+
+	retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
+
+	if ((flags & DIO_LOCKING) && (rw == READ))
+		mutex_unlock(&inode->i_mutex);
+
+	if ((retval > 0) && end_io)
+		end_io(iocb, pos, retval, bh.b_private);
+
+	inode_dio_done(inode);
+ out:
+	return retval;
+}
+EXPORT_SYMBOL_GPL(dax_do_io);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..a247123fd798 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -81,8 +81,10 @@ const struct file_operations ext2_file_operations = {
 #ifdef CONFIG_EXT2_FS_XIP
 const struct file_operations ext2_xip_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= xip_file_read,
-	.write		= xip_file_write,
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
 	.unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0cb04486577d..3ccd5fd47d66 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -859,7 +859,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
+	if (IS_DAX(inode))
+		ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
+				NULL, DIO_LOCKING);
+	else
+		ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+					 ext2_get_block);
 	if (ret < 0 && (rw & WRITE))
 		ext2_write_failed(mapping, offset + count);
 	return ret;
@@ -888,6 +893,7 @@ const struct address_space_operations ext2_aops = {
 const struct address_space_operations ext2_aops_xip = {
 	.bmap			= ext2_bmap,
 	.get_xip_mem		= ext2_get_xip_mem,
+	.direct_IO		= ext2_direct_IO,
 };
 
 const struct address_space_operations ext2_nobh_aops = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb373bb5cf03..241c3c030fb5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2587,12 +2587,11 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
+ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
+		loff_t, get_block_t, dio_iodone_t, int flags);
+
 #ifdef CONFIG_FS_XIP
-extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
-			     loff_t *ppos);
 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
-extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
-			      size_t len, loff_t *ppos);
 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
 #else
 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
@@ -2756,6 +2755,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
 extern void save_mount_options(struct super_block *sb, char *options);
 extern void replace_mount_options(struct super_block *sb, char *options);
 
+static inline bool io_is_direct(struct file *filp)
+{
+	return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
+}
+
 static inline ino_t parent_ino(struct dentry *dentry)
 {
 	ino_t res;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1578c224285e..ad7242043bdb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	loff_t *ppos = &iocb->ki_pos;
 	loff_t pos = *ppos;
 
-	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-	if (file->f_flags & O_DIRECT) {
+	if (io_is_direct(file)) {
 		struct address_space *mapping = file->f_mapping;
 		struct inode *inode = mapping->host;
 		size_t count = iov_iter_count(iter);
@@ -2584,8 +2583,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (err)
 		goto out;
 
-	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-	if (unlikely(file->f_flags & O_DIRECT)) {
+	if (io_is_direct(file)) {
 		loff_t endbyte;
 
 		written = generic_file_direct_write(iocb, from, pos);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 59e1c5585748..9c869f402c07 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -42,119 +42,6 @@ static struct page *xip_sparse_page(void)
 	return __xip_sparse_page;
 }
 
-/*
- * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_mem() function for the actual low-level
- * stuff.
- *
- * Note the struct file* is not used at all.  It may be NULL.
- */
-static ssize_t
-do_xip_mapping_read(struct address_space *mapping,
-		    struct file_ra_state *_ra,
-		    struct file *filp,
-		    char __user *buf,
-		    size_t len,
-		    loff_t *ppos)
-{
-	struct inode *inode = mapping->host;
-	pgoff_t index, end_index;
-	unsigned long offset;
-	loff_t isize, pos;
-	size_t copied = 0, error = 0;
-
-	BUG_ON(!mapping->a_ops->get_xip_mem);
-
-	pos = *ppos;
-	index = pos >> PAGE_CACHE_SHIFT;
-	offset = pos & ~PAGE_CACHE_MASK;
-
-	isize = i_size_read(inode);
-	if (!isize)
-		goto out;
-
-	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-	do {
-		unsigned long nr, left;
-		void *xip_mem;
-		unsigned long xip_pfn;
-		int zero = 0;
-
-		/* nr is the maximum number of bytes to copy from this page */
-		nr = PAGE_CACHE_SIZE;
-		if (index >= end_index) {
-			if (index > end_index)
-				goto out;
-			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
-			if (nr <= offset) {
-				goto out;
-			}
-		}
-		nr = nr - offset;
-		if (nr > len - copied)
-			nr = len - copied;
-
-		error = mapping->a_ops->get_xip_mem(mapping, index, 0,
-							&xip_mem, &xip_pfn);
-		if (unlikely(error)) {
-			if (error == -ENODATA) {
-				/* sparse */
-				zero = 1;
-			} else
-				goto out;
-		}
-
-		/* If users can be writing to this page using arbitrary
-		 * virtual addresses, take care about potential aliasing
-		 * before reading the page on the kernel side.
-		 */
-		if (mapping_writably_mapped(mapping))
-			/* address based flush */ ;
-
-		/*
-		 * Ok, we have the mem, so now we can copy it to user space...
-		 *
-		 * The actor routine returns how many bytes were actually used..
-		 * NOTE! This may not be the same as how much of a user buffer
-		 * we filled up (we may be padding etc), so we can only update
-		 * "pos" here (the actor routine has to update the user buffer
-		 * pointers and the remaining count).
-		 */
-		if (!zero)
-			left = __copy_to_user(buf+copied, xip_mem+offset, nr);
-		else
-			left = __clear_user(buf + copied, nr);
-
-		if (left) {
-			error = -EFAULT;
-			goto out;
-		}
-
-		copied += (nr - left);
-		offset += (nr - left);
-		index += offset >> PAGE_CACHE_SHIFT;
-		offset &= ~PAGE_CACHE_MASK;
-	} while (copied < len);
-
-out:
-	*ppos = pos + copied;
-	if (filp)
-		file_accessed(filp);
-
-	return (copied ? copied : error);
-}
-
-ssize_t
-xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
-	if (!access_ok(VERIFY_WRITE, buf, len))
-		return -EFAULT;
-
-	return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
-			    buf, len, ppos);
-}
-EXPORT_SYMBOL_GPL(xip_file_read);
-
 /*
  * __xip_unmap is invoked from xip_unmap and xip_write
  *
@@ -341,127 +228,6 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
 
-static ssize_t
-__xip_file_write(struct file *filp, const char __user *buf,
-		  size_t count, loff_t pos, loff_t *ppos)
-{
-	struct address_space * mapping = filp->f_mapping;
-	const struct address_space_operations *a_ops = mapping->a_ops;
-	struct inode 	*inode = mapping->host;
-	long		status = 0;
-	size_t		bytes;
-	ssize_t		written = 0;
-
-	BUG_ON(!mapping->a_ops->get_xip_mem);
-
-	do {
-		unsigned long index;
-		unsigned long offset;
-		size_t copied;
-		void *xip_mem;
-		unsigned long xip_pfn;
-
-		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-		index = pos >> PAGE_CACHE_SHIFT;
-		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
-
-		status = a_ops->get_xip_mem(mapping, index, 0,
-						&xip_mem, &xip_pfn);
-		if (status == -ENODATA) {
-			/* we allocate a new page unmap it */
-			mutex_lock(&xip_sparse_mutex);
-			status = a_ops->get_xip_mem(mapping, index, 1,
-							&xip_mem, &xip_pfn);
-			mutex_unlock(&xip_sparse_mutex);
-			if (!status)
-				/* unmap page at pgoff from all other vmas */
-				__xip_unmap(mapping, index);
-		}
-
-		if (status)
-			break;
-
-		copied = bytes -
-			__copy_from_user_nocache(xip_mem + offset, buf, bytes);
-
-		if (likely(copied > 0)) {
-			status = copied;
-
-			if (status >= 0) {
-				written += status;
-				count -= status;
-				pos += status;
-				buf += status;
-			}
-		}
-		if (unlikely(copied != bytes))
-			if (status >= 0)
-				status = -EFAULT;
-		if (status < 0)
-			break;
-	} while (count);
-	*ppos = pos;
-	/*
-	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_mutex.
-	 */
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-
-	return written ? written : status;
-}
-
-ssize_t
-xip_file_write(struct file *filp, const char __user *buf, size_t len,
-	       loff_t *ppos)
-{
-	struct address_space *mapping = filp->f_mapping;
-	struct inode *inode = mapping->host;
-	size_t count;
-	loff_t pos;
-	ssize_t ret;
-
-	mutex_lock(&inode->i_mutex);
-
-	if (!access_ok(VERIFY_READ, buf, len)) {
-		ret=-EFAULT;
-		goto out_up;
-	}
-
-	pos = *ppos;
-	count = len;
-
-	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = inode_to_bdi(inode);
-
-	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
-	if (ret)
-		goto out_backing;
-	if (count == 0)
-		goto out_backing;
-
-	ret = file_remove_suid(filp);
-	if (ret)
-		goto out_backing;
-
-	ret = file_update_time(filp);
-	if (ret)
-		goto out_backing;
-
-	ret = __xip_file_write (filp, buf, count, pos, ppos);
-
- out_backing:
-	current->backing_dev_info = NULL;
- out_up:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(xip_file_write);
-
 /*
  * truncate a page used for execute in place
  * functionality is analog to block_truncate_page but does use get_xip_mem
-- 
cgit v1.2.3


From 289c6aedac981533331428bc933fff21ae332c9e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:58:59 -0800
Subject: dax,ext2: replace ext2_clear_xip_target with dax_clear_blocks

This is practically generic code; other filesystems will want to call it
from other places, but there's nothing ext2-specific about it.

Make it a little more generic by allowing it to take a count of the number
of bytes to zero rather than fixing it to a single page.  Thanks to Dave
Hansen for suggesting that I need to call cond_resched() if zeroing more
than one page.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c           | 37 +++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c    |  8 +++++---
 fs/ext2/xip.c      | 14 --------------
 fs/ext2/xip.h      |  3 ---
 include/linux/fs.h |  1 +
 5 files changed, 43 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 1a2bdbfa3ea9..69c3126a05b4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -20,8 +20,45 @@
 #include <linux/fs.h>
 #include <linux/genhd.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 #include <linux/uio.h>
 
+int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	sector_t sector = block << (inode->i_blkbits - 9);
+
+	might_sleep();
+	do {
+		void *addr;
+		unsigned long pfn;
+		long count;
+
+		count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+		if (count < 0)
+			return count;
+		BUG_ON(size < count);
+		while (count > 0) {
+			unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
+			if (pgsz > count)
+				pgsz = count;
+			if (pgsz < PAGE_SIZE)
+				memset(addr, 0, pgsz);
+			else
+				clear_page(addr);
+			addr += pgsz;
+			size -= pgsz;
+			count -= pgsz;
+			BUG_ON(pgsz & 511);
+			sector += pgsz / 512;
+			cond_resched();
+		}
+	} while (size);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dax_clear_blocks);
+
 static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
 {
 	unsigned long pfn;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 3ccd5fd47d66..52978b853226 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -733,10 +733,12 @@ static int ext2_get_blocks(struct inode *inode,
 
 	if (IS_DAX(inode)) {
 		/*
-		 * we need to clear the block
+		 * block must be initialised before we put it in the tree
+		 * so that it's not found by another thread before it's
+		 * initialised
 		 */
-		err = ext2_clear_xip_target (inode,
-			le32_to_cpu(chain[depth-1].key));
+		err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
+						1 << inode->i_blkbits);
 		if (err) {
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index bbc5fec6ff7f..8cfca3a4cd58 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -42,20 +42,6 @@ __ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
 	return rc;
 }
 
-int
-ext2_clear_xip_target(struct inode *inode, sector_t block)
-{
-	void *kaddr;
-	unsigned long pfn;
-	long size;
-
-	size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE);
-	if (size < 0)
-		return size;
-	clear_page(kaddr);
-	return 0;
-}
-
 void ext2_xip_verify_sb(struct super_block *sb)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
index 29be73781419..b2592f2f3c9d 100644
--- a/fs/ext2/xip.h
+++ b/fs/ext2/xip.h
@@ -7,8 +7,6 @@
 
 #ifdef CONFIG_EXT2_FS_XIP
 extern void ext2_xip_verify_sb (struct super_block *);
-extern int ext2_clear_xip_target (struct inode *, sector_t);
-
 static inline int ext2_use_xip (struct super_block *sb)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
@@ -19,6 +17,5 @@ int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
 #else
 #define ext2_xip_verify_sb(sb)			do { } while (0)
 #define ext2_use_xip(sb)			0
-#define ext2_clear_xip_target(inode, chain)	0
 #define ext2_get_xip_mem			NULL
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 241c3c030fb5..8084934a5676 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2589,6 +2589,7 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
 
 ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
 		loff_t, get_block_t, dio_iodone_t, int flags);
+int dax_clear_blocks(struct inode *, sector_t block, long size);
 
 #ifdef CONFIG_FS_XIP
 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
-- 
cgit v1.2.3


From f7ca90b160307d63aaedab8bd451c24a182db20f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:02 -0800
Subject: dax,ext2: replace the XIP page fault handler with the DAX page fault
 handler

Instead of calling aops->get_xip_mem from the fault handler, the
filesystem passes a get_block_t that is used to find the appropriate
blocks.

This requires that all architectures implement copy_user_page().  At the
time of writing, mips and arm do not.  Patches exist and are in progress.

[akpm@linux-foundation.org: remap_file_pages went away]
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c           | 241 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/file.c     |  34 +++++++-
 include/linux/fs.h |   4 +-
 mm/filemap_xip.c   | 206 ---------------------------------------------
 4 files changed, 276 insertions(+), 209 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 69c3126a05b4..553e55b93495 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -19,9 +19,13 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
+#include <linux/vmstat.h>
 
 int dax_clear_blocks(struct inode *inode, sector_t block, long size)
 {
@@ -221,3 +225,240 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
 	return retval;
 }
 EXPORT_SYMBOL_GPL(dax_do_io);
+
+/*
+ * The user has performed a load from a hole in the file.  Allocating
+ * a new page in the file would cause excessive storage usage for
+ * workloads with sparse files.  We allocate a page cache page instead.
+ * We'll kick it out of the page cache if it's ever written to,
+ * otherwise it will simply fall out of the page cache under memory
+ * pressure without ever having been dirtied.
+ */
+static int dax_load_hole(struct address_space *mapping, struct page *page,
+							struct vm_fault *vmf)
+{
+	unsigned long size;
+	struct inode *inode = mapping->host;
+	if (!page)
+		page = find_or_create_page(mapping, vmf->pgoff,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		return VM_FAULT_OOM;
+	/* Recheck i_size under page lock to avoid truncate race */
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size) {
+		unlock_page(page);
+		page_cache_release(page);
+		return VM_FAULT_SIGBUS;
+	}
+
+	vmf->page = page;
+	return VM_FAULT_LOCKED;
+}
+
+static int copy_user_bh(struct page *to, struct buffer_head *bh,
+			unsigned blkbits, unsigned long vaddr)
+{
+	void *vfrom, *vto;
+	if (dax_get_addr(bh, &vfrom, blkbits) < 0)
+		return -EIO;
+	vto = kmap_atomic(to);
+	copy_user_page(vto, vfrom, vaddr, to);
+	kunmap_atomic(vto);
+	return 0;
+}
+
+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+			struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct address_space *mapping = inode->i_mapping;
+	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	void *addr;
+	unsigned long pfn;
+	pgoff_t size;
+	int error;
+
+	i_mmap_lock_read(mapping);
+
+	/*
+	 * Check truncate didn't happen while we were allocating a block.
+	 * If it did, this block may or may not be still allocated to the
+	 * file.  We can't tell the filesystem to free it because we can't
+	 * take i_mutex here.  In the worst case, the file still has blocks
+	 * allocated past the end of the file.
+	 */
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (unlikely(vmf->pgoff >= size)) {
+		error = -EIO;
+		goto out;
+	}
+
+	error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
+	if (error < 0)
+		goto out;
+	if (error < PAGE_SIZE) {
+		error = -EIO;
+		goto out;
+	}
+
+	if (buffer_unwritten(bh) || buffer_new(bh))
+		clear_page(addr);
+
+	error = vm_insert_mixed(vma, vaddr, pfn);
+
+ out:
+	i_mmap_unlock_read(mapping);
+
+	if (bh->b_end_io)
+		bh->b_end_io(bh, 1);
+
+	return error;
+}
+
+static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+			get_block_t get_block)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head bh;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	unsigned blkbits = inode->i_blkbits;
+	sector_t block;
+	pgoff_t size;
+	int error;
+	int major = 0;
+
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		return VM_FAULT_SIGBUS;
+
+	memset(&bh, 0, sizeof(bh));
+	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+	bh.b_size = PAGE_SIZE;
+
+ repeat:
+	page = find_get_page(mapping, vmf->pgoff);
+	if (page) {
+		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+			page_cache_release(page);
+			return VM_FAULT_RETRY;
+		}
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
+		}
+		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (unlikely(vmf->pgoff >= size)) {
+			/*
+			 * We have a struct page covering a hole in the file
+			 * from a read fault and we've raced with a truncate
+			 */
+			error = -EIO;
+			goto unlock_page;
+		}
+	}
+
+	error = get_block(inode, block, &bh, 0);
+	if (!error && (bh.b_size < PAGE_SIZE))
+		error = -EIO;		/* fs corruption? */
+	if (error)
+		goto unlock_page;
+
+	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
+		if (vmf->flags & FAULT_FLAG_WRITE) {
+			error = get_block(inode, block, &bh, 1);
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			major = VM_FAULT_MAJOR;
+			if (!error && (bh.b_size < PAGE_SIZE))
+				error = -EIO;
+			if (error)
+				goto unlock_page;
+		} else {
+			return dax_load_hole(mapping, page, vmf);
+		}
+	}
+
+	if (vmf->cow_page) {
+		struct page *new_page = vmf->cow_page;
+		if (buffer_written(&bh))
+			error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+		else
+			clear_user_highpage(new_page, vaddr);
+		if (error)
+			goto unlock_page;
+		vmf->page = page;
+		if (!page) {
+			i_mmap_lock_read(mapping);
+			/* Check we didn't race with truncate */
+			size = (i_size_read(inode) + PAGE_SIZE - 1) >>
+								PAGE_SHIFT;
+			if (vmf->pgoff >= size) {
+				i_mmap_unlock_read(mapping);
+				error = -EIO;
+				goto out;
+			}
+		}
+		return VM_FAULT_LOCKED;
+	}
+
+	/* Check we didn't race with a read fault installing a new page */
+	if (!page && major)
+		page = find_lock_page(mapping, vmf->pgoff);
+
+	if (page) {
+		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+							PAGE_CACHE_SIZE, 0);
+		delete_from_page_cache(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	error = dax_insert_mapping(inode, &bh, vma, vmf);
+
+ out:
+	if (error == -ENOMEM)
+		return VM_FAULT_OOM | major;
+	/* -EBUSY is fine, somebody else faulted on the same PTE */
+	if ((error < 0) && (error != -EBUSY))
+		return VM_FAULT_SIGBUS | major;
+	return VM_FAULT_NOPAGE | major;
+
+ unlock_page:
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	goto out;
+}
+
+/**
+ * dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files.
+ */
+int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+			get_block_t get_block)
+{
+	int result;
+	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+	if (vmf->flags & FAULT_FLAG_WRITE) {
+		sb_start_pagefault(sb);
+		file_update_time(vma->vm_file);
+	}
+	result = do_dax_fault(vma, vmf, get_block);
+	if (vmf->flags & FAULT_FLAG_WRITE)
+		sb_end_pagefault(sb);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(dax_fault);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a247123fd798..a61c93fd9dce 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,6 +25,36 @@
 #include "xattr.h"
 #include "acl.h"
 
+#ifdef CONFIG_EXT2_FS_XIP
+static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_fault(vma, vmf, ext2_get_block);
+}
+
+static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_mkwrite(vma, vmf, ext2_get_block);
+}
+
+static const struct vm_operations_struct ext2_dax_vm_ops = {
+	.fault		= ext2_dax_fault,
+	.page_mkwrite	= ext2_dax_mkwrite,
+};
+
+static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!IS_DAX(file_inode(file)))
+		return generic_file_mmap(file, vma);
+
+	file_accessed(file);
+	vma->vm_ops = &ext2_dax_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP;
+	return 0;
+}
+#else
+#define ext2_file_mmap	generic_file_mmap
+#endif
+
 /*
  * Called when filp is released. This happens when all file descriptors
  * for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.mmap		= generic_file_mmap,
+	.mmap		= ext2_file_mmap,
 	.open		= dquot_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
@@ -89,7 +119,7 @@ const struct file_operations ext2_xip_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.mmap		= xip_file_mmap,
+	.mmap		= ext2_file_mmap,
 	.open		= dquot_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8084934a5676..6bad6d4c579b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -51,6 +51,7 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
+struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2590,9 +2591,10 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
 ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
 		loff_t, get_block_t, dio_iodone_t, int flags);
 int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
 
 #ifdef CONFIG_FS_XIP
-extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
 #else
 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9c869f402c07..59fb387b2238 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -22,212 +22,6 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-/*
- * We do use our own empty page to avoid interference with other users
- * of ZERO_PAGE(), such as /dev/zero
- */
-static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
-static struct page *__xip_sparse_page;
-
-/* called under xip_sparse_mutex */
-static struct page *xip_sparse_page(void)
-{
-	if (!__xip_sparse_page) {
-		struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-
-		if (page)
-			__xip_sparse_page = page;
-	}
-	return __xip_sparse_page;
-}
-
-/*
- * __xip_unmap is invoked from xip_unmap and xip_write
- *
- * This function walks all vmas of the address_space and unmaps the
- * __xip_sparse_page when found at pgoff.
- */
-static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
-{
-	struct vm_area_struct *vma;
-	struct page *page;
-	unsigned count;
-	int locked = 0;
-
-	count = read_seqcount_begin(&xip_sparse_seq);
-
-	page = __xip_sparse_page;
-	if (!page)
-		return;
-
-retry:
-	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-		pte_t *pte, pteval;
-		spinlock_t *ptl;
-		struct mm_struct *mm = vma->vm_mm;
-		unsigned long address = vma->vm_start +
-			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-
-		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-		pte = page_check_address(page, mm, address, &ptl, 1);
-		if (pte) {
-			/* Nuke the page table entry. */
-			flush_cache_page(vma, address, pte_pfn(*pte));
-			pteval = ptep_clear_flush(vma, address, pte);
-			page_remove_rmap(page);
-			dec_mm_counter(mm, MM_FILEPAGES);
-			BUG_ON(pte_dirty(pteval));
-			pte_unmap_unlock(pte, ptl);
-			/* must invalidate_page _before_ freeing the page */
-			mmu_notifier_invalidate_page(mm, address);
-			page_cache_release(page);
-		}
-	}
-	i_mmap_unlock_read(mapping);
-
-	if (locked) {
-		mutex_unlock(&xip_sparse_mutex);
-	} else if (read_seqcount_retry(&xip_sparse_seq, count)) {
-		mutex_lock(&xip_sparse_mutex);
-		locked = 1;
-		goto retry;
-	}
-}
-
-/*
- * xip_fault() is invoked via the vma operations vector for a
- * mapped memory region to read in file data during a page fault.
- *
- * This function is derived from filemap_fault, but used for execute in place
- */
-static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	pgoff_t size;
-	void *xip_mem;
-	unsigned long xip_pfn;
-	struct page *page;
-	int error;
-
-	/* XXX: are VM_FAULT_ codes OK? */
-again:
-	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (vmf->pgoff >= size)
-		return VM_FAULT_SIGBUS;
-
-	error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
-						&xip_mem, &xip_pfn);
-	if (likely(!error))
-		goto found;
-	if (error != -ENODATA)
-		return VM_FAULT_OOM;
-
-	/* sparse block */
-	if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
-	    (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
-	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
-		int err;
-
-		/* maybe shared writable, allocate new block */
-		mutex_lock(&xip_sparse_mutex);
-		error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
-							&xip_mem, &xip_pfn);
-		mutex_unlock(&xip_sparse_mutex);
-		if (error)
-			return VM_FAULT_SIGBUS;
-		/* unmap sparse mappings at pgoff from all other vmas */
-		__xip_unmap(mapping, vmf->pgoff);
-
-found:
-		/*
-		 * We must recheck i_size under i_mmap_rwsem to prevent races
-		 * with truncation
-		 */
-		i_mmap_lock_read(mapping);
-		size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-							PAGE_CACHE_SHIFT;
-		if (unlikely(vmf->pgoff >= size)) {
-			i_mmap_unlock_read(mapping);
-			return VM_FAULT_SIGBUS;
-		}
-		err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
-							xip_pfn);
-		i_mmap_unlock_read(mapping);
-		if (err == -ENOMEM)
-			return VM_FAULT_OOM;
-		/*
-		 * err == -EBUSY is fine, we've raced against another thread
-		 * that faulted-in the same page
-		 */
-		if (err != -EBUSY)
-			BUG_ON(err);
-		return VM_FAULT_NOPAGE;
-	} else {
-		int err, ret = VM_FAULT_OOM;
-
-		mutex_lock(&xip_sparse_mutex);
-		write_seqcount_begin(&xip_sparse_seq);
-		error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
-							&xip_mem, &xip_pfn);
-		if (unlikely(!error)) {
-			write_seqcount_end(&xip_sparse_seq);
-			mutex_unlock(&xip_sparse_mutex);
-			goto again;
-		}
-		if (error != -ENODATA)
-			goto out;
-
-		/*
-		 * We must recheck i_size under i_mmap_rwsem to prevent races
-		 * with truncation
-		 */
-		i_mmap_lock_read(mapping);
-		size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-							PAGE_CACHE_SHIFT;
-		if (unlikely(vmf->pgoff >= size)) {
-			ret = VM_FAULT_SIGBUS;
-			goto unlock;
-		}
-		/* not shared and writable, use xip_sparse_page() */
-		page = xip_sparse_page();
-		if (!page)
-			goto unlock;
-		err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
-							page);
-		if (err == -ENOMEM)
-			goto unlock;
-
-		ret = VM_FAULT_NOPAGE;
-unlock:
-		i_mmap_unlock_read(mapping);
-out:
-		write_seqcount_end(&xip_sparse_seq);
-		mutex_unlock(&xip_sparse_mutex);
-
-		return ret;
-	}
-}
-
-static const struct vm_operations_struct xip_file_vm_ops = {
-	.fault	= xip_file_fault,
-	.page_mkwrite	= filemap_page_mkwrite,
-};
-
-int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
-
-	file_accessed(file);
-	vma->vm_ops = &xip_file_vm_ops;
-	vma->vm_flags |= VM_MIXEDMAP;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(xip_file_mmap);
-
 /*
  * truncate a page used for execute in place
  * functionality is analog to block_truncate_page but does use get_xip_mem
-- 
cgit v1.2.3


From 4c0ccfef2e9f7418a6eb0bf07a2fc8f216365b18 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:06 -0800
Subject: dax,ext2: replace xip_truncate_page with dax_truncate_page

It takes a get_block parameter just like nobh_truncate_page() and
block_truncate_page()

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c           | 44 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c    |  2 +-
 include/linux/fs.h | 10 +---------
 mm/filemap_xip.c   | 40 ----------------------------------------
 4 files changed, 46 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 553e55b93495..ebf9b2231b0f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -462,3 +462,47 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_fault);
+
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating an DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmaped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+	struct buffer_head bh;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length = PAGE_CACHE_ALIGN(from) - from;
+	int err;
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	memset(&bh, 0, sizeof(bh));
+	bh.b_size = PAGE_CACHE_SIZE;
+	err = get_block(inode, index, &bh, 0);
+	if (err < 0)
+		return err;
+	if (buffer_written(&bh)) {
+		void *addr;
+		err = dax_get_addr(&bh, &addr, inode->i_blkbits);
+		if (err < 0)
+			return err;
+		memset(addr + offset, 0, length);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 52978b853226..5ac0a3461b3c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1210,7 +1210,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 	inode_dio_wait(inode);
 
 	if (IS_DAX(inode))
-		error = xip_truncate_page(inode->i_mapping, newsize);
+		error = dax_truncate_page(inode, newsize, ext2_get_block);
 	else if (test_opt(inode->i_sb, NOBH))
 		error = nobh_truncate_page(inode->i_mapping,
 				newsize, ext2_get_block);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6bad6d4c579b..2c8f9055af38 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2591,18 +2591,10 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
 ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
 		loff_t, get_block_t, dio_iodone_t, int flags);
 int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
 
-#ifdef CONFIG_FS_XIP
-extern int xip_truncate_page(struct address_space *mapping, loff_t from);
-#else
-static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
-{
-	return 0;
-}
-#endif
-
 #ifdef CONFIG_BLOCK
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
 			    loff_t file_offset);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 59fb387b2238..8e3f99b61959 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -22,43 +22,3 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-/*
- * truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_mem
- * to get the page instead of page cache
- */
-int
-xip_truncate_page(struct address_space *mapping, loff_t from)
-{
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned blocksize;
-	unsigned length;
-	void *xip_mem;
-	unsigned long xip_pfn;
-	int err;
-
-	BUG_ON(!mapping->a_ops->get_xip_mem);
-
-	blocksize = 1 << mapping->host->i_blkbits;
-	length = offset & (blocksize - 1);
-
-	/* Block boundary? Nothing to do */
-	if (!length)
-		return 0;
-
-	length = blocksize - length;
-
-	err = mapping->a_ops->get_xip_mem(mapping, index, 0,
-						&xip_mem, &xip_pfn);
-	if (unlikely(err)) {
-		if (err == -ENODATA)
-			/* Hole? No need to truncate */
-			return 0;
-		else
-			return err;
-	}
-	memset(xip_mem + offset, 0, length);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(xip_truncate_page);
-- 
cgit v1.2.3


From e748dcd095ddee50e7a7deda2e26247715318a2e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:12 -0800
Subject: vfs: remove get_xip_mem

All callers of get_xip_mem() are now gone.  Remove checks for it,
initialisers of it, documentation of it and the only implementation of it.
 Also remove mm/filemap_xip.c as it is now empty.  Also remove
documentation of the long-gone get_xip_page().

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |  3 ---
 Documentation/filesystems/vfs.txt |  7 ------
 fs/exofs/inode.c                  |  1 -
 fs/ext2/inode.c                   |  1 -
 fs/ext2/xip.c                     | 45 ---------------------------------------
 fs/ext2/xip.h                     |  3 ---
 fs/open.c                         |  5 +----
 include/linux/fs.h                |  2 --
 include/linux/rmap.h              |  2 +-
 mm/Makefile                       |  1 -
 mm/fadvise.c                      |  6 ++++--
 mm/filemap_xip.c                  | 24 ---------------------
 mm/madvise.c                      |  2 +-
 13 files changed, 7 insertions(+), 95 deletions(-)
 delete mode 100644 mm/filemap_xip.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index b30753cbf431..2ca3d17eee56 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -199,8 +199,6 @@ prototypes:
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
-	int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
-				unsigned long *);
 	int (*migratepage)(struct address_space *, struct page *, struct page *);
 	int (*launder_page)(struct page *);
 	int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
@@ -225,7 +223,6 @@ invalidatepage:		yes
 releasepage:		yes
 freepage:		yes
 direct_IO:
-get_xip_mem:					maybe
 migratepage:		yes (both)
 launder_page:		yes
 is_partially_uptodate:	yes
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 43ce0507ee25..966b22829f3b 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -591,8 +591,6 @@ struct address_space_operations {
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
-	struct page* (*get_xip_page)(struct address_space *, sector_t,
-			int);
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
@@ -748,11 +746,6 @@ struct address_space_operations {
         and transfer data directly between the storage and the
         application's address space.
 
-  get_xip_page: called by the VM to translate a block number to a page.
-	The page is valid until the corresponding filesystem is unmounted.
-	Filesystems that want to use execute-in-place (XIP) need to implement
-	it.  An example implementation can be found in fs/ext2/xip.c.
-
   migrate_page:  This is used to compact the physical memory usage.
         If the VM wants to relocate a page (maybe off a memory card
         that is signalling imminent failure) it will pass a new page
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6fc91df99ff8..a198e94813fe 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = {
 	.direct_IO	= exofs_direct_IO,
 
 	/* With these NULL has special meaning or default is not exported */
-	.get_xip_mem	= NULL,
 	.migratepage	= NULL,
 	.launder_page	= NULL,
 	.is_partially_uptodate = NULL,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 5ac0a3461b3c..59d6c7d43740 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -894,7 +894,6 @@ const struct address_space_operations ext2_aops = {
 
 const struct address_space_operations ext2_aops_xip = {
 	.bmap			= ext2_bmap,
-	.get_xip_mem		= ext2_get_xip_mem,
 	.direct_IO		= ext2_direct_IO,
 };
 
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index 8cfca3a4cd58..132d4daf98c4 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -13,35 +13,6 @@
 #include "ext2.h"
 #include "xip.h"
 
-static inline long __inode_direct_access(struct inode *inode, sector_t block,
-				void **kaddr, unsigned long *pfn, long size)
-{
-	struct block_device *bdev = inode->i_sb->s_bdev;
-	sector_t sector = block * (PAGE_SIZE / 512);
-	return bdev_direct_access(bdev, sector, kaddr, pfn, size);
-}
-
-static inline int
-__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
-		   sector_t *result)
-{
-	struct buffer_head tmp;
-	int rc;
-
-	memset(&tmp, 0, sizeof(struct buffer_head));
-	tmp.b_size = 1 << inode->i_blkbits;
-	rc = ext2_get_block(inode, pgoff, &tmp, create);
-	*result = tmp.b_blocknr;
-
-	/* did we get a sparse block (hole in the file)? */
-	if (!tmp.b_blocknr && !rc) {
-		BUG_ON(create);
-		rc = -ENODATA;
-	}
-
-	return rc;
-}
-
 void ext2_xip_verify_sb(struct super_block *sb)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
@@ -54,19 +25,3 @@ void ext2_xip_verify_sb(struct super_block *sb)
 			     "not supported by bdev");
 	}
 }
-
-int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
-				void **kmem, unsigned long *pfn)
-{
-	long rc;
-	sector_t block;
-
-	/* first, retrieve the sector number */
-	rc = __ext2_get_block(mapping->host, pgoff, create, &block);
-	if (rc)
-		return rc;
-
-	/* retrieve address of the target data */
-	rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE);
-	return (rc < 0) ? rc : 0;
-}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
index b2592f2f3c9d..e7b9f0a2cc54 100644
--- a/fs/ext2/xip.h
+++ b/fs/ext2/xip.h
@@ -12,10 +12,7 @@ static inline int ext2_use_xip (struct super_block *sb)
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
 }
-int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
-				void **, unsigned long *);
 #else
 #define ext2_xip_verify_sb(sb)			do { } while (0)
 #define ext2_use_xip(sb)			0
-#define ext2_get_xip_mem			NULL
 #endif
diff --git a/fs/open.c b/fs/open.c
index 813be037b412..a293c2020676 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f)
 {
 	/* NB: we're sure to have correct a_ops only after f_op->open */
 	if (f->f_flags & O_DIRECT) {
-		if (!f->f_mapping->a_ops ||
-		    ((!f->f_mapping->a_ops->direct_IO) &&
-		    (!f->f_mapping->a_ops->get_xip_mem))) {
+		if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
 			return -EINVAL;
-		}
 	}
 	return 0;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c8f9055af38..9772d655f444 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -362,8 +362,6 @@ struct address_space_operations {
 	int (*releasepage) (struct page *, gfp_t);
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
-	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
-						void **, unsigned long *);
 	/*
 	 * migrate the contents of a page to the specified target. If
 	 * migrate_mode is MIGRATE_ASYNC, it must not block.
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b38f559130d5..c4c559a45dc8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -198,7 +198,7 @@ int page_referenced(struct page *, int is_locked,
 int try_to_unmap(struct page *, enum ttu_flags flags);
 
 /*
- * Called from mm/filemap_xip.c to unmap empty zero page
+ * Used by uprobes to replace a userspace page safely
  */
 pte_t *__page_check_address(struct page *, struct mm_struct *,
 				unsigned long, spinlock_t **, int);
diff --git a/mm/Makefile b/mm/Makefile
index 088c68e9ec35..3c1caa2693bd 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,7 +55,6 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
-obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
diff --git a/mm/fadvise.c b/mm/fadvise.c
index fac23ecf8d72..4a3907cf79f8 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -28,6 +28,7 @@
 SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 {
 	struct fd f = fdget(fd);
+	struct inode *inode;
 	struct address_space *mapping;
 	struct backing_dev_info *bdi;
 	loff_t endbyte;			/* inclusive */
@@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	if (!f.file)
 		return -EBADF;
 
-	if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+	inode = file_inode(f.file);
+	if (S_ISFIFO(inode->i_mode)) {
 		ret = -ESPIPE;
 		goto out;
 	}
@@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		goto out;
 	}
 
-	if (mapping->a_ops->get_xip_mem) {
+	if (IS_DAX(inode)) {
 		switch (advice) {
 		case POSIX_FADV_NORMAL:
 		case POSIX_FADV_RANDOM:
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
deleted file mode 100644
index 8e3f99b61959..000000000000
--- a/mm/filemap_xip.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- *	linux/mm/filemap_xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte <cotte@de.ibm.com>
- *
- * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
- *
- */
-
-#include <linux/fs.h>
-#include <linux/backing-dev.h>
-#include <linux/pagemap.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/rmap.h>
-#include <linux/mmu_notifier.h>
-#include <linux/sched.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
diff --git a/mm/madvise.c b/mm/madvise.c
index 1077cbdc8b52..d551475517bf 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -239,7 +239,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
 		return -EBADF;
 #endif
 
-	if (file->f_mapping->a_ops->get_xip_mem) {
+	if (IS_DAX(file_inode(file))) {
 		/* no bad return value, but ignore advice */
 		return 0;
 	}
-- 
cgit v1.2.3


From 6cd176a51e52e5218b1aa97e1ec916bac25a9b7e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:25 -0800
Subject: vfs,ext2: remove CONFIG_EXT2_FS_XIP and rename CONFIG_FS_XIP to
 CONFIG_FS_DAX

The fewer Kconfig options we have the better.  Use the generic
CONFIG_FS_DAX to enable XIP support in ext2 as well as in the core.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig         | 21 ++++++++++++++-------
 fs/Makefile        |  2 +-
 fs/ext2/Kconfig    | 11 -----------
 fs/ext2/ext2.h     |  2 +-
 fs/ext2/file.c     |  4 ++--
 fs/ext2/super.c    |  4 ++--
 include/linux/fs.h |  2 +-
 scripts/diffconfig |  1 -
 8 files changed, 21 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index a6bb530b1ec5..5331497d5b25 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -13,13 +13,6 @@ if BLOCK
 source "fs/ext2/Kconfig"
 source "fs/ext3/Kconfig"
 source "fs/ext4/Kconfig"
-
-config FS_XIP
-# execute in place
-	bool
-	depends on EXT2_FS_XIP
-	default y
-
 source "fs/jbd/Kconfig"
 source "fs/jbd2/Kconfig"
 
@@ -40,6 +33,20 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 
+config FS_DAX
+	bool "Direct Access (DAX) support"
+	depends on MMU
+	help
+	  Direct Access (DAX) can be used on memory-backed block devices.
+	  If the block device supports DAX and the filesystem supports DAX,
+	  then you can avoid using the pagecache to buffer I/Os.  Turning
+	  on this option will compile in support for DAX; you will need to
+	  mount the filesystem using the -o dax option.
+
+	  If you do not have a block device that is capable of using this,
+	  or if unsure, say N.  Saying Y will increase the size of the kernel
+	  by about 5kB.
+
 endif # BLOCK
 
 # Posix ACL utility routines
diff --git a/fs/Makefile b/fs/Makefile
index 0534444e257c..0f4635f7c49c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,7 +28,7 @@ obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
-obj-$(CONFIG_FS_XIP)		+= dax.o
+obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 14a6780fd034..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -42,14 +42,3 @@ config EXT2_FS_SECURITY
 
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
-
-config EXT2_FS_XIP
-	bool "Ext2 execute in place support"
-	depends on EXT2_FS && MMU
-	help
-	  Execute in place can be used on memory-backed block devices. If you
-	  enable this option, you can select to mount block devices which are
-	  capable of this feature without using the page cache.
-
-	  If you do not use a block device that is capable of using this,
-	  or if unsure, say N.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 30604c4d70e6..6854038c09ae 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -380,7 +380,7 @@ struct ext2_inode {
 #define EXT2_MOUNT_NO_UID32		0x000200  /* Disable 32-bit UIDs */
 #define EXT2_MOUNT_XATTR_USER		0x004000  /* Extended user attributes */
 #define EXT2_MOUNT_POSIX_ACL		0x008000  /* POSIX Access Control Lists */
-#ifdef CONFIG_FS_XIP
+#ifdef CONFIG_FS_DAX
 #define EXT2_MOUNT_XIP			0x010000  /* Execute in place */
 #else
 #define EXT2_MOUNT_XIP			0
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a61c93fd9dce..de8174d1e973 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,7 +25,7 @@
 #include "xattr.h"
 #include "acl.h"
 
-#ifdef CONFIG_EXT2_FS_XIP
+#ifdef CONFIG_FS_DAX
 static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	return dax_fault(vma, vmf, ext2_get_block);
@@ -108,7 +108,7 @@ const struct file_operations ext2_file_operations = {
 	.splice_write	= iter_file_splice_write,
 };
 
-#ifdef CONFIG_EXT2_FS_XIP
+#ifdef CONFIG_FS_DAX
 const struct file_operations ext2_xip_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 50342583db1f..5f029d8c3a02 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -291,7 +291,7 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",grpquota");
 #endif
 
-#if defined(CONFIG_EXT2_FS_XIP)
+#ifdef CONFIG_FS_DAX
 	if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
 		seq_puts(seq, ",xip");
 #endif
@@ -558,7 +558,7 @@ static int parse_options(char *options, struct super_block *sb)
 			break;
 #endif
 		case Opt_xip:
-#ifdef CONFIG_EXT2_FS_XIP
+#ifdef CONFIG_FS_DAX
 			set_opt (sbi->s_mount_opt, XIP);
 #else
 			ext2_msg(sb, KERN_INFO, "xip option not supported");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9772d655f444..d46f8fe6a0ea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1676,7 +1676,7 @@ struct super_operations {
 #define S_IMA		1024	/* Inode has an associated IMA struct */
 #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 #define S_NOSEC		4096	/* no suid or xattr security attributes */
-#ifdef CONFIG_FS_XIP
+#ifdef CONFIG_FS_DAX
 #define S_DAX		8192	/* Direct Access, avoiding the page cache */
 #else
 #define S_DAX		0	/* Make all the DAX code disappear */
diff --git a/scripts/diffconfig b/scripts/diffconfig
index 6d672836e187..0db267d0adc9 100755
--- a/scripts/diffconfig
+++ b/scripts/diffconfig
@@ -28,7 +28,6 @@ If no config files are specified, .config and .config.old are used.
 Example usage:
  $ diffconfig .config config-with-some-changes
 -EXT2_FS_XATTR  n
--EXT2_FS_XIP  n
  CRAMFS  n -> y
  EXT2_FS  y -> n
  LOG_BUF_SHIFT  14 -> 16
-- 
cgit v1.2.3


From 25726bc15731d42112b579cf73f30edbc43d3973 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 16 Feb 2015 15:59:35 -0800
Subject: dax: add dax_zero_page_range

This new function allows us to support hole-punch for DAX files by zeroing
a partial page, as opposed to the dax_truncate_page() function which can
only truncate to the end of the page.  Reimplement dax_truncate_page() to
call dax_zero_page_range().

[ross.zwisler@linux.intel.com: ported to 3.13-rc2]
[akpm@linux-foundation.org: fix typos in comments]
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/dax.txt |  1 +
 fs/dax.c                          | 38 ++++++++++++++++++++++++++++++++------
 include/linux/fs.h                |  1 +
 3 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 635adaa1e425..ebcd97f03654 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -62,6 +62,7 @@ Filesystem support consists of
   for fault and page_mkwrite (which should probably call dax_fault() and
   dax_mkwrite(), passing the appropriate get_block() callback)
 - calling dax_truncate_page() instead of block_truncate_page() for DAX files
+- calling dax_zero_page_range() instead of zero_user() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
   truncates and page faults
 
diff --git a/fs/dax.c b/fs/dax.c
index ebf9b2231b0f..ed1619ec6537 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -464,31 +464,35 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 EXPORT_SYMBOL_GPL(dax_fault);
 
 /**
- * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * dax_zero_page_range - zero a range within a page of a DAX file
  * @inode: The file being truncated
  * @from: The file offset that is being truncated to
+ * @length: The number of bytes to zero
  * @get_block: The filesystem method used to translate file offsets to blocks
  *
- * Similar to block_truncate_page(), this function can be called by a
- * filesystem when it is truncating an DAX file to handle the partial page.
+ * This function can be called by a filesystem when it is zeroing part of a
+ * page in a DAX file.  This is intended for hole-punch operations.  If
+ * you are truncating a file, the helper function dax_truncate_page() may be
+ * more convenient.
  *
  * We work in terms of PAGE_CACHE_SIZE here for commonality with
  * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
  * took care of disposing of the unnecessary blocks.  Even if the filesystem
  * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmaped.
+ * since the file might be mmapped.
  */
-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
+							get_block_t get_block)
 {
 	struct buffer_head bh;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned length = PAGE_CACHE_ALIGN(from) - from;
 	int err;
 
 	/* Block boundary? Nothing to do */
 	if (!length)
 		return 0;
+	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 
 	memset(&bh, 0, sizeof(bh));
 	bh.b_size = PAGE_CACHE_SIZE;
@@ -505,4 +509,26 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating a DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+	unsigned length = PAGE_CACHE_ALIGN(from) - from;
+	return dax_zero_page_range(inode, from, length, get_block);
+}
 EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d46f8fe6a0ea..ed5a0900b94d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2589,6 +2589,7 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
 ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
 		loff_t, get_block_t, dio_iodone_t, int flags);
 int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
-- 
cgit v1.2.3


From aaaf5fbf56f16c81a653713cc333b18ad6e25ea9 Mon Sep 17 00:00:00 2001
From: Joshua Kinard <kumba@gentoo.org>
Date: Mon, 16 Feb 2015 16:00:26 -0800
Subject: rtc: add driver for DS1685 family of real time clocks

This adds a driver for the Dallas/Maxim DS1685-family of RTC chips.  It
supports the DS1685/DS1687, DS1688/DS1691, DS1689/DS1693, DS17285/DS17287,
DS17485/DS17487, and DS17885/DS17887 RTC chips.  These chips are commonly
found in SGI O2 and SGI Octane systems.  It was originally derived from a
driver patch submitted by Matthias Fuchs many years ago for use in
EPPC-405-UC modules, which also used these RTCs.  In addition to the
time-keeping functions, this RTC also handles the shutdown mechanism of
the O2 and Octane and acts as a partial NVRAM for the boot PROMS in these
systems.

Verified on both an SGI O2 and an SGI Octane.

Signed-off-by: Joshua Kinard <kumba@gentoo.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                |    6 +
 drivers/rtc/Kconfig        |   90 ++
 drivers/rtc/Makefile       |    1 +
 drivers/rtc/rtc-ds1685.c   | 2252 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc/ds1685.h |  375 ++++++++
 5 files changed, 2724 insertions(+)
 create mode 100644 drivers/rtc/rtc-ds1685.c
 create mode 100644 include/linux/rtc/ds1685.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6f2c849e6da3..d5ff4c3cdbac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2963,6 +2963,12 @@ S:	Supported
 F:	drivers/input/touchscreen/cyttsp*
 F:	include/linux/input/cyttsp.h
 
+DALLAS/MAXIM DS1685-FAMILY REAL TIME CLOCK
+M:	Joshua Kinard <kumba@gentoo.org>
+S:	Maintained
+F:	drivers/rtc/rtc-ds1685.c
+F:	include/linux/rtc/ds1685.h
+
 DAMA SLAVE for AX.25
 M:	Joerg Reuter <jreuter@yaina.de>
 W:	http://yaina.de/jreuter/
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 3bc9ddbe5cf7..0cf2e1d9cb17 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -801,6 +801,96 @@ config RTC_DRV_DS1553
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-ds1553.
 
+config RTC_DRV_DS1685_FAMILY
+	tristate "Dallas/Maxim DS1685 Family"
+	help
+	  If you say yes here you get support for the Dallas/Maxim DS1685
+	  family of real time chips.  This family includes the DS1685/DS1687,
+	  DS1689/DS1693, DS17285/DS17287, DS17485/DS17487, and
+	  DS17885/DS17887 chips.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-ds1685.
+
+choice
+	prompt "Subtype"
+	depends on RTC_DRV_DS1685_FAMILY
+	default RTC_DRV_DS1685
+
+config RTC_DRV_DS1685
+	bool "DS1685/DS1687"
+	help
+	  This enables support for the Dallas/Maxim DS1685/DS1687 real time
+	  clock chip.
+
+	  This chip is commonly found in SGI O2 (IP32) and SGI Octane (IP30)
+	  systems, as well as EPPC-405-UC modules by electronic system design
+	  GmbH.
+
+config RTC_DRV_DS1689
+	bool "DS1689/DS1693"
+	help
+	  This enables support for the Dallas/Maxim DS1689/DS1693 real time
+	  clock chip.
+
+	  This is an older RTC chip, supplanted by the DS1685/DS1687 above,
+	  which supports a few minor features such as Vcc, Vbat, and Power
+	  Cycle counters, plus a customer-specific, 8-byte ROM/Serial number.
+
+	  It also works for the even older DS1688/DS1691 RTC chips, which are
+	  virtually the same and carry the same model number.  Both chips
+	  have 114 bytes of user NVRAM.
+
+config RTC_DRV_DS17285
+	bool "DS17285/DS17287"
+	help
+	  This enables support for the Dallas/Maxim DS17285/DS17287 real time
+	  clock chip.
+
+	  This chip features 2kb of extended NV-SRAM.  It may possibly be
+	  found in some SGI O2 systems (rare).
+
+config RTC_DRV_DS17485
+	bool "DS17485/DS17487"
+	help
+	  This enables support for the Dallas/Maxim DS17485/DS17487 real time
+	  clock chip.
+
+	  This chip features 4kb of extended NV-SRAM.
+
+config RTC_DRV_DS17885
+	bool "DS17885/DS17887"
+	help
+	  This enables support for the Dallas/Maxim DS17885/DS17887 real time
+	  clock chip.
+
+	  This chip features 8kb of extended NV-SRAM.
+
+endchoice
+
+config RTC_DS1685_PROC_REGS
+	bool "Display register values in /proc"
+	depends on RTC_DRV_DS1685_FAMILY && PROC_FS
+	help
+	  Enable this to display a readout of all of the RTC registers in
+	  /proc/drivers/rtc.  Keep in mind that this can potentially lead
+	  to lost interrupts, as reading Control Register C will clear
+	  all pending IRQ flags.
+
+	  Unless you are debugging this driver, choose N.
+
+config RTC_DS1685_SYSFS_REGS
+	bool "SysFS access to RTC register bits"
+	depends on RTC_DRV_DS1685_FAMILY && SYSFS
+	help
+	  Enable this to provide access to the RTC control register bits
+	  in /sys.  Some of the bits are read-write, others are read-only.
+
+	  Keep in mind that reading Control C's bits automatically clears
+	  all pending IRQ flags - this can cause lost interrupts.
+
+	  If you know that you need access to these bits, choose Y, Else N.
+
 config RTC_DRV_DS1742
 	tristate "Maxim/Dallas DS1742/1743"
 	depends on HAS_IOMEM
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 99ded8b75e95..69c87062b098 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_RTC_DRV_DS1390)	+= rtc-ds1390.o
 obj-$(CONFIG_RTC_DRV_DS1511)	+= rtc-ds1511.o
 obj-$(CONFIG_RTC_DRV_DS1553)	+= rtc-ds1553.o
 obj-$(CONFIG_RTC_DRV_DS1672)	+= rtc-ds1672.o
+obj-$(CONFIG_RTC_DRV_DS1685_FAMILY)	+= rtc-ds1685.o
 obj-$(CONFIG_RTC_DRV_DS1742)	+= rtc-ds1742.o
 obj-$(CONFIG_RTC_DRV_DS2404)    += rtc-ds2404.o
 obj-$(CONFIG_RTC_DRV_DS3232)	+= rtc-ds3232.o
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
new file mode 100644
index 000000000000..8c3bfcb115b7
--- /dev/null
+++ b/drivers/rtc/rtc-ds1685.c
@@ -0,0 +1,2252 @@
+/*
+ * An rtc driver for the Dallas/Maxim DS1685/DS1687 and related real-time
+ * chips.
+ *
+ * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>.
+ * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>.
+ *
+ * References:
+ *    DS1685/DS1687 3V/5V Real-Time Clocks, 19-5215, Rev 4/10.
+ *    DS17x85/DS17x87 3V/5V Real-Time Clocks, 19-5222, Rev 4/10.
+ *    DS1689/DS1693 3V/5V Serialized Real-Time Clocks, Rev 112105.
+ *    Application Note 90, Using the Multiplex Bus RTC Extended Features.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/bcd.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/workqueue.h>
+
+#include <linux/rtc/ds1685.h>
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#endif
+
+#define DRV_VERSION	"0.42.0"
+
+
+/* ----------------------------------------------------------------------- */
+/* Standard read/write functions if platform does not provide overrides */
+
+/**
+ * ds1685_read - read a value from an rtc register.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @reg: the register address to read.
+ */
+static u8
+ds1685_read(struct ds1685_priv *rtc, int reg)
+{
+	return readb((u8 __iomem *)rtc->regs +
+		     (reg * rtc->regstep));
+}
+
+/**
+ * ds1685_write - write a value to an rtc register.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @reg: the register address to write.
+ * @value: value to write to the register.
+ */
+static void
+ds1685_write(struct ds1685_priv *rtc, int reg, u8 value)
+{
+	writeb(value, ((u8 __iomem *)rtc->regs +
+		       (reg * rtc->regstep)));
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Inlined functions */
+
+/**
+ * ds1685_rtc_bcd2bin - bcd2bin wrapper in case platform doesn't support BCD.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @val: u8 time value to consider converting.
+ * @bcd_mask: u8 mask value if BCD mode is used.
+ * @bin_mask: u8 mask value if BIN mode is used.
+ *
+ * Returns the value, converted to BIN if originally in BCD and bcd_mode TRUE.
+ */
+static inline u8
+ds1685_rtc_bcd2bin(struct ds1685_priv *rtc, u8 val, u8 bcd_mask, u8 bin_mask)
+{
+	if (rtc->bcd_mode)
+		return (bcd2bin(val) & bcd_mask);
+
+	return (val & bin_mask);
+}
+
+/**
+ * ds1685_rtc_bin2bcd - bin2bcd wrapper in case platform doesn't support BCD.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @val: u8 time value to consider converting.
+ * @bin_mask: u8 mask value if BIN mode is used.
+ * @bcd_mask: u8 mask value if BCD mode is used.
+ *
+ * Returns the value, converted to BCD if originally in BIN and bcd_mode TRUE.
+ */
+static inline u8
+ds1685_rtc_bin2bcd(struct ds1685_priv *rtc, u8 val, u8 bin_mask, u8 bcd_mask)
+{
+	if (rtc->bcd_mode)
+		return (bin2bcd(val) & bcd_mask);
+
+	return (val & bin_mask);
+}
+
+/**
+ * ds1685_rtc_switch_to_bank0 - switch the rtc to bank 0.
+ * @rtc: pointer to the ds1685 rtc structure.
+ */
+static inline void
+ds1685_rtc_switch_to_bank0(struct ds1685_priv *rtc)
+{
+	rtc->write(rtc, RTC_CTRL_A,
+		   (rtc->read(rtc, RTC_CTRL_A) & ~(RTC_CTRL_A_DV0)));
+}
+
+/**
+ * ds1685_rtc_switch_to_bank1 - switch the rtc to bank 1.
+ * @rtc: pointer to the ds1685 rtc structure.
+ */
+static inline void
+ds1685_rtc_switch_to_bank1(struct ds1685_priv *rtc)
+{
+	rtc->write(rtc, RTC_CTRL_A,
+		   (rtc->read(rtc, RTC_CTRL_A) | RTC_CTRL_A_DV0));
+}
+
+/**
+ * ds1685_rtc_begin_data_access - prepare the rtc for data access.
+ * @rtc: pointer to the ds1685 rtc structure.
+ *
+ * This takes several steps to prepare the rtc for access to get/set time
+ * and alarm values from the rtc registers:
+ *  - Sets the SET bit in Control Register B.
+ *  - Reads Ext Control Register 4A and checks the INCR bit.
+ *  - If INCR is active, a short delay is added before Ext Control Register 4A
+ *    is read again in a loop until INCR is inactive.
+ *  - Switches the rtc to bank 1.  This allows access to all relevant
+ *    data for normal rtc operation, as bank 0 contains only the nvram.
+ */
+static inline void
+ds1685_rtc_begin_data_access(struct ds1685_priv *rtc)
+{
+	/* Set the SET bit in Ctrl B */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) | RTC_CTRL_B_SET));
+
+	/* Read Ext Ctrl 4A and check the INCR bit to avoid a lockout. */
+	while (rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_INCR)
+		cpu_relax();
+
+	/* Switch to Bank 1 */
+	ds1685_rtc_switch_to_bank1(rtc);
+}
+
+/**
+ * ds1685_rtc_end_data_access - end data access on the rtc.
+ * @rtc: pointer to the ds1685 rtc structure.
+ *
+ * This ends what was started by ds1685_rtc_begin_data_access:
+ *  - Switches the rtc back to bank 0.
+ *  - Clears the SET bit in Control Register B.
+ */
+static inline void
+ds1685_rtc_end_data_access(struct ds1685_priv *rtc)
+{
+	/* Switch back to Bank 0 */
+	ds1685_rtc_switch_to_bank1(rtc);
+
+	/* Clear the SET bit in Ctrl B */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_SET)));
+}
+
+/**
+ * ds1685_rtc_begin_ctrl_access - prepare the rtc for ctrl access.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @flags: irq flags variable for spin_lock_irqsave.
+ *
+ * This takes several steps to prepare the rtc for access to read just the
+ * control registers:
+ *  - Sets a spinlock on the rtc IRQ.
+ *  - Switches the rtc to bank 1.  This allows access to the two extended
+ *    control registers.
+ *
+ * Only use this where you are certain another lock will not be held.
+ */
+static inline void
+ds1685_rtc_begin_ctrl_access(struct ds1685_priv *rtc, unsigned long flags)
+{
+	spin_lock_irqsave(&rtc->lock, flags);
+	ds1685_rtc_switch_to_bank1(rtc);
+}
+
+/**
+ * ds1685_rtc_end_ctrl_access - end ctrl access on the rtc.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @flags: irq flags variable for spin_unlock_irqrestore.
+ *
+ * This ends what was started by ds1685_rtc_begin_ctrl_access:
+ *  - Switches the rtc back to bank 0.
+ *  - Unsets the spinlock on the rtc IRQ.
+ */
+static inline void
+ds1685_rtc_end_ctrl_access(struct ds1685_priv *rtc, unsigned long flags)
+{
+	ds1685_rtc_switch_to_bank0(rtc);
+	spin_unlock_irqrestore(&rtc->lock, flags);
+}
+
+/**
+ * ds1685_rtc_get_ssn - retrieve the silicon serial number.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @ssn: u8 array to hold the bits of the silicon serial number.
+ *
+ * This number starts at 0x40, and is 8-bytes long, ending at 0x47. The
+ * first byte is the model number, the next six bytes are the serial number
+ * digits, and the final byte is a CRC check byte.  Together, they form the
+ * silicon serial number.
+ *
+ * These values are stored in bank1, so ds1685_rtc_switch_to_bank1 must be
+ * called first before calling this function, else data will be read out of
+ * the bank0 NVRAM.  Be sure to call ds1685_rtc_switch_to_bank0 when done.
+ */
+static inline void
+ds1685_rtc_get_ssn(struct ds1685_priv *rtc, u8 *ssn)
+{
+	ssn[0] = rtc->read(rtc, RTC_BANK1_SSN_MODEL);
+	ssn[1] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_1);
+	ssn[2] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_2);
+	ssn[3] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_3);
+	ssn[4] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_4);
+	ssn[5] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_5);
+	ssn[6] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_6);
+	ssn[7] = rtc->read(rtc, RTC_BANK1_SSN_CRC);
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Read/Set Time & Alarm functions */
+
+/**
+ * ds1685_rtc_read_time - reads the time registers.
+ * @dev: pointer to device structure.
+ * @tm: pointer to rtc_time structure.
+ */
+static int
+ds1685_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrlb, century;
+	u8 seconds, minutes, hours, wday, mday, month, years;
+
+	/* Fetch the time info from the RTC registers. */
+	ds1685_rtc_begin_data_access(rtc);
+	seconds = rtc->read(rtc, RTC_SECS);
+	minutes = rtc->read(rtc, RTC_MINS);
+	hours   = rtc->read(rtc, RTC_HRS);
+	wday    = rtc->read(rtc, RTC_WDAY);
+	mday    = rtc->read(rtc, RTC_MDAY);
+	month   = rtc->read(rtc, RTC_MONTH);
+	years   = rtc->read(rtc, RTC_YEAR);
+	century = rtc->read(rtc, RTC_CENTURY);
+	ctrlb   = rtc->read(rtc, RTC_CTRL_B);
+	ds1685_rtc_end_data_access(rtc);
+
+	/* bcd2bin if needed, perform fixups, and store to rtc_time. */
+	years        = ds1685_rtc_bcd2bin(rtc, years, RTC_YEAR_BCD_MASK,
+					  RTC_YEAR_BIN_MASK);
+	century      = ds1685_rtc_bcd2bin(rtc, century, RTC_CENTURY_MASK,
+					  RTC_CENTURY_MASK);
+	tm->tm_sec   = ds1685_rtc_bcd2bin(rtc, seconds, RTC_SECS_BCD_MASK,
+					  RTC_SECS_BIN_MASK);
+	tm->tm_min   = ds1685_rtc_bcd2bin(rtc, minutes, RTC_MINS_BCD_MASK,
+					  RTC_MINS_BIN_MASK);
+	tm->tm_hour  = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_24_BCD_MASK,
+					  RTC_HRS_24_BIN_MASK);
+	tm->tm_wday  = (ds1685_rtc_bcd2bin(rtc, wday, RTC_WDAY_MASK,
+					   RTC_WDAY_MASK) - 1);
+	tm->tm_mday  = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK,
+					  RTC_MDAY_BIN_MASK);
+	tm->tm_mon   = (ds1685_rtc_bcd2bin(rtc, month, RTC_MONTH_BCD_MASK,
+					   RTC_MONTH_BIN_MASK) - 1);
+	tm->tm_year  = ((years + (century * 100)) - 1900);
+	tm->tm_yday  = rtc_year_days(tm->tm_mday, tm->tm_mon, tm->tm_year);
+	tm->tm_isdst = 0; /* RTC has hardcoded timezone, so don't use. */
+
+	return rtc_valid_tm(tm);
+}
+
+/**
+ * ds1685_rtc_set_time - sets the time registers.
+ * @dev: pointer to device structure.
+ * @tm: pointer to rtc_time structure.
+ */
+static int
+ds1685_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrlb, seconds, minutes, hours, wday, mday, month, years, century;
+
+	/* Fetch the time info from rtc_time. */
+	seconds = ds1685_rtc_bin2bcd(rtc, tm->tm_sec, RTC_SECS_BIN_MASK,
+				     RTC_SECS_BCD_MASK);
+	minutes = ds1685_rtc_bin2bcd(rtc, tm->tm_min, RTC_MINS_BIN_MASK,
+				     RTC_MINS_BCD_MASK);
+	hours   = ds1685_rtc_bin2bcd(rtc, tm->tm_hour, RTC_HRS_24_BIN_MASK,
+				     RTC_HRS_24_BCD_MASK);
+	wday    = ds1685_rtc_bin2bcd(rtc, (tm->tm_wday + 1), RTC_WDAY_MASK,
+				     RTC_WDAY_MASK);
+	mday    = ds1685_rtc_bin2bcd(rtc, tm->tm_mday, RTC_MDAY_BIN_MASK,
+				     RTC_MDAY_BCD_MASK);
+	month   = ds1685_rtc_bin2bcd(rtc, (tm->tm_mon + 1), RTC_MONTH_BIN_MASK,
+				     RTC_MONTH_BCD_MASK);
+	years   = ds1685_rtc_bin2bcd(rtc, (tm->tm_year % 100),
+				     RTC_YEAR_BIN_MASK, RTC_YEAR_BCD_MASK);
+	century = ds1685_rtc_bin2bcd(rtc, ((tm->tm_year + 1900) / 100),
+				     RTC_CENTURY_MASK, RTC_CENTURY_MASK);
+
+	/*
+	 * Perform Sanity Checks:
+	 *   - Months: !> 12, Month Day != 0.
+	 *   - Month Day !> Max days in current month.
+	 *   - Hours !>= 24, Mins !>= 60, Secs !>= 60, & Weekday !> 7.
+	 */
+	if ((tm->tm_mon > 11) || (mday == 0))
+		return -EDOM;
+
+	if (tm->tm_mday > rtc_month_days(tm->tm_mon, tm->tm_year))
+		return -EDOM;
+
+	if ((tm->tm_hour >= 24) || (tm->tm_min >= 60) ||
+	    (tm->tm_sec >= 60)  || (wday > 7))
+		return -EDOM;
+
+	/*
+	 * Set the data mode to use and store the time values in the
+	 * RTC registers.
+	 */
+	ds1685_rtc_begin_data_access(rtc);
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	if (rtc->bcd_mode)
+		ctrlb &= ~(RTC_CTRL_B_DM);
+	else
+		ctrlb |= RTC_CTRL_B_DM;
+	rtc->write(rtc, RTC_CTRL_B, ctrlb);
+	rtc->write(rtc, RTC_SECS, seconds);
+	rtc->write(rtc, RTC_MINS, minutes);
+	rtc->write(rtc, RTC_HRS, hours);
+	rtc->write(rtc, RTC_WDAY, wday);
+	rtc->write(rtc, RTC_MDAY, mday);
+	rtc->write(rtc, RTC_MONTH, month);
+	rtc->write(rtc, RTC_YEAR, years);
+	rtc->write(rtc, RTC_CENTURY, century);
+	ds1685_rtc_end_data_access(rtc);
+
+	return 0;
+}
+
+/**
+ * ds1685_rtc_read_alarm - reads the alarm registers.
+ * @dev: pointer to device structure.
+ * @alrm: pointer to rtc_wkalrm structure.
+ *
+ * There are three primary alarm registers: seconds, minutes, and hours.
+ * A fourth alarm register for the month date is also available in bank1 for
+ * kickstart/wakeup features.  The DS1685/DS1687 manual states that a
+ * "don't care" value ranging from 0xc0 to 0xff may be written into one or
+ * more of the three alarm bytes to act as a wildcard value.  The fourth
+ * byte doesn't support a "don't care" value.
+ */
+static int
+ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 seconds, minutes, hours, mday, ctrlb, ctrlc;
+
+	/* Fetch the alarm info from the RTC alarm registers. */
+	ds1685_rtc_begin_data_access(rtc);
+	seconds	= rtc->read(rtc, RTC_SECS_ALARM);
+	minutes	= rtc->read(rtc, RTC_MINS_ALARM);
+	hours	= rtc->read(rtc, RTC_HRS_ALARM);
+	mday	= rtc->read(rtc, RTC_MDAY_ALARM);
+	ctrlb	= rtc->read(rtc, RTC_CTRL_B);
+	ctrlc	= rtc->read(rtc, RTC_CTRL_C);
+	ds1685_rtc_end_data_access(rtc);
+
+	/* Check month date. */
+	if (!(mday >= 1) && (mday <= 31))
+		return -EDOM;
+
+	/*
+	 * Check the three alarm bytes.
+	 *
+	 * The Linux RTC system doesn't support the "don't care" capability
+	 * of this RTC chip.  We check for it anyways in case support is
+	 * added in the future.
+	 */
+	if (unlikely((seconds >= 0xc0) && (seconds <= 0xff)))
+		alrm->time.tm_sec = -1;
+	else
+		alrm->time.tm_sec = ds1685_rtc_bcd2bin(rtc, seconds,
+						       RTC_SECS_BCD_MASK,
+						       RTC_SECS_BIN_MASK);
+
+	if (unlikely((minutes >= 0xc0) && (minutes <= 0xff)))
+		alrm->time.tm_min = -1;
+	else
+		alrm->time.tm_min = ds1685_rtc_bcd2bin(rtc, minutes,
+						       RTC_MINS_BCD_MASK,
+						       RTC_MINS_BIN_MASK);
+
+	if (unlikely((hours >= 0xc0) && (hours <= 0xff)))
+		alrm->time.tm_hour = -1;
+	else
+		alrm->time.tm_hour = ds1685_rtc_bcd2bin(rtc, hours,
+							RTC_HRS_24_BCD_MASK,
+							RTC_HRS_24_BIN_MASK);
+
+	/* Write the data to rtc_wkalrm. */
+	alrm->time.tm_mday = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK,
+						RTC_MDAY_BIN_MASK);
+	alrm->time.tm_mon = -1;
+	alrm->time.tm_year = -1;
+	alrm->time.tm_wday = -1;
+	alrm->time.tm_yday = -1;
+	alrm->time.tm_isdst = -1;
+	alrm->enabled = !!(ctrlb & RTC_CTRL_B_AIE);
+	alrm->pending = !!(ctrlc & RTC_CTRL_C_AF);
+
+	return 0;
+}
+
+/**
+ * ds1685_rtc_set_alarm - sets the alarm in registers.
+ * @dev: pointer to device structure.
+ * @alrm: pointer to rtc_wkalrm structure.
+ */
+static int
+ds1685_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrlb, seconds, minutes, hours, mday;
+
+	/* Fetch the alarm info and convert to BCD. */
+	seconds	= ds1685_rtc_bin2bcd(rtc, alrm->time.tm_sec,
+				     RTC_SECS_BIN_MASK,
+				     RTC_SECS_BCD_MASK);
+	minutes	= ds1685_rtc_bin2bcd(rtc, alrm->time.tm_min,
+				     RTC_MINS_BIN_MASK,
+				     RTC_MINS_BCD_MASK);
+	hours	= ds1685_rtc_bin2bcd(rtc, alrm->time.tm_hour,
+				     RTC_HRS_24_BIN_MASK,
+				     RTC_HRS_24_BCD_MASK);
+	mday	= ds1685_rtc_bin2bcd(rtc, alrm->time.tm_mday,
+				     RTC_MDAY_BIN_MASK,
+				     RTC_MDAY_BCD_MASK);
+
+	/* Check the month date for validity. */
+	if (!(mday >= 1) && (mday <= 31))
+		return -EDOM;
+
+	/*
+	 * Check the three alarm bytes.
+	 *
+	 * The Linux RTC system doesn't support the "don't care" capability
+	 * of this RTC chip because rtc_valid_tm tries to validate every
+	 * field, and we only support four fields.  We put the support
+	 * here anyways for the future.
+	 */
+	if (unlikely((seconds >= 0xc0) && (seconds <= 0xff)))
+		seconds = 0xff;
+
+	if (unlikely((minutes >= 0xc0) && (minutes <= 0xff)))
+		minutes = 0xff;
+
+	if (unlikely((hours >= 0xc0) && (hours <= 0xff)))
+		hours = 0xff;
+
+	alrm->time.tm_mon	= -1;
+	alrm->time.tm_year	= -1;
+	alrm->time.tm_wday	= -1;
+	alrm->time.tm_yday	= -1;
+	alrm->time.tm_isdst	= -1;
+
+	/* Disable the alarm interrupt first. */
+	ds1685_rtc_begin_data_access(rtc);
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	rtc->write(rtc, RTC_CTRL_B, (ctrlb & ~(RTC_CTRL_B_AIE)));
+
+	/* Read ctrlc to clear RTC_CTRL_C_AF. */
+	rtc->read(rtc, RTC_CTRL_C);
+
+	/*
+	 * Set the data mode to use and store the time values in the
+	 * RTC registers.
+	 */
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	if (rtc->bcd_mode)
+		ctrlb &= ~(RTC_CTRL_B_DM);
+	else
+		ctrlb |= RTC_CTRL_B_DM;
+	rtc->write(rtc, RTC_CTRL_B, ctrlb);
+	rtc->write(rtc, RTC_SECS_ALARM, seconds);
+	rtc->write(rtc, RTC_MINS_ALARM, minutes);
+	rtc->write(rtc, RTC_HRS_ALARM, hours);
+	rtc->write(rtc, RTC_MDAY_ALARM, mday);
+
+	/* Re-enable the alarm if needed. */
+	if (alrm->enabled) {
+		ctrlb = rtc->read(rtc, RTC_CTRL_B);
+		ctrlb |= RTC_CTRL_B_AIE;
+		rtc->write(rtc, RTC_CTRL_B, ctrlb);
+	}
+
+	/* Done! */
+	ds1685_rtc_end_data_access(rtc);
+
+	return 0;
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* /dev/rtcX Interface functions */
+
+#ifdef CONFIG_RTC_INTF_DEV
+/**
+ * ds1685_rtc_alarm_irq_enable - replaces ioctl() RTC_AIE on/off.
+ * @dev: pointer to device structure.
+ * @enabled: flag indicating whether to enable or disable.
+ */
+static int
+ds1685_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	unsigned long flags = 0;
+
+	/* Enable/disable the Alarm IRQ-Enable flag. */
+	spin_lock_irqsave(&rtc->lock, flags);
+
+	/* Flip the requisite interrupt-enable bit. */
+	if (enabled)
+		rtc->write(rtc, RTC_CTRL_B, (rtc->read(rtc, RTC_CTRL_B) |
+					     RTC_CTRL_B_AIE));
+	else
+		rtc->write(rtc, RTC_CTRL_B, (rtc->read(rtc, RTC_CTRL_B) &
+					     ~(RTC_CTRL_B_AIE)));
+
+	/* Read Control C to clear all the flag bits. */
+	rtc->read(rtc, RTC_CTRL_C);
+	spin_unlock_irqrestore(&rtc->lock, flags);
+
+	return 0;
+}
+#endif
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* IRQ handler & workqueue. */
+
+/**
+ * ds1685_rtc_irq_handler - IRQ handler.
+ * @irq: IRQ number.
+ * @dev_id: platform device pointer.
+ */
+static irqreturn_t
+ds1685_rtc_irq_handler(int irq, void *dev_id)
+{
+	struct platform_device *pdev = dev_id;
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrlb, ctrlc;
+	unsigned long events = 0;
+	u8 num_irqs = 0;
+
+	/* Abort early if the device isn't ready yet (i.e., DEBUG_SHIRQ). */
+	if (unlikely(!rtc))
+		return IRQ_HANDLED;
+
+	/* Ctrlb holds the interrupt-enable bits and ctrlc the flag bits. */
+	spin_lock(&rtc->lock);
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	ctrlc = rtc->read(rtc, RTC_CTRL_C);
+
+	/* Is the IRQF bit set? */
+	if (likely(ctrlc & RTC_CTRL_C_IRQF)) {
+		/*
+		 * We need to determine if it was one of the standard
+		 * events: PF, AF, or UF.  If so, we handle them and
+		 * update the RTC core.
+		 */
+		if (likely(ctrlc & RTC_CTRL_B_PAU_MASK)) {
+			events = RTC_IRQF;
+
+			/* Check for a periodic interrupt. */
+			if ((ctrlb & RTC_CTRL_B_PIE) &&
+			    (ctrlc & RTC_CTRL_C_PF)) {
+				events |= RTC_PF;
+				num_irqs++;
+			}
+
+			/* Check for an alarm interrupt. */
+			if ((ctrlb & RTC_CTRL_B_AIE) &&
+			    (ctrlc & RTC_CTRL_C_AF)) {
+				events |= RTC_AF;
+				num_irqs++;
+			}
+
+			/* Check for an update interrupt. */
+			if ((ctrlb & RTC_CTRL_B_UIE) &&
+			    (ctrlc & RTC_CTRL_C_UF)) {
+				events |= RTC_UF;
+				num_irqs++;
+			}
+
+			rtc_update_irq(rtc->dev, num_irqs, events);
+		} else {
+			/*
+			 * One of the "extended" interrupts was received that
+			 * is not recognized by the RTC core.  These need to
+			 * be handled in task context as they can call other
+			 * functions and the time spent in irq context needs
+			 * to be minimized.  Schedule them into a workqueue
+			 * and inform the RTC core that the IRQs were handled.
+			 */
+			spin_unlock(&rtc->lock);
+			schedule_work(&rtc->work);
+			rtc_update_irq(rtc->dev, 0, 0);
+			return IRQ_HANDLED;
+		}
+	}
+	spin_unlock(&rtc->lock);
+
+	return events ? IRQ_HANDLED : IRQ_NONE;
+}
+
+/**
+ * ds1685_rtc_work_queue - work queue handler.
+ * @work: work_struct containing data to work on in task context.
+ */
+static void
+ds1685_rtc_work_queue(struct work_struct *work)
+{
+	struct ds1685_priv *rtc = container_of(work,
+					       struct ds1685_priv, work);
+	struct platform_device *pdev = to_platform_device(&rtc->dev->dev);
+	struct mutex *rtc_mutex = &rtc->dev->ops_lock;
+	u8 ctrl4a, ctrl4b;
+
+	mutex_lock(rtc_mutex);
+
+	ds1685_rtc_switch_to_bank1(rtc);
+	ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+	ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B);
+
+	/*
+	 * Check for a kickstart interrupt. With Vcc applied, this
+	 * typically means that the power button was pressed, so we
+	 * begin the shutdown sequence.
+	 */
+	if ((ctrl4b & RTC_CTRL_4B_KSE) && (ctrl4a & RTC_CTRL_4A_KF)) {
+		/* Briefly disable kickstarts to debounce button presses. */
+		rtc->write(rtc, RTC_EXT_CTRL_4B,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4B) &
+			    ~(RTC_CTRL_4B_KSE)));
+
+		/* Clear the kickstart flag. */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (ctrl4a & ~(RTC_CTRL_4A_KF)));
+
+
+		/*
+		 * Sleep 500ms before re-enabling kickstarts.  This allows
+		 * adequate time to avoid reading signal jitter as additional
+		 * button presses.
+		 */
+		msleep(500);
+		rtc->write(rtc, RTC_EXT_CTRL_4B,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4B) |
+			    RTC_CTRL_4B_KSE));
+
+		/* Call the platform pre-poweroff function. Else, shutdown. */
+		if (rtc->prepare_poweroff != NULL)
+			rtc->prepare_poweroff();
+		else
+			ds1685_rtc_poweroff(pdev);
+	}
+
+	/*
+	 * Check for a wake-up interrupt.  With Vcc applied, this is
+	 * essentially a second alarm interrupt, except it takes into
+	 * account the 'date' register in bank1 in addition to the
+	 * standard three alarm registers.
+	 */
+	if ((ctrl4b & RTC_CTRL_4B_WIE) && (ctrl4a & RTC_CTRL_4A_WF)) {
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (ctrl4a & ~(RTC_CTRL_4A_WF)));
+
+		/* Call the platform wake_alarm function if defined. */
+		if (rtc->wake_alarm != NULL)
+			rtc->wake_alarm();
+		else
+			dev_warn(&pdev->dev,
+				 "Wake Alarm IRQ just occurred!\n");
+	}
+
+	/*
+	 * Check for a ram-clear interrupt.  This happens if RIE=1 and RF=0
+	 * when RCE=1 in 4B.  This clears all NVRAM bytes in bank0 by setting
+	 * each byte to a logic 1.  This has no effect on any extended
+	 * NV-SRAM that might be present, nor on the time/calendar/alarm
+	 * registers.  After a ram-clear is completed, there is a minimum
+	 * recovery time of ~150ms in which all reads/writes are locked out.
+	 * NOTE: A ram-clear can still occur if RCE=1 and RIE=0.  We cannot
+	 * catch this scenario.
+	 */
+	if ((ctrl4b & RTC_CTRL_4B_RIE) && (ctrl4a & RTC_CTRL_4A_RF)) {
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (ctrl4a & ~(RTC_CTRL_4A_RF)));
+		msleep(150);
+
+		/* Call the platform post_ram_clear function if defined. */
+		if (rtc->post_ram_clear != NULL)
+			rtc->post_ram_clear();
+		else
+			dev_warn(&pdev->dev,
+				 "RAM-Clear IRQ just occurred!\n");
+	}
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	mutex_unlock(rtc_mutex);
+}
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* ProcFS interface */
+
+#ifdef CONFIG_PROC_FS
+#define NUM_REGS	6	/* Num of control registers. */
+#define NUM_BITS	8	/* Num bits per register. */
+#define NUM_SPACES	4	/* Num spaces between each bit. */
+
+/*
+ * Periodic Interrupt Rates.
+ */
+static const char *ds1685_rtc_pirq_rate[16] = {
+	"none", "3.90625ms", "7.8125ms", "0.122070ms", "0.244141ms",
+	"0.488281ms", "0.9765625ms", "1.953125ms", "3.90625ms", "7.8125ms",
+	"15.625ms", "31.25ms", "62.5ms", "125ms", "250ms", "500ms"
+};
+
+/*
+ * Square-Wave Output Frequencies.
+ */
+static const char *ds1685_rtc_sqw_freq[16] = {
+	"none", "256Hz", "128Hz", "8192Hz", "4096Hz", "2048Hz", "1024Hz",
+	"512Hz", "256Hz", "128Hz", "64Hz", "32Hz", "16Hz", "8Hz", "4Hz", "2Hz"
+};
+
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+/**
+ * ds1685_rtc_print_regs - helper function to print register values.
+ * @hex: hex byte to convert into binary bits.
+ * @dest: destination char array.
+ *
+ * This is basically a hex->binary function, just with extra spacing between
+ * the digits.  It only works on 1-byte values (8 bits).
+ */
+static char*
+ds1685_rtc_print_regs(u8 hex, char *dest)
+{
+	u32 i, j;
+	char *tmp = dest;
+
+	for (i = 0; i < NUM_BITS; i++) {
+		*tmp++ = ((hex & 0x80) != 0 ? '1' : '0');
+		for (j = 0; j < NUM_SPACES; j++)
+			*tmp++ = ' ';
+		hex <<= 1;
+	}
+	*tmp++ = '\0';
+
+	return dest;
+}
+#endif
+
+/**
+ * ds1685_rtc_proc - procfs access function.
+ * @dev: pointer to device structure.
+ * @seq: pointer to seq_file structure.
+ */
+static int
+ds1685_rtc_proc(struct device *dev, struct seq_file *seq)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrla, ctrlb, ctrlc, ctrld, ctrl4a, ctrl4b, ssn[8];
+	char *model = '\0';
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+	char bits[NUM_REGS][(NUM_BITS * NUM_SPACES) + NUM_BITS + 1];
+#endif
+
+	/* Read all the relevant data from the control registers. */
+	ds1685_rtc_switch_to_bank1(rtc);
+	ds1685_rtc_get_ssn(rtc, ssn);
+	ctrla = rtc->read(rtc, RTC_CTRL_A);
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	ctrlc = rtc->read(rtc, RTC_CTRL_C);
+	ctrld = rtc->read(rtc, RTC_CTRL_D);
+	ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+	ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	/* Determine the RTC model. */
+	switch (ssn[0]) {
+	case RTC_MODEL_DS1685:
+		model = "DS1685/DS1687\0";
+		break;
+	case RTC_MODEL_DS1689:
+		model = "DS1689/DS1693\0";
+		break;
+	case RTC_MODEL_DS17285:
+		model = "DS17285/DS17287\0";
+		break;
+	case RTC_MODEL_DS17485:
+		model = "DS17485/DS17487\0";
+		break;
+	case RTC_MODEL_DS17885:
+		model = "DS17885/DS17887\0";
+		break;
+	default:
+		model = "Unknown\0";
+		break;
+	}
+
+	/* Print out the information. */
+	seq_printf(seq,
+	   "Model\t\t: %s\n"
+	   "Oscillator\t: %s\n"
+	   "12/24hr\t\t: %s\n"
+	   "DST\t\t: %s\n"
+	   "Data mode\t: %s\n"
+	   "Battery\t\t: %s\n"
+	   "Aux batt\t: %s\n"
+	   "Update IRQ\t: %s\n"
+	   "Periodic IRQ\t: %s\n"
+	   "Periodic Rate\t: %s\n"
+	   "SQW Freq\t: %s\n"
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+	   "Serial #\t: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n"
+	   "Register Status\t:\n"
+	   "   Ctrl A\t: UIP  DV2  DV1  DV0  RS3  RS2  RS1  RS0\n"
+	   "\t\t:  %s\n"
+	   "   Ctrl B\t: SET  PIE  AIE  UIE  SQWE  DM  2412 DSE\n"
+	   "\t\t:  %s\n"
+	   "   Ctrl C\t: IRQF  PF   AF   UF  ---  ---  ---  ---\n"
+	   "\t\t:  %s\n"
+	   "   Ctrl D\t: VRT  ---  ---  ---  ---  ---  ---  ---\n"
+	   "\t\t:  %s\n"
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+	   "   Ctrl 4A\t: VRT2 INCR BME  ---  PAB   RF   WF   KF\n"
+#else
+	   "   Ctrl 4A\t: VRT2 INCR ---  ---  PAB   RF   WF   KF\n"
+#endif
+	   "\t\t:  %s\n"
+	   "   Ctrl 4B\t: ABE  E32k  CS  RCE  PRS  RIE  WIE  KSE\n"
+	   "\t\t:  %s\n",
+#else
+	   "Serial #\t: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+#endif
+	   model,
+	   ((ctrla & RTC_CTRL_A_DV1) ? "enabled" : "disabled"),
+	   ((ctrlb & RTC_CTRL_B_2412) ? "24-hour" : "12-hour"),
+	   ((ctrlb & RTC_CTRL_B_DSE) ? "enabled" : "disabled"),
+	   ((ctrlb & RTC_CTRL_B_DM) ? "binary" : "BCD"),
+	   ((ctrld & RTC_CTRL_D_VRT) ? "ok" : "exhausted or n/a"),
+	   ((ctrl4a & RTC_CTRL_4A_VRT2) ? "ok" : "exhausted or n/a"),
+	   ((ctrlb & RTC_CTRL_B_UIE) ? "yes" : "no"),
+	   ((ctrlb & RTC_CTRL_B_PIE) ? "yes" : "no"),
+	   (!(ctrl4b & RTC_CTRL_4B_E32K) ?
+	    ds1685_rtc_pirq_rate[(ctrla & RTC_CTRL_A_RS_MASK)] : "none"),
+	   (!((ctrl4b & RTC_CTRL_4B_E32K)) ?
+	    ds1685_rtc_sqw_freq[(ctrla & RTC_CTRL_A_RS_MASK)] : "32768Hz"),
+#ifdef CONFIG_RTC_DS1685_PROC_REGS
+	   ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5], ssn[6], ssn[7],
+	   ds1685_rtc_print_regs(ctrla, bits[0]),
+	   ds1685_rtc_print_regs(ctrlb, bits[1]),
+	   ds1685_rtc_print_regs(ctrlc, bits[2]),
+	   ds1685_rtc_print_regs(ctrld, bits[3]),
+	   ds1685_rtc_print_regs(ctrl4a, bits[4]),
+	   ds1685_rtc_print_regs(ctrl4b, bits[5]));
+#else
+	   ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5], ssn[6], ssn[7]);
+#endif
+	return 0;
+}
+#else
+#define ds1685_rtc_proc NULL
+#endif /* CONFIG_PROC_FS */
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* RTC Class operations */
+
+static const struct rtc_class_ops
+ds1685_rtc_ops = {
+	.proc = ds1685_rtc_proc,
+	.read_time = ds1685_rtc_read_time,
+	.set_time = ds1685_rtc_set_time,
+	.read_alarm = ds1685_rtc_read_alarm,
+	.set_alarm = ds1685_rtc_set_alarm,
+	.alarm_irq_enable = ds1685_rtc_alarm_irq_enable,
+};
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* SysFS interface */
+
+#ifdef CONFIG_SYSFS
+/**
+ * ds1685_rtc_sysfs_nvram_read - reads rtc nvram via sysfs.
+ * @file: pointer to file structure.
+ * @kobj: pointer to kobject structure.
+ * @bin_attr: pointer to bin_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ * @pos: current file position pointer.
+ * @size: size of the data to read.
+ */
+static ssize_t
+ds1685_rtc_sysfs_nvram_read(struct file *filp, struct kobject *kobj,
+			    struct bin_attribute *bin_attr, char *buf,
+			    loff_t pos, size_t size)
+{
+	struct platform_device *pdev =
+		to_platform_device(container_of(kobj, struct device, kobj));
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	ssize_t count;
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&rtc->lock, flags);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	/* Read NVRAM in time and bank0 registers. */
+	for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ_BANK0;
+	     count++, size--) {
+		if (count < NVRAM_SZ_TIME)
+			*buf++ = rtc->read(rtc, (NVRAM_TIME_BASE + pos++));
+		else
+			*buf++ = rtc->read(rtc, (NVRAM_BANK0_BASE + pos++));
+	}
+
+#ifndef CONFIG_RTC_DRV_DS1689
+	if (size > 0) {
+		ds1685_rtc_switch_to_bank1(rtc);
+
+#ifndef CONFIG_RTC_DRV_DS1685
+		/* Enable burst-mode on DS17x85/DS17x87 */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4A) |
+			    RTC_CTRL_4A_BME));
+
+		/* We need one write to RTC_BANK1_RAM_ADDR_LSB to start
+		 * reading with burst-mode */
+		rtc->write(rtc, RTC_BANK1_RAM_ADDR_LSB,
+			   (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+
+		/* Read NVRAM in bank1 registers. */
+		for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ;
+		     count++, size--) {
+#ifdef CONFIG_RTC_DRV_DS1685
+			/* DS1685/DS1687 has to write to RTC_BANK1_RAM_ADDR
+			 * before each read. */
+			rtc->write(rtc, RTC_BANK1_RAM_ADDR,
+				   (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+			*buf++ = rtc->read(rtc, RTC_BANK1_RAM_DATA_PORT);
+			pos++;
+		}
+
+#ifndef CONFIG_RTC_DRV_DS1685
+		/* Disable burst-mode on DS17x85/DS17x87 */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4A) &
+			    ~(RTC_CTRL_4A_BME)));
+#endif
+		ds1685_rtc_switch_to_bank0(rtc);
+	}
+#endif /* !CONFIG_RTC_DRV_DS1689 */
+	spin_unlock_irqrestore(&rtc->lock, flags);
+
+	/*
+	 * XXX: Bug? this appears to cause the function to get executed
+	 * several times in succession.  But it's the only way to actually get
+	 * data written out to a file.
+	 */
+	return count;
+}
+
+/**
+ * ds1685_rtc_sysfs_nvram_write - writes rtc nvram via sysfs.
+ * @file: pointer to file structure.
+ * @kobj: pointer to kobject structure.
+ * @bin_attr: pointer to bin_attribute structure.
+ * @buf: pointer to char array to hold the input.
+ * @pos: current file position pointer.
+ * @size: size of the data to write.
+ */
+static ssize_t
+ds1685_rtc_sysfs_nvram_write(struct file *filp, struct kobject *kobj,
+			     struct bin_attribute *bin_attr, char *buf,
+			     loff_t pos, size_t size)
+{
+	struct platform_device *pdev =
+		to_platform_device(container_of(kobj, struct device, kobj));
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	ssize_t count;
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&rtc->lock, flags);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	/* Write NVRAM in time and bank0 registers. */
+	for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ_BANK0;
+	     count++, size--)
+		if (count < NVRAM_SZ_TIME)
+			rtc->write(rtc, (NVRAM_TIME_BASE + pos++),
+				   *buf++);
+		else
+			rtc->write(rtc, (NVRAM_BANK0_BASE), *buf++);
+
+#ifndef CONFIG_RTC_DRV_DS1689
+	if (size > 0) {
+		ds1685_rtc_switch_to_bank1(rtc);
+
+#ifndef CONFIG_RTC_DRV_DS1685
+		/* Enable burst-mode on DS17x85/DS17x87 */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4A) |
+			    RTC_CTRL_4A_BME));
+
+		/* We need one write to RTC_BANK1_RAM_ADDR_LSB to start
+		 * writing with burst-mode */
+		rtc->write(rtc, RTC_BANK1_RAM_ADDR_LSB,
+			   (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+
+		/* Write NVRAM in bank1 registers. */
+		for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ;
+		     count++, size--) {
+#ifdef CONFIG_RTC_DRV_DS1685
+			/* DS1685/DS1687 has to write to RTC_BANK1_RAM_ADDR
+			 * before each read. */
+			rtc->write(rtc, RTC_BANK1_RAM_ADDR,
+				   (pos - NVRAM_TOTAL_SZ_BANK0));
+#endif
+			rtc->write(rtc, RTC_BANK1_RAM_DATA_PORT, *buf++);
+			pos++;
+		}
+
+#ifndef CONFIG_RTC_DRV_DS1685
+		/* Disable burst-mode on DS17x85/DS17x87 */
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (rtc->read(rtc, RTC_EXT_CTRL_4A) &
+			    ~(RTC_CTRL_4A_BME)));
+#endif
+		ds1685_rtc_switch_to_bank0(rtc);
+	}
+#endif /* !CONFIG_RTC_DRV_DS1689 */
+	spin_unlock_irqrestore(&rtc->lock, flags);
+
+	return count;
+}
+
+/**
+ * struct ds1685_rtc_sysfs_nvram_attr - sysfs attributes for rtc nvram.
+ * @attr: nvram attributes.
+ * @read: nvram read function.
+ * @write: nvram write function.
+ * @size: nvram total size (bank0 + extended).
+ */
+static struct bin_attribute
+ds1685_rtc_sysfs_nvram_attr = {
+	.attr = {
+		.name = "nvram",
+		.mode = S_IRUGO | S_IWUSR,
+	},
+	.read = ds1685_rtc_sysfs_nvram_read,
+	.write = ds1685_rtc_sysfs_nvram_write,
+	.size = NVRAM_TOTAL_SZ
+};
+
+/**
+ * ds1685_rtc_sysfs_battery_show - sysfs file for main battery status.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_battery_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrld;
+
+	ctrld = rtc->read(rtc, RTC_CTRL_D);
+
+	return snprintf(buf, 13, "%s\n",
+			(ctrld & RTC_CTRL_D_VRT) ? "ok" : "not ok or N/A");
+}
+static DEVICE_ATTR(battery, S_IRUGO, ds1685_rtc_sysfs_battery_show, NULL);
+
+/**
+ * ds1685_rtc_sysfs_auxbatt_show - sysfs file for aux battery status.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_auxbatt_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ctrl4a;
+
+	ds1685_rtc_switch_to_bank1(rtc);
+	ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	return snprintf(buf, 13, "%s\n",
+			(ctrl4a & RTC_CTRL_4A_VRT2) ? "ok" : "not ok or N/A");
+}
+static DEVICE_ATTR(auxbatt, S_IRUGO, ds1685_rtc_sysfs_auxbatt_show, NULL);
+
+/**
+ * ds1685_rtc_sysfs_serial_show - sysfs file for silicon serial number.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_serial_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+	u8 ssn[8];
+
+	ds1685_rtc_switch_to_bank1(rtc);
+	ds1685_rtc_get_ssn(rtc, ssn);
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	return snprintf(buf, 24, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+			ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5],
+			ssn[6], ssn[7]);
+
+	return 0;
+}
+static DEVICE_ATTR(serial, S_IRUGO, ds1685_rtc_sysfs_serial_show, NULL);
+
+/**
+ * struct ds1685_rtc_sysfs_misc_attrs - list for misc RTC features.
+ */
+static struct attribute*
+ds1685_rtc_sysfs_misc_attrs[] = {
+	&dev_attr_battery.attr,
+	&dev_attr_auxbatt.attr,
+	&dev_attr_serial.attr,
+	NULL,
+};
+
+/**
+ * struct ds1685_rtc_sysfs_misc_grp - attr group for misc RTC features.
+ */
+static const struct attribute_group
+ds1685_rtc_sysfs_misc_grp = {
+	.name = "misc",
+	.attrs = ds1685_rtc_sysfs_misc_attrs,
+};
+
+#ifdef CONFIG_RTC_DS1685_SYSFS_REGS
+/**
+ * struct ds1685_rtc_ctrl_regs.
+ * @name: char pointer for the bit name.
+ * @reg: control register the bit is in.
+ * @bit: the bit's offset in the register.
+ */
+struct ds1685_rtc_ctrl_regs {
+	const char *name;
+	const u8 reg;
+	const u8 bit;
+};
+
+/*
+ * Ctrl register bit lookup table.
+ */
+static const struct ds1685_rtc_ctrl_regs
+ds1685_ctrl_regs_table[] = {
+	{ "uip",  RTC_CTRL_A,      RTC_CTRL_A_UIP   },
+	{ "dv2",  RTC_CTRL_A,      RTC_CTRL_A_DV2   },
+	{ "dv1",  RTC_CTRL_A,      RTC_CTRL_A_DV1   },
+	{ "dv0",  RTC_CTRL_A,      RTC_CTRL_A_DV0   },
+	{ "rs3",  RTC_CTRL_A,      RTC_CTRL_A_RS3   },
+	{ "rs2",  RTC_CTRL_A,      RTC_CTRL_A_RS2   },
+	{ "rs1",  RTC_CTRL_A,      RTC_CTRL_A_RS1   },
+	{ "rs0",  RTC_CTRL_A,      RTC_CTRL_A_RS0   },
+	{ "set",  RTC_CTRL_B,      RTC_CTRL_B_SET   },
+	{ "pie",  RTC_CTRL_B,      RTC_CTRL_B_PIE   },
+	{ "aie",  RTC_CTRL_B,      RTC_CTRL_B_AIE   },
+	{ "uie",  RTC_CTRL_B,      RTC_CTRL_B_UIE   },
+	{ "sqwe", RTC_CTRL_B,      RTC_CTRL_B_SQWE  },
+	{ "dm",   RTC_CTRL_B,      RTC_CTRL_B_DM    },
+	{ "2412", RTC_CTRL_B,      RTC_CTRL_B_2412  },
+	{ "dse",  RTC_CTRL_B,      RTC_CTRL_B_DSE   },
+	{ "irqf", RTC_CTRL_C,      RTC_CTRL_C_IRQF  },
+	{ "pf",   RTC_CTRL_C,      RTC_CTRL_C_PF    },
+	{ "af",   RTC_CTRL_C,      RTC_CTRL_C_AF    },
+	{ "uf",   RTC_CTRL_C,      RTC_CTRL_C_UF    },
+	{ "vrt",  RTC_CTRL_D,      RTC_CTRL_D_VRT   },
+	{ "vrt2", RTC_EXT_CTRL_4A, RTC_CTRL_4A_VRT2 },
+	{ "incr", RTC_EXT_CTRL_4A, RTC_CTRL_4A_INCR },
+	{ "pab",  RTC_EXT_CTRL_4A, RTC_CTRL_4A_PAB  },
+	{ "rf",   RTC_EXT_CTRL_4A, RTC_CTRL_4A_RF   },
+	{ "wf",   RTC_EXT_CTRL_4A, RTC_CTRL_4A_WF   },
+	{ "kf",   RTC_EXT_CTRL_4A, RTC_CTRL_4A_KF   },
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+	{ "bme",  RTC_EXT_CTRL_4A, RTC_CTRL_4A_BME  },
+#endif
+	{ "abe",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_ABE  },
+	{ "e32k", RTC_EXT_CTRL_4B, RTC_CTRL_4B_E32K },
+	{ "cs",   RTC_EXT_CTRL_4B, RTC_CTRL_4B_CS   },
+	{ "rce",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_RCE  },
+	{ "prs",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_PRS  },
+	{ "rie",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_RIE  },
+	{ "wie",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_WIE  },
+	{ "kse",  RTC_EXT_CTRL_4B, RTC_CTRL_4B_KSE  },
+	{ NULL,   0,               0                },
+};
+
+/**
+ * ds1685_rtc_sysfs_ctrl_regs_lookup - ctrl register bit lookup function.
+ * @name: ctrl register bit to look up in ds1685_ctrl_regs_table.
+ */
+static const struct ds1685_rtc_ctrl_regs*
+ds1685_rtc_sysfs_ctrl_regs_lookup(const char *name)
+{
+	const struct ds1685_rtc_ctrl_regs *p = ds1685_ctrl_regs_table;
+
+	for (; p->name != NULL; ++p)
+		if (strcmp(p->name, name) == 0)
+			return p;
+
+	return NULL;
+}
+
+/**
+ * ds1685_rtc_sysfs_ctrl_regs_show - reads a ctrl register bit via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_ctrl_regs_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u8 tmp;
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	const struct ds1685_rtc_ctrl_regs *reg_info =
+		ds1685_rtc_sysfs_ctrl_regs_lookup(attr->attr.name);
+
+	/* Make sure we actually matched something. */
+	if (!reg_info)
+		return -EINVAL;
+
+	/* No spinlock during a read -- mutex is already held. */
+	ds1685_rtc_switch_to_bank1(rtc);
+	tmp = rtc->read(rtc, reg_info->reg) & reg_info->bit;
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	return snprintf(buf, 2, "%d\n", (tmp ? 1 : 0));
+}
+
+/**
+ * ds1685_rtc_sysfs_ctrl_regs_store - writes a ctrl register bit via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ * @count: number of bytes written.
+ */
+static ssize_t
+ds1685_rtc_sysfs_ctrl_regs_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	u8 reg = 0, bit = 0, tmp;
+	unsigned long flags = 0;
+	long int val = 0;
+	const struct ds1685_rtc_ctrl_regs *reg_info =
+		ds1685_rtc_sysfs_ctrl_regs_lookup(attr->attr.name);
+
+	/* We only accept numbers. */
+	if (kstrtol(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	/* bits are binary, 0 or 1 only. */
+	if ((val != 0) && (val != 1))
+		return -ERANGE;
+
+	/* Make sure we actually matched something. */
+	if (!reg_info)
+		return -EINVAL;
+
+	reg = reg_info->reg;
+	bit = reg_info->bit;
+
+	/* Safe to spinlock during a write. */
+	ds1685_rtc_begin_ctrl_access(rtc, flags);
+	tmp = rtc->read(rtc, reg);
+	rtc->write(rtc, reg, (val ? (tmp | bit) : (tmp & ~(bit))));
+	ds1685_rtc_end_ctrl_access(rtc, flags);
+
+	return count;
+}
+
+/**
+ * DS1685_RTC_SYSFS_CTRL_REG_RO - device_attribute for read-only register bit.
+ * @bit: bit to read.
+ */
+#define DS1685_RTC_SYSFS_CTRL_REG_RO(bit)				\
+	static DEVICE_ATTR(bit, S_IRUGO,				\
+	ds1685_rtc_sysfs_ctrl_regs_show, NULL)
+
+/**
+ * DS1685_RTC_SYSFS_CTRL_REG_RW - device_attribute for read-write register bit.
+ * @bit: bit to read or write.
+ */
+#define DS1685_RTC_SYSFS_CTRL_REG_RW(bit)				\
+	static DEVICE_ATTR(bit, S_IRUGO | S_IWUSR,			\
+	ds1685_rtc_sysfs_ctrl_regs_show,				\
+	ds1685_rtc_sysfs_ctrl_regs_store)
+
+/*
+ * Control Register A bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(uip);
+DS1685_RTC_SYSFS_CTRL_REG_RW(dv2);
+DS1685_RTC_SYSFS_CTRL_REG_RW(dv1);
+DS1685_RTC_SYSFS_CTRL_REG_RO(dv0);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs3);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs2);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs1);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rs0);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrla_attrs[] = {
+	&dev_attr_uip.attr,
+	&dev_attr_dv2.attr,
+	&dev_attr_dv1.attr,
+	&dev_attr_dv0.attr,
+	&dev_attr_rs3.attr,
+	&dev_attr_rs2.attr,
+	&dev_attr_rs1.attr,
+	&dev_attr_rs0.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrla_grp = {
+	.name = "ctrla",
+	.attrs = ds1685_rtc_sysfs_ctrla_attrs,
+};
+
+
+/*
+ * Control Register B bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(set);
+DS1685_RTC_SYSFS_CTRL_REG_RW(pie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(aie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(uie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(sqwe);
+DS1685_RTC_SYSFS_CTRL_REG_RO(dm);
+DS1685_RTC_SYSFS_CTRL_REG_RO(2412);
+DS1685_RTC_SYSFS_CTRL_REG_RO(dse);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrlb_attrs[] = {
+	&dev_attr_set.attr,
+	&dev_attr_pie.attr,
+	&dev_attr_aie.attr,
+	&dev_attr_uie.attr,
+	&dev_attr_sqwe.attr,
+	&dev_attr_dm.attr,
+	&dev_attr_2412.attr,
+	&dev_attr_dse.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrlb_grp = {
+	.name = "ctrlb",
+	.attrs = ds1685_rtc_sysfs_ctrlb_attrs,
+};
+
+/*
+ * Control Register C bits.
+ *
+ * Reading Control C clears these bits!  Reading them individually can
+ * possibly cause an interrupt to be missed.  Use the /proc interface
+ * to see all the bits in this register simultaneously.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(irqf);
+DS1685_RTC_SYSFS_CTRL_REG_RO(pf);
+DS1685_RTC_SYSFS_CTRL_REG_RO(af);
+DS1685_RTC_SYSFS_CTRL_REG_RO(uf);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrlc_attrs[] = {
+	&dev_attr_irqf.attr,
+	&dev_attr_pf.attr,
+	&dev_attr_af.attr,
+	&dev_attr_uf.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrlc_grp = {
+	.name = "ctrlc",
+	.attrs = ds1685_rtc_sysfs_ctrlc_attrs,
+};
+
+/*
+ * Control Register D bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(vrt);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrld_attrs[] = {
+	&dev_attr_vrt.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrld_grp = {
+	.name = "ctrld",
+	.attrs = ds1685_rtc_sysfs_ctrld_attrs,
+};
+
+/*
+ * Control Register 4A bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RO(vrt2);
+DS1685_RTC_SYSFS_CTRL_REG_RO(incr);
+DS1685_RTC_SYSFS_CTRL_REG_RW(pab);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rf);
+DS1685_RTC_SYSFS_CTRL_REG_RW(wf);
+DS1685_RTC_SYSFS_CTRL_REG_RW(kf);
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+DS1685_RTC_SYSFS_CTRL_REG_RO(bme);
+#endif
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrl4a_attrs[] = {
+	&dev_attr_vrt2.attr,
+	&dev_attr_incr.attr,
+	&dev_attr_pab.attr,
+	&dev_attr_rf.attr,
+	&dev_attr_wf.attr,
+	&dev_attr_kf.attr,
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+	&dev_attr_bme.attr,
+#endif
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrl4a_grp = {
+	.name = "ctrl4a",
+	.attrs = ds1685_rtc_sysfs_ctrl4a_attrs,
+};
+
+/*
+ * Control Register 4B bits.
+ */
+DS1685_RTC_SYSFS_CTRL_REG_RW(abe);
+DS1685_RTC_SYSFS_CTRL_REG_RW(e32k);
+DS1685_RTC_SYSFS_CTRL_REG_RO(cs);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rce);
+DS1685_RTC_SYSFS_CTRL_REG_RW(prs);
+DS1685_RTC_SYSFS_CTRL_REG_RW(rie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(wie);
+DS1685_RTC_SYSFS_CTRL_REG_RW(kse);
+
+static struct attribute*
+ds1685_rtc_sysfs_ctrl4b_attrs[] = {
+	&dev_attr_abe.attr,
+	&dev_attr_e32k.attr,
+	&dev_attr_cs.attr,
+	&dev_attr_rce.attr,
+	&dev_attr_prs.attr,
+	&dev_attr_rie.attr,
+	&dev_attr_wie.attr,
+	&dev_attr_kse.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_ctrl4b_grp = {
+	.name = "ctrl4b",
+	.attrs = ds1685_rtc_sysfs_ctrl4b_attrs,
+};
+
+
+/**
+ * struct ds1685_rtc_ctrl_regs.
+ * @name: char pointer for the bit name.
+ * @reg: control register the bit is in.
+ * @bit: the bit's offset in the register.
+ */
+struct ds1685_rtc_time_regs {
+	const char *name;
+	const u8 reg;
+	const u8 mask;
+	const u8 min;
+	const u8 max;
+};
+
+/*
+ * Time/Date register lookup tables.
+ */
+static const struct ds1685_rtc_time_regs
+ds1685_time_regs_bcd_table[] = {
+	{ "seconds",       RTC_SECS,       RTC_SECS_BCD_MASK,   0, 59 },
+	{ "minutes",       RTC_MINS,       RTC_MINS_BCD_MASK,   0, 59 },
+	{ "hours",         RTC_HRS,        RTC_HRS_24_BCD_MASK, 0, 23 },
+	{ "wday",          RTC_WDAY,       RTC_WDAY_MASK,       1,  7 },
+	{ "mday",          RTC_MDAY,       RTC_MDAY_BCD_MASK,   1, 31 },
+	{ "month",         RTC_MONTH,      RTC_MONTH_BCD_MASK,  1, 12 },
+	{ "year",          RTC_YEAR,       RTC_YEAR_BCD_MASK,   0, 99 },
+	{ "century",       RTC_CENTURY,    RTC_CENTURY_MASK,    0, 99 },
+	{ "alarm_seconds", RTC_SECS_ALARM, RTC_SECS_BCD_MASK,   0, 59 },
+	{ "alarm_minutes", RTC_MINS_ALARM, RTC_MINS_BCD_MASK,   0, 59 },
+	{ "alarm_hours",   RTC_HRS_ALARM,  RTC_HRS_24_BCD_MASK, 0, 23 },
+	{ "alarm_mday",    RTC_MDAY_ALARM, RTC_MDAY_ALARM_MASK, 1, 31 },
+	{ NULL,            0,              0,                   0,  0 },
+};
+
+static const struct ds1685_rtc_time_regs
+ds1685_time_regs_bin_table[] = {
+	{ "seconds",       RTC_SECS,       RTC_SECS_BIN_MASK,   0x00, 0x3b },
+	{ "minutes",       RTC_MINS,       RTC_MINS_BIN_MASK,   0x00, 0x3b },
+	{ "hours",         RTC_HRS,        RTC_HRS_24_BIN_MASK, 0x00, 0x17 },
+	{ "wday",          RTC_WDAY,       RTC_WDAY_MASK,       0x01, 0x07 },
+	{ "mday",          RTC_MDAY,       RTC_MDAY_BIN_MASK,   0x01, 0x1f },
+	{ "month",         RTC_MONTH,      RTC_MONTH_BIN_MASK,  0x01, 0x0c },
+	{ "year",          RTC_YEAR,       RTC_YEAR_BIN_MASK,   0x00, 0x63 },
+	{ "century",       RTC_CENTURY,    RTC_CENTURY_MASK,    0x00, 0x63 },
+	{ "alarm_seconds", RTC_SECS_ALARM, RTC_SECS_BIN_MASK,   0x00, 0x3b },
+	{ "alarm_minutes", RTC_MINS_ALARM, RTC_MINS_BIN_MASK,   0x00, 0x3b },
+	{ "alarm_hours",   RTC_HRS_ALARM,  RTC_HRS_24_BIN_MASK, 0x00, 0x17 },
+	{ "alarm_mday",    RTC_MDAY_ALARM, RTC_MDAY_ALARM_MASK, 0x01, 0x1f },
+	{ NULL,            0,              0,                   0x00, 0x00 },
+};
+
+/**
+ * ds1685_rtc_sysfs_time_regs_bcd_lookup - time/date reg bit lookup function.
+ * @name: register bit to look up in ds1685_time_regs_bcd_table.
+ */
+static const struct ds1685_rtc_time_regs*
+ds1685_rtc_sysfs_time_regs_lookup(const char *name, bool bcd_mode)
+{
+	const struct ds1685_rtc_time_regs *p;
+
+	if (bcd_mode)
+		p = ds1685_time_regs_bcd_table;
+	else
+		p = ds1685_time_regs_bin_table;
+
+	for (; p->name != NULL; ++p)
+		if (strcmp(p->name, name) == 0)
+			return p;
+
+	return NULL;
+}
+
+/**
+ * ds1685_rtc_sysfs_time_regs_show - reads a time/date register via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ */
+static ssize_t
+ds1685_rtc_sysfs_time_regs_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u8 tmp;
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	const struct ds1685_rtc_time_regs *bcd_reg_info =
+		ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, true);
+	const struct ds1685_rtc_time_regs *bin_reg_info =
+		ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, false);
+
+	/* Make sure we actually matched something. */
+	if (!bcd_reg_info && !bin_reg_info)
+		return -EINVAL;
+
+	/* bcd_reg_info->reg == bin_reg_info->reg. */
+	ds1685_rtc_begin_data_access(rtc);
+	tmp = rtc->read(rtc, bcd_reg_info->reg);
+	ds1685_rtc_end_data_access(rtc);
+
+	tmp = ds1685_rtc_bcd2bin(rtc, tmp, bcd_reg_info->mask,
+				 bin_reg_info->mask);
+
+	return snprintf(buf, 4, "%d\n", tmp);
+}
+
+/**
+ * ds1685_rtc_sysfs_time_regs_store - writes a time/date register via sysfs.
+ * @dev: pointer to device structure.
+ * @attr: pointer to device_attribute structure.
+ * @buf: pointer to char array to hold the output.
+ * @count: number of bytes written.
+ */
+static ssize_t
+ds1685_rtc_sysfs_time_regs_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	long int val = 0;
+	struct ds1685_priv *rtc = dev_get_drvdata(dev);
+	const struct ds1685_rtc_time_regs *bcd_reg_info =
+		ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, true);
+	const struct ds1685_rtc_time_regs *bin_reg_info =
+		ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, false);
+
+	/* We only accept numbers. */
+	if (kstrtol(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	/* Make sure we actually matched something. */
+	if (!bcd_reg_info && !bin_reg_info)
+		return -EINVAL;
+
+	/* Check for a valid range. */
+	if (rtc->bcd_mode) {
+		if ((val < bcd_reg_info->min) || (val > bcd_reg_info->max))
+			return -ERANGE;
+	} else {
+		if ((val < bin_reg_info->min) || (val > bin_reg_info->max))
+			return -ERANGE;
+	}
+
+	val = ds1685_rtc_bin2bcd(rtc, val, bin_reg_info->mask,
+				 bcd_reg_info->mask);
+
+	/* bcd_reg_info->reg == bin_reg_info->reg. */
+	ds1685_rtc_begin_data_access(rtc);
+	rtc->write(rtc, bcd_reg_info->reg, val);
+	ds1685_rtc_end_data_access(rtc);
+
+	return count;
+}
+
+/**
+ * DS1685_RTC_SYSFS_REG_RW - device_attribute for a read-write time register.
+ * @reg: time/date register to read or write.
+ */
+#define DS1685_RTC_SYSFS_TIME_REG_RW(reg)				\
+	static DEVICE_ATTR(reg, S_IRUGO | S_IWUSR,			\
+	ds1685_rtc_sysfs_time_regs_show,				\
+	ds1685_rtc_sysfs_time_regs_store)
+
+/*
+ * Time/Date Register bits.
+ */
+DS1685_RTC_SYSFS_TIME_REG_RW(seconds);
+DS1685_RTC_SYSFS_TIME_REG_RW(minutes);
+DS1685_RTC_SYSFS_TIME_REG_RW(hours);
+DS1685_RTC_SYSFS_TIME_REG_RW(wday);
+DS1685_RTC_SYSFS_TIME_REG_RW(mday);
+DS1685_RTC_SYSFS_TIME_REG_RW(month);
+DS1685_RTC_SYSFS_TIME_REG_RW(year);
+DS1685_RTC_SYSFS_TIME_REG_RW(century);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_seconds);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_minutes);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_hours);
+DS1685_RTC_SYSFS_TIME_REG_RW(alarm_mday);
+
+static struct attribute*
+ds1685_rtc_sysfs_time_attrs[] = {
+	&dev_attr_seconds.attr,
+	&dev_attr_minutes.attr,
+	&dev_attr_hours.attr,
+	&dev_attr_wday.attr,
+	&dev_attr_mday.attr,
+	&dev_attr_month.attr,
+	&dev_attr_year.attr,
+	&dev_attr_century.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_time_grp = {
+	.name = "datetime",
+	.attrs = ds1685_rtc_sysfs_time_attrs,
+};
+
+static struct attribute*
+ds1685_rtc_sysfs_alarm_attrs[] = {
+	&dev_attr_alarm_seconds.attr,
+	&dev_attr_alarm_minutes.attr,
+	&dev_attr_alarm_hours.attr,
+	&dev_attr_alarm_mday.attr,
+	NULL,
+};
+
+static const struct attribute_group
+ds1685_rtc_sysfs_alarm_grp = {
+	.name = "alarm",
+	.attrs = ds1685_rtc_sysfs_alarm_attrs,
+};
+#endif /* CONFIG_RTC_DS1685_SYSFS_REGS */
+
+
+/**
+ * ds1685_rtc_sysfs_register - register sysfs files.
+ * @dev: pointer to device structure.
+ */
+static int
+ds1685_rtc_sysfs_register(struct device *dev)
+{
+	int ret = 0;
+
+	sysfs_bin_attr_init(&ds1685_rtc_sysfs_nvram_attr);
+	ret = sysfs_create_bin_file(&dev->kobj, &ds1685_rtc_sysfs_nvram_attr);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_misc_grp);
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_RTC_DS1685_SYSFS_REGS
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrla_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlb_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlc_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrld_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4a_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4b_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_time_grp);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_alarm_grp);
+	if (ret)
+		return ret;
+#endif
+	return 0;
+}
+
+/**
+ * ds1685_rtc_sysfs_unregister - unregister sysfs files.
+ * @dev: pointer to device structure.
+ */
+static int
+ds1685_rtc_sysfs_unregister(struct device *dev)
+{
+	sysfs_remove_bin_file(&dev->kobj, &ds1685_rtc_sysfs_nvram_attr);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_misc_grp);
+
+#ifdef CONFIG_RTC_DS1685_SYSFS_REGS
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrla_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlb_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlc_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrld_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4a_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4b_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_time_grp);
+	sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_alarm_grp);
+#endif
+
+	return 0;
+}
+#endif /* CONFIG_SYSFS */
+
+
+
+/* ----------------------------------------------------------------------- */
+/* Driver Probe/Removal */
+
+/**
+ * ds1685_rtc_probe - initializes rtc driver.
+ * @pdev: pointer to platform_device structure.
+ */
+static int
+ds1685_rtc_probe(struct platform_device *pdev)
+{
+	struct rtc_device *rtc_dev;
+	struct resource *res;
+	struct ds1685_priv *rtc;
+	struct ds1685_rtc_platform_data *pdata;
+	u8 ctrla, ctrlb, hours;
+	unsigned char am_pm;
+	int ret = 0;
+
+	/* Get the platform data. */
+	pdata = (struct ds1685_rtc_platform_data *) pdev->dev.platform_data;
+	if (!pdata)
+		return -ENODEV;
+
+	/* Allocate memory for the rtc device. */
+	rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
+	if (!rtc)
+		return -ENOMEM;
+
+	/*
+	 * Allocate/setup any IORESOURCE_MEM resources, if required.  Not all
+	 * platforms put the RTC in an easy-access place.  Like the SGI Octane,
+	 * which attaches the RTC to a "ByteBus", hooked to a SuperIO chip
+	 * that sits behind the IOC3 PCI metadevice.
+	 */
+	if (pdata->alloc_io_resources) {
+		/* Get the platform resources. */
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+		if (!res)
+			return -ENXIO;
+		rtc->size = resource_size(res);
+
+		/* Request a memory region. */
+		/* XXX: mmio-only for now. */
+		if (!devm_request_mem_region(&pdev->dev, res->start, rtc->size,
+					     pdev->name))
+			return -EBUSY;
+
+		/*
+		 * Set the base address for the rtc, and ioremap its
+		 * registers.
+		 */
+		rtc->baseaddr = res->start;
+		rtc->regs = devm_ioremap(&pdev->dev, res->start, rtc->size);
+		if (!rtc->regs)
+			return -ENOMEM;
+	}
+	rtc->alloc_io_resources = pdata->alloc_io_resources;
+
+	/* Get the register step size. */
+	if (pdata->regstep > 0)
+		rtc->regstep = pdata->regstep;
+	else
+		rtc->regstep = 1;
+
+	/* Platform read function, else default if mmio setup */
+	if (pdata->plat_read)
+		rtc->read = pdata->plat_read;
+	else
+		if (pdata->alloc_io_resources)
+			rtc->read = ds1685_read;
+		else
+			return -ENXIO;
+
+	/* Platform write function, else default if mmio setup */
+	if (pdata->plat_write)
+		rtc->write = pdata->plat_write;
+	else
+		if (pdata->alloc_io_resources)
+			rtc->write = ds1685_write;
+		else
+			return -ENXIO;
+
+	/* Platform pre-shutdown function, if defined. */
+	if (pdata->plat_prepare_poweroff)
+		rtc->prepare_poweroff = pdata->plat_prepare_poweroff;
+
+	/* Platform wake_alarm function, if defined. */
+	if (pdata->plat_wake_alarm)
+		rtc->wake_alarm = pdata->plat_wake_alarm;
+
+	/* Platform post_ram_clear function, if defined. */
+	if (pdata->plat_post_ram_clear)
+		rtc->post_ram_clear = pdata->plat_post_ram_clear;
+
+	/* Init the spinlock, workqueue, & set the driver data. */
+	spin_lock_init(&rtc->lock);
+	INIT_WORK(&rtc->work, ds1685_rtc_work_queue);
+	platform_set_drvdata(pdev, rtc);
+
+	/* Turn the oscillator on if is not already on (DV1 = 1). */
+	ctrla = rtc->read(rtc, RTC_CTRL_A);
+	if (!(ctrla & RTC_CTRL_A_DV1))
+		ctrla |= RTC_CTRL_A_DV1;
+
+	/* Enable the countdown chain (DV2 = 0) */
+	ctrla &= ~(RTC_CTRL_A_DV2);
+
+	/* Clear RS3-RS0 in Control A. */
+	ctrla &= ~(RTC_CTRL_A_RS_MASK);
+
+	/*
+	 * All done with Control A.  Switch to Bank 1 for the remainder of
+	 * the RTC setup so we have access to the extended functions.
+	 */
+	ctrla |= RTC_CTRL_A_DV0;
+	rtc->write(rtc, RTC_CTRL_A, ctrla);
+
+	/* Default to 32768kHz output. */
+	rtc->write(rtc, RTC_EXT_CTRL_4B,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4B) | RTC_CTRL_4B_E32K));
+
+	/* Set the SET bit in Control B so we can do some housekeeping. */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) | RTC_CTRL_B_SET));
+
+	/* Read Ext Ctrl 4A and check the INCR bit to avoid a lockout. */
+	while (rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_INCR)
+		cpu_relax();
+
+	/*
+	 * If the platform supports BCD mode, then set DM=0 in Control B.
+	 * Otherwise, set DM=1 for BIN mode.
+	 */
+	ctrlb = rtc->read(rtc, RTC_CTRL_B);
+	if (pdata->bcd_mode)
+		ctrlb &= ~(RTC_CTRL_B_DM);
+	else
+		ctrlb |= RTC_CTRL_B_DM;
+	rtc->bcd_mode = pdata->bcd_mode;
+
+	/*
+	 * Disable Daylight Savings Time (DSE = 0).
+	 * The RTC has hardcoded timezone information that is rendered
+	 * obselete.  We'll let the OS deal with DST settings instead.
+	 */
+	if (ctrlb & RTC_CTRL_B_DSE)
+		ctrlb &= ~(RTC_CTRL_B_DSE);
+
+	/* Force 24-hour mode (2412 = 1). */
+	if (!(ctrlb & RTC_CTRL_B_2412)) {
+		/* Reinitialize the time hours. */
+		hours = rtc->read(rtc, RTC_HRS);
+		am_pm = hours & RTC_HRS_AMPM_MASK;
+		hours = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_12_BCD_MASK,
+					   RTC_HRS_12_BIN_MASK);
+		hours = ((hours == 12) ? 0 : ((am_pm) ? hours + 12 : hours));
+
+		/* Enable 24-hour mode. */
+		ctrlb |= RTC_CTRL_B_2412;
+
+		/* Write back to Control B, including DM & DSE bits. */
+		rtc->write(rtc, RTC_CTRL_B, ctrlb);
+
+		/* Write the time hours back. */
+		rtc->write(rtc, RTC_HRS,
+			   ds1685_rtc_bin2bcd(rtc, hours,
+					      RTC_HRS_24_BIN_MASK,
+					      RTC_HRS_24_BCD_MASK));
+
+		/* Reinitialize the alarm hours. */
+		hours = rtc->read(rtc, RTC_HRS_ALARM);
+		am_pm = hours & RTC_HRS_AMPM_MASK;
+		hours = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_12_BCD_MASK,
+					   RTC_HRS_12_BIN_MASK);
+		hours = ((hours == 12) ? 0 : ((am_pm) ? hours + 12 : hours));
+
+		/* Write the alarm hours back. */
+		rtc->write(rtc, RTC_HRS_ALARM,
+			   ds1685_rtc_bin2bcd(rtc, hours,
+					      RTC_HRS_24_BIN_MASK,
+					      RTC_HRS_24_BCD_MASK));
+	} else {
+		/* 24-hour mode is already set, so write Control B back. */
+		rtc->write(rtc, RTC_CTRL_B, ctrlb);
+	}
+
+	/* Unset the SET bit in Control B so the RTC can update. */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_SET)));
+
+	/* Check the main battery. */
+	if (!(rtc->read(rtc, RTC_CTRL_D) & RTC_CTRL_D_VRT))
+		dev_warn(&pdev->dev,
+			 "Main battery is exhausted! RTC may be invalid!\n");
+
+	/* Check the auxillary battery.  It is optional. */
+	if (!(rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_VRT2))
+		dev_warn(&pdev->dev,
+			 "Aux battery is exhausted or not available.\n");
+
+	/* Read Ctrl B and clear PIE/AIE/UIE. */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_PAU_MASK)));
+
+	/* Reading Ctrl C auto-clears PF/AF/UF. */
+	rtc->read(rtc, RTC_CTRL_C);
+
+	/* Read Ctrl 4B and clear RIE/WIE/KSE. */
+	rtc->write(rtc, RTC_EXT_CTRL_4B,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4B) & ~(RTC_CTRL_4B_RWK_MASK)));
+
+	/* Clear RF/WF/KF in Ctrl 4A. */
+	rtc->write(rtc, RTC_EXT_CTRL_4A,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4A) & ~(RTC_CTRL_4A_RWK_MASK)));
+
+	/*
+	 * Re-enable KSE to handle power button events.  We do not enable
+	 * WIE or RIE by default.
+	 */
+	rtc->write(rtc, RTC_EXT_CTRL_4B,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4B) | RTC_CTRL_4B_KSE));
+
+	/*
+	 * Fetch the IRQ and setup the interrupt handler.
+	 *
+	 * Not all platforms have the IRQF pin tied to something.  If not, the
+	 * RTC will still set the *IE / *F flags and raise IRQF in ctrlc, but
+	 * there won't be an automatic way of notifying the kernel about it,
+	 * unless ctrlc is explicitly polled.
+	 */
+	if (!pdata->no_irq) {
+		ret = platform_get_irq(pdev, 0);
+		if (ret > 0) {
+			rtc->irq_num = ret;
+
+			/* Request an IRQ. */
+			ret = devm_request_irq(&pdev->dev, rtc->irq_num,
+					       ds1685_rtc_irq_handler,
+					       IRQF_SHARED, pdev->name, pdev);
+
+			/* Check to see if something came back. */
+			if (unlikely(ret)) {
+				dev_warn(&pdev->dev,
+					 "RTC interrupt not available\n");
+				rtc->irq_num = 0;
+			}
+		} else
+			return ret;
+	}
+	rtc->no_irq = pdata->no_irq;
+
+	/* Setup complete. */
+	ds1685_rtc_switch_to_bank0(rtc);
+
+	/* Register the device as an RTC. */
+	rtc_dev = rtc_device_register(pdev->name, &pdev->dev,
+				      &ds1685_rtc_ops, THIS_MODULE);
+
+	/* Success? */
+	if (IS_ERR(rtc_dev))
+		return PTR_ERR(rtc_dev);
+
+	/* Maximum periodic rate is 8192Hz (0.122070ms). */
+	rtc_dev->max_user_freq = RTC_MAX_USER_FREQ;
+
+	/* See if the platform doesn't support UIE. */
+	if (pdata->uie_unsupported)
+		rtc_dev->uie_unsupported = 1;
+	rtc->uie_unsupported = pdata->uie_unsupported;
+
+	rtc->dev = rtc_dev;
+
+#ifdef CONFIG_SYSFS
+	ret = ds1685_rtc_sysfs_register(&pdev->dev);
+	if (ret)
+		rtc_device_unregister(rtc->dev);
+#endif
+
+	/* Done! */
+	return ret;
+}
+
+/**
+ * ds1685_rtc_remove - removes rtc driver.
+ * @pdev: pointer to platform_device structure.
+ */
+static int
+ds1685_rtc_remove(struct platform_device *pdev)
+{
+	struct ds1685_priv *rtc = platform_get_drvdata(pdev);
+
+#ifdef CONFIG_SYSFS
+	ds1685_rtc_sysfs_unregister(&pdev->dev);
+#endif
+
+	rtc_device_unregister(rtc->dev);
+
+	/* Read Ctrl B and clear PIE/AIE/UIE. */
+	rtc->write(rtc, RTC_CTRL_B,
+		   (rtc->read(rtc, RTC_CTRL_B) &
+		    ~(RTC_CTRL_B_PAU_MASK)));
+
+	/* Reading Ctrl C auto-clears PF/AF/UF. */
+	rtc->read(rtc, RTC_CTRL_C);
+
+	/* Read Ctrl 4B and clear RIE/WIE/KSE. */
+	rtc->write(rtc, RTC_EXT_CTRL_4B,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4B) &
+		    ~(RTC_CTRL_4B_RWK_MASK)));
+
+	/* Manually clear RF/WF/KF in Ctrl 4A. */
+	rtc->write(rtc, RTC_EXT_CTRL_4A,
+		   (rtc->read(rtc, RTC_EXT_CTRL_4A) &
+		    ~(RTC_CTRL_4A_RWK_MASK)));
+
+	cancel_work_sync(&rtc->work);
+
+	return 0;
+}
+
+/**
+ * ds1685_rtc_driver - rtc driver properties.
+ */
+static struct platform_driver ds1685_rtc_driver = {
+	.driver		= {
+		.name	= "rtc-ds1685",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= ds1685_rtc_probe,
+	.remove		= ds1685_rtc_remove,
+};
+
+/**
+ * ds1685_rtc_init - rtc module init.
+ */
+static int __init
+ds1685_rtc_init(void)
+{
+	return platform_driver_register(&ds1685_rtc_driver);
+}
+
+/**
+ * ds1685_rtc_exit - rtc module exit.
+ */
+static void __exit
+ds1685_rtc_exit(void)
+{
+	platform_driver_unregister(&ds1685_rtc_driver);
+}
+
+module_init(ds1685_rtc_init);
+module_exit(ds1685_rtc_exit);
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Poweroff function */
+
+/**
+ * ds1685_rtc_poweroff - uses the RTC chip to power the system off.
+ * @pdev: pointer to platform_device structure.
+ */
+extern void __noreturn
+ds1685_rtc_poweroff(struct platform_device *pdev)
+{
+	u8 ctrla, ctrl4a, ctrl4b;
+	struct ds1685_priv *rtc;
+
+	/* Check for valid RTC data, else, spin forever. */
+	if (unlikely(!pdev)) {
+		pr_emerg("rtc-ds1685: platform device data not available, spinning forever ...\n");
+		unreachable();
+	} else {
+		/* Get the rtc data. */
+		rtc = platform_get_drvdata(pdev);
+
+		/*
+		 * Disable our IRQ.  We're powering down, so we're not
+		 * going to worry about cleaning up.  Most of that should
+		 * have been taken care of by the shutdown scripts and this
+		 * is the final function call.
+		 */
+		if (!rtc->no_irq)
+			disable_irq_nosync(rtc->irq_num);
+
+		/* Oscillator must be on and the countdown chain enabled. */
+		ctrla = rtc->read(rtc, RTC_CTRL_A);
+		ctrla |= RTC_CTRL_A_DV1;
+		ctrla &= ~(RTC_CTRL_A_DV2);
+		rtc->write(rtc, RTC_CTRL_A, ctrla);
+
+		/*
+		 * Read Control 4A and check the status of the auxillary
+		 * battery.  This must be present and working (VRT2 = 1)
+		 * for wakeup and kickstart functionality to be useful.
+		 */
+		ds1685_rtc_switch_to_bank1(rtc);
+		ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A);
+		if (ctrl4a & RTC_CTRL_4A_VRT2) {
+			/* Clear all of the interrupt flags on Control 4A. */
+			ctrl4a &= ~(RTC_CTRL_4A_RWK_MASK);
+			rtc->write(rtc, RTC_EXT_CTRL_4A, ctrl4a);
+
+			/*
+			 * The auxillary battery is present and working.
+			 * Enable extended functions (ABE=1), enable
+			 * wake-up (WIE=1), and enable kickstart (KSE=1)
+			 * in Control 4B.
+			 */
+			ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B);
+			ctrl4b |= (RTC_CTRL_4B_ABE | RTC_CTRL_4B_WIE |
+				   RTC_CTRL_4B_KSE);
+			rtc->write(rtc, RTC_EXT_CTRL_4B, ctrl4b);
+		}
+
+		/* Set PAB to 1 in Control 4A to power the system down. */
+		dev_warn(&pdev->dev, "Powerdown.\n");
+		msleep(20);
+		rtc->write(rtc, RTC_EXT_CTRL_4A,
+			   (ctrl4a | RTC_CTRL_4A_PAB));
+
+		/* Spin ... we do not switch back to bank0. */
+		unreachable();
+	}
+}
+EXPORT_SYMBOL(ds1685_rtc_poweroff);
+/* ----------------------------------------------------------------------- */
+
+
+MODULE_AUTHOR("Joshua Kinard <kumba@gentoo.org>");
+MODULE_AUTHOR("Matthias Fuchs <matthias.fuchs@esd-electronics.com>");
+MODULE_DESCRIPTION("Dallas/Maxim DS1685/DS1687-series RTC driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+MODULE_ALIAS("platform:rtc-ds1685");
diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h
new file mode 100644
index 000000000000..e6337a56d741
--- /dev/null
+++ b/include/linux/rtc/ds1685.h
@@ -0,0 +1,375 @@
+/*
+ * Definitions for the registers, addresses, and platform data of the
+ * DS1685/DS1687-series RTC chips.
+ *
+ * This Driver also works for the DS17X85/DS17X87 RTC chips.  Functionally
+ * similar to the DS1685/DS1687, they support a few extra features which
+ * include larger, battery-backed NV-SRAM, burst-mode access, and an RTC
+ * write counter.
+ *
+ * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>.
+ * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>.
+ *
+ * References:
+ *    DS1685/DS1687 3V/5V Real-Time Clocks, 19-5215, Rev 4/10.
+ *    DS17x85/DS17x87 3V/5V Real-Time Clocks, 19-5222, Rev 4/10.
+ *    DS1689/DS1693 3V/5V Serialized Real-Time Clocks, Rev 112105.
+ *    Application Note 90, Using the Multiplex Bus RTC Extended Features.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_RTC_DS1685_H_
+#define _LINUX_RTC_DS1685_H_
+
+#include <linux/rtc.h>
+#include <linux/platform_device.h>
+#include <linux/workqueue.h>
+
+/**
+ * struct ds1685_priv - DS1685 private data structure.
+ * @dev: pointer to the rtc_device structure.
+ * @regs: iomapped base address pointer of the RTC registers.
+ * @regstep: padding/step size between registers (optional).
+ * @baseaddr: base address of the RTC device.
+ * @size: resource size.
+ * @lock: private lock variable for spin locking/unlocking.
+ * @work: private workqueue.
+ * @irq: IRQ number assigned to the RTC device.
+ * @prepare_poweroff: pointer to platform pre-poweroff function.
+ * @wake_alarm: pointer to platform wake alarm function.
+ * @post_ram_clear: pointer to platform post ram-clear function.
+ */
+struct ds1685_priv {
+	struct rtc_device *dev;
+	void __iomem *regs;
+	u32 regstep;
+	resource_size_t baseaddr;
+	size_t size;
+	spinlock_t lock;
+	struct work_struct work;
+	int irq_num;
+	bool bcd_mode;
+	bool no_irq;
+	bool uie_unsupported;
+	bool alloc_io_resources;
+	u8 (*read)(struct ds1685_priv *, int);
+	void (*write)(struct ds1685_priv *, int, u8);
+	void (*prepare_poweroff)(void);
+	void (*wake_alarm)(void);
+	void (*post_ram_clear)(void);
+};
+
+
+/**
+ * struct ds1685_rtc_platform_data - platform data structure.
+ * @plat_prepare_poweroff: platform-specific pre-poweroff function.
+ * @plat_wake_alarm: platform-specific wake alarm function.
+ * @plat_post_ram_clear: platform-specific post ram-clear function.
+ *
+ * If your platform needs to use a custom padding/step size between
+ * registers, or uses one or more of the extended interrupts and needs special
+ * handling, then include this header file in your platform definition and
+ * set regstep and the plat_* pointers as appropriate.
+ */
+struct ds1685_rtc_platform_data {
+	const u32 regstep;
+	const bool bcd_mode;
+	const bool no_irq;
+	const bool uie_unsupported;
+	const bool alloc_io_resources;
+	u8 (*plat_read)(struct ds1685_priv *, int);
+	void (*plat_write)(struct ds1685_priv *, int, u8);
+	void (*plat_prepare_poweroff)(void);
+	void (*plat_wake_alarm)(void);
+	void (*plat_post_ram_clear)(void);
+};
+
+
+/*
+ * Time Registers.
+ */
+#define RTC_SECS		0x00	/* Seconds 00-59 */
+#define RTC_SECS_ALARM		0x01	/* Alarm Seconds 00-59 */
+#define RTC_MINS		0x02	/* Minutes 00-59 */
+#define RTC_MINS_ALARM		0x03	/* Alarm Minutes 00-59 */
+#define RTC_HRS			0x04	/* Hours 01-12 AM/PM || 00-23 */
+#define RTC_HRS_ALARM		0x05	/* Alarm Hours 01-12 AM/PM || 00-23 */
+#define RTC_WDAY		0x06	/* Day of Week 01-07 */
+#define RTC_MDAY		0x07	/* Day of Month 01-31 */
+#define RTC_MONTH		0x08	/* Month 01-12 */
+#define RTC_YEAR		0x09	/* Year 00-99 */
+#define RTC_CENTURY		0x48	/* Century 00-99 */
+#define RTC_MDAY_ALARM		0x49	/* Alarm Day of Month 01-31 */
+
+
+/*
+ * Bit masks for the Time registers in BCD Mode (DM = 0).
+ */
+#define RTC_SECS_BCD_MASK	0x7f	/* - x x x x x x x */
+#define RTC_MINS_BCD_MASK	0x7f	/* - x x x x x x x */
+#define RTC_HRS_12_BCD_MASK	0x1f	/* - - - x x x x x */
+#define RTC_HRS_24_BCD_MASK	0x3f	/* - - x x x x x x */
+#define RTC_MDAY_BCD_MASK	0x3f	/* - - x x x x x x */
+#define RTC_MONTH_BCD_MASK	0x1f	/* - - - x x x x x */
+#define RTC_YEAR_BCD_MASK	0xff	/* x x x x x x x x */
+
+/*
+ * Bit masks for the Time registers in BIN Mode (DM = 1).
+ */
+#define RTC_SECS_BIN_MASK	0x3f	/* - - x x x x x x */
+#define RTC_MINS_BIN_MASK	0x3f	/* - - x x x x x x */
+#define RTC_HRS_12_BIN_MASK	0x0f	/* - - - - x x x x */
+#define RTC_HRS_24_BIN_MASK	0x1f	/* - - - x x x x x */
+#define RTC_MDAY_BIN_MASK	0x1f	/* - - - x x x x x */
+#define RTC_MONTH_BIN_MASK	0x0f	/* - - - - x x x x */
+#define RTC_YEAR_BIN_MASK	0x7f	/* - x x x x x x x */
+
+/*
+ * Bit masks common for the Time registers in BCD or BIN Mode.
+ */
+#define RTC_WDAY_MASK		0x07	/* - - - - - x x x */
+#define RTC_CENTURY_MASK	0xff	/* x x x x x x x x */
+#define RTC_MDAY_ALARM_MASK	0xff	/* x x x x x x x x */
+#define RTC_HRS_AMPM_MASK	BIT(7)	/* Mask for the AM/PM bit */
+
+
+
+/*
+ * Control Registers.
+ */
+#define RTC_CTRL_A		0x0a	/* Control Register A */
+#define RTC_CTRL_B		0x0b	/* Control Register B */
+#define RTC_CTRL_C		0x0c	/* Control Register C */
+#define RTC_CTRL_D		0x0d	/* Control Register D */
+#define RTC_EXT_CTRL_4A		0x4a	/* Extended Control Register 4A */
+#define RTC_EXT_CTRL_4B		0x4b	/* Extended Control Register 4B */
+
+
+/*
+ * Bit names in Control Register A.
+ */
+#define RTC_CTRL_A_UIP		BIT(7)	/* Update In Progress */
+#define RTC_CTRL_A_DV2		BIT(6)	/* Countdown Chain */
+#define RTC_CTRL_A_DV1		BIT(5)	/* Oscillator Enable */
+#define RTC_CTRL_A_DV0		BIT(4)	/* Bank Select */
+#define RTC_CTRL_A_RS2		BIT(2)	/* Rate-Selection Bit 2 */
+#define RTC_CTRL_A_RS3		BIT(3)	/* Rate-Selection Bit 3 */
+#define RTC_CTRL_A_RS1		BIT(1)	/* Rate-Selection Bit 1 */
+#define RTC_CTRL_A_RS0		BIT(0)	/* Rate-Selection Bit 0 */
+#define RTC_CTRL_A_RS_MASK	0x0f	/* RS3 + RS2 + RS1 + RS0 */
+
+/*
+ * Bit names in Control Register B.
+ */
+#define RTC_CTRL_B_SET		BIT(7)	/* SET Bit */
+#define RTC_CTRL_B_PIE		BIT(6)	/* Periodic-Interrupt Enable */
+#define RTC_CTRL_B_AIE		BIT(5)	/* Alarm-Interrupt Enable */
+#define RTC_CTRL_B_UIE		BIT(4)	/* Update-Ended Interrupt-Enable */
+#define RTC_CTRL_B_SQWE		BIT(3)	/* Square-Wave Enable */
+#define RTC_CTRL_B_DM		BIT(2)	/* Data Mode */
+#define RTC_CTRL_B_2412		BIT(1)	/* 12-Hr/24-Hr Mode */
+#define RTC_CTRL_B_DSE		BIT(0)	/* Daylight Savings Enable */
+#define RTC_CTRL_B_PAU_MASK	0x70	/* PIE + AIE + UIE */
+
+
+/*
+ * Bit names in Control Register C.
+ *
+ * BIT(0), BIT(1), BIT(2), & BIT(3) are unused, always return 0, and cannot
+ * be written to.
+ */
+#define RTC_CTRL_C_IRQF		BIT(7)	/* Interrupt-Request Flag */
+#define RTC_CTRL_C_PF		BIT(6)	/* Periodic-Interrupt Flag */
+#define RTC_CTRL_C_AF		BIT(5)	/* Alarm-Interrupt Flag */
+#define RTC_CTRL_C_UF		BIT(4)	/* Update-Ended Interrupt Flag */
+#define RTC_CTRL_C_PAU_MASK	0x70	/* PF + AF + UF */
+
+
+/*
+ * Bit names in Control Register D.
+ *
+ * BIT(0) through BIT(6) are unused, always return 0, and cannot
+ * be written to.
+ */
+#define RTC_CTRL_D_VRT		BIT(7)	/* Valid RAM and Time */
+
+
+/*
+ * Bit names in Extended Control Register 4A.
+ *
+ * On the DS1685/DS1687/DS1689/DS1693, BIT(4) and BIT(5) are reserved for
+ * future use.  They can be read from and written to, but have no effect
+ * on the RTC's operation.
+ *
+ * On the DS17x85/DS17x87, BIT(5) is Burst-Mode Enable (BME), and allows
+ * access to the extended NV-SRAM by automatically incrementing the address
+ * register when they are read from or written to.
+ */
+#define RTC_CTRL_4A_VRT2	BIT(7)	/* Auxillary Battery Status */
+#define RTC_CTRL_4A_INCR	BIT(6)	/* Increment-in-Progress Status */
+#define RTC_CTRL_4A_PAB		BIT(3)	/* Power-Active Bar Control */
+#define RTC_CTRL_4A_RF		BIT(2)	/* RAM-Clear Flag */
+#define RTC_CTRL_4A_WF		BIT(1)	/* Wake-Up Alarm Flag */
+#define RTC_CTRL_4A_KF		BIT(0)	/* Kickstart Flag */
+#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689)
+#define RTC_CTRL_4A_BME		BIT(5)	/* Burst-Mode Enable */
+#endif
+#define RTC_CTRL_4A_RWK_MASK	0x07	/* RF + WF + KF */
+
+
+/*
+ * Bit names in Extended Control Register 4B.
+ */
+#define RTC_CTRL_4B_ABE		BIT(7)	/* Auxillary Battery Enable */
+#define RTC_CTRL_4B_E32K	BIT(6)	/* Enable 32.768Hz on SQW Pin */
+#define RTC_CTRL_4B_CS		BIT(5)	/* Crystal Select */
+#define RTC_CTRL_4B_RCE		BIT(4)	/* RAM Clear-Enable */
+#define RTC_CTRL_4B_PRS		BIT(3)	/* PAB Reset-Select */
+#define RTC_CTRL_4B_RIE		BIT(2)	/* RAM Clear-Interrupt Enable */
+#define RTC_CTRL_4B_WIE		BIT(1)	/* Wake-Up Alarm-Interrupt Enable */
+#define RTC_CTRL_4B_KSE		BIT(0)	/* Kickstart Interrupt-Enable */
+#define RTC_CTRL_4B_RWK_MASK	0x07	/* RIE + WIE + KSE */
+
+
+/*
+ * Misc register names in Bank 1.
+ *
+ * The DV0 bit in Control Register A must be set to 1 for these registers
+ * to become available, including Extended Control Registers 4A & 4B.
+ */
+#define RTC_BANK1_SSN_MODEL	0x40	/* Model Number */
+#define RTC_BANK1_SSN_BYTE_1	0x41	/* 1st Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_2	0x42	/* 2nd Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_3	0x43	/* 3rd Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_4	0x44	/* 4th Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_5	0x45	/* 5th Byte of Serial Number */
+#define RTC_BANK1_SSN_BYTE_6	0x46	/* 6th Byte of Serial Number */
+#define RTC_BANK1_SSN_CRC	0x47	/* Serial CRC Byte */
+#define RTC_BANK1_RAM_DATA_PORT	0x53	/* Extended RAM Data Port */
+
+
+/*
+ * Model-specific registers in Bank 1.
+ *
+ * The addresses below differ depending on the model of the RTC chip
+ * selected in the kernel configuration.  Not all of these features are
+ * supported in the main driver at present.
+ *
+ * DS1685/DS1687   - Extended NV-SRAM address (LSB only).
+ * DS1689/DS1693   - Vcc, Vbat, Pwr Cycle Counters & Customer-specific S/N.
+ * DS17x85/DS17x87 - Extended NV-SRAM addresses (MSB & LSB) & Write counter.
+ */
+#if defined(CONFIG_RTC_DRV_DS1685)
+#define RTC_BANK1_RAM_ADDR	0x50	/* NV-SRAM Addr */
+#elif defined(CONFIG_RTC_DRV_DS1689)
+#define RTC_BANK1_VCC_CTR_LSB	0x54	/* Vcc Counter Addr (LSB) */
+#define RTC_BANK1_VCC_CTR_MSB	0x57	/* Vcc Counter Addr (MSB) */
+#define RTC_BANK1_VBAT_CTR_LSB	0x58	/* Vbat Counter Addr (LSB) */
+#define RTC_BANK1_VBAT_CTR_MSB	0x5b	/* Vbat Counter Addr (MSB) */
+#define RTC_BANK1_PWR_CTR_LSB	0x5c	/* Pwr Cycle Counter Addr (LSB) */
+#define RTC_BANK1_PWR_CTR_MSB	0x5d	/* Pwr Cycle Counter Addr (MSB) */
+#define RTC_BANK1_UNIQ_SN	0x60	/* Customer-specific S/N */
+#else /* DS17x85/DS17x87 */
+#define RTC_BANK1_RAM_ADDR_LSB	0x50	/* NV-SRAM Addr (LSB) */
+#define RTC_BANK1_RAM_ADDR_MSB	0x51	/* NV-SRAM Addr (MSB) */
+#define RTC_BANK1_WRITE_CTR	0x5e	/* RTC Write Counter */
+#endif
+
+
+/*
+ * Model numbers.
+ *
+ * The DS1688/DS1691 and DS1689/DS1693 chips share the same model number
+ * and the manual doesn't indicate any major differences.  As such, they
+ * are regarded as the same chip in this driver.
+ */
+#define RTC_MODEL_DS1685	0x71	/* DS1685/DS1687 */
+#define RTC_MODEL_DS17285	0x72	/* DS17285/DS17287 */
+#define RTC_MODEL_DS1689	0x73	/* DS1688/DS1691/DS1689/DS1693 */
+#define RTC_MODEL_DS17485	0x74	/* DS17485/DS17487 */
+#define RTC_MODEL_DS17885	0x78	/* DS17885/DS17887 */
+
+
+/*
+ * Periodic Interrupt Rates / Square-Wave Output Frequency
+ *
+ * Periodic rates are selected by setting the RS3-RS0 bits in Control
+ * Register A and enabled via either the E32K bit in Extended Control
+ * Register 4B or the SQWE bit in Control Register B.
+ *
+ * E32K overrides the settings of RS3-RS0 and outputs a frequency of 32768Hz
+ * on the SQW pin of the RTC chip.  While there are 16 possible selections,
+ * the 1-of-16 decoder is only able to divide the base 32768Hz signal into 13
+ * smaller frequencies.  The values 0x01 and 0x02 are not used and are
+ * synonymous with 0x08 and 0x09, respectively.
+ *
+ * When E32K is set to a logic 1, periodic interrupts are disabled and reading
+ * /dev/rtc will return -EINVAL.  This also applies if the periodic interrupt
+ * frequency is set to 0Hz.
+ *
+ * Not currently used by the rtc-ds1685 driver because the RTC core removed
+ * support for hardware-generated periodic-interrupts in favour of
+ * hrtimer-generated interrupts.  But these defines are kept around for use
+ * in userland, as documentation to the hardware, and possible future use if
+ * hardware-generated periodic interrupts are ever added back.
+ */
+					/* E32K RS3 RS2 RS1 RS0 */
+#define RTC_SQW_8192HZ		0x03	/*  0    0   0   1   1  */
+#define RTC_SQW_4096HZ		0x04	/*  0    0   1   0   0  */
+#define RTC_SQW_2048HZ		0x05	/*  0    0   1   0   1  */
+#define RTC_SQW_1024HZ		0x06	/*  0    0   1   1   0  */
+#define RTC_SQW_512HZ		0x07	/*  0    0   1   1   1  */
+#define RTC_SQW_256HZ		0x08	/*  0    1   0   0   0  */
+#define RTC_SQW_128HZ		0x09	/*  0    1   0   0   1  */
+#define RTC_SQW_64HZ		0x0a	/*  0    1   0   1   0  */
+#define RTC_SQW_32HZ		0x0b	/*  0    1   0   1   1  */
+#define RTC_SQW_16HZ		0x0c	/*  0    1   1   0   0  */
+#define RTC_SQW_8HZ		0x0d	/*  0    1   1   0   1  */
+#define RTC_SQW_4HZ		0x0e	/*  0    1   1   1   0  */
+#define RTC_SQW_2HZ		0x0f	/*  0    1   1   1   1  */
+#define RTC_SQW_0HZ		0x00	/*  0    0   0   0   0  */
+#define RTC_SQW_32768HZ		32768	/*  1    -   -   -   -  */
+#define RTC_MAX_USER_FREQ	8192
+
+
+/*
+ * NVRAM data & addresses:
+ *   - 50 bytes of NVRAM are available just past the clock registers.
+ *   - 64 additional bytes are available in Bank0.
+ *
+ * Extended, battery-backed NV-SRAM:
+ *   - DS1685/DS1687    - 128 bytes.
+ *   - DS1689/DS1693    - 0 bytes.
+ *   - DS17285/DS17287  - 2048 bytes.
+ *   - DS17485/DS17487  - 4096 bytes.
+ *   - DS17885/DS17887  - 8192 bytes.
+ */
+#define NVRAM_TIME_BASE		0x0e	/* NVRAM Addr in Time regs */
+#define NVRAM_BANK0_BASE	0x40	/* NVRAM Addr in Bank0 regs */
+#define NVRAM_SZ_TIME		50
+#define NVRAM_SZ_BANK0		64
+#if defined(CONFIG_RTC_DRV_DS1685)
+#  define NVRAM_SZ_EXTND	128
+#elif defined(CONFIG_RTC_DRV_DS1689)
+#  define NVRAM_SZ_EXTND	0
+#elif defined(CONFIG_RTC_DRV_DS17285)
+#  define NVRAM_SZ_EXTND	2048
+#elif defined(CONFIG_RTC_DRV_DS17485)
+#  define NVRAM_SZ_EXTND	4096
+#elif defined(CONFIG_RTC_DRV_DS17885)
+#  define NVRAM_SZ_EXTND	8192
+#endif
+#define NVRAM_TOTAL_SZ_BANK0	(NVRAM_SZ_TIME + NVRAM_SZ_BANK0)
+#define NVRAM_TOTAL_SZ		(NVRAM_TOTAL_SZ_BANK0 + NVRAM_SZ_EXTND)
+
+
+/*
+ * Function Prototypes.
+ */
+extern void __noreturn
+ds1685_rtc_poweroff(struct platform_device *pdev);
+
+#endif /* _LINUX_RTC_DS1685_H_ */
-- 
cgit v1.2.3


From f9d904acb3e7e9edb470cd824fe2a21bb0df86b8 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 7 Jan 2015 10:14:43 -0800
Subject: HID: hid-sensor-hub: Correct documentation

During changes to the interface, some documentation field comments
were missed. Added missing comments.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid-sensor-hub.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index 51f7ccadf923..4173a8fdad9e 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -33,6 +33,8 @@
  * @units:		Measurment unit for this attribute.
  * @unit_expo:		Exponent used in the data.
  * @size:		Size in bytes for data size.
+ * @logical_minimum:	Logical minimum value for this attribute.
+ * @logical_maximum:	Logical maximum value for this attribute.
  */
 struct hid_sensor_hub_attribute_info {
 	u32 usage_id;
@@ -146,6 +148,7 @@ int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev,
 
 /**
 * sensor_hub_input_attr_get_raw_value() - Synchronous read request
+* @hsdev:	Hub device instance.
 * @usage_id:	Attribute usage id of parent physical device as per spec
 * @attr_usage_id:	Attribute usage id as per spec
 * @report_id:	Report id to look for
@@ -160,6 +163,7 @@ int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev,
 			u32 attr_usage_id, u32 report_id);
 /**
 * sensor_hub_set_feature() - Feature set request
+* @hsdev:	Hub device instance.
 * @report_id:	Report id to look for
 * @field_index:	Field index inside a report
 * @value:	Value to set
@@ -172,6 +176,7 @@ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
 
 /**
 * sensor_hub_get_feature() - Feature get request
+* @hsdev:	Hub device instance.
 * @report_id:	Report id to look for
 * @field_index:	Field index inside a report
 * @value:	Place holder for return value
-- 
cgit v1.2.3


From e59b4e9187bd5175b9845dc10fedb0879b7efbfd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 21 Jan 2015 20:03:40 +0000
Subject: debugfs: Provide a file creation function that also takes an initial
 size

Provide a file creation function that also takes an initial size so that the
caller doesn't have to set i_size, thus meaning that we don't have to call
deal with ->d_inode in the callers.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/infiniband/hw/cxgb4/device.c               | 35 ++++++-------------
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c |  9 ++---
 drivers/scsi/csiostor/csio_init.c                  |  9 ++---
 drivers/usb/gadget/udc/atmel_usba_udc.c            | 15 ++++----
 fs/debugfs/inode.c                                 | 40 ++++++++++++++++++++++
 include/linux/debugfs.h                            | 13 +++++++
 6 files changed, 79 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index eb5df4e62703..391309a55360 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -700,37 +700,24 @@ static const struct file_operations ep_debugfs_fops = {
 
 static int setup_debugfs(struct c4iw_dev *devp)
 {
-	struct dentry *de;
-
 	if (!devp->debugfs_root)
 		return -1;
 
-	de = debugfs_create_file("qps", S_IWUSR, devp->debugfs_root,
-				 (void *)devp, &qp_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = 4096;
+	debugfs_create_file_size("qps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &qp_debugfs_fops, 4096);
 
-	de = debugfs_create_file("stags", S_IWUSR, devp->debugfs_root,
-				 (void *)devp, &stag_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = 4096;
+	debugfs_create_file_size("stags", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &stag_debugfs_fops, 4096);
 
-	de = debugfs_create_file("stats", S_IWUSR, devp->debugfs_root,
-			(void *)devp, &stats_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = 4096;
+	debugfs_create_file_size("stats", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &stats_debugfs_fops, 4096);
 
-	de = debugfs_create_file("eps", S_IWUSR, devp->debugfs_root,
-			(void *)devp, &ep_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = 4096;
+	debugfs_create_file_size("eps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &ep_debugfs_fops, 4096);
 
-	if (c4iw_wr_log) {
-		de = debugfs_create_file("wr_log", S_IWUSR, devp->debugfs_root,
-					 (void *)devp, &wr_log_debugfs_fops);
-		if (de && de->d_inode)
-			de->d_inode->i_size = 4096;
-	}
+	if (c4iw_wr_log)
+		debugfs_create_file_size("wr_log", S_IWUSR, devp->debugfs_root,
+					 (void *)devp, &wr_log_debugfs_fops, 4096);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
index c98a350d857e..4c7fe447e407 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
@@ -91,12 +91,9 @@ static const struct file_operations mem_debugfs_fops = {
 static void add_debugfs_mem(struct adapter *adap, const char *name,
 			    unsigned int idx, unsigned int size_mb)
 {
-	struct dentry *de;
-
-	de = debugfs_create_file(name, S_IRUSR, adap->debugfs_root,
-				 (void *)adap + idx, &mem_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = size_mb << 20;
+	debugfs_create_file_size(name, S_IRUSR, adap->debugfs_root,
+				 (void *)adap + idx, &mem_debugfs_fops,
+				 size_mb << 20);
 }
 
 /* Add an array of Debug FS files.
diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c
index 34d20cc3e110..2617f15eaeeb 100644
--- a/drivers/scsi/csiostor/csio_init.c
+++ b/drivers/scsi/csiostor/csio_init.c
@@ -113,12 +113,9 @@ static const struct file_operations csio_mem_debugfs_fops = {
 void csio_add_debugfs_mem(struct csio_hw *hw, const char *name,
 				 unsigned int idx, unsigned int size_mb)
 {
-	struct dentry *de;
-
-	de = debugfs_create_file(name, S_IRUSR, hw->debugfs_root,
-				 (void *)hw + idx, &csio_mem_debugfs_fops);
-	if (de && de->d_inode)
-		de->d_inode->i_size = size_mb << 20;
+	debugfs_create_file_size(name, S_IRUSR, hw->debugfs_root,
+				 (void *)hw + idx, &csio_mem_debugfs_fops,
+				 size_mb << 20);
 }
 
 static int csio_setup_debugfs(struct csio_hw *hw)
diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
index 9f93bed42052..b2239a2d4941 100644
--- a/drivers/usb/gadget/udc/atmel_usba_udc.c
+++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
@@ -264,14 +264,17 @@ static void usba_init_debugfs(struct usba_udc *udc)
 		goto err_root;
 	udc->debugfs_root = root;
 
-	regs = debugfs_create_file("regs", 0400, root, udc, &regs_dbg_fops);
-	if (!regs)
-		goto err_regs;
-
 	regs_resource = platform_get_resource(udc->pdev, IORESOURCE_MEM,
 				CTRL_IOMEM_ID);
-	regs->d_inode->i_size = resource_size(regs_resource);
-	udc->debugfs_regs = regs;
+
+	if (regs_resource) {
+		regs = debugfs_create_file_size("regs", 0400, root, udc,
+						&regs_dbg_fops,
+						resource_size(regs_resource));
+		if (!regs)
+			goto err_regs;
+		udc->debugfs_regs = regs;
+	}
 
 	usba_ep_init_debugfs(udc, to_usba_ep(udc->gadget.ep0));
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 957c40ce09f6..45b18a5e225c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -337,6 +337,46 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file);
 
+/**
+ * debugfs_create_file_size - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ * @file_size: initial file size
+ *
+ * This is the basic "create a file" function for debugfs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops,
+					loff_t file_size)
+{
+	struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
+
+	if (de)
+		de->d_inode->i_size = file_size;
+	return de;
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file_size);
+
 /**
  * debugfs_create_dir - create a directory in the debugfs filesystem
  * @name: a pointer to a string containing the name of the directory to
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index ea149a24a1f2..cb25af461054 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -51,6 +51,11 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
 
+struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops,
+					loff_t file_size);
+
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
@@ -129,6 +134,14 @@ static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops,
+					loff_t file_size)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct dentry *debugfs_create_dir(const char *name,
 						struct dentry *parent)
 {
-- 
cgit v1.2.3


From 1cc7495c60879eeeda52385a70c99c4cbaace7ef Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 25 Jan 2015 11:40:57 +0100
Subject: watchdog: bcm47xx_wdt.c: add restart handler support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Just like in case of other watchdog drivers, use the new kernel core
API to provide restart support.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 drivers/watchdog/bcm47xx_wdt.c | 21 ++++++++++++++++++++-
 include/linux/bcm47xx_wdt.h    |  1 +
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/watchdog/bcm47xx_wdt.c b/drivers/watchdog/bcm47xx_wdt.c
index 9816485f6825..b28a072abf78 100644
--- a/drivers/watchdog/bcm47xx_wdt.c
+++ b/drivers/watchdog/bcm47xx_wdt.c
@@ -169,6 +169,17 @@ static int bcm47xx_wdt_notify_sys(struct notifier_block *this,
 	return NOTIFY_DONE;
 }
 
+static int bcm47xx_wdt_restart(struct notifier_block *this, unsigned long mode,
+			       void *cmd)
+{
+	struct bcm47xx_wdt *wdt;
+
+	wdt = container_of(this, struct bcm47xx_wdt, restart_handler);
+	wdt->timer_set(wdt, 1);
+
+	return NOTIFY_DONE;
+}
+
 static struct watchdog_ops bcm47xx_wdt_soft_ops = {
 	.owner		= THIS_MODULE,
 	.start		= bcm47xx_wdt_soft_start,
@@ -209,15 +220,23 @@ static int bcm47xx_wdt_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_timer;
 
-	ret = watchdog_register_device(&wdt->wdd);
+	wdt->restart_handler.notifier_call = &bcm47xx_wdt_restart;
+	wdt->restart_handler.priority = 64;
+	ret = register_restart_handler(&wdt->restart_handler);
 	if (ret)
 		goto err_notifier;
 
+	ret = watchdog_register_device(&wdt->wdd);
+	if (ret)
+		goto err_handler;
+
 	dev_info(&pdev->dev, "BCM47xx Watchdog Timer enabled (%d seconds%s%s)\n",
 		timeout, nowayout ? ", nowayout" : "",
 		soft ? ", Software Timer" : "");
 	return 0;
 
+err_handler:
+	unregister_restart_handler(&wdt->restart_handler);
 err_notifier:
 	unregister_reboot_notifier(&wdt->notifier);
 err_timer:
diff --git a/include/linux/bcm47xx_wdt.h b/include/linux/bcm47xx_wdt.h
index b708786d4cbf..5582c211f594 100644
--- a/include/linux/bcm47xx_wdt.h
+++ b/include/linux/bcm47xx_wdt.h
@@ -16,6 +16,7 @@ struct bcm47xx_wdt {
 
 	struct watchdog_device wdd;
 	struct notifier_block notifier;
+	struct notifier_block restart_handler;
 
 	struct timer_list soft_timer;
 	atomic_t soft_ticks;
-- 
cgit v1.2.3


From 73d7e3eac01da3cef32ab25cbc6a36a6202c4ea6 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 17 Feb 2015 13:45:44 -0800
Subject: kexec: remove never used member destination in kimage

struct kimage has a member destination which is used to store the real
destination address of each page when load segment from user space buffer
to kernel.  But we never retrieve the value stored in kimage->destination,
so this member variable in kimage and its assignment operation are
redundent code.

I guess for_each_kimage_entry just does the work that kimage->destination
is expected to do.

So in this patch just make a cleanup to remove it.

Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h | 2 --
 kernel/kexec.c        | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9d957b7ae095..10da8e246317 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -122,8 +122,6 @@ struct kimage {
 	kimage_entry_t *entry;
 	kimage_entry_t *last_entry;
 
-	unsigned long destination;
-
 	unsigned long start;
 	struct page *control_code_page;
 	struct page *swap_page;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c85277639b34..35dcac4b5c1c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
 
 	destination &= PAGE_MASK;
 	result = kimage_add_entry(image, destination | IND_DESTINATION);
-	if (result == 0)
-		image->destination = destination;
 
 	return result;
 }
@@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
 
 	page &= PAGE_MASK;
 	result = kimage_add_entry(image, page | IND_SOURCE);
-	if (result == 0)
-		image->destination += PAGE_SIZE;
 
 	return result;
 }
-- 
cgit v1.2.3


From cf2df6396ba78014289f322839a5cc785f09e1fd Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Tue, 17 Feb 2015 13:45:56 -0800
Subject: kexec: add bit definitions for kimage entry flags

Define new kexec preprocessor macros IND_*_BIT that define the bit
position of the kimage entry flags.  Change the existing IND_* flag macros
to be defined as bit shifts of the corresponding IND_*_BIT macros.  Also
wrap all C language code in kexec.h with #if !defined(__ASSEMBLY__) so
assembly files can include kexec.h to get the IND_* and IND_*_BIT macros.

Some CPU instruction sets have tests for bit position which are convenient
in implementing routines that operate on the kimage entry list.  The
addition of these bit position macros in a common location will avoid
duplicate definitions and the chance that changes to the IND_* flags will
not be propagated to assembly files.

Signed-off-by: Geoff Levand <geoff@infradead.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Maximilian Attems <max@stro.at>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 10da8e246317..1fd980cc481b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -1,6 +1,18 @@
 #ifndef LINUX_KEXEC_H
 #define LINUX_KEXEC_H
 
+#define IND_DESTINATION_BIT 0
+#define IND_INDIRECTION_BIT 1
+#define IND_DONE_BIT        2
+#define IND_SOURCE_BIT      3
+
+#define IND_DESTINATION  (1 << IND_DESTINATION_BIT)
+#define IND_INDIRECTION  (1 << IND_INDIRECTION_BIT)
+#define IND_DONE         (1 << IND_DONE_BIT)
+#define IND_SOURCE       (1 << IND_SOURCE_BIT)
+
+#if !defined(__ASSEMBLY__)
+
 #include <uapi/linux/kexec.h>
 
 #ifdef CONFIG_KEXEC
@@ -64,10 +76,6 @@
  */
 
 typedef unsigned long kimage_entry_t;
-#define IND_DESTINATION  0x1
-#define IND_INDIRECTION  0x2
-#define IND_DONE         0x4
-#define IND_SOURCE       0x8
 
 struct kexec_segment {
 	/*
@@ -311,4 +319,7 @@ struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #endif /* CONFIG_KEXEC */
+
+#endif /* !defined(__ASSEBMLY__) */
+
 #endif /* LINUX_KEXEC_H */
-- 
cgit v1.2.3


From b28c2ee868dbdc0baa89c60fb520be85d5e90a72 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Tue, 17 Feb 2015 13:45:58 -0800
Subject: kexec: add IND_FLAGS macro

Add a new kexec preprocessor macro IND_FLAGS, which is the bitwise OR of
all the possible kexec IND_ kimage_entry indirection flags.  Having this
macro allows for simplified code in the prosessing of the kexec
kimage_entry items.  Also, remove the local powerpc definition and use the
generic one.

Signed-off-by: Geoff Levand <geoff@infradead.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Maximilian Attems <max@stro.at>
Cc: Michal Marek <mmarek@suse.cz>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/machine_kexec_64.c | 2 --
 include/linux/kexec.h                  | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index f96d1ec24189..1a74446fd9e5 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -96,8 +96,6 @@ int default_machine_kexec_prepare(struct kimage *image)
 	return 0;
 }
 
-#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE)
-
 static void copy_segments(unsigned long ind)
 {
 	unsigned long entry;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 1fd980cc481b..e60a745ac198 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -10,6 +10,7 @@
 #define IND_INDIRECTION  (1 << IND_INDIRECTION_BIT)
 #define IND_DONE         (1 << IND_DONE_BIT)
 #define IND_SOURCE       (1 << IND_SOURCE_BIT)
+#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE)
 
 #if !defined(__ASSEMBLY__)
 
-- 
cgit v1.2.3


From 7647f14fe4cd98151f8e90656c01fe61044de714 Mon Sep 17 00:00:00 2001
From: John de la Garza <john@jjdev.com>
Date: Tue, 17 Feb 2015 13:46:04 -0800
Subject: lib/rbtree.c: fix typo in comment

Signed-off-by: John de la Garza <john@jjdev.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 57e75ae9910f..fb31765e935a 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -51,7 +51,7 @@ struct rb_root {
 
 #define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
 
-/* 'empty' nodes are nodes that are known not to be inserted in an rbree */
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
 #define RB_EMPTY_NODE(node)  \
 	((node)->__rb_parent_color == (unsigned long)(node))
 #define RB_CLEAR_NODE(node)  \
-- 
cgit v1.2.3


From 9cff8adeaa34b5d2802f03f89803da57856b3b72 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 13 Feb 2015 15:49:17 +1100
Subject: sched: Prevent recursion in io_schedule()

io_schedule() calls blk_flush_plug() which, depending on the
contents of current->plug, can initiate arbitrary blk-io requests.

Note that this contrasts with blk_schedule_flush_plug() which requires
all non-trivial work to be handed off to a separate thread.

This makes it possible for io_schedule() to recurse, and initiating
block requests could possibly call mempool_alloc() which, in times of
memory pressure, uses io_schedule().

Apart from any stack usage issues, io_schedule() will not behave
correctly when called recursively as delayacct_blkio_start() does
not allow for repeated calls.

So:
 - use ->in_iowait to detect recursion.  Set it earlier, and restore
   it to the old value.
 - move the call to "raw_rq" after the call to blk_flush_plug().
   As this is some sort of per-cpu thing, we want some chance that
   we are on the right CPU
 - When io_schedule() is called recurively, use blk_schedule_flush_plug()
   which cannot further recurse.
 - as this makes io_schedule() a lot more complex and as io_schedule()
   must match io_schedule_timeout(), but all the changes in io_schedule_timeout()
   and make io_schedule a simple wrapper for that.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[ Moved the now rudimentary io_schedule() into sched.h. ]
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tony Battersby <tonyb@cybernetics.com>
Link: http://lkml.kernel.org/r/20150213162600.059fffb2@notabene.brown
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 10 +++++++---
 kernel/sched/core.c   | 31 ++++++++++++-------------------
 2 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef98d2f..cb5cdc777c8a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -363,9 +363,6 @@ extern void show_regs(struct pt_regs *);
  */
 extern void show_stack(struct task_struct *task, unsigned long *sp);
 
-void io_schedule(void);
-long io_schedule_timeout(long timeout);
-
 extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
@@ -422,6 +419,13 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 
+extern long io_schedule_timeout(long timeout);
+
+static inline void io_schedule(void)
+{
+	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+}
+
 struct nsproxy;
 struct user_namespace;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c314000f5e52..daaea922f482 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4358,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
-void __sched io_schedule(void)
-{
-	struct rq *rq = raw_rq();
-
-	delayacct_blkio_start();
-	atomic_inc(&rq->nr_iowait);
-	blk_flush_plug(current);
-	current->in_iowait = 1;
-	schedule();
-	current->in_iowait = 0;
-	atomic_dec(&rq->nr_iowait);
-	delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
 long __sched io_schedule_timeout(long timeout)
 {
-	struct rq *rq = raw_rq();
+	int old_iowait = current->in_iowait;
+	struct rq *rq;
 	long ret;
 
+	current->in_iowait = 1;
+	if (old_iowait)
+		blk_schedule_flush_plug(current);
+	else
+		blk_flush_plug(current);
+
 	delayacct_blkio_start();
+	rq = raw_rq();
 	atomic_inc(&rq->nr_iowait);
-	blk_flush_plug(current);
-	current->in_iowait = 1;
 	ret = schedule_timeout(timeout);
-	current->in_iowait = 0;
+	current->in_iowait = old_iowait;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
+
 	return ret;
 }
+EXPORT_SYMBOL(io_schedule_timeout);
 
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
-- 
cgit v1.2.3


From 2e65d8bfe80be51af2f84c904f85bac1437a5545 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Date: Thu, 12 Feb 2015 14:58:29 +0100
Subject: clk: Add __clk_hw_set_clk helper function

After the clk API change to return a per-user clock instance, both the
struct clk_core and struct clk pointers from the hw clock needs to be
assigned to clock that share the same state.

In the future the struct clk_core will be removed and this is going to
change again so to avoid having to change the assignments twice in all
the drivers, add a helper function to have an indirection level.

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Michael Turquette <mturquette@linaro.org>
---
 include/linux/clk-provider.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 17dd6e9439d1..5591ea71a8d1 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -590,6 +590,12 @@ long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate,
 			      unsigned long *best_parent_rate,
 			      struct clk_hw **best_parent_p);
 
+static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src)
+{
+	dst->clk = src->clk;
+	dst->core = src->core;
+}
+
 /*
  * FIXME clock api without lock protection
  */
-- 
cgit v1.2.3


From 79969dd12e8756f64a999992c0536ccd91bf6e54 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 18 Feb 2015 11:30:18 -0800
Subject: NFSv4.1: Clean up create_session

Don't decode directly into the shared struct session

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 42 +++++++++++++++++++++++++++++-------------
 fs/nfs/nfs4session.h    |  6 ++++++
 fs/nfs/nfs4xdr.c        | 16 +++++++---------
 include/linux/nfs_xdr.h |  8 +++++++-
 4 files changed, 49 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2e7c9f7a6f7c..006bfa3da55b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7166,10 +7166,11 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 		args->bc_attrs.max_reqs);
 }
 
-static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args,
+		struct nfs41_create_session_res *res)
 {
 	struct nfs4_channel_attrs *sent = &args->fc_attrs;
-	struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
+	struct nfs4_channel_attrs *rcvd = &res->fc_attrs;
 
 	if (rcvd->max_resp_sz > sent->max_resp_sz)
 		return -EINVAL;
@@ -7188,10 +7189,11 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
 	return 0;
 }
 
-static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args,
+		struct nfs41_create_session_res *res)
 {
 	struct nfs4_channel_attrs *sent = &args->bc_attrs;
-	struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
+	struct nfs4_channel_attrs *rcvd = &res->bc_attrs;
 
 	if (rcvd->max_rqst_sz > sent->max_rqst_sz)
 		return -EINVAL;
@@ -7208,14 +7210,23 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
 }
 
 static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
-				     struct nfs4_session *session)
+				     struct nfs41_create_session_res *res)
 {
 	int ret;
 
-	ret = nfs4_verify_fore_channel_attrs(args, session);
+	ret = nfs4_verify_fore_channel_attrs(args, res);
 	if (ret)
 		return ret;
-	return nfs4_verify_back_channel_attrs(args, session);
+	return nfs4_verify_back_channel_attrs(args, res);
+}
+
+static void nfs4_update_session(struct nfs4_session *session,
+		struct nfs41_create_session_res *res)
+{
+	nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
+	session->flags = res->flags;
+	memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
+	memcpy(&session->bc_attrs, &res->bc_attrs, sizeof(session->bc_attrs));
 }
 
 static int _nfs4_proc_create_session(struct nfs_client *clp,
@@ -7224,11 +7235,12 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 	struct nfs4_session *session = clp->cl_session;
 	struct nfs41_create_session_args args = {
 		.client = clp,
+		.clientid = clp->cl_clientid,
+		.seqid = clp->cl_seqid,
 		.cb_program = NFS4_CALLBACK,
 	};
-	struct nfs41_create_session_res res = {
-		.client = clp,
-	};
+	struct nfs41_create_session_res res;
+
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
 		.rpc_argp = &args,
@@ -7245,11 +7257,15 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 
 	if (!status) {
 		/* Verify the session's negotiated channel_attrs values */
-		status = nfs4_verify_channel_attrs(&args, session);
+		status = nfs4_verify_channel_attrs(&args, &res);
 		/* Increment the clientid slot sequence id */
-		clp->cl_seqid++;
+		if (clp->cl_seqid == res.seqid)
+			clp->cl_seqid++;
+		if (status)
+			goto out;
+		nfs4_update_session(session, &res);
 	}
-
+out:
 	return status;
 }
 
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index b34ada9bc6a2..fc46c7455898 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -118,6 +118,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
 	return 0;
 }
 
+static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
+		const struct nfs4_sessionid *src)
+{
+	memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
+}
+
 #ifdef CONFIG_CRC32
 /*
  * nfs_session_id_hash - calculate the crc32 hash for the session id
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e23a0a664e12..248903b138a8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1806,8 +1806,8 @@ static void encode_create_session(struct xdr_stream *xdr,
 
 	encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
 	p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
-	p = xdr_encode_hyper(p, clp->cl_clientid);
-	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
+	p = xdr_encode_hyper(p, args->clientid);
+	*p++ = cpu_to_be32(args->seqid);			/*Sequence id */
 	*p++ = cpu_to_be32(args->flags);			/*flags */
 
 	/* Fore Channel */
@@ -5641,12 +5641,10 @@ static int decode_create_session(struct xdr_stream *xdr,
 {
 	__be32 *p;
 	int status;
-	struct nfs_client *clp = res->client;
-	struct nfs4_session *session = clp->cl_session;
 
 	status = decode_op_hdr(xdr, OP_CREATE_SESSION);
 	if (!status)
-		status = decode_sessionid(xdr, &session->sess_id);
+		status = decode_sessionid(xdr, &res->sessionid);
 	if (unlikely(status))
 		return status;
 
@@ -5654,13 +5652,13 @@ static int decode_create_session(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	clp->cl_seqid = be32_to_cpup(p++);
-	session->flags = be32_to_cpup(p);
+	res->seqid = be32_to_cpup(p++);
+	res->flags = be32_to_cpup(p);
 
 	/* Channel attributes */
-	status = decode_chan_attrs(xdr, &session->fc_attrs);
+	status = decode_chan_attrs(xdr, &res->fc_attrs);
 	if (!status)
-		status = decode_chan_attrs(xdr, &session->bc_attrs);
+		status = decode_chan_attrs(xdr, &res->bc_attrs);
 	return status;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9a39132fda49..1af12fc16e98 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1185,6 +1185,8 @@ struct nfs41_exchange_id_res {
 
 struct nfs41_create_session_args {
 	struct nfs_client	       *client;
+	u64				clientid;
+	uint32_t			seqid;
 	uint32_t			flags;
 	uint32_t			cb_program;
 	struct nfs4_channel_attrs	fc_attrs;	/* Fore Channel */
@@ -1192,7 +1194,11 @@ struct nfs41_create_session_args {
 };
 
 struct nfs41_create_session_res {
-	struct nfs_client	       *client;
+	struct nfs4_sessionid		sessionid;
+	uint32_t			seqid;
+	uint32_t			flags;
+	struct nfs4_channel_attrs	fc_attrs;	/* Fore Channel */
+	struct nfs4_channel_attrs	bc_attrs;	/* Back Channel */
 };
 
 struct nfs41_reclaim_complete_args {
-- 
cgit v1.2.3


From 71a097c6de9a49afd0f96b3ecef70c4eb04efde7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 18 Feb 2015 09:27:18 -0800
Subject: NFSv4.1: Clean up bind_conn_to_session

We don't need to fake up an entire session in order retrieve the arguments.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 28 ++++++++++++++--------------
 fs/nfs/nfs4xdr.c        | 16 ++++++++--------
 include/linux/nfs_xdr.h |  9 ++++++++-
 3 files changed, 30 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 59853797825c..88180ac5ea0e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6648,47 +6648,47 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
 int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status;
+	struct nfs41_bind_conn_to_session_args args = {
+		.client = clp,
+		.dir = NFS4_CDFC4_FORE_OR_BOTH,
+	};
 	struct nfs41_bind_conn_to_session_res res;
 	struct rpc_message msg = {
 		.rpc_proc =
 			&nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
-		.rpc_argp = clp,
+		.rpc_argp = &args,
 		.rpc_resp = &res,
 		.rpc_cred = cred,
 	};
 
 	dprintk("--> %s\n", __func__);
 
-	res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
-	if (unlikely(res.session == NULL)) {
-		status = -ENOMEM;
-		goto out;
-	}
+	nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
+	if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+		args.dir = NFS4_CDFC4_FORE;
 
 	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 	trace_nfs4_bind_conn_to_session(clp, status);
 	if (status == 0) {
-		if (memcmp(res.session->sess_id.data,
+		if (memcmp(res.sessionid.data,
 		    clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
 			dprintk("NFS: %s: Session ID mismatch\n", __func__);
 			status = -EIO;
-			goto out_session;
+			goto out;
 		}
-		if (res.dir != NFS4_CDFS4_BOTH) {
+		if ((res.dir & args.dir) != res.dir || res.dir == 0) {
 			dprintk("NFS: %s: Unexpected direction from server\n",
 				__func__);
 			status = -EIO;
-			goto out_session;
+			goto out;
 		}
-		if (res.use_conn_in_rdma_mode) {
+		if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
 			dprintk("NFS: %s: Server returned RDMA mode = true\n",
 				__func__);
 			status = -EIO;
-			goto out_session;
+			goto out;
 		}
 	}
-out_session:
-	kfree(res.session);
 out:
 	dprintk("<-- %s status= %d\n", __func__, status);
 	return status;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 97d4bdf53541..5c399ec41079 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1715,17 +1715,17 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru
 #if defined(CONFIG_NFS_V4_1)
 /* NFSv4.1 operations */
 static void encode_bind_conn_to_session(struct xdr_stream *xdr,
-				   struct nfs4_session *session,
+				   struct nfs41_bind_conn_to_session_args *args,
 				   struct compound_hdr *hdr)
 {
 	__be32 *p;
 
 	encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
 		decode_bind_conn_to_session_maxsz, hdr);
-	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN);
 	p = xdr_reserve_space(xdr, 8);
-	*p++ = cpu_to_be32(NFS4_CDFC4_FORE_OR_BOTH);
-	*p = 0;	/* use_conn_in_rdma_mode = False */
+	*p++ = cpu_to_be32(args->dir);
+	*p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0);
 }
 
 static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
@@ -2734,14 +2734,14 @@ static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
  */
 static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
 				struct xdr_stream *xdr,
-				struct nfs_client *clp)
+				struct nfs41_bind_conn_to_session_args *args)
 {
 	struct compound_hdr hdr = {
-		.minorversion = clp->cl_mvops->minor_version,
+		.minorversion = args->client->cl_mvops->minor_version,
 	};
 
 	encode_compound_hdr(xdr, req, &hdr);
-	encode_bind_conn_to_session(xdr, clp->cl_session, &hdr);
+	encode_bind_conn_to_session(xdr, args, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -5613,7 +5613,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
 
 	status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
 	if (!status)
-		status = decode_sessionid(xdr, &res->session->sess_id);
+		status = decode_sessionid(xdr, &res->sessionid);
 	if (unlikely(status))
 		return status;
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 1af12fc16e98..4cb3eaa89cf7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1167,8 +1167,15 @@ struct nfs41_impl_id {
 	struct nfstime4			date;
 };
 
+struct nfs41_bind_conn_to_session_args {
+	struct nfs_client		*client;
+	struct nfs4_sessionid		sessionid;
+	u32				dir;
+	bool				use_conn_in_rdma_mode;
+};
+
 struct nfs41_bind_conn_to_session_res {
-	struct nfs4_session		*session;
+	struct nfs4_sessionid		sessionid;
 	u32				dir;
 	bool				use_conn_in_rdma_mode;
 };
-- 
cgit v1.2.3


From 7a6fdeb2b1e93548854063c46c9724458564c76b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@redhat.com>
Date: Mon, 22 Dec 2014 19:14:26 +0300
Subject: libceph: nuke pool op infrastructure

On Mon, Dec 22, 2014 at 5:35 PM, Sage Weil <sage@newdream.net> wrote:
> On Mon, 22 Dec 2014, Ilya Dryomov wrote:
>> Actually, pool op stuff has been unused for over two years - looks like
>> it was added for rbd create_snap and that got ripped out in 2012.  It's
>> unlikely we'd ever need to manage pools or snaps from the kernel client
>> so I think it makes sense to nuke it.  Sage?
>
> Yep!

Signed-off-by: Ilya Dryomov <idryomov@redhat.com>
---
 include/linux/ceph/ceph_fs.h    |  36 -----------
 include/linux/ceph/mon_client.h |   9 +--
 net/ceph/ceph_strings.c         |  14 -----
 net/ceph/debugfs.c              |   2 -
 net/ceph/mon_client.c           | 135 +---------------------------------------
 5 files changed, 4 insertions(+), 192 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index c0dadaac26e3..69e2c9e2305b 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -158,17 +158,6 @@ enum {
 };
 
 
-/* pool operations */
-enum {
-  POOL_OP_CREATE			= 0x01,
-  POOL_OP_DELETE			= 0x02,
-  POOL_OP_AUID_CHANGE			= 0x03,
-  POOL_OP_CREATE_SNAP			= 0x11,
-  POOL_OP_DELETE_SNAP			= 0x12,
-  POOL_OP_CREATE_UNMANAGED_SNAP		= 0x21,
-  POOL_OP_DELETE_UNMANAGED_SNAP		= 0x22,
-};
-
 struct ceph_mon_request_header {
 	__le64 have_version;
 	__le16 session_mon;
@@ -191,31 +180,6 @@ struct ceph_mon_statfs_reply {
 	struct ceph_statfs st;
 } __attribute__ ((packed));
 
-const char *ceph_pool_op_name(int op);
-
-struct ceph_mon_poolop {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-	__le32 pool;
-	__le32 op;
-	__le64 auid;
-	__le64 snapid;
-	__le32 name_len;
-} __attribute__ ((packed));
-
-struct ceph_mon_poolop_reply {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-	__le32 reply_code;
-	__le32 epoch;
-	char has_data;
-	char data[0];
-} __attribute__ ((packed));
-
-struct ceph_mon_unmanaged_snap {
-	__le64 snapid;
-} __attribute__ ((packed));
-
 struct ceph_osd_getmap {
 	struct ceph_mon_request_header monhdr;
 	struct ceph_fsid fsid;
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index deb47e45ac7c..81810dc21f06 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -40,7 +40,7 @@ struct ceph_mon_request {
 };
 
 /*
- * ceph_mon_generic_request is being used for the statfs, poolop and
+ * ceph_mon_generic_request is being used for the statfs and
  * mon_get_version requests which are being done a bit differently
  * because we need to get data back to the caller
  */
@@ -50,7 +50,6 @@ struct ceph_mon_generic_request {
 	struct rb_node node;
 	int result;
 	void *buf;
-	int buf_len;
 	struct completion completion;
 	struct ceph_msg *request;  /* original request */
 	struct ceph_msg *reply;    /* and reply */
@@ -117,10 +116,4 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc);
 
 extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
 
-extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-				   u32 pool, u64 *snapid);
-
-extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-				   u32 pool, u64 snapid);
-
 #endif
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 30560202f57b..139a9cb19b0c 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -42,17 +42,3 @@ const char *ceph_osd_state_name(int s)
 		return "???";
 	}
 }
-
-const char *ceph_pool_op_name(int op)
-{
-	switch (op) {
-	case POOL_OP_CREATE: return "create";
-	case POOL_OP_DELETE: return "delete";
-	case POOL_OP_AUID_CHANGE: return "auid change";
-	case POOL_OP_CREATE_SNAP: return "create snap";
-	case POOL_OP_DELETE_SNAP: return "delete snap";
-	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
-	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
-	}
-	return "???";
-}
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index d2d525529f87..14d9995097cc 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -127,8 +127,6 @@ static int monc_show(struct seq_file *s, void *p)
 		op = le16_to_cpu(req->request->hdr.type);
 		if (op == CEPH_MSG_STATFS)
 			seq_printf(s, "%llu statfs\n", req->tid);
-		else if (op == CEPH_MSG_POOLOP)
-			seq_printf(s, "%llu poolop\n", req->tid);
 		else if (op == CEPH_MSG_MON_GET_VERSION)
 			seq_printf(s, "%llu mon_get_version", req->tid);
 		else
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index f2148e22b148..02e105c6865f 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -410,7 +410,7 @@ out_unlocked:
 }
 
 /*
- * generic requests (e.g., statfs, poolop)
+ * generic requests (currently statfs, mon_get_version)
  */
 static struct ceph_mon_generic_request *__lookup_generic_req(
 	struct ceph_mon_client *monc, u64 tid)
@@ -569,7 +569,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	return;
 
 bad:
-	pr_err("corrupt generic reply, tid %llu\n", tid);
+	pr_err("corrupt statfs reply, tid %llu\n", tid);
 	ceph_msg_dump(msg);
 }
 
@@ -588,7 +588,6 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 
 	kref_init(&req->kref);
 	req->buf = buf;
-	req->buf_len = sizeof(*buf);
 	init_completion(&req->completion);
 
 	err = -ENOMEM;
@@ -647,7 +646,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 
 	return;
 bad:
-	pr_err("corrupt mon_get_version reply\n");
+	pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
 	ceph_msg_dump(msg);
 }
 
@@ -670,7 +669,6 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
 
 	kref_init(&req->kref);
 	req->buf = newest;
-	req->buf_len = sizeof(*newest);
 	init_completion(&req->completion);
 
 	req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
@@ -706,128 +704,6 @@ out:
 }
 EXPORT_SYMBOL(ceph_monc_do_get_version);
 
-/*
- * pool ops
- */
-static int get_poolop_reply_buf(const char *src, size_t src_len,
-				char *dst, size_t dst_len)
-{
-	u32 buf_len;
-
-	if (src_len != sizeof(u32) + dst_len)
-		return -EINVAL;
-
-	buf_len = le32_to_cpu(*(__le32 *)src);
-	if (buf_len != dst_len)
-		return -EINVAL;
-
-	memcpy(dst, src + sizeof(u32), dst_len);
-	return 0;
-}
-
-static void handle_poolop_reply(struct ceph_mon_client *monc,
-				struct ceph_msg *msg)
-{
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
-	u64 tid = le64_to_cpu(msg->hdr.tid);
-
-	if (msg->front.iov_len < sizeof(*reply))
-		goto bad;
-	dout("handle_poolop_reply %p tid %llu\n", msg, tid);
-
-	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
-	if (req) {
-		if (req->buf_len &&
-		    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
-				     msg->front.iov_len - sizeof(*reply),
-				     req->buf, req->buf_len) < 0) {
-			mutex_unlock(&monc->mutex);
-			goto bad;
-		}
-		req->result = le32_to_cpu(reply->reply_code);
-		get_generic_request(req);
-	}
-	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete(&req->completion);
-		put_generic_request(req);
-	}
-	return;
-
-bad:
-	pr_err("corrupt generic reply, tid %llu\n", tid);
-	ceph_msg_dump(msg);
-}
-
-/*
- * Do a synchronous pool op.
- */
-static int do_poolop(struct ceph_mon_client *monc, u32 op,
-			u32 pool, u64 snapid,
-			char *buf, int len)
-{
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_poolop *h;
-	int err;
-
-	req = kzalloc(sizeof(*req), GFP_NOFS);
-	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	req->buf = buf;
-	req->buf_len = len;
-	init_completion(&req->completion);
-
-	err = -ENOMEM;
-	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS,
-				    true);
-	if (!req->request)
-		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS,
-				  true);
-	if (!req->reply)
-		goto out;
-
-	/* fill out request */
-	req->request->hdr.version = cpu_to_le16(2);
-	h = req->request->front.iov_base;
-	h->monhdr.have_version = 0;
-	h->monhdr.session_mon = cpu_to_le16(-1);
-	h->monhdr.session_mon_tid = 0;
-	h->fsid = monc->monmap->fsid;
-	h->pool = cpu_to_le32(pool);
-	h->op = cpu_to_le32(op);
-	h->auid = 0;
-	h->snapid = cpu_to_le64(snapid);
-	h->name_len = 0;
-
-	err = do_generic_request(monc, req);
-
-out:
-	kref_put(&req->kref, release_generic_request);
-	return err;
-}
-
-int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-			    u32 pool, u64 *snapid)
-{
-	return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-				   pool, 0, (char *)snapid, sizeof(*snapid));
-
-}
-EXPORT_SYMBOL(ceph_monc_create_snapid);
-
-int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-			    u32 pool, u64 snapid)
-{
-	return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-				   pool, snapid, NULL, 0);
-
-}
-
 /*
  * Resend pending generic requests.
  */
@@ -1112,10 +988,6 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		handle_get_version_reply(monc, msg);
 		break;
 
-	case CEPH_MSG_POOLOP_REPLY:
-		handle_poolop_reply(monc, msg);
-		break;
-
 	case CEPH_MSG_MON_MAP:
 		ceph_monc_handle_map(monc, msg);
 		break;
@@ -1154,7 +1026,6 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
 		m = ceph_msg_get(monc->m_subscribe_ack);
 		break;
-	case CEPH_MSG_POOLOP_REPLY:
 	case CEPH_MSG_STATFS_REPLY:
 		return get_generic_reply(con, hdr, skip);
 	case CEPH_MSG_AUTH_REPLY:
-- 
cgit v1.2.3


From 03f4fcb02884859b584c709652bb48f8125ceb45 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 5 Jan 2015 11:04:04 +0800
Subject: ceph: handle SESSION_FORCE_RO message

mark session as readonly and wake up all cap waiters.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/caps.c               | 15 +++++++++++++++
 fs/ceph/mds_client.c         | 10 ++++++++++
 fs/ceph/mds_client.h         |  1 +
 include/linux/ceph/ceph_fs.h |  1 +
 4 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b93c631c6c87..d0618e8412fd 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2143,6 +2143,21 @@ again:
 			ret = 1;
 		}
 	} else {
+		int session_readonly = false;
+		if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+			struct ceph_mds_session *s = ci->i_auth_cap->session;
+			spin_lock(&s->s_cap_lock);
+			session_readonly = s->s_readonly;
+			spin_unlock(&s->s_cap_lock);
+		}
+		if (session_readonly) {
+			dout("get_cap_refs %p needed %s but mds%d readonly\n",
+			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+			*err = -EROFS;
+			ret = 1;
+			goto out_unlock;
+		}
+
 		dout("get_cap_refs %p have %s needed %s\n", inode,
 		     ceph_cap_string(have), ceph_cap_string(need));
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d2171f4a6980..c6c33b411a2f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2580,6 +2580,14 @@ static void handle_session(struct ceph_mds_session *session,
 		send_flushmsg_ack(mdsc, session, seq);
 		break;
 
+	case CEPH_SESSION_FORCE_RO:
+		dout("force_session_readonly %p\n", session);
+		spin_lock(&session->s_cap_lock);
+		session->s_readonly = true;
+		spin_unlock(&session->s_cap_lock);
+		wake_up_session_caps(session, 0);
+		break;
+
 	default:
 		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
 		WARN_ON(1);
@@ -2791,6 +2799,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	spin_unlock(&session->s_gen_ttl_lock);
 
 	spin_lock(&session->s_cap_lock);
+	/* don't know if session is readonly */
+	session->s_readonly = 0;
 	/*
 	 * notify __ceph_remove_cap() that we are composing cap reconnect.
 	 * If a cap get released before being added to the cap reconnect,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e2817d00f7d9..a87b92f500bb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -137,6 +137,7 @@ struct ceph_mds_session {
 	int               s_nr_caps, s_trim_caps;
 	int               s_num_cap_releases;
 	int		  s_cap_reconnect;
+	int		  s_readonly;
 	struct list_head  s_cap_releases; /* waiting cap_release messages */
 	struct list_head  s_cap_releases_done; /* ready to send */
 	struct ceph_cap  *s_cap_iterator;
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 69e2c9e2305b..31eb03d0c766 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -271,6 +271,7 @@ enum {
 	CEPH_SESSION_RECALL_STATE,
 	CEPH_SESSION_FLUSHMSG,
 	CEPH_SESSION_FLUSHMSG_ACK,
+	CEPH_SESSION_FORCE_RO,
 };
 
 extern const char *ceph_session_op_name(int op);
-- 
cgit v1.2.3


From ba988f87f532cd2b8c4740aa8ec49056521ae833 Mon Sep 17 00:00:00 2001
From: Chaitanya Huilgol <chaitanya.huilgol@gmail.com>
Date: Fri, 23 Jan 2015 16:41:25 +0530
Subject: libceph: tcp_nodelay support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TCP_NODELAY socket option set on connection sockets,
disables Nagle’s algorithm and improves latency characteristics.
tcp_nodelay(default)/notcp_nodelay option flags provided to
enable/disable setting the socket option.

Signed-off-by: Chaitanya Huilgol <chaitanya.huilgol@sandisk.com>
[idryomov@redhat.com: NO_TCP_NODELAY -> TCP_NODELAY, minor adjustments]
Signed-off-by: Ilya Dryomov <idryomov@redhat.com>
---
 include/linux/ceph/libceph.h   |  3 ++-
 include/linux/ceph/messenger.h |  4 +++-
 net/ceph/ceph_common.c         | 16 +++++++++++++++-
 net/ceph/messenger.c           | 14 +++++++++++++-
 4 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 8b11a79ca1cb..16fff9608848 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -30,8 +30,9 @@
 #define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
 #define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes */
 #define CEPH_OPT_NOMSGAUTH	  (1<<4) /* not require cephx message signature */
+#define CEPH_OPT_TCP_NODELAY	  (1<<5) /* TCP_NODELAY on TCP sockets */
 
-#define CEPH_OPT_DEFAULT   (0)
+#define CEPH_OPT_DEFAULT   (CEPH_OPT_TCP_NODELAY)
 
 #define ceph_set_opt(client, opt) \
 	(client)->options->flags |= CEPH_OPT_##opt;
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index d9d396c16503..e15499422fdc 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -57,6 +57,7 @@ struct ceph_messenger {
 
 	atomic_t stopping;
 	bool nocrc;
+	bool tcp_nodelay;
 
 	/*
 	 * the global_seq counts connections i (attempt to) initiate
@@ -264,7 +265,8 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr,
 			struct ceph_entity_addr *myaddr,
 			u64 supported_features,
 			u64 required_features,
-			bool nocrc);
+			bool nocrc,
+			bool tcp_nodelay);
 
 extern void ceph_con_init(struct ceph_connection *con, void *private,
 			const struct ceph_connection_operations *ops,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 5d5ab67f516d..ec565508e904 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -239,6 +239,8 @@ enum {
 	Opt_nocrc,
 	Opt_cephx_require_signatures,
 	Opt_nocephx_require_signatures,
+	Opt_tcp_nodelay,
+	Opt_notcp_nodelay,
 };
 
 static match_table_t opt_tokens = {
@@ -259,6 +261,8 @@ static match_table_t opt_tokens = {
 	{Opt_nocrc, "nocrc"},
 	{Opt_cephx_require_signatures, "cephx_require_signatures"},
 	{Opt_nocephx_require_signatures, "nocephx_require_signatures"},
+	{Opt_tcp_nodelay, "tcp_nodelay"},
+	{Opt_notcp_nodelay, "notcp_nodelay"},
 	{-1, NULL}
 };
 
@@ -457,6 +461,7 @@ ceph_parse_options(char *options, const char *dev_name,
 		case Opt_nocrc:
 			opt->flags |= CEPH_OPT_NOCRC;
 			break;
+
 		case Opt_cephx_require_signatures:
 			opt->flags &= ~CEPH_OPT_NOMSGAUTH;
 			break;
@@ -464,6 +469,13 @@ ceph_parse_options(char *options, const char *dev_name,
 			opt->flags |= CEPH_OPT_NOMSGAUTH;
 			break;
 
+		case Opt_tcp_nodelay:
+			opt->flags |= CEPH_OPT_TCP_NODELAY;
+			break;
+		case Opt_notcp_nodelay:
+			opt->flags &= ~CEPH_OPT_TCP_NODELAY;
+			break;
+
 		default:
 			BUG_ON(token);
 		}
@@ -518,10 +530,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 	/* msgr */
 	if (ceph_test_opt(client, MYIP))
 		myaddr = &client->options->my_addr;
+
 	ceph_messenger_init(&client->msgr, myaddr,
 		client->supported_features,
 		client->required_features,
-		ceph_test_opt(client, NOCRC));
+		ceph_test_opt(client, NOCRC),
+		ceph_test_opt(client, TCP_NODELAY));
 
 	/* subsystems */
 	err = ceph_monc_init(&client->monc, client);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 33a2f201e460..6b3f54ed65ba 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -510,6 +510,16 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 		return ret;
 	}
 
+	if (con->msgr->tcp_nodelay) {
+		int optval = 1;
+
+		ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+					(char *)&optval, sizeof(optval));
+		if (ret)
+			pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d",
+			       ret);
+	}
+
 	sk_set_memalloc(sock->sk);
 
 	con->sock = sock;
@@ -2922,7 +2932,8 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 			struct ceph_entity_addr *myaddr,
 			u64 supported_features,
 			u64 required_features,
-			bool nocrc)
+			bool nocrc,
+			bool tcp_nodelay)
 {
 	msgr->supported_features = supported_features;
 	msgr->required_features = required_features;
@@ -2937,6 +2948,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
 	encode_my_addr(msgr);
 	msgr->nocrc = nocrc;
+	msgr->tcp_nodelay = tcp_nodelay;
 
 	atomic_set(&msgr->stopping, 0);
 
-- 
cgit v1.2.3


From f7d4ca8bbfda23b4f1eae9b6757ff64166b093d5 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Fri, 7 Nov 2014 18:37:57 +0000
Subject: kdb: Avoid printing KERN_ levels to consoles

Currently when kdb traps printk messages then the raw log level prefix
(consisting of '\001' followed by a numeral) does not get stripped off
before the message is issued to the various I/O handlers supported by
kdb. This causes annoying visual noise as well as causing problems
grepping for ^. It is also a change of behaviour compared to normal usage
of printk() usage. For example <SysRq>-h ends up with different output to
that of kdb's "sr h".

This patch addresses the problem by stripping log levels from messages
before they are issued to the I/O handlers. printk() which can also
act as an i/o handler in some cases is special cased; if the caller
provided a log level then the prefix will be preserved when sent to
printk().

The addition of non-printable characters to the output of kdb commands is a
regression, albeit and extremely elderly one, introduced by commit
04d2c8c83d0e ("printk: convert the format for KERN_<LEVEL> to a 2 byte
pattern"). Note also that this patch does *not* restore the original
behaviour from v3.5. Instead it makes printk() from within a kdb command
display the message without any prefix (i.e. like printk() normally does).

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Joe Perches <joe@perches.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/kdb.h       |  8 +++++++-
 kernel/debug/kdb/kdb_io.c | 22 +++++++++++++---------
 kernel/printk/printk.c    |  2 +-
 3 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 75ae2e2631fc..a19bcf9e762e 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -156,8 +156,14 @@ typedef enum {
 	KDB_REASON_SYSTEM_NMI,	/* In NMI due to SYSTEM cmd; regs valid */
 } kdb_reason_t;
 
+enum kdb_msgsrc {
+	KDB_MSGSRC_INTERNAL, /* direct call to kdb_printf() */
+	KDB_MSGSRC_PRINTK, /* trapped from printk() */
+};
+
 extern int kdb_trap_printk;
-extern __printf(1, 0) int vkdb_printf(const char *fmt, va_list args);
+extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
+				      va_list args);
 extern __printf(1, 2) int kdb_printf(const char *, ...);
 typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
 
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 7c70812caea5..a550afb99ebe 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor)
 	return 0;
 }
 
-int vkdb_printf(const char *fmt, va_list ap)
+int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
 {
 	int diag;
 	int linecount;
@@ -691,19 +691,20 @@ kdb_printit:
 	 * Write to all consoles.
 	 */
 	retlen = strlen(kdb_buffer);
+	cp = (char *) printk_skip_level(kdb_buffer);
 	if (!dbg_kdb_mode && kgdb_connected) {
-		gdbstub_msg_write(kdb_buffer, retlen);
+		gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
 	} else {
 		if (dbg_io_ops && !dbg_io_ops->is_console) {
-			len = retlen;
-			cp = kdb_buffer;
+			len = retlen - (cp - kdb_buffer);
+			cp2 = cp;
 			while (len--) {
-				dbg_io_ops->write_char(*cp);
-				cp++;
+				dbg_io_ops->write_char(*cp2);
+				cp2++;
 			}
 		}
 		while (c) {
-			c->write(c, kdb_buffer, retlen);
+			c->write(c, cp, retlen - (cp - kdb_buffer));
 			touch_nmi_watchdog();
 			c = c->next;
 		}
@@ -711,7 +712,10 @@ kdb_printit:
 	if (logging) {
 		saved_loglevel = console_loglevel;
 		console_loglevel = CONSOLE_LOGLEVEL_SILENT;
-		printk(KERN_INFO "%s", kdb_buffer);
+		if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK)
+			printk("%s", kdb_buffer);
+		else
+			pr_info("%s", kdb_buffer);
 	}
 
 	if (KDB_STATE(PAGER)) {
@@ -844,7 +848,7 @@ int kdb_printf(const char *fmt, ...)
 	int r;
 
 	va_start(ap, fmt);
-	r = vkdb_printf(fmt, ap);
+	r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
 	va_end(ap);
 
 	return r;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02d6b6d28796..fae29e3ffbf0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1811,7 +1811,7 @@ int vprintk_default(const char *fmt, va_list args)
 
 #ifdef CONFIG_KGDB_KDB
 	if (unlikely(kdb_trap_printk)) {
-		r = vkdb_printf(fmt, args);
+		r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
 		return r;
 	}
 #endif
-- 
cgit v1.2.3


From e1e5e5641e6f271321aec257ed26a72715e4a8c2 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 19 Feb 2015 13:39:03 -0700
Subject: NVMe: Metadata format support

Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.

The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.

The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.

If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.

The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 298 +++++++++++++++++++++++++++++++++++-----------
 include/linux/nvme.h      |   2 +
 include/uapi/linux/nvme.h |  16 +++
 3 files changed, 249 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index cbdfbbf98392..3ffa57a932ea 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -37,6 +37,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/t10-pi.h>
 #include <linux/types.h>
 #include <scsi/sg.h>
 #include <asm-generic/io-64-nonatomic-lo-hi.h>
@@ -482,6 +483,62 @@ static int nvme_error_status(u16 status)
 	}
 }
 
+static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+	if (be32_to_cpu(pi->ref_tag) == v)
+		pi->ref_tag = cpu_to_be32(p);
+}
+
+static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+	if (be32_to_cpu(pi->ref_tag) == p)
+		pi->ref_tag = cpu_to_be32(v);
+}
+
+/**
+ * nvme_dif_remap - remaps ref tags to bip seed and physical lba
+ *
+ * The virtual start sector is the one that was originally submitted by the
+ * block layer.	Due to partitioning, MD/DM cloning, etc. the actual physical
+ * start sector may be different. Remap protection information to match the
+ * physical LBA on writes, and back to the original seed on reads.
+ *
+ * Type 0 and 3 do not have a ref tag, so no remapping required.
+ */
+static void nvme_dif_remap(struct request *req,
+			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
+{
+	struct nvme_ns *ns = req->rq_disk->private_data;
+	struct bio_integrity_payload *bip;
+	struct t10_pi_tuple *pi;
+	void *p, *pmap;
+	u32 i, nlb, ts, phys, virt;
+
+	if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
+		return;
+
+	bip = bio_integrity(req->bio);
+	if (!bip)
+		return;
+
+	pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
+	if (!pmap)
+		return;
+
+	p = pmap;
+	virt = bip_get_seed(bip);
+	phys = nvme_block_nr(ns, blk_rq_pos(req));
+	nlb = (blk_rq_bytes(req) >> ns->lba_shift);
+	ts = ns->disk->integrity->tuple_size;
+
+	for (i = 0; i < nlb; i++, virt++, phys++) {
+		pi = (struct t10_pi_tuple *)p;
+		dif_swap(phys, virt, pi);
+		p += ts;
+	}
+	kunmap_atomic(pmap);
+}
+
 static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
@@ -512,9 +569,16 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 			"completing aborted command with status:%04x\n",
 			status);
 
-	if (iod->nents)
+	if (iod->nents) {
 		dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
 			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		if (blk_integrity_rq(req)) {
+			if (!rq_data_dir(req))
+				nvme_dif_remap(req, nvme_dif_complete);
+			dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1,
+				rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		}
+	}
 	nvme_free_iod(nvmeq->dev, iod);
 
 	blk_mq_complete_request(req);
@@ -670,6 +734,24 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 	cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+
+	if (blk_integrity_rq(req)) {
+		cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg));
+		switch (ns->pi_type) {
+		case NVME_NS_DPS_PI_TYPE3:
+			control |= NVME_RW_PRINFO_PRCHK_GUARD;
+			break;
+		case NVME_NS_DPS_PI_TYPE1:
+		case NVME_NS_DPS_PI_TYPE2:
+			control |= NVME_RW_PRINFO_PRCHK_GUARD |
+					NVME_RW_PRINFO_PRCHK_REF;
+			cmnd->rw.reftag = cpu_to_le32(
+					nvme_block_nr(ns, blk_rq_pos(req)));
+			break;
+		}
+	} else if (ns->ms)
+		control |= NVME_RW_PRINFO_PRACT;
+
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
@@ -690,6 +772,19 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct nvme_iod *iod;
 	enum dma_data_direction dma_dir;
 
+	/*
+	 * If formated with metadata, require the block layer provide a buffer
+	 * unless this namespace is formated such that the metadata can be
+	 * stripped/generated by the controller with PRACT=1.
+	 */
+	if (ns->ms && !blk_integrity_rq(req)) {
+		if (!(ns->pi_type && ns->ms == 8)) {
+			req->errors = -EFAULT;
+			blk_mq_complete_request(req);
+			return BLK_MQ_RQ_QUEUE_OK;
+		}
+	}
+
 	iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
@@ -725,6 +820,21 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 					iod->nents, dma_dir);
 			goto retry_cmd;
 		}
+		if (blk_integrity_rq(req)) {
+			if (blk_rq_count_integrity_sg(req->q, req->bio) != 1)
+				goto error_cmd;
+
+			sg_init_table(iod->meta_sg, 1);
+			if (blk_rq_map_integrity_sg(
+					req->q, req->bio, iod->meta_sg) != 1)
+				goto error_cmd;
+
+			if (rq_data_dir(req))
+				nvme_dif_remap(req, nvme_dif_prep);
+
+			if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir))
+				goto error_cmd;
+		}
 	}
 
 	nvme_set_info(cmd, iod, req_completion);
@@ -1875,13 +1985,61 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
 	return 0;
 }
 
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+	u32 logical_block_size = queue_logical_block_size(ns->queue);
+	ns->queue->limits.discard_zeroes_data = 0;
+	ns->queue->limits.discard_alignment = logical_block_size;
+	ns->queue->limits.discard_granularity = logical_block_size;
+	ns->queue->limits.max_discard_sectors = 0xffffffff;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
+static int nvme_noop_verify(struct blk_integrity_iter *iter)
+{
+	return 0;
+}
+
+static int nvme_noop_generate(struct blk_integrity_iter *iter)
+{
+	return 0;
+}
+
+struct blk_integrity nvme_meta_noop = {
+	.name			= "NVME_META_NOOP",
+	.generate_fn		= nvme_noop_generate,
+	.verify_fn		= nvme_noop_verify,
+};
+
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+	struct blk_integrity integrity;
+
+	switch (ns->pi_type) {
+	case NVME_NS_DPS_PI_TYPE3:
+		integrity = t10_pi_type3_crc;
+		break;
+	case NVME_NS_DPS_PI_TYPE1:
+	case NVME_NS_DPS_PI_TYPE2:
+		integrity = t10_pi_type1_crc;
+		break;
+	default:
+		integrity = nvme_meta_noop;
+		break;
+	}
+	integrity.tuple_size = ns->ms;
+	blk_integrity_register(ns->disk, &integrity);
+	blk_queue_max_integrity_segments(ns->queue, 1);
+}
+
 static int nvme_revalidate_disk(struct gendisk *disk)
 {
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ns *id;
 	dma_addr_t dma_addr;
-	int lbaf;
+	int lbaf, pi_type, old_ms;
+	unsigned short bs;
 
 	id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
 								GFP_KERNEL);
@@ -1890,16 +2048,50 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 								__func__);
 		return 0;
 	}
+	if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) {
+		dev_warn(&dev->pci_dev->dev,
+			"identify failed ns:%d, setting capacity to 0\n",
+			ns->ns_id);
+		memset(id, 0, sizeof(*id));
+	}
 
-	if (nvme_identify(dev, ns->ns_id, 0, dma_addr))
-		goto free;
-
-	lbaf = id->flbas & 0xf;
+	old_ms = ns->ms;
+	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
 	ns->lba_shift = id->lbaf[lbaf].ds;
+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+
+	/*
+	 * If identify namespace failed, use default 512 byte block size so
+	 * block layer can use before failing read/write for 0 capacity.
+	 */
+	if (ns->lba_shift == 0)
+		ns->lba_shift = 9;
+	bs = 1 << ns->lba_shift;
+
+	/* XXX: PI implementation requires metadata equal t10 pi tuple size */
+	pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
+					id->dps & NVME_NS_DPS_PI_MASK : 0;
+
+	if (disk->integrity && (ns->pi_type != pi_type || ns->ms != old_ms ||
+				bs != queue_logical_block_size(disk->queue) ||
+				(ns->ms && id->flbas & NVME_NS_FLBAS_META_EXT)))
+		blk_integrity_unregister(disk);
+
+	ns->pi_type = pi_type;
+	blk_queue_logical_block_size(ns->queue, bs);
+
+	if (ns->ms && !disk->integrity && (disk->flags & GENHD_FL_UP) &&
+				!(id->flbas & NVME_NS_FLBAS_META_EXT))
+		nvme_init_integrity(ns);
+
+	if (id->ncap == 0 || (ns->ms && !disk->integrity))
+		set_capacity(disk, 0);
+	else
+		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+	if (dev->oncs & NVME_CTRL_ONCS_DSM)
+		nvme_config_discard(ns);
 
-	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
-	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
- free:
 	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
 	return 0;
 }
@@ -1956,30 +2148,16 @@ static int nvme_kthread(void *data)
 	return 0;
 }
 
-static void nvme_config_discard(struct nvme_ns *ns)
-{
-	u32 logical_block_size = queue_logical_block_size(ns->queue);
-	ns->queue->limits.discard_zeroes_data = 0;
-	ns->queue->limits.discard_alignment = logical_block_size;
-	ns->queue->limits.discard_granularity = logical_block_size;
-	ns->queue->limits.max_discard_sectors = 0xffffffff;
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
-}
-
-static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
-			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
+static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
 	int node = dev_to_node(&dev->pci_dev->dev);
-	int lbaf;
-
-	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
-		return NULL;
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
-		return NULL;
+		return;
+
 	ns->queue = blk_mq_init_queue(&dev->tagset);
 	if (IS_ERR(ns->queue))
 		goto out_free_ns;
@@ -1995,9 +2173,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 
 	ns->ns_id = nsid;
 	ns->disk = disk;
-	lbaf = id->flbas & 0xf;
-	ns->lba_shift = id->lbaf[lbaf].ds;
-	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+	list_add_tail(&ns->list, &dev->namespaces);
+
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	if (dev->max_hw_sectors)
 		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
@@ -2014,18 +2192,23 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	disk->driverfs_dev = &dev->pci_dev->dev;
 	disk->flags = GENHD_FL_EXT_DEVT;
 	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
-	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
-
-	if (dev->oncs & NVME_CTRL_ONCS_DSM)
-		nvme_config_discard(ns);
-
-	return ns;
 
+	/*
+	 * Initialize capacity to 0 until we establish the namespace format and
+	 * setup integrity extentions if necessary. The revalidate_disk after
+	 * add_disk allows the driver to register with integrity if the format
+	 * requires it.
+	 */
+	set_capacity(disk, 0);
+	nvme_revalidate_disk(ns->disk);
+	add_disk(ns->disk);
+	if (ns->ms)
+		revalidate_disk(ns->disk);
+	return;
  out_free_queue:
 	blk_cleanup_queue(ns->queue);
  out_free_ns:
 	kfree(ns);
-	return NULL;
 }
 
 static void nvme_create_io_queues(struct nvme_dev *dev)
@@ -2150,22 +2333,20 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	struct pci_dev *pdev = dev->pci_dev;
 	int res;
 	unsigned nn, i;
-	struct nvme_ns *ns;
 	struct nvme_id_ctrl *ctrl;
-	struct nvme_id_ns *id_ns;
 	void *mem;
 	dma_addr_t dma_addr;
 	int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
 
-	mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL);
+	mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL);
 	if (!mem)
 		return -ENOMEM;
 
 	res = nvme_identify(dev, 0, 1, dma_addr);
 	if (res) {
 		dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res);
-		res = -EIO;
-		goto out;
+		dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+		return -EIO;
 	}
 
 	ctrl = mem;
@@ -2191,6 +2372,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		} else
 			dev->max_hw_sectors = max_hw_sectors;
 	}
+	dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
 
 	dev->tagset.ops = &nvme_mq_ops;
 	dev->tagset.nr_hw_queues = dev->online_queues - 1;
@@ -2203,33 +2385,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	dev->tagset.driver_data = dev;
 
 	if (blk_mq_alloc_tag_set(&dev->tagset))
-		goto out;
-
-	id_ns = mem;
-	for (i = 1; i <= nn; i++) {
-		res = nvme_identify(dev, i, 0, dma_addr);
-		if (res)
-			continue;
-
-		if (id_ns->ncap == 0)
-			continue;
-
-		res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
-							dma_addr + 4096, NULL);
-		if (res)
-			memset(mem + 4096, 0, 4096);
+		return 0;
 
-		ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
-		if (ns)
-			list_add_tail(&ns->list, &dev->namespaces);
-	}
-	list_for_each_entry(ns, &dev->namespaces, list)
-		add_disk(ns->disk);
-	res = 0;
+	for (i = 1; i <= nn; i++)
+		nvme_alloc_ns(dev, i);
 
- out:
-	dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
-	return res;
+	return 0;
 }
 
 static int nvme_dev_map(struct nvme_dev *dev)
@@ -2528,8 +2689,11 @@ static void nvme_dev_remove(struct nvme_dev *dev)
 	struct nvme_ns *ns;
 
 	list_for_each_entry(ns, &dev->namespaces, list) {
-		if (ns->disk->flags & GENHD_FL_UP)
+		if (ns->disk->flags & GENHD_FL_UP) {
+			if (ns->disk->integrity)
+				blk_integrity_unregister(ns->disk);
 			del_gendisk(ns->disk);
+		}
 		if (!blk_queue_dying(ns->queue)) {
 			blk_mq_abort_requeue_list(ns->queue);
 			blk_cleanup_queue(ns->queue);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 19a5d4b23209..cca264db2478 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -121,6 +121,7 @@ struct nvme_ns {
 	unsigned ns_id;
 	int lba_shift;
 	int ms;
+	int pi_type;
 	u64 mode_select_num_blocks;
 	u32 mode_select_block_len;
 };
@@ -138,6 +139,7 @@ struct nvme_iod {
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
 	dma_addr_t first_dma;
+	struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */
 	struct scatterlist sg[0];
 };
 
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 26386cf3db44..406bfc95652c 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -124,10 +124,22 @@ struct nvme_id_ns {
 
 enum {
 	NVME_NS_FEAT_THIN	= 1 << 0,
+	NVME_NS_FLBAS_LBA_MASK	= 0xf,
+	NVME_NS_FLBAS_META_EXT	= 0x10,
 	NVME_LBAF_RP_BEST	= 0,
 	NVME_LBAF_RP_BETTER	= 1,
 	NVME_LBAF_RP_GOOD	= 2,
 	NVME_LBAF_RP_DEGRADED	= 3,
+	NVME_NS_DPC_PI_LAST	= 1 << 4,
+	NVME_NS_DPC_PI_FIRST	= 1 << 3,
+	NVME_NS_DPC_PI_TYPE3	= 1 << 2,
+	NVME_NS_DPC_PI_TYPE2	= 1 << 1,
+	NVME_NS_DPC_PI_TYPE1	= 1 << 0,
+	NVME_NS_DPS_PI_FIRST	= 1 << 3,
+	NVME_NS_DPS_PI_MASK	= 0x7,
+	NVME_NS_DPS_PI_TYPE1	= 1,
+	NVME_NS_DPS_PI_TYPE2	= 2,
+	NVME_NS_DPS_PI_TYPE3	= 3,
 };
 
 struct nvme_smart_log {
@@ -261,6 +273,10 @@ enum {
 	NVME_RW_DSM_LATENCY_LOW		= 3 << 4,
 	NVME_RW_DSM_SEQ_REQ		= 1 << 6,
 	NVME_RW_DSM_COMPRESSED		= 1 << 7,
+	NVME_RW_PRINFO_PRCHK_REF	= 1 << 10,
+	NVME_RW_PRINFO_PRCHK_APP	= 1 << 11,
+	NVME_RW_PRINFO_PRCHK_GUARD	= 1 << 12,
+	NVME_RW_PRINFO_PRACT		= 1 << 13,
 };
 
 struct nvme_dsm_cmd {
-- 
cgit v1.2.3


From 4f1982b4e262c45475a91b4253e9bc7f7c991c13 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 19 Feb 2015 13:42:14 -0700
Subject: NVMe: Update SCSI Inquiry VPD 83h translation

The original translation created collisions on Inquiry VPD 83 for many
existing devices. Newer specifications provide other ways to translate
based on the device's version can be used to create unique identifiers.

Version 1.1 provides an EUI64 field that uniquely identifies each
namespace, and 1.2 added the longer NGUID field for the same reason.
Both follow the IEEE EUI format and readily translate to the SCSI device
identification EUI designator type 2h. For devices implementing either,
the translation will use this type, defaulting to the EUI64 8-byte type if
implemented then NGUID's 16 byte version if not. If neither are provided,
the 1.0 translation is used, and is updated to use the SCSI String format
to guarantee a unique identifier.

Knowing when to use the new fields depends on the nvme controller's
revision. The NVME_VS macro was not decoding this correctly, so that is
fixed in this patch and moved to a more appropriate place.

Since the Identify Namespace structure required an update for the NGUID
field, this patch adds the remaining new 1.2 fields to the structure.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-scsi.c | 94 ++++++++++++++++++++++++++---------------------
 include/linux/nvme.h      |  2 -
 include/uapi/linux/nvme.h | 10 ++++-
 3 files changed, 62 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 5e78568026c3..5cc907648742 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -779,10 +779,8 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	struct nvme_dev *dev = ns->dev;
 	dma_addr_t dma_addr;
 	void *mem;
-	struct nvme_id_ctrl *id_ctrl;
 	int res = SNTI_TRANSLATION_SUCCESS;
 	int nvme_sc;
-	u8 ieee[4];
 	int xfer_len;
 	__be32 tmp_id = cpu_to_be32(ns->ns_id);
 
@@ -793,46 +791,60 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		goto out_dma;
 	}
 
-	/* nvme controller identify */
-	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
-	res = nvme_trans_status_code(hdr, nvme_sc);
-	if (res)
-		goto out_free;
-	if (nvme_sc) {
-		res = nvme_sc;
-		goto out_free;
-	}
-	id_ctrl = mem;
-
-	/* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */
-	ieee[0] = id_ctrl->ieee[0] << 4;
-	ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4;
-	ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4;
-	ieee[3] = id_ctrl->ieee[2] >> 4;
-
-	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+	memset(inq_response, 0, alloc_len);
 	inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;    /* Page Code */
-	inq_response[3] = 20;      /* Page Length */
-	/* Designation Descriptor start */
-	inq_response[4] = 0x01;    /* Proto ID=0h | Code set=1h */
-	inq_response[5] = 0x03;    /* PIV=0b | Asso=00b | Designator Type=3h */
-	inq_response[6] = 0x00;    /* Rsvd */
-	inq_response[7] = 16;      /* Designator Length */
-	/* Designator start */
-	inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/
-	inq_response[9] = ieee[2];        /* IEEE ID */
-	inq_response[10] = ieee[1];       /* IEEE ID */
-	inq_response[11] = ieee[0];       /* IEEE ID| Vendor Specific ID... */
-	inq_response[12] = (dev->pci_dev->vendor & 0xFF00) >> 8;
-	inq_response[13] = (dev->pci_dev->vendor & 0x00FF);
-	inq_response[14] = dev->serial[0];
-	inq_response[15] = dev->serial[1];
-	inq_response[16] = dev->model[0];
-	inq_response[17] = dev->model[1];
-	memcpy(&inq_response[18], &tmp_id, sizeof(u32));
-	/* Last 2 bytes are zero */
+	if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) {
+		struct nvme_id_ns *id_ns = mem;
+		void *eui = id_ns->eui64;
+		int len = sizeof(id_ns->eui64);
 
-	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+		nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+		res = nvme_trans_status_code(hdr, nvme_sc);
+		if (res)
+			goto out_free;
+		if (nvme_sc) {
+			res = nvme_sc;
+			goto out_free;
+		}
+
+		if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) {
+			if (bitmap_empty(eui, len * 8)) {
+				eui = id_ns->nguid;
+				len = sizeof(id_ns->nguid);
+			}
+		}
+		if (bitmap_empty(eui, len * 8))
+			goto scsi_string;
+
+		inq_response[3] = 4 + len; /* Page Length */
+		/* Designation Descriptor start */
+		inq_response[4] = 0x01;    /* Proto ID=0h | Code set=1h */
+		inq_response[5] = 0x02;    /* PIV=0b | Asso=00b | Designator Type=2h */
+		inq_response[6] = 0x00;    /* Rsvd */
+		inq_response[7] = len;     /* Designator Length */
+		memcpy(&inq_response[8], eui, len);
+	} else {
+ scsi_string:
+		if (alloc_len < 72) {
+			res = nvme_trans_completion(hdr,
+					SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			goto out_free;
+		}
+		inq_response[3] = 0x48;    /* Page Length */
+		/* Designation Descriptor start */
+		inq_response[4] = 0x03;    /* Proto ID=0h | Code set=3h */
+		inq_response[5] = 0x08;    /* PIV=0b | Asso=00b | Designator Type=8h */
+		inq_response[6] = 0x00;    /* Rsvd */
+		inq_response[7] = 0x44;    /* Designator Length */
+
+		sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor);
+		memcpy(&inq_response[12], dev->model, sizeof(dev->model));
+		sprintf(&inq_response[52], "%04x", tmp_id);
+		memcpy(&inq_response[56], dev->serial, sizeof(dev->serial));
+	}
+	xfer_len = alloc_len;
 	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 
  out_free:
@@ -2222,7 +2234,7 @@ static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	page_code = GET_INQ_PAGE_CODE(cmd);
 	alloc_len = GET_INQ_ALLOC_LENGTH(cmd);
 
-	inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL);
+	inq_response = kmalloc(alloc_len, GFP_KERNEL);
 	if (inq_response == NULL) {
 		res = -ENOMEM;
 		goto out_mem;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index cca264db2478..1f062a9e521d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -62,8 +62,6 @@ enum {
 	NVME_CSTS_SHST_MASK	= 3 << 2,
 };
 
-#define NVME_VS(major, minor)	(major << 16 | minor)
-
 extern unsigned char nvme_io_timeout;
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
 
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 406bfc95652c..aef9a81b2d75 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -115,7 +115,13 @@ struct nvme_id_ns {
 	__le16			nawun;
 	__le16			nawupf;
 	__le16			nacwu;
-	__u8			rsvd40[80];
+	__le16			nabsn;
+	__le16			nabo;
+	__le16			nabspf;
+	__u16			rsvd46;
+	__le64			nvmcap[2];
+	__u8			rsvd64[40];
+	__u8			nguid[16];
 	__u8			eui64[8];
 	struct nvme_lbaf	lbaf[16];
 	__u8			rsvd192[192];
@@ -565,6 +571,8 @@ struct nvme_passthru_cmd {
 	__u32	result;
 };
 
+#define NVME_VS(major, minor) (((major) << 16) | ((minor) << 8))
+
 #define nvme_admin_cmd nvme_passthru_cmd
 
 #define NVME_IOCTL_ID		_IO('N', 0x40)
-- 
cgit v1.2.3


From b3fffdefabab266ae5176a136d93b6670b07bb30 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 3 Feb 2015 11:21:42 -0700
Subject: NVMe: Register management handle under nvme class

This creates a new class type for nvme devices to register their
management character devices with. This is so we do not rely on miscdev
to provide enough minors for as many nvme devices some people plan to
use. The previous limit was approximately 60 NVMe controllers, depending
on the platform and kernel. Now the limit is 1M, which ought to be enough
for anybody.

Since we have a new device class, it makes sense to attach the block
devices under this as well, so part of this patch moves the management
handle initialization prior to the namespaces discovery.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 79 +++++++++++++++++++++++++++++++++--------------
 include/linux/nvme.h      |  3 +-
 2 files changed, 57 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 3ffa57a932ea..bb2b861cfed9 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -42,6 +42,7 @@
 #include <scsi/sg.h>
 #include <asm-generic/io-64-nonatomic-lo-hi.h>
 
+#define NVME_MINORS		(1U << MINORBITS)
 #define NVME_Q_DEPTH		1024
 #define NVME_AQ_DEPTH		64
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
@@ -69,6 +70,9 @@ MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown")
 static int nvme_major;
 module_param(nvme_major, int, 0);
 
+static int nvme_char_major;
+module_param(nvme_char_major, int, 0);
+
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -79,6 +83,8 @@ static struct workqueue_struct *nvme_workq;
 static wait_queue_head_t nvme_kthread_wait;
 static struct notifier_block nvme_nb;
 
+static struct class *nvme_class;
+
 static void nvme_reset_failed_dev(struct work_struct *ws);
 static int nvme_process_cq(struct nvme_queue *nvmeq);
 
@@ -2189,7 +2195,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
 	disk->fops = &nvme_fops;
 	disk->private_data = ns;
 	disk->queue = ns->queue;
-	disk->driverfs_dev = &dev->pci_dev->dev;
+	disk->driverfs_dev = dev->device;
 	disk->flags = GENHD_FL_EXT_DEVT;
 	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
 
@@ -2775,6 +2781,7 @@ static void nvme_free_dev(struct kref *kref)
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
 	pci_dev_put(dev->pci_dev);
+	put_device(dev->device);
 	nvme_free_namespaces(dev);
 	nvme_release_instance(dev);
 	blk_mq_free_tag_set(&dev->tagset);
@@ -2786,11 +2793,23 @@ static void nvme_free_dev(struct kref *kref)
 
 static int nvme_dev_open(struct inode *inode, struct file *f)
 {
-	struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev,
-								miscdev);
-	kref_get(&dev->kref);
-	f->private_data = dev;
-	return 0;
+	struct nvme_dev *dev;
+	int instance = iminor(inode);
+	int ret = -ENODEV;
+
+	spin_lock(&dev_list_lock);
+	list_for_each_entry(dev, &dev_list, node) {
+		if (dev->instance == instance) {
+			if (!kref_get_unless_zero(&dev->kref))
+				break;
+			f->private_data = dev;
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock(&dev_list_lock);
+
+	return ret;
 }
 
 static int nvme_dev_release(struct inode *inode, struct file *f)
@@ -3002,29 +3021,26 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto release_pools;
 
-	if (dev->online_queues > 1)
-		result = nvme_dev_add(dev);
-	if (result)
+	dev->device = device_create(nvme_class, &pdev->dev,
+				MKDEV(nvme_char_major, dev->instance),
+				dev, "nvme%d", dev->instance);
+	if (IS_ERR(dev->device)) {
+		result = PTR_ERR(dev->device);
 		goto shutdown;
+	}
+	get_device(dev->device);
 
-	scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
-	dev->miscdev.minor = MISC_DYNAMIC_MINOR;
-	dev->miscdev.parent = &pdev->dev;
-	dev->miscdev.name = dev->name;
-	dev->miscdev.fops = &nvme_dev_fops;
-	result = misc_register(&dev->miscdev);
+	if (dev->online_queues > 1)
+		result = nvme_dev_add(dev);
 	if (result)
-		goto remove;
+		goto device_del;
 
 	nvme_set_irq_hints(dev);
-
 	dev->initialized = 1;
 	return 0;
 
- remove:
-	nvme_dev_remove(dev);
-	nvme_dev_remove_admin(dev);
-	nvme_free_namespaces(dev);
+ device_del:
+	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
  shutdown:
 	nvme_dev_shutdown(dev);
  release_pools:
@@ -3067,10 +3083,10 @@ static void nvme_remove(struct pci_dev *pdev)
 
 	pci_set_drvdata(pdev, NULL);
 	flush_work(&dev->reset_work);
-	misc_deregister(&dev->miscdev);
 	nvme_dev_shutdown(dev);
 	nvme_dev_remove(dev);
 	nvme_dev_remove_admin(dev);
+	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
 	nvme_free_queues(dev, 0);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
@@ -3154,11 +3170,26 @@ static int __init nvme_init(void)
 	else if (result > 0)
 		nvme_major = result;
 
+	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
+							&nvme_dev_fops);
+	if (result < 0)
+		goto unregister_blkdev;
+	else if (result > 0)
+		nvme_char_major = result;
+
+	nvme_class = class_create(THIS_MODULE, "nvme");
+	if (!nvme_class)
+		goto unregister_chrdev;
+
 	result = pci_register_driver(&nvme_driver);
 	if (result)
-		goto unregister_blkdev;
+		goto destroy_class;
 	return 0;
 
+ destroy_class:
+	class_destroy(nvme_class);
+ unregister_chrdev:
+	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
  unregister_blkdev:
 	unregister_blkdev(nvme_major, "nvme");
  kill_workq:
@@ -3172,6 +3203,8 @@ static void __exit nvme_exit(void)
 	unregister_hotcpu_notifier(&nvme_nb);
 	unregister_blkdev(nvme_major, "nvme");
 	destroy_workqueue(nvme_workq);
+	class_destroy(nvme_class);
+	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
 	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
 	_nvme_check_size();
 }
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 1f062a9e521d..383d495c5e4c 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -17,7 +17,6 @@
 
 #include <uapi/linux/nvme.h>
 #include <linux/pci.h>
-#include <linux/miscdevice.h>
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
 
@@ -89,7 +88,7 @@ struct nvme_dev {
 	struct nvme_bar __iomem *bar;
 	struct list_head namespaces;
 	struct kref kref;
-	struct miscdevice miscdev;
+	struct device *device;
 	work_func_t reset_workfn;
 	struct work_struct reset_work;
 	char name[12];
-- 
cgit v1.2.3


From 2e1d8448196ba85cd78a18723413a3c92aabe0f3 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 12 Feb 2015 15:33:00 -0700
Subject: NVMe: Asynchronous controller probe

This performs the longest parts of nvme device probe in scheduled work.
This speeds up probe significantly when multiple devices are in use.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 48 ++++++++++++++++++++++++++++++-----------------
 include/linux/nvme.h      |  1 +
 2 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index bb2b861cfed9..a57685f74e5e 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2800,6 +2800,10 @@ static int nvme_dev_open(struct inode *inode, struct file *f)
 	spin_lock(&dev_list_lock);
 	list_for_each_entry(dev, &dev_list, node) {
 		if (dev->instance == instance) {
+			if (!dev->admin_q) {
+				ret = -EWOULDBLOCK;
+				break;
+			}
 			if (!kref_get_unless_zero(&dev->kref))
 				break;
 			f->private_data = dev;
@@ -2982,6 +2986,7 @@ static void nvme_reset_workfn(struct work_struct *work)
 	dev->reset_workfn(work);
 }
 
+static void nvme_async_probe(struct work_struct *work);
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int node, result = -ENOMEM;
@@ -3017,34 +3022,20 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto release;
 
 	kref_init(&dev->kref);
-	result = nvme_dev_start(dev);
-	if (result)
-		goto release_pools;
-
 	dev->device = device_create(nvme_class, &pdev->dev,
 				MKDEV(nvme_char_major, dev->instance),
 				dev, "nvme%d", dev->instance);
 	if (IS_ERR(dev->device)) {
 		result = PTR_ERR(dev->device);
-		goto shutdown;
+		goto release_pools;
 	}
 	get_device(dev->device);
 
-	if (dev->online_queues > 1)
-		result = nvme_dev_add(dev);
-	if (result)
-		goto device_del;
-
-	nvme_set_irq_hints(dev);
-	dev->initialized = 1;
+	INIT_WORK(&dev->probe_work, nvme_async_probe);
+	schedule_work(&dev->probe_work);
 	return 0;
 
- device_del:
-	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
- shutdown:
-	nvme_dev_shutdown(dev);
  release_pools:
-	nvme_free_queues(dev, 0);
 	nvme_release_prp_pools(dev);
  release:
 	nvme_release_instance(dev);
@@ -3057,6 +3048,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return result;
 }
 
+static void nvme_async_probe(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
+	int result;
+
+	result = nvme_dev_start(dev);
+	if (result)
+		goto reset;
+
+	if (dev->online_queues > 1)
+		result = nvme_dev_add(dev);
+	if (result)
+		goto reset;
+
+	nvme_set_irq_hints(dev);
+	dev->initialized = 1;
+	return;
+ reset:
+	dev->reset_workfn = nvme_reset_failed_dev;
+	queue_work(nvme_workq, &dev->reset_work);
+}
+
 static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
@@ -3082,6 +3095,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	spin_unlock(&dev_list_lock);
 
 	pci_set_drvdata(pdev, NULL);
+	flush_work(&dev->probe_work);
 	flush_work(&dev->reset_work);
 	nvme_dev_shutdown(dev);
 	nvme_dev_remove(dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 383d495c5e4c..e2429e8cdab4 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -91,6 +91,7 @@ struct nvme_dev {
 	struct device *device;
 	work_func_t reset_workfn;
 	struct work_struct reset_work;
+	struct work_struct probe_work;
 	char name[12];
 	char serial[20];
 	char model[40];
-- 
cgit v1.2.3


From 07836e659c81ec6b0d683dfbf7958339a22a7b69 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 19 Feb 2015 10:34:48 -0700
Subject: NVMe: Fix potential corruption during shutdown

The driver has to end unreturned commands at some point even if the
controller has not provided a completion. The driver tried to be safe by
deleting IO queues prior to ending all unreturned commands. That should
cause the controller to internally abort inflight commands, but IO queue
deletion request does not have to be successful, so all bets are off. We
still have to make progress, so to be extra safe, this patch doesn't
clear a queue to release the dma mapping for a command until after the
pci device has been disabled.

This patch removes the special handling during device initialization
so controller recovery can be done all the time. This is possible since
initialization is not inlined with pci probe anymore.

Reported-by: Nilish Choudhury <nilesh.choudhury@oracle.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 49 ++++++++++++++++++-----------------------------
 include/linux/nvme.h      |  1 -
 2 files changed, 19 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index a57685f74e5e..0f388206b15b 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1274,29 +1274,18 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = cmd->nvmeq;
 
-	/*
-	 * The aborted req will be completed on receiving the abort req.
-	 * We enable the timer again. If hit twice, it'll cause a device reset,
-	 * as the device then is in a faulty state.
-	 */
-	int ret = BLK_EH_RESET_TIMER;
-
 	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
 							nvmeq->qid);
-
 	spin_lock_irq(&nvmeq->q_lock);
-	if (!nvmeq->dev->initialized) {
-		/*
-		 * Force cancelled command frees the request, which requires we
-		 * return BLK_EH_NOT_HANDLED.
-		 */
-		nvme_cancel_queue_ios(nvmeq->hctx, req, nvmeq, reserved);
-		ret = BLK_EH_NOT_HANDLED;
-	} else
-		nvme_abort_req(req);
+	nvme_abort_req(req);
 	spin_unlock_irq(&nvmeq->q_lock);
 
-	return ret;
+	/*
+	 * The aborted req will be completed on receiving the abort req.
+	 * We enable the timer again. If hit twice, it'll cause a device reset,
+	 * as the device then is in a faulty state.
+	 */
+	return BLK_EH_RESET_TIMER;
 }
 
 static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1349,7 +1338,6 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
 	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
 
 	spin_lock_irq(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
 	if (hctx && hctx->tags)
 		blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq);
 	spin_unlock_irq(&nvmeq->q_lock);
@@ -1372,7 +1360,10 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 	}
 	if (!qid && dev->admin_q)
 		blk_mq_freeze_queue_start(dev->admin_q);
-	nvme_clear_queue(nvmeq);
+
+	spin_lock_irq(&nvmeq->q_lock);
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
 }
 
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
@@ -2121,8 +2112,7 @@ static int nvme_kthread(void *data)
 		spin_lock(&dev_list_lock);
 		list_for_each_entry_safe(dev, next, &dev_list, node) {
 			int i;
-			if (readl(&dev->bar->csts) & NVME_CSTS_CFS &&
-							dev->initialized) {
+			if (readl(&dev->bar->csts) & NVME_CSTS_CFS) {
 				if (work_busy(&dev->reset_work))
 					continue;
 				list_del_init(&dev->node);
@@ -2525,8 +2515,6 @@ static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
 static void nvme_del_queue_end(struct nvme_queue *nvmeq)
 {
 	struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
-
-	nvme_clear_queue(nvmeq);
 	nvme_put_dq(dq);
 }
 
@@ -2669,7 +2657,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	int i;
 	u32 csts = -1;
 
-	dev->initialized = 0;
 	nvme_dev_list_remove(dev);
 
 	if (dev->bar) {
@@ -2680,7 +2667,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 		for (i = dev->queue_count - 1; i >= 0; i--) {
 			struct nvme_queue *nvmeq = dev->queues[i];
 			nvme_suspend_queue(nvmeq);
-			nvme_clear_queue(nvmeq);
 		}
 	} else {
 		nvme_disable_io_queues(dev);
@@ -2688,6 +2674,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 		nvme_disable_queue(dev, 0);
 	}
 	nvme_dev_unmap(dev);
+
+	for (i = dev->queue_count - 1; i >= 0; i--)
+		nvme_clear_queue(dev->queues[i]);
 }
 
 static void nvme_dev_remove(struct nvme_dev *dev)
@@ -2955,7 +2944,6 @@ static int nvme_dev_resume(struct nvme_dev *dev)
 		nvme_unfreeze_queues(dev);
 		nvme_set_irq_hints(dev);
 	}
-	dev->initialized = 1;
 	return 0;
 }
 
@@ -3063,11 +3051,12 @@ static void nvme_async_probe(struct work_struct *work)
 		goto reset;
 
 	nvme_set_irq_hints(dev);
-	dev->initialized = 1;
 	return;
  reset:
-	dev->reset_workfn = nvme_reset_failed_dev;
-	queue_work(nvme_workq, &dev->reset_work);
+	if (!work_busy(&dev->reset_work)) {
+		dev->reset_workfn = nvme_reset_failed_dev;
+		queue_work(nvme_workq, &dev->reset_work);
+	}
 }
 
 static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index e2429e8cdab4..0adad4a5419b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -103,7 +103,6 @@ struct nvme_dev {
 	u16 abort_limit;
 	u8 event_limit;
 	u8 vwc;
-	u8 initialized;
 };
 
 /*
-- 
cgit v1.2.3


From dd36929720f40f17685e841ae0d4c581c165ea60 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 20 Feb 2015 15:46:31 -0800
Subject: kernel: make READ_ONCE() valid on const arguments

The use of READ_ONCE() causes lots of warnings witht he pending paravirt
spinlock fixes, because those ends up having passing a member to a
'const' structure to READ_ONCE().

There should certainly be nothing wrong with using READ_ONCE() with a
const source, but the helper function __read_once_size() would cause
warnings because it would drop the 'const' qualifier, but also because
the destination would be marked 'const' too due to the use of 'typeof'.

Use a union of types in READ_ONCE() to avoid this issue.

Also make sure to use parenthesis around the macro arguments to avoid
possible operator precedence issues.

Tested-by: Ingo Molnar <mingo@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d1ec10a940ff..1b45e4a0519b 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -202,7 +202,7 @@ static __always_inline void data_access_exceeds_word_size(void)
 {
 }
 
-static __always_inline void __read_once_size(volatile void *p, void *res, int size)
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
 {
 	switch (size) {
 	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
@@ -259,10 +259,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  */
 
 #define READ_ONCE(x) \
-	({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; })
+	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 
 #define WRITE_ONCE(x, val) \
-	({ typeof(x) __val; __val = val; __write_once_size(&x, &__val, sizeof(__val)); __val; })
+	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From 155e35d4daa804582f75acaa2c74ec797a89c615 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 29 Jan 2015 12:02:27 +0000
Subject: VFS: Introduce inode-getting helpers for layered/unioned fs
 environments

Introduce some function for getting the inode (and also the dentry) in an
environment where layered/unioned filesystems are in operation.

The problem is that we have places where we need *both* the union dentry and
the lower source or workspace inode or dentry available, but we can only have
a handle on one of them.  Therefore we need to derive the handle to the other
from that.

The idea is to introduce an extra field in struct dentry that allows the union
dentry to refer to and pin the lower dentry.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 92c08cf7670e..047c0db5763f 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -464,4 +464,61 @@ static inline unsigned long vfs_pressure_ratio(unsigned long val)
 {
 	return mult_frac(val, sysctl_vfs_cache_pressure, 100);
 }
+
+/**
+ * d_inode - Get the actual inode of this dentry
+ * @dentry: The dentry to query
+ *
+ * This is the helper normal filesystems should use to get at their own inodes
+ * in their own dentries and ignore the layering superimposed upon them.
+ */
+static inline struct inode *d_inode(const struct dentry *dentry)
+{
+	return dentry->d_inode;
+}
+
+/**
+ * d_inode_rcu - Get the actual inode of this dentry with ACCESS_ONCE()
+ * @dentry: The dentry to query
+ *
+ * This is the helper normal filesystems should use to get at their own inodes
+ * in their own dentries and ignore the layering superimposed upon them.
+ */
+static inline struct inode *d_inode_rcu(const struct dentry *dentry)
+{
+	return ACCESS_ONCE(dentry->d_inode);
+}
+
+/**
+ * d_backing_inode - Get upper or lower inode we should be using
+ * @upper: The upper layer
+ *
+ * This is the helper that should be used to get at the inode that will be used
+ * if this dentry were to be opened as a file.  The inode may be on the upper
+ * dentry or it may be on a lower dentry pinned by the upper.
+ *
+ * Normal filesystems should not use this to access their own inodes.
+ */
+static inline struct inode *d_backing_inode(const struct dentry *upper)
+{
+	struct inode *inode = upper->d_inode;
+
+	return inode;
+}
+
+/**
+ * d_backing_dentry - Get upper or lower dentry we should be using
+ * @upper: The upper layer
+ *
+ * This is the helper that should be used to get the dentry of the inode that
+ * will be used if this dentry were opened as a file.  It may be the upper
+ * dentry or it may be a lower dentry pinned by the upper.
+ *
+ * Normal filesystems should not use this to access their own dentries.
+ */
+static inline struct dentry *d_backing_dentry(struct dentry *upper)
+{
+	return upper;
+}
+
 #endif	/* __LINUX_DCACHE_H */
-- 
cgit v1.2.3


From e7f7d2253c05143ed76ddc5e11f4cf0a9b814a27 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 29 Jan 2015 12:02:27 +0000
Subject: VFS: Add a whiteout dentry type

Add DCACHE_WHITEOUT_TYPE and provide a d_is_whiteout() accessor function.  A
d_is_miss() accessor is also added for ordinary cache misses and
d_is_negative() is modified to indicate either an ordinary miss or an enforced
miss (whiteout).

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 047c0db5763f..98d2a948a08e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -215,11 +215,12 @@ struct dentry_operations {
 #define DCACHE_LRU_LIST			0x00080000
 
 #define DCACHE_ENTRY_TYPE		0x00700000
-#define DCACHE_MISS_TYPE		0x00000000 /* Negative dentry */
-#define DCACHE_DIRECTORY_TYPE		0x00100000 /* Normal directory */
-#define DCACHE_AUTODIR_TYPE		0x00200000 /* Lookupless directory (presumed automount) */
-#define DCACHE_SYMLINK_TYPE		0x00300000 /* Symlink */
-#define DCACHE_FILE_TYPE		0x00400000 /* Other file type */
+#define DCACHE_MISS_TYPE		0x00000000 /* Negative dentry (maybe fallthru to nowhere) */
+#define DCACHE_WHITEOUT_TYPE		0x00100000 /* Whiteout dentry (stop pathwalk) */
+#define DCACHE_DIRECTORY_TYPE		0x00200000 /* Normal directory */
+#define DCACHE_AUTODIR_TYPE		0x00300000 /* Lookupless directory (presumed automount) */
+#define DCACHE_SYMLINK_TYPE		0x00400000 /* Symlink (or fallthru to such) */
+#define DCACHE_FILE_TYPE		0x00500000 /* Other file type (or fallthru to such) */
 
 #define DCACHE_MAY_FREE			0x00800000
 
@@ -423,6 +424,16 @@ static inline unsigned __d_entry_type(const struct dentry *dentry)
 	return dentry->d_flags & DCACHE_ENTRY_TYPE;
 }
 
+static inline bool d_is_miss(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
+}
+
+static inline bool d_is_whiteout(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
+}
+
 static inline bool d_can_lookup(const struct dentry *dentry)
 {
 	return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
@@ -450,7 +461,8 @@ static inline bool d_is_file(const struct dentry *dentry)
 
 static inline bool d_is_negative(const struct dentry *dentry)
 {
-	return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
+	// TODO: check d_is_whiteout(dentry) also.
+	return d_is_miss(dentry);
 }
 
 static inline bool d_is_positive(const struct dentry *dentry)
-- 
cgit v1.2.3


From df1a085af1f652a02238168c4f2b730c8c90dd4a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 29 Jan 2015 12:02:28 +0000
Subject: VFS: Add a fallthrough flag for marking virtual dentries

Add a DCACHE_FALLTHRU flag to indicate that, in a layered filesystem, this is
a virtual dentry that covers another one in a lower layer that should be used
instead.  This may be recorded on medium if directory integration is stored
there.

The flag can be set with d_set_fallthru() and tested with d_is_fallthru().

Original-author: Valerie Aurora <vaurora@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 19 ++++++++++++++++++-
 include/linux/dcache.h |  9 +++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index dc400fd29f4d..e33a0934efd7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1659,6 +1659,22 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 }
 EXPORT_SYMBOL(d_set_d_op);
 
+
+/*
+ * d_set_fallthru - Mark a dentry as falling through to a lower layer
+ * @dentry - The dentry to mark
+ *
+ * Mark a dentry as falling through to the lower layer (as set with
+ * d_pin_lower()).  This flag may be recorded on the medium.
+ */
+void d_set_fallthru(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_FALLTHRU;
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_set_fallthru);
+
 static unsigned d_flags_for_inode(struct inode *inode)
 {
 	unsigned add_flags = DCACHE_FILE_TYPE;
@@ -1691,7 +1707,8 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	unsigned add_flags = d_flags_for_inode(inode);
 
 	spin_lock(&dentry->d_lock);
-	__d_set_type(dentry, add_flags);
+	dentry->d_flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+	dentry->d_flags |= add_flags;
 	if (inode)
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
 	dentry->d_inode = inode;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 98d2a948a08e..728f5d31b5e6 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -223,6 +223,7 @@ struct dentry_operations {
 #define DCACHE_FILE_TYPE		0x00500000 /* Other file type (or fallthru to such) */
 
 #define DCACHE_MAY_FREE			0x00800000
+#define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
 
 extern seqlock_t rename_lock;
 
@@ -470,6 +471,14 @@ static inline bool d_is_positive(const struct dentry *dentry)
 	return !d_is_negative(dentry);
 }
 
+extern void d_set_fallthru(struct dentry *dentry);
+
+static inline bool d_is_fallthru(const struct dentry *dentry)
+{
+	return dentry->d_flags & DCACHE_FALLTHRU;
+}
+
+
 extern int sysctl_vfs_cache_pressure;
 
 static inline unsigned long vfs_pressure_ratio(unsigned long val)
-- 
cgit v1.2.3


From 44bdb5e5f6382ba88f7678d6f535f879324522ae Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 29 Jan 2015 12:02:29 +0000
Subject: VFS: Split DCACHE_FILE_TYPE into regular and special types

Split DCACHE_FILE_TYPE into DCACHE_REGULAR_TYPE (dentries representing regular
files) and DCACHE_SPECIAL_TYPE (representing blockdev, chardev, FIFO and
socket files).

d_is_reg() and d_is_special() are added to detect these subtypes and
d_is_file() is left as the union of the two.

This allows a number of places that use S_ISREG(dentry->d_inode->i_mode) to
use d_is_reg(dentry) instead.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 18 +++++++++++++-----
 include/linux/dcache.h | 17 ++++++++++++++---
 2 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index e33a0934efd7..c71e3732e53b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1677,7 +1677,7 @@ EXPORT_SYMBOL(d_set_fallthru);
 
 static unsigned d_flags_for_inode(struct inode *inode)
 {
-	unsigned add_flags = DCACHE_FILE_TYPE;
+	unsigned add_flags = DCACHE_REGULAR_TYPE;
 
 	if (!inode)
 		return DCACHE_MISS_TYPE;
@@ -1690,13 +1690,21 @@ static unsigned d_flags_for_inode(struct inode *inode)
 			else
 				inode->i_opflags |= IOP_LOOKUP;
 		}
-	} else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
-		if (unlikely(inode->i_op->follow_link))
+		goto type_determined;
+	}
+
+	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+		if (unlikely(inode->i_op->follow_link)) {
 			add_flags = DCACHE_SYMLINK_TYPE;
-		else
-			inode->i_opflags |= IOP_NOFOLLOW;
+			goto type_determined;
+		}
+		inode->i_opflags |= IOP_NOFOLLOW;
 	}
 
+	if (unlikely(!S_ISREG(inode->i_mode)))
+		add_flags = DCACHE_SPECIAL_TYPE;
+
+type_determined:
 	if (unlikely(IS_AUTOMOUNT(inode)))
 		add_flags |= DCACHE_NEED_AUTOMOUNT;
 	return add_flags;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 728f5d31b5e6..d8358799c594 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -219,8 +219,9 @@ struct dentry_operations {
 #define DCACHE_WHITEOUT_TYPE		0x00100000 /* Whiteout dentry (stop pathwalk) */
 #define DCACHE_DIRECTORY_TYPE		0x00200000 /* Normal directory */
 #define DCACHE_AUTODIR_TYPE		0x00300000 /* Lookupless directory (presumed automount) */
-#define DCACHE_SYMLINK_TYPE		0x00400000 /* Symlink (or fallthru to such) */
-#define DCACHE_FILE_TYPE		0x00500000 /* Other file type (or fallthru to such) */
+#define DCACHE_REGULAR_TYPE		0x00400000 /* Regular file type (or fallthru to such) */
+#define DCACHE_SPECIAL_TYPE		0x00500000 /* Other file type (or fallthru to such) */
+#define DCACHE_SYMLINK_TYPE		0x00600000 /* Symlink (or fallthru to such) */
 
 #define DCACHE_MAY_FREE			0x00800000
 #define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
@@ -455,9 +456,19 @@ static inline bool d_is_symlink(const struct dentry *dentry)
 	return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
 }
 
+static inline bool d_is_reg(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
+}
+
+static inline bool d_is_special(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
+}
+
 static inline bool d_is_file(const struct dentry *dentry)
 {
-	return __d_entry_type(dentry) == DCACHE_FILE_TYPE;
+	return d_is_reg(dentry) || d_is_special(dentry);
 }
 
 static inline bool d_is_negative(const struct dentry *dentry)
-- 
cgit v1.2.3


From 12ca7188468ee29c4e717f73db4bf43c90954fc7 Mon Sep 17 00:00:00 2001
From: Nishanth Menon <nm@ti.com>
Date: Fri, 13 Feb 2015 19:28:02 -0600
Subject: thermal: Introduce dummy functions when thermal is not defined

When CONFIG_THERMAL is not enabled, it is better to introduce
equivalent dummy functions in the exported header than to
introduce #ifdeffery in drivers using the function.

This will prevent issues such as that reported in:
http://www.spinics.net/lists/linux-next/msg31573.html

While at it switch over to IS_ENABLED for thermal macros
to allow for thermal framework to be built as framework
and relevant APIs be usable by relevant drivers as a result.

Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 include/linux/thermal.h | 56 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index fc52e307efab..5eac316490ea 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -314,6 +314,8 @@ void thermal_zone_of_sensor_unregister(struct device *dev,
 }
 
 #endif
+
+#if IS_ENABLED(CONFIG_THERMAL)
 struct thermal_zone_device *thermal_zone_device_register(const char *, int, int,
 		void *, struct thermal_zone_device_ops *,
 		const struct thermal_zone_params *, int, int);
@@ -340,8 +342,58 @@ struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
 		struct thermal_cooling_device *, int);
 void thermal_cdev_update(struct thermal_cooling_device *);
 void thermal_notify_framework(struct thermal_zone_device *, int);
-
-#ifdef CONFIG_NET
+#else
+static inline struct thermal_zone_device *thermal_zone_device_register(
+	const char *type, int trips, int mask, void *devdata,
+	struct thermal_zone_device_ops *ops,
+	const struct thermal_zone_params *tzp,
+	int passive_delay, int polling_delay)
+{ return ERR_PTR(-ENODEV); }
+static inline void thermal_zone_device_unregister(
+	struct thermal_zone_device *tz)
+{ }
+static inline int thermal_zone_bind_cooling_device(
+	struct thermal_zone_device *tz, int trip,
+	struct thermal_cooling_device *cdev,
+	unsigned long upper, unsigned long lower)
+{ return -ENODEV; }
+static inline int thermal_zone_unbind_cooling_device(
+	struct thermal_zone_device *tz, int trip,
+	struct thermal_cooling_device *cdev)
+{ return -ENODEV; }
+static inline void thermal_zone_device_update(struct thermal_zone_device *tz)
+{ }
+static inline struct thermal_cooling_device *
+thermal_cooling_device_register(char *type, void *devdata,
+	const struct thermal_cooling_device_ops *ops)
+{ return ERR_PTR(-ENODEV); }
+static inline struct thermal_cooling_device *
+thermal_of_cooling_device_register(struct device_node *np,
+	char *type, void *devdata, const struct thermal_cooling_device_ops *ops)
+{ return ERR_PTR(-ENODEV); }
+static inline void thermal_cooling_device_unregister(
+	struct thermal_cooling_device *cdev)
+{ }
+static inline struct thermal_zone_device *thermal_zone_get_zone_by_name(
+		const char *name)
+{ return ERR_PTR(-ENODEV); }
+static inline int thermal_zone_get_temp(
+		struct thermal_zone_device *tz, unsigned long *temp)
+{ return -ENODEV; }
+static inline int get_tz_trend(struct thermal_zone_device *tz, int trip)
+{ return -ENODEV; }
+static inline struct thermal_instance *
+get_thermal_instance(struct thermal_zone_device *tz,
+	struct thermal_cooling_device *cdev, int trip)
+{ return ERR_PTR(-ENODEV); }
+static inline void thermal_cdev_update(struct thermal_cooling_device *cdev)
+{ }
+static inline void thermal_notify_framework(struct thermal_zone_device *tz,
+	int trip)
+{ }
+#endif /* CONFIG_THERMAL */
+
+#if defined(CONFIG_NET) && IS_ENABLED(CONFIG_THERMAL)
 extern int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 						enum events event);
 #else
-- 
cgit v1.2.3