From 2543a87108d2af7d48a43b3d6685c2b1ea279e36 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Wed, 6 Apr 2011 10:43:11 +0200
Subject: workqueue: remove cancel_rearming_delayed_work[queue]()

cancel_rearming_delayed_work() and cancel_rearming_delayed_workqueue()
can be removed now.

Signed-off-by: WANG Cong <amwang@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f584aba78ca9..57b31b3d83bd 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -412,21 +412,6 @@ static inline bool __cancel_delayed_work(struct delayed_work *work)
 	return ret;
 }
 
-/* Obsolete. use cancel_delayed_work_sync() */
-static inline __deprecated
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
-					struct delayed_work *work)
-{
-	cancel_delayed_work_sync(work);
-}
-
-/* Obsolete. use cancel_delayed_work_sync() */
-static inline __deprecated
-void cancel_rearming_delayed_work(struct delayed_work *work)
-{
-	cancel_delayed_work_sync(work);
-}
-
 #ifndef CONFIG_SMP
 static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
-- 
cgit v1.2.3


From 9c5a2ba70251ecaab18c7a83e38b3c620223476c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 18:01:44 +0200
Subject: workqueue: separate out drain_workqueue() from destroy_workqueue()

There are users which want to drain workqueues without destroying it.
Separate out drain functionality from destroy_workqueue() into
drain_workqueue() and make it accessible to workqueue users.

To guarantee forward-progress, only chain queueing is allowed while
drain is in progress.  If a new work item which isn't chained from the
running or pending work items is queued while draining is in progress,
WARN_ON_ONCE() is triggered.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
---
 include/linux/workqueue.h |  3 +-
 kernel/workqueue.c        | 81 +++++++++++++++++++++++++++++++----------------
 2 files changed, 55 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 57b31b3d83bd..2be2887c6958 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -255,7 +255,7 @@ enum {
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
 
-	WQ_DYING		= 1 << 6, /* internal: workqueue is dying */
+	WQ_DRAINING		= 1 << 6, /* internal: workqueue is draining */
 	WQ_RESCUER		= 1 << 7, /* internal: workqueue has rescuer */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
@@ -355,6 +355,7 @@ extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct delayed_work *work, unsigned long delay);
 
 extern void flush_workqueue(struct workqueue_struct *wq);
+extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
 extern int schedule_work(struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e3378e8d3a5c..25c8afeaeae8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t;
  * per-CPU workqueues:
  */
 struct workqueue_struct {
-	unsigned int		flags;		/* I: WQ_* flags */
+	unsigned int		flags;		/* W: WQ_* flags */
 	union {
 		struct cpu_workqueue_struct __percpu	*pcpu;
 		struct cpu_workqueue_struct		*single;
@@ -240,6 +240,7 @@ struct workqueue_struct {
 	mayday_mask_t		mayday_mask;	/* cpus requesting rescue */
 	struct worker		*rescuer;	/* I: rescue worker */
 
+	int			nr_drainers;	/* W: drain in progress */
 	int			saved_max_active; /* W: saved cwq max_active */
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	debug_work_activate(work);
 
 	/* if dying, only works from the same workqueue are allowed */
-	if (unlikely(wq->flags & WQ_DYING) &&
+	if (unlikely(wq->flags & WQ_DRAINING) &&
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
 
@@ -2381,6 +2382,54 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
+/**
+ * drain_workqueue - drain a workqueue
+ * @wq: workqueue to drain
+ *
+ * Wait until the workqueue becomes empty.  While draining is in progress,
+ * only chain queueing is allowed.  IOW, only currently pending or running
+ * work items on @wq can queue further work items on it.  @wq is flushed
+ * repeatedly until it becomes empty.  The number of flushing is detemined
+ * by the depth of chaining and should be relatively short.  Whine if it
+ * takes too long.
+ */
+void drain_workqueue(struct workqueue_struct *wq)
+{
+	unsigned int flush_cnt = 0;
+	unsigned int cpu;
+
+	/*
+	 * __queue_work() needs to test whether there are drainers, is much
+	 * hotter than drain_workqueue() and already looks at @wq->flags.
+	 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
+	 */
+	spin_lock(&workqueue_lock);
+	if (!wq->nr_drainers++)
+		wq->flags |= WQ_DRAINING;
+	spin_unlock(&workqueue_lock);
+reflush:
+	flush_workqueue(wq);
+
+	for_each_cwq_cpu(cpu, wq) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+		if (!cwq->nr_active && list_empty(&cwq->delayed_works))
+			continue;
+
+		if (++flush_cnt == 10 ||
+		    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+			pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
+				   wq->name, flush_cnt);
+		goto reflush;
+	}
+
+	spin_lock(&workqueue_lock);
+	if (!--wq->nr_drainers)
+		wq->flags &= ~WQ_DRAINING;
+	spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(drain_workqueue);
+
 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 			     bool wait_executing)
 {
@@ -3011,34 +3060,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
  */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
-	unsigned int flush_cnt = 0;
 	unsigned int cpu;
 
-	/*
-	 * Mark @wq dying and drain all pending works.  Once WQ_DYING is
-	 * set, only chain queueing is allowed.  IOW, only currently
-	 * pending or running work items on @wq can queue further work
-	 * items on it.  @wq is flushed repeatedly until it becomes empty.
-	 * The number of flushing is detemined by the depth of chaining and
-	 * should be relatively short.  Whine if it takes too long.
-	 */
-	wq->flags |= WQ_DYING;
-reflush:
-	flush_workqueue(wq);
-
-	for_each_cwq_cpu(cpu, wq) {
-		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-
-		if (!cwq->nr_active && list_empty(&cwq->delayed_works))
-			continue;
-
-		if (++flush_cnt == 10 ||
-		    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
-			printk(KERN_WARNING "workqueue %s: flush on "
-			       "destruction isn't complete after %u tries\n",
-			       wq->name, flush_cnt);
-		goto reflush;
-	}
+	/* drain it before proceeding with destruction */
+	drain_workqueue(wq);
 
 	/*
 	 * wq list is used to freeze wq, remove from list after
-- 
cgit v1.2.3


From 1bdcd095e39a789135f8638a2ff76f74e3071d46 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Wed, 18 May 2011 11:40:22 +0200
Subject: bcma: add IRQ number and pointer to DMA dev
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/bcma/main.c       | 2 ++
 include/linux/bcma/bcma.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index be52344ed19d..a2f6b1879273 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -89,6 +89,8 @@ static int bcma_register_cores(struct bcma_bus *bus)
 		switch (bus->hosttype) {
 		case BCMA_HOSTTYPE_PCI:
 			core->dev.parent = &bus->host_pci->dev;
+			core->dma_dev = &bus->host_pci->dev;
+			core->irq = bus->host_pci->irq;
 			break;
 		case BCMA_HOSTTYPE_NONE:
 		case BCMA_HOSTTYPE_SDIO:
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 08763e4e848f..8b0cef9fd692 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -117,6 +117,8 @@ struct bcma_device {
 	struct bcma_device_id id;
 
 	struct device dev;
+	struct device *dma_dev;
+	unsigned int irq;
 	bool dev_registered;
 
 	u8 core_index;
-- 
cgit v1.2.3


From 9d75ef0f8f6d2e31ed940b3057a42a25f07076fb Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Fri, 20 May 2011 03:27:06 +0200
Subject: bcma: host pci: implement block R/W operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/bcma/Kconfig      |  5 +++++
 drivers/bcma/host_pci.c   | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/bcma/bcma.h | 18 ++++++++++++++++
 3 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/bcma/Kconfig b/drivers/bcma/Kconfig
index 353781b5b78b..83e9adf46441 100644
--- a/drivers/bcma/Kconfig
+++ b/drivers/bcma/Kconfig
@@ -13,6 +13,11 @@ config BCMA
 	  Bus driver for Broadcom specific Advanced Microcontroller Bus
 	  Architecture.
 
+# Support for Block-I/O. SELECT this from the driver that needs it.
+config BCMA_BLOCKIO
+	bool
+	depends on BCMA
+
 config BCMA_HOST_PCI_POSSIBLE
 	bool
 	depends on BCMA && PCI = y
diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c
index ffd8797faf4f..279bf50f6d8e 100644
--- a/drivers/bcma/host_pci.c
+++ b/drivers/bcma/host_pci.c
@@ -64,6 +64,54 @@ static void bcma_host_pci_write32(struct bcma_device *core, u16 offset,
 	iowrite32(value, core->bus->mmio + offset);
 }
 
+#ifdef CONFIG_BCMA_BLOCKIO
+void bcma_host_pci_block_read(struct bcma_device *core, void *buffer,
+			      size_t count, u16 offset, u8 reg_width)
+{
+	void __iomem *addr = core->bus->mmio + offset;
+	if (core->bus->mapped_core != core)
+		bcma_host_pci_switch_core(core);
+	switch (reg_width) {
+	case sizeof(u8):
+		ioread8_rep(addr, buffer, count);
+		break;
+	case sizeof(u16):
+		WARN_ON(count & 1);
+		ioread16_rep(addr, buffer, count >> 1);
+		break;
+	case sizeof(u32):
+		WARN_ON(count & 3);
+		ioread32_rep(addr, buffer, count >> 2);
+		break;
+	default:
+		WARN_ON(1);
+	}
+}
+
+void bcma_host_pci_block_write(struct bcma_device *core, const void *buffer,
+			       size_t count, u16 offset, u8 reg_width)
+{
+	void __iomem *addr = core->bus->mmio + offset;
+	if (core->bus->mapped_core != core)
+		bcma_host_pci_switch_core(core);
+	switch (reg_width) {
+	case sizeof(u8):
+		iowrite8_rep(addr, buffer, count);
+		break;
+	case sizeof(u16):
+		WARN_ON(count & 1);
+		iowrite16_rep(addr, buffer, count >> 1);
+		break;
+	case sizeof(u32):
+		WARN_ON(count & 3);
+		iowrite32_rep(addr, buffer, count >> 2);
+		break;
+	default:
+		WARN_ON(1);
+	}
+}
+#endif
+
 static u32 bcma_host_pci_aread32(struct bcma_device *core, u16 offset)
 {
 	if (core->bus->mapped_core != core)
@@ -86,6 +134,10 @@ const struct bcma_host_ops bcma_host_pci_ops = {
 	.write8		= bcma_host_pci_write8,
 	.write16	= bcma_host_pci_write16,
 	.write32	= bcma_host_pci_write32,
+#ifdef CONFIG_BCMA_BLOCKIO
+	.block_read	= bcma_host_pci_block_read,
+	.block_write	= bcma_host_pci_block_write,
+#endif
 	.aread32	= bcma_host_pci_aread32,
 	.awrite32	= bcma_host_pci_awrite32,
 };
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 8b0cef9fd692..27a27a79bea3 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -31,6 +31,12 @@ struct bcma_host_ops {
 	void (*write8)(struct bcma_device *core, u16 offset, u8 value);
 	void (*write16)(struct bcma_device *core, u16 offset, u16 value);
 	void (*write32)(struct bcma_device *core, u16 offset, u32 value);
+#ifdef CONFIG_BCMA_BLOCKIO
+	void (*block_read)(struct bcma_device *core, void *buffer,
+			   size_t count, u16 offset, u8 reg_width);
+	void (*block_write)(struct bcma_device *core, const void *buffer,
+			    size_t count, u16 offset, u8 reg_width);
+#endif
 	/* Agent ops */
 	u32 (*aread32)(struct bcma_device *core, u16 offset);
 	void (*awrite32)(struct bcma_device *core, u16 offset, u32 value);
@@ -210,6 +216,18 @@ void bcma_write32(struct bcma_device *core, u16 offset, u32 value)
 {
 	core->bus->ops->write32(core, offset, value);
 }
+#ifdef CONFIG_BCMA_BLOCKIO
+extern inline void bcma_block_read(struct bcma_device *core, void *buffer,
+				   size_t count, u16 offset, u8 reg_width)
+{
+	core->bus->ops->block_read(core, buffer, count, offset, reg_width);
+}
+extern inline void bcma_block_write(struct bcma_device *core, const void *buffer,
+				    size_t count, u16 offset, u8 reg_width)
+{
+	core->bus->ops->block_write(core, buffer, count, offset, reg_width);
+}
+#endif
 extern inline u32 bcma_aread32(struct bcma_device *core, u16 offset)
 {
 	return core->bus->ops->aread32(core, offset);
-- 
cgit v1.2.3


From a3cc68c37897f0656489a0d853d6e342fc6f076b Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Fri, 27 May 2011 16:35:59 -0700
Subject: gpio/74x164: remove unnecessary defines and prototype

Remove the #define GEN_74X164_GPIO_COUNT since it's only used in
one place and it's meaning is obvious.  Also remove the #define
GEN_74X164_DRIVER_NAME and use spi->modalias to set the gpio chip's
label and the string "74x164" for the driver name.

Reorder the code slightly to remove the need to prototype
gen_74x164_set_value.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/gpio/74x164.c      | 27 +++++++++++----------------
 include/linux/spi/74x164.h |  2 --
 2 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/74x164.c b/drivers/gpio/74x164.c
index 84e070219839..7fb60b6bf523 100644
--- a/drivers/gpio/74x164.c
+++ b/drivers/gpio/74x164.c
@@ -16,9 +16,6 @@
 #include <linux/gpio.h>
 #include <linux/slab.h>
 
-#define GEN_74X164_GPIO_COUNT	8
-
-
 struct gen_74x164_chip {
 	struct spi_device	*spi;
 	struct gpio_chip	gpio_chip;
@@ -26,8 +23,6 @@ struct gen_74x164_chip {
 	u8			port_config;
 };
 
-static void gen_74x164_set_value(struct gpio_chip *, unsigned, int);
-
 static struct gen_74x164_chip *gpio_to_chip(struct gpio_chip *gc)
 {
 	return container_of(gc, struct gen_74x164_chip, gpio_chip);
@@ -39,13 +34,6 @@ static int __gen_74x164_write_config(struct gen_74x164_chip *chip)
 			 &chip->port_config, sizeof(chip->port_config));
 }
 
-static int gen_74x164_direction_output(struct gpio_chip *gc,
-		unsigned offset, int val)
-{
-	gen_74x164_set_value(gc, offset, val);
-	return 0;
-}
-
 static int gen_74x164_get_value(struct gpio_chip *gc, unsigned offset)
 {
 	struct gen_74x164_chip *chip = gpio_to_chip(gc);
@@ -73,6 +61,13 @@ static void gen_74x164_set_value(struct gpio_chip *gc,
 	mutex_unlock(&chip->lock);
 }
 
+static int gen_74x164_direction_output(struct gpio_chip *gc,
+		unsigned offset, int val)
+{
+	gen_74x164_set_value(gc, offset, val);
+	return 0;
+}
+
 static int __devinit gen_74x164_probe(struct spi_device *spi)
 {
 	struct gen_74x164_chip *chip;
@@ -104,12 +99,12 @@ static int __devinit gen_74x164_probe(struct spi_device *spi)
 
 	chip->spi = spi;
 
-	chip->gpio_chip.label = GEN_74X164_DRIVER_NAME,
-		chip->gpio_chip.direction_output = gen_74x164_direction_output;
+	chip->gpio_chip.label = spi->modalias;
+	chip->gpio_chip.direction_output = gen_74x164_direction_output;
 	chip->gpio_chip.get = gen_74x164_get_value;
 	chip->gpio_chip.set = gen_74x164_set_value;
 	chip->gpio_chip.base = pdata->base;
-	chip->gpio_chip.ngpio = GEN_74X164_GPIO_COUNT;
+	chip->gpio_chip.ngpio = 8;
 	chip->gpio_chip.can_sleep = 1;
 	chip->gpio_chip.dev = &spi->dev;
 	chip->gpio_chip.owner = THIS_MODULE;
@@ -157,7 +152,7 @@ static int __devexit gen_74x164_remove(struct spi_device *spi)
 
 static struct spi_driver gen_74x164_driver = {
 	.driver = {
-		.name		= GEN_74X164_DRIVER_NAME,
+		.name		= "74x164",
 		.owner		= THIS_MODULE,
 	},
 	.probe		= gen_74x164_probe,
diff --git a/include/linux/spi/74x164.h b/include/linux/spi/74x164.h
index d85c52f294a0..0aa6acc73317 100644
--- a/include/linux/spi/74x164.h
+++ b/include/linux/spi/74x164.h
@@ -1,8 +1,6 @@
 #ifndef LINUX_SPI_74X164_H
 #define LINUX_SPI_74X164_H
 
-#define GEN_74X164_DRIVER_NAME "74x164"
-
 struct gen_74x164_chip_platform_data {
 	/* number assigned to the first GPIO */
 	unsigned	base;
-- 
cgit v1.2.3


From 7150962d637cf38617924f7f72ea00612283eb89 Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend@broadcom.com>
Date: Tue, 31 May 2011 11:22:15 +0200
Subject: lib: crc8: add new library module providing crc8 algorithm

The brcm80211 driver in staging tree uses a crc8 function. Based on
feedback from John Linville to move this to lib directory, the linux
source has been searched. Although there is currently only one other
kernel driver using this algorithm (ie. drivers/ssb) we are providing
this as a library function for others to use.

Cc: linux-kernel@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: Dan Carpenter <error27@gmail.com>
Cc: George Spelvin <linux@horizon.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Reviewed-by: Henry Ptasinski <henryp@broadcom.com>
Reviewed-by: Roland Vossen <rvossen@broadcom.com>
Reviewed-by: "Franky (Zhenhui) Lin" <frankyl@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/crc8.h | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig          |   7 ++++
 lib/Makefile         |   1 +
 lib/crc8.c           |  86 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 195 insertions(+)
 create mode 100644 include/linux/crc8.h
 create mode 100644 lib/crc8.c

(limited to 'include/linux')

diff --git a/include/linux/crc8.h b/include/linux/crc8.h
new file mode 100644
index 000000000000..13c8dabb0441
--- /dev/null
+++ b/include/linux/crc8.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2011 Broadcom Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef __CRC8_H_
+#define __CRC8_H_
+
+#include <linux/types.h>
+
+/* see usage of this value in crc8() description */
+#define CRC8_INIT_VALUE		0xFF
+
+/*
+ * Return value of crc8() indicating valid message+crc. This is true
+ * if a CRC is inverted before transmission. The CRC computed over the
+ * whole received bitstream is _table[x], where x is the bit pattern
+ * of the modification (almost always 0xff).
+ */
+#define CRC8_GOOD_VALUE(_table)	(_table[0xFF])
+
+/* required table size for crc8 algorithm */
+#define CRC8_TABLE_SIZE			256
+
+/* helper macro assuring right table size is used */
+#define DECLARE_CRC8_TABLE(_table) \
+	static u8 _table[CRC8_TABLE_SIZE]
+
+/**
+ * crc8_populate_lsb - fill crc table for given polynomial in regular bit order.
+ *
+ * @table:	table to be filled.
+ * @polynomial:	polynomial for which table is to be filled.
+ *
+ * This function fills the provided table according the polynomial provided for
+ * regular bit order (lsb first). Polynomials in CRC algorithms are typically
+ * represented as shown below.
+ *
+ *	poly = x^8 + x^7 + x^6 + x^4 + x^2 + 1
+ *
+ * For lsb first direction x^7 maps to the lsb. So the polynomial is as below.
+ *
+ * - lsb first: poly = 10101011(1) = 0xAB
+ */
+void crc8_populate_lsb(u8 table[CRC8_TABLE_SIZE], u8 polynomial);
+
+/**
+ * crc8_populate_msb - fill crc table for given polynomial in reverse bit order.
+ *
+ * @table:	table to be filled.
+ * @polynomial:	polynomial for which table is to be filled.
+ *
+ * This function fills the provided table according the polynomial provided for
+ * reverse bit order (msb first). Polynomials in CRC algorithms are typically
+ * represented as shown below.
+ *
+ *	poly = x^8 + x^7 + x^6 + x^4 + x^2 + 1
+ *
+ * For msb first direction x^7 maps to the msb. So the polynomial is as below.
+ *
+ * - msb first: poly = (1)11010101 = 0xD5
+ */
+void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial);
+
+/**
+ * crc8() - calculate a crc8 over the given input data.
+ *
+ * @table:	crc table used for calculation.
+ * @pdata:	pointer to data buffer.
+ * @nbytes:	number of bytes in data buffer.
+ * @crc:	previous returned crc8 value.
+ *
+ * The CRC8 is calculated using the polynomial given in crc8_populate_msb()
+ * or crc8_populate_lsb().
+ *
+ * The caller provides the initial value (either %CRC8_INIT_VALUE
+ * or the previous returned value) to allow for processing of
+ * discontiguous blocks of data.  When generating the CRC the
+ * caller is responsible for complementing the final return value
+ * and inserting it into the byte stream.  When validating a byte
+ * stream (including CRC8), a final return value of %CRC8_GOOD_VALUE
+ * indicates the byte stream data can be considered valid.
+ *
+ * Reference:
+ * "A Painless Guide to CRC Error Detection Algorithms", ver 3, Aug 1993
+ * Williams, Ross N., ross<at>ross.net
+ * (see URL http://www.ross.net/crc/download/crc_v3.txt).
+ */
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc);
+
+#endif /* __CRC8_H_ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 9c10e38fc609..ff9e5a34be76 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -89,6 +89,13 @@ config LIBCRC32C
 	  require M here.  See Castagnoli93.
 	  Module will be libcrc32c.
 
+config CRC8
+	tristate "CRC8 function"
+	help
+	  This option provides CRC8 function. Drivers may select this
+	  when they need to do cyclic redundancy check according CRC8
+	  algorithm. Module will be called crc8.
+
 config AUDIT_GENERIC
 	bool
 	depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Makefile b/lib/Makefile
index 4b49a249064b..704959d2018b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -65,6 +65,7 @@ obj-$(CONFIG_CRC_ITU_T)	+= crc-itu-t.o
 obj-$(CONFIG_CRC32)	+= crc32.o
 obj-$(CONFIG_CRC7)	+= crc7.o
 obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
+obj-$(CONFIG_CRC8)	+= crc8.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
 
 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
diff --git a/lib/crc8.c b/lib/crc8.c
new file mode 100644
index 000000000000..87b59cafdb83
--- /dev/null
+++ b/lib/crc8.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2011 Broadcom Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define pr_fmt(fmt)		KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/crc8.h>
+#include <linux/printk.h>
+
+/*
+ * crc8_populate_msb - fill crc table for given polynomial in reverse bit order.
+ *
+ * table:	table to be filled.
+ * polynomial:	polynomial for which table is to be filled.
+ */
+void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial)
+{
+	int i, j;
+	const u8 msbit = 0x80;
+	u8 t = msbit;
+
+	table[0] = 0;
+
+	for (i = 1; i < CRC8_TABLE_SIZE; i *= 2) {
+		t = (t << 1) ^ (t & msbit ? polynomial : 0);
+		for (j = 0; j < i; j++)
+			table[i+j] = table[j] ^ t;
+	}
+}
+EXPORT_SYMBOL(crc8_populate_msb);
+
+/*
+ * crc8_populate_lsb - fill crc table for given polynomial in regular bit order.
+ *
+ * table:	table to be filled.
+ * polynomial:	polynomial for which table is to be filled.
+ */
+void crc8_populate_lsb(u8 table[CRC8_TABLE_SIZE], u8 polynomial)
+{
+	int i, j;
+	u8 t = 1;
+
+	table[0] = 0;
+
+	for (i = (CRC8_TABLE_SIZE >> 1); i; i >>= 1) {
+		t = (t >> 1) ^ (t & 1 ? polynomial : 0);
+		for (j = 0; j < CRC8_TABLE_SIZE; j += 2*i)
+			table[i+j] = table[j] ^ t;
+	}
+}
+EXPORT_SYMBOL(crc8_populate_lsb);
+
+/*
+ * crc8 - calculate a crc8 over the given input data.
+ *
+ * table: crc table used for calculation.
+ * pdata: pointer to data buffer.
+ * nbytes: number of bytes in data buffer.
+ * crc:	previous returned crc8 value.
+ */
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc)
+{
+	/* loop over the buffer data */
+	while (nbytes-- > 0)
+		crc = table[(crc ^ *pdata++) & 0xff];
+
+	return crc;
+}
+EXPORT_SYMBOL(crc8);
+
+MODULE_DESCRIPTION("CRC8 (by Williams, Ross N.) function");
+MODULE_AUTHOR("Broadcom Corporation");
+MODULE_LICENSE("Dual BSD/GPL");
-- 
cgit v1.2.3


From 10f8113ecb76eea72f96c7cfb72d7fed7c282565 Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend@broadcom.com>
Date: Tue, 31 May 2011 11:22:16 +0200
Subject: lib: cordic: add library module providing cordic angle calculation

The brcm80211 driver in the staging tree has a cordic function to
determine cosine and sine for a given angle. Feedback received from
John Linville suggested that these kind of functions should be made
available to others as a library function in the kernel tree. The
b43 driver also has a cordic angle calculation implemented.

Cc: linux-kernel@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Dan Carpenter <error27@gmail.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Larry Finger <Larry.Finger@lwfinger.net>
Reviewed-by: Roland Vossen <rvossen@broadcom.com>
Reviewed-by: Henry Ptasinski <henryp@broadcom.com>
Reviewed-by: Franky (Zhenhui) Lin <frankyl@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/cordic.h |  48 +++++++++++++++++++++++
 lib/Kconfig            |   7 ++++
 lib/Makefile           |   2 +
 lib/cordic.c           | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 158 insertions(+)
 create mode 100644 include/linux/cordic.h
 create mode 100644 lib/cordic.c

(limited to 'include/linux')

diff --git a/include/linux/cordic.h b/include/linux/cordic.h
new file mode 100644
index 000000000000..f932093e20c2
--- /dev/null
+++ b/include/linux/cordic.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2011 Broadcom Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef __CORDIC_H_
+#define __CORDIC_H_
+
+#include <linux/types.h>
+
+/**
+ * struct cordic_iq - i/q coordinate.
+ *
+ * @i: real part of coordinate (in phase).
+ * @q: imaginary part of coordinate (quadrature).
+ */
+struct cordic_iq {
+	s32 i;
+	s32 q;
+};
+
+/**
+ * cordic_calc_iq() - calculates the i/q coordinate for given angle.
+ *
+ * @theta: angle in degrees for which i/q coordinate is to be calculated.
+ * @coord: function output parameter holding the i/q coordinate.
+ *
+ * The function calculates the i/q coordinate for a given angle using
+ * cordic algorithm. The coordinate consists of a real (i) and an
+ * imaginary (q) part. The real part is essentially the cosine of the
+ * angle and the imaginary part is the sine of the angle. The returned
+ * values are scaled by 2^16 for precision. The range for theta is
+ * for -180 degrees to +180 degrees. Passed values outside this range are
+ * converted before doing the actual calculation.
+ */
+struct cordic_iq cordic_calc_iq(s32 theta);
+
+#endif /* __CORDIC_H_ */
diff --git a/lib/Kconfig b/lib/Kconfig
index ff9e5a34be76..5c702047c57f 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -279,4 +279,11 @@ config AVERAGE
 
 	  If unsure, say N.
 
+config CORDIC
+	tristate "Cordic function"
+	help
+	  The option provides arithmetic function using cordic algorithm
+	  so its calculations are in fixed point. Modules can select this
+	  when they require this function. Module will be called cordic.
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 704959d2018b..9e3c1b0652d1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -117,6 +117,8 @@ obj-$(CONFIG_AVERAGE) += average.o
 
 obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
 
+obj-$(CONFIG_CORDIC) += cordic.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/lib/cordic.c b/lib/cordic.c
new file mode 100644
index 000000000000..aa27a88d7e04
--- /dev/null
+++ b/lib/cordic.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2011 Broadcom Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/cordic.h>
+
+#define CORDIC_ANGLE_GEN	39797
+#define CORDIC_PRECISION_SHIFT	16
+#define	CORDIC_NUM_ITER		(CORDIC_PRECISION_SHIFT + 2)
+
+#define	FIXED(X)	((s32)((X) << CORDIC_PRECISION_SHIFT))
+#define	FLOAT(X)	(((X) >= 0) \
+		? ((((X) >> (CORDIC_PRECISION_SHIFT - 1)) + 1) >> 1) \
+		: -((((-(X)) >> (CORDIC_PRECISION_SHIFT - 1)) + 1) >> 1))
+
+static const s32 arctan_table[] = {
+	2949120,
+	1740967,
+	919879,
+	466945,
+	234379,
+	117304,
+	58666,
+	29335,
+	14668,
+	7334,
+	3667,
+	1833,
+	917,
+	458,
+	229,
+	115,
+	57,
+	29
+};
+
+/*
+ * cordic_calc_iq() - calculates the i/q coordinate for given angle
+ *
+ * theta: angle in degrees for which i/q coordinate is to be calculated
+ * coord: function output parameter holding the i/q coordinate
+ */
+struct cordic_iq cordic_calc_iq(s32 theta)
+{
+	struct cordic_iq coord;
+	s32 angle, valtmp;
+	unsigned iter;
+	int signx = 1;
+	int signtheta;
+
+	coord.i = CORDIC_ANGLE_GEN;
+	coord.q = 0;
+	angle = 0;
+
+	theta = FIXED(theta);
+	signtheta = (theta < 0) ? -1 : 1;
+	theta = ((theta + FIXED(180) * signtheta) % FIXED(360)) -
+		FIXED(180) * signtheta;
+
+	if (FLOAT(theta) > 90) {
+		theta -= FIXED(180);
+		signx = -1;
+	} else if (FLOAT(theta) < -90) {
+		theta += FIXED(180);
+		signx = -1;
+	}
+
+	for (iter = 0; iter < CORDIC_NUM_ITER; iter++) {
+		if (theta > angle) {
+			valtmp = coord.i - (coord.q >> iter);
+			coord.q += (coord.i >> iter);
+			angle += arctan_table[iter];
+		} else {
+			valtmp = coord.i + (coord.q >> iter);
+			coord.q -= (coord.i >> iter);
+			angle -= arctan_table[iter];
+		}
+		coord.i = valtmp;
+	}
+
+	coord.i *= signx;
+	coord.q *= signx;
+	return coord;
+}
+EXPORT_SYMBOL(cordic_calc_iq);
+
+MODULE_DESCRIPTION("Cordic functions");
+MODULE_AUTHOR("Broadcom Corporation");
+MODULE_LICENSE("Dual BSD/GPL");
-- 
cgit v1.2.3


From 27f18dc2dafe66a93c7101fc94201b8c83903597 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Thu, 2 Jun 2011 02:08:51 +0200
Subject: bcma: read SPROM and extract MAC from it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case of BCMA cards SPROM is located in the ChipCommon core, it is
not mapped as separated host window. So far we have met only SPROMs rev
8.
SPROM layout seems to be the same as for SSB buses, so we decided to
share SPROM struct and some defines.
For now we extract MAC address only, this can be improved of course.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/bcma/Makefile                       |   2 +-
 drivers/bcma/bcma_private.h                 |   3 +
 drivers/bcma/main.c                         |   7 ++
 drivers/bcma/sprom.c                        | 162 ++++++++++++++++++++++++++++
 include/linux/bcma/bcma.h                   |   5 +
 include/linux/bcma/bcma_driver_chipcommon.h |   1 +
 6 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 drivers/bcma/sprom.c

(limited to 'include/linux')

diff --git a/drivers/bcma/Makefile b/drivers/bcma/Makefile
index 0d56245bcb79..cde0182bd1dc 100644
--- a/drivers/bcma/Makefile
+++ b/drivers/bcma/Makefile
@@ -1,4 +1,4 @@
-bcma-y					+= main.o scan.o core.o
+bcma-y					+= main.o scan.o core.o sprom.o
 bcma-y					+= driver_chipcommon.o driver_chipcommon_pmu.o
 bcma-y					+= driver_pci.o
 bcma-$(CONFIG_BCMA_HOST_PCI)		+= host_pci.o
diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h
index 2f72e9c585fd..12a75ab3dd23 100644
--- a/drivers/bcma/bcma_private.h
+++ b/drivers/bcma/bcma_private.h
@@ -19,6 +19,9 @@ extern void bcma_bus_unregister(struct bcma_bus *bus);
 /* scan.c */
 int bcma_bus_scan(struct bcma_bus *bus);
 
+/* sprom.c */
+int bcma_sprom_get(struct bcma_bus *bus);
+
 #ifdef CONFIG_BCMA_HOST_PCI
 /* host_pci.c */
 extern int __init bcma_host_pci_init(void);
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index a2f6b1879273..11e96dc6011a 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -146,6 +146,13 @@ int bcma_bus_register(struct bcma_bus *bus)
 		bcma_core_pci_init(&bus->drv_pci);
 	}
 
+	/* Try to get SPROM */
+	err = bcma_sprom_get(bus);
+	if (err) {
+		pr_err("Failed to get SPROM: %d\n", err);
+		return -ENOENT;
+	}
+
 	/* Register found cores */
 	bcma_register_cores(bus);
 
diff --git a/drivers/bcma/sprom.c b/drivers/bcma/sprom.c
new file mode 100644
index 000000000000..ffbb0e32e921
--- /dev/null
+++ b/drivers/bcma/sprom.c
@@ -0,0 +1,162 @@
+/*
+ * Broadcom specific AMBA
+ * SPROM reading
+ *
+ * Licensed under the GNU/GPL. See COPYING for details.
+ */
+
+#include "bcma_private.h"
+
+#include <linux/bcma/bcma.h>
+#include <linux/bcma/bcma_regs.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
+#define SPOFF(offset)	((offset) / sizeof(u16))
+
+/**************************************************
+ * R/W ops.
+ **************************************************/
+
+static void bcma_sprom_read(struct bcma_bus *bus, u16 *sprom)
+{
+	int i;
+	for (i = 0; i < SSB_SPROMSIZE_WORDS_R4; i++)
+		sprom[i] = bcma_read16(bus->drv_cc.core,
+				       BCMA_CC_SPROM + (i * 2));
+}
+
+/**************************************************
+ * Validation.
+ **************************************************/
+
+static inline u8 bcma_crc8(u8 crc, u8 data)
+{
+	/* Polynomial:   x^8 + x^7 + x^6 + x^4 + x^2 + 1   */
+	static const u8 t[] = {
+		0x00, 0xF7, 0xB9, 0x4E, 0x25, 0xD2, 0x9C, 0x6B,
+		0x4A, 0xBD, 0xF3, 0x04, 0x6F, 0x98, 0xD6, 0x21,
+		0x94, 0x63, 0x2D, 0xDA, 0xB1, 0x46, 0x08, 0xFF,
+		0xDE, 0x29, 0x67, 0x90, 0xFB, 0x0C, 0x42, 0xB5,
+		0x7F, 0x88, 0xC6, 0x31, 0x5A, 0xAD, 0xE3, 0x14,
+		0x35, 0xC2, 0x8C, 0x7B, 0x10, 0xE7, 0xA9, 0x5E,
+		0xEB, 0x1C, 0x52, 0xA5, 0xCE, 0x39, 0x77, 0x80,
+		0xA1, 0x56, 0x18, 0xEF, 0x84, 0x73, 0x3D, 0xCA,
+		0xFE, 0x09, 0x47, 0xB0, 0xDB, 0x2C, 0x62, 0x95,
+		0xB4, 0x43, 0x0D, 0xFA, 0x91, 0x66, 0x28, 0xDF,
+		0x6A, 0x9D, 0xD3, 0x24, 0x4F, 0xB8, 0xF6, 0x01,
+		0x20, 0xD7, 0x99, 0x6E, 0x05, 0xF2, 0xBC, 0x4B,
+		0x81, 0x76, 0x38, 0xCF, 0xA4, 0x53, 0x1D, 0xEA,
+		0xCB, 0x3C, 0x72, 0x85, 0xEE, 0x19, 0x57, 0xA0,
+		0x15, 0xE2, 0xAC, 0x5B, 0x30, 0xC7, 0x89, 0x7E,
+		0x5F, 0xA8, 0xE6, 0x11, 0x7A, 0x8D, 0xC3, 0x34,
+		0xAB, 0x5C, 0x12, 0xE5, 0x8E, 0x79, 0x37, 0xC0,
+		0xE1, 0x16, 0x58, 0xAF, 0xC4, 0x33, 0x7D, 0x8A,
+		0x3F, 0xC8, 0x86, 0x71, 0x1A, 0xED, 0xA3, 0x54,
+		0x75, 0x82, 0xCC, 0x3B, 0x50, 0xA7, 0xE9, 0x1E,
+		0xD4, 0x23, 0x6D, 0x9A, 0xF1, 0x06, 0x48, 0xBF,
+		0x9E, 0x69, 0x27, 0xD0, 0xBB, 0x4C, 0x02, 0xF5,
+		0x40, 0xB7, 0xF9, 0x0E, 0x65, 0x92, 0xDC, 0x2B,
+		0x0A, 0xFD, 0xB3, 0x44, 0x2F, 0xD8, 0x96, 0x61,
+		0x55, 0xA2, 0xEC, 0x1B, 0x70, 0x87, 0xC9, 0x3E,
+		0x1F, 0xE8, 0xA6, 0x51, 0x3A, 0xCD, 0x83, 0x74,
+		0xC1, 0x36, 0x78, 0x8F, 0xE4, 0x13, 0x5D, 0xAA,
+		0x8B, 0x7C, 0x32, 0xC5, 0xAE, 0x59, 0x17, 0xE0,
+		0x2A, 0xDD, 0x93, 0x64, 0x0F, 0xF8, 0xB6, 0x41,
+		0x60, 0x97, 0xD9, 0x2E, 0x45, 0xB2, 0xFC, 0x0B,
+		0xBE, 0x49, 0x07, 0xF0, 0x9B, 0x6C, 0x22, 0xD5,
+		0xF4, 0x03, 0x4D, 0xBA, 0xD1, 0x26, 0x68, 0x9F,
+	};
+	return t[crc ^ data];
+}
+
+static u8 bcma_sprom_crc(const u16 *sprom)
+{
+	int word;
+	u8 crc = 0xFF;
+
+	for (word = 0; word < SSB_SPROMSIZE_WORDS_R4 - 1; word++) {
+		crc = bcma_crc8(crc, sprom[word] & 0x00FF);
+		crc = bcma_crc8(crc, (sprom[word] & 0xFF00) >> 8);
+	}
+	crc = bcma_crc8(crc, sprom[SSB_SPROMSIZE_WORDS_R4 - 1] & 0x00FF);
+	crc ^= 0xFF;
+
+	return crc;
+}
+
+static int bcma_sprom_check_crc(const u16 *sprom)
+{
+	u8 crc;
+	u8 expected_crc;
+	u16 tmp;
+
+	crc = bcma_sprom_crc(sprom);
+	tmp = sprom[SSB_SPROMSIZE_WORDS_R4 - 1] & SSB_SPROM_REVISION_CRC;
+	expected_crc = tmp >> SSB_SPROM_REVISION_CRC_SHIFT;
+	if (crc != expected_crc)
+		return -EPROTO;
+
+	return 0;
+}
+
+static int bcma_sprom_valid(const u16 *sprom)
+{
+	u16 revision;
+	int err;
+
+	err = bcma_sprom_check_crc(sprom);
+	if (err)
+		return err;
+
+	revision = sprom[SSB_SPROMSIZE_WORDS_R4 - 1] & SSB_SPROM_REVISION_REV;
+	if (revision != 8) {
+		pr_err("Unsupported SPROM revision: %d\n", revision);
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+/**************************************************
+ * SPROM extraction.
+ **************************************************/
+
+static void bcma_sprom_extract_r8(struct bcma_bus *bus, const u16 *sprom)
+{
+	u16 v;
+	int i;
+
+	for (i = 0; i < 3; i++) {
+		v = sprom[SPOFF(SSB_SPROM8_IL0MAC) + i];
+		*(((__be16 *)bus->sprom.il0mac) + i) = cpu_to_be16(v);
+	}
+}
+
+int bcma_sprom_get(struct bcma_bus *bus)
+{
+	u16 *sprom;
+	int err = 0;
+
+	if (!bus->drv_cc.core)
+		return -EOPNOTSUPP;
+
+	sprom = kcalloc(SSB_SPROMSIZE_WORDS_R4, sizeof(u16),
+			GFP_KERNEL);
+	if (!sprom)
+		return -ENOMEM;
+
+	bcma_sprom_read(bus, sprom);
+
+	err = bcma_sprom_valid(sprom);
+	if (err)
+		goto out;
+
+	bcma_sprom_extract_r8(bus, sprom);
+
+out:
+	kfree(sprom);
+	return err;
+}
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 27a27a79bea3..6ff080eac0b2 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -6,6 +6,7 @@
 
 #include <linux/bcma/bcma_driver_chipcommon.h>
 #include <linux/bcma/bcma_driver_pci.h>
+#include <linux/ssb/ssb.h> /* SPROM sharing */
 
 #include "bcma_regs.h"
 
@@ -187,6 +188,10 @@ struct bcma_bus {
 
 	struct bcma_drv_cc drv_cc;
 	struct bcma_drv_pci drv_pci;
+
+	/* We decided to share SPROM struct with SSB as long as we do not need
+	 * any hacks for BCMA. This simplifies drivers code. */
+	struct ssb_sprom sprom;
 };
 
 extern inline u32 bcma_read8(struct bcma_device *core, u16 offset)
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 083c3b6cd5ce..9c5b69fc985a 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -244,6 +244,7 @@
 #define BCMA_CC_REGCTL_DATA		0x065C
 #define BCMA_CC_PLLCTL_ADDR		0x0660
 #define BCMA_CC_PLLCTL_DATA		0x0664
+#define BCMA_CC_SPROM			0x0830 /* SPROM beginning */
 
 /* Data for the PMU, if available.
  * Check availability with ((struct bcma_chipcommon)->capabilities & BCMA_CC_CAP_PMU)
-- 
cgit v1.2.3


From a8f072c1d624a627b67f2ace2f0c25d856ef4e54 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:13:59 +0200
Subject: job control: rename signal->group_stop and flags to jobctl and update
 them

signal->group_stop currently hosts mostly group stop related flags;
however, it's gonna be used for wider purposes and the GROUP_STOP_
flag prefix becomes confusing.  Rename signal->group_stop to
signal->jobctl and rename all GROUP_STOP_* flags to JOBCTL_*.

Bit position macros JOBCTL_*_BIT are defined and JOBCTL_* flags are
defined in terms of them to allow using bitops later.

While at it, reassign JOBCTL_TRAPPING to bit 22 to better accomodate
future additions.

This doesn't cause any functional change.

-v2: JOBCTL_*_BIT macros added as suggested by Linus.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c             |  2 +-
 include/linux/sched.h | 22 ++++++++-----
 kernel/ptrace.c       | 12 +++----
 kernel/signal.c       | 91 ++++++++++++++++++++++++++-------------------------
 4 files changed, 67 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index ea5f748906a8..8986bb0f9dc2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1772,7 +1772,7 @@ static int zap_process(struct task_struct *start, int exit_code)
 
 	t = start;
 	do {
-		task_clear_group_stop_pending(t);
+		task_clear_jobctl_stop_pending(t);
 		if (t != current && t->mm) {
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2a8621c4be1e..b0dd064eb4fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1282,7 +1282,7 @@ struct task_struct {
 	int exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
-	unsigned int group_stop;	/* GROUP_STOP_*, siglock protected */
+	unsigned int jobctl;	/* JOBCTL_*, siglock protected */
 	/* ??? */
 	unsigned int personality;
 	unsigned did_exec:1;
@@ -1803,15 +1803,21 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define used_math() tsk_used_math(current)
 
 /*
- * task->group_stop flags
+ * task->jobctl flags
  */
-#define GROUP_STOP_SIGMASK	0xffff    /* signr of the last group stop */
-#define GROUP_STOP_PENDING	(1 << 16) /* task should stop for group stop */
-#define GROUP_STOP_CONSUME	(1 << 17) /* consume group stop count */
-#define GROUP_STOP_TRAPPING	(1 << 18) /* switching from STOPPED to TRACED */
-#define GROUP_STOP_DEQUEUED	(1 << 19) /* stop signal dequeued */
+#define JOBCTL_STOP_SIGMASK	0xffff	/* signr of the last group stop */
 
-extern void task_clear_group_stop_pending(struct task_struct *task);
+#define JOBCTL_STOP_DEQUEUED_BIT 16	/* stop signal dequeued */
+#define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
+#define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
+#define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
+
+#define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
+#define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
+#define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
+
+extern void task_clear_jobctl_stop_pending(struct task_struct *task);
 
 #ifdef CONFIG_PREEMPT_RCU
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4f689cb739a3..134f34cb142b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -77,13 +77,13 @@ void __ptrace_unlink(struct task_struct *child)
 	spin_lock(&child->sighand->siglock);
 
 	/*
-	 * Reinstate GROUP_STOP_PENDING if group stop is in effect and
+	 * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
 	 * @child isn't dead.
 	 */
 	if (!(child->flags & PF_EXITING) &&
 	    (child->signal->flags & SIGNAL_STOP_STOPPED ||
 	     child->signal->group_stop_count))
-		child->group_stop |= GROUP_STOP_PENDING;
+		child->jobctl |= JOBCTL_STOP_PENDING;
 
 	/*
 	 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
@@ -91,7 +91,7 @@ void __ptrace_unlink(struct task_struct *child)
 	 * is in TASK_TRACED; otherwise, we might unduly disrupt
 	 * TASK_KILLABLE sleeps.
 	 */
-	if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
+	if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
 		signal_wake_up(child, task_is_traced(child));
 
 	spin_unlock(&child->sighand->siglock);
@@ -226,7 +226,7 @@ static int ptrace_attach(struct task_struct *task)
 	spin_lock(&task->sighand->siglock);
 
 	/*
-	 * If the task is already STOPPED, set GROUP_STOP_PENDING and
+	 * If the task is already STOPPED, set JOBCTL_STOP_PENDING and
 	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
 	 * will be cleared if the child completes the transition or any
 	 * event which clears the group stop states happens.  We'll wait
@@ -243,7 +243,7 @@ static int ptrace_attach(struct task_struct *task)
 	 * in and out of STOPPED are protected by siglock.
 	 */
 	if (task_is_stopped(task)) {
-		task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
+		task->jobctl |= JOBCTL_STOP_PENDING | JOBCTL_TRAPPING;
 		signal_wake_up(task, 1);
 	}
 
@@ -257,7 +257,7 @@ unlock_creds:
 out:
 	if (!retval)
 		wait_event(current->signal->wait_chldexit,
-			   !(task->group_stop & GROUP_STOP_TRAPPING));
+			   !(task->jobctl & JOBCTL_TRAPPING));
 	return retval;
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 86c32b884f8e..ab6851c06461 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 
 static int recalc_sigpending_tsk(struct task_struct *t)
 {
-	if ((t->group_stop & GROUP_STOP_PENDING) ||
+	if ((t->jobctl & JOBCTL_STOP_PENDING) ||
 	    PENDING(&t->pending, &t->blocked) ||
 	    PENDING(&t->signal->shared_pending, &t->blocked)) {
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -224,27 +224,28 @@ static inline void print_dropped_signal(int sig)
 }
 
 /**
- * task_clear_group_stop_trapping - clear group stop trapping bit
+ * task_clear_jobctl_trapping - clear jobctl trapping bit
  * @task: target task
  *
- * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us.  Clear it
- * and wake up the ptracer.  Note that we don't need any further locking.
- * @task->siglock guarantees that @task->parent points to the ptracer.
+ * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
+ * Clear it and wake up the ptracer.  Note that we don't need any further
+ * locking.  @task->siglock guarantees that @task->parent points to the
+ * ptracer.
  *
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-static void task_clear_group_stop_trapping(struct task_struct *task)
+static void task_clear_jobctl_trapping(struct task_struct *task)
 {
-	if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
-		task->group_stop &= ~GROUP_STOP_TRAPPING;
+	if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
+		task->jobctl &= ~JOBCTL_TRAPPING;
 		__wake_up_sync_key(&task->parent->signal->wait_chldexit,
 				   TASK_UNINTERRUPTIBLE, 1, task);
 	}
 }
 
 /**
- * task_clear_group_stop_pending - clear pending group stop
+ * task_clear_jobctl_stop_pending - clear pending group stop
  * @task: target task
  *
  * Clear group stop states for @task.
@@ -252,19 +253,19 @@ static void task_clear_group_stop_trapping(struct task_struct *task)
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-void task_clear_group_stop_pending(struct task_struct *task)
+void task_clear_jobctl_stop_pending(struct task_struct *task)
 {
-	task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
-			      GROUP_STOP_DEQUEUED);
+	task->jobctl &= ~(JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME |
+			  JOBCTL_STOP_DEQUEUED);
 }
 
 /**
  * task_participate_group_stop - participate in a group stop
  * @task: task participating in a group stop
  *
- * @task has GROUP_STOP_PENDING set and is participating in a group stop.
+ * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
  * Group stop states are cleared and the group stop count is consumed if
- * %GROUP_STOP_CONSUME was set.  If the consumption completes the group
+ * %JOBCTL_STOP_CONSUME was set.  If the consumption completes the group
  * stop, the appropriate %SIGNAL_* flags are set.
  *
  * CONTEXT:
@@ -277,11 +278,11 @@ void task_clear_group_stop_pending(struct task_struct *task)
 static bool task_participate_group_stop(struct task_struct *task)
 {
 	struct signal_struct *sig = task->signal;
-	bool consume = task->group_stop & GROUP_STOP_CONSUME;
+	bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
 
-	WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
+	WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
 
-	task_clear_group_stop_pending(task);
+	task_clear_jobctl_stop_pending(task);
 
 	if (!consume)
 		return false;
@@ -604,7 +605,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		 * is to alert stop-signal processing code when another
 		 * processor has come along and cleared the flag.
 		 */
-		current->group_stop |= GROUP_STOP_DEQUEUED;
+		current->jobctl |= JOBCTL_STOP_DEQUEUED;
 	}
 	if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
 		/*
@@ -809,7 +810,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 		rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
 		t = p;
 		do {
-			task_clear_group_stop_pending(t);
+			task_clear_jobctl_stop_pending(t);
 			rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
 			wake_up_state(t, __TASK_STOPPED);
 		} while_each_thread(p, t);
@@ -925,7 +926,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 			signal->group_stop_count = 0;
 			t = p;
 			do {
-				task_clear_group_stop_pending(t);
+				task_clear_jobctl_stop_pending(t);
 				sigaddset(&t->pending.signal, SIGKILL);
 				signal_wake_up(t, 1);
 			} while_each_thread(p, t);
@@ -1160,7 +1161,7 @@ int zap_other_threads(struct task_struct *p)
 	p->signal->group_stop_count = 0;
 
 	while_each_thread(p, t) {
-		task_clear_group_stop_pending(t);
+		task_clear_jobctl_stop_pending(t);
 		count++;
 
 		/* Don't bother with already dead threads */
@@ -1738,7 +1739,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	 * clear now.  We act as if SIGCONT is received after TASK_TRACED
 	 * is entered - ignore it.
 	 */
-	if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
+	if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
 		gstop_done = task_participate_group_stop(current);
 
 	current->last_siginfo = info;
@@ -1751,12 +1752,12 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	set_current_state(TASK_TRACED);
 
 	/*
-	 * We're committing to trapping.  Clearing GROUP_STOP_TRAPPING and
+	 * We're committing to trapping.  Clearing JOBCTL_TRAPPING and
 	 * transition to TASK_TRACED should be atomic with respect to
-	 * siglock.  This hsould be done after the arch hook as siglock is
+	 * siglock.  This should be done after the arch hook as siglock is
 	 * released and regrabbed across it.
 	 */
-	task_clear_group_stop_trapping(current);
+	task_clear_jobctl_trapping(current);
 
 	spin_unlock_irq(&current->sighand->siglock);
 	read_lock(&tasklist_lock);
@@ -1792,9 +1793,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 		 *
 		 * If @gstop_done, the ptracer went away between group stop
 		 * completion and here.  During detach, it would have set
-		 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
-		 * in do_signal_stop() on return, so notifying the real
-		 * parent of the group stop completion is enough.
+		 * JOBCTL_STOP_PENDING on us and we'll re-enter
+		 * TASK_STOPPED in do_signal_stop() on return, so notifying
+		 * the real parent of the group stop completion is enough.
 		 */
 		if (gstop_done)
 			do_notify_parent_cldstop(current, false, why);
@@ -1856,14 +1857,14 @@ static int do_signal_stop(int signr)
 {
 	struct signal_struct *sig = current->signal;
 
-	if (!(current->group_stop & GROUP_STOP_PENDING)) {
-		unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
+	if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
+		unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
 		struct task_struct *t;
 
-		/* signr will be recorded in task->group_stop for retries */
-		WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
+		/* signr will be recorded in task->jobctl for retries */
+		WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
 
-		if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
+		if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
 		    unlikely(signal_group_exit(sig)))
 			return 0;
 		/*
@@ -1890,19 +1891,19 @@ static int do_signal_stop(int signr)
 		else
 			WARN_ON_ONCE(!task_ptrace(current));
 
-		current->group_stop &= ~GROUP_STOP_SIGMASK;
-		current->group_stop |= signr | gstop;
+		current->jobctl &= ~JOBCTL_STOP_SIGMASK;
+		current->jobctl |= signr | gstop;
 		sig->group_stop_count = 1;
 		for (t = next_thread(current); t != current;
 		     t = next_thread(t)) {
-			t->group_stop &= ~GROUP_STOP_SIGMASK;
+			t->jobctl &= ~JOBCTL_STOP_SIGMASK;
 			/*
 			 * Setting state to TASK_STOPPED for a group
 			 * stop is always done with the siglock held,
 			 * so this check has no races.
 			 */
 			if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
-				t->group_stop |= signr | gstop;
+				t->jobctl |= signr | gstop;
 				sig->group_stop_count++;
 				signal_wake_up(t, 0);
 			}
@@ -1943,23 +1944,23 @@ retry:
 
 		spin_lock_irq(&current->sighand->siglock);
 	} else {
-		ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
+		ptrace_stop(current->jobctl & JOBCTL_STOP_SIGMASK,
 			    CLD_STOPPED, 0, NULL);
 		current->exit_code = 0;
 	}
 
 	/*
-	 * GROUP_STOP_PENDING could be set if another group stop has
+	 * JOBCTL_STOP_PENDING could be set if another group stop has
 	 * started since being woken up or ptrace wants us to transit
 	 * between TASK_STOPPED and TRACED.  Retry group stop.
 	 */
-	if (current->group_stop & GROUP_STOP_PENDING) {
-		WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
+	if (current->jobctl & JOBCTL_STOP_PENDING) {
+		WARN_ON_ONCE(!(current->jobctl & JOBCTL_STOP_SIGMASK));
 		goto retry;
 	}
 
 	/* PTRACE_ATTACH might have raced with task killing, clear trapping */
-	task_clear_group_stop_trapping(current);
+	task_clear_jobctl_trapping(current);
 
 	spin_unlock_irq(&current->sighand->siglock);
 
@@ -2078,8 +2079,8 @@ relock:
 		if (unlikely(signr != 0))
 			ka = return_ka;
 		else {
-			if (unlikely(current->group_stop &
-				     GROUP_STOP_PENDING) && do_signal_stop(0))
+			if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
+			    do_signal_stop(0))
 				goto relock;
 
 			signr = dequeue_signal(current, &current->blocked,
@@ -2253,7 +2254,7 @@ void exit_signals(struct task_struct *tsk)
 	signotset(&unblocked);
 	retarget_shared_pending(tsk, &unblocked);
 
-	if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
+	if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
 	    task_participate_group_stop(tsk))
 		group_stop = CLD_STOPPED;
 out:
-- 
cgit v1.2.3


From 755e276b3326f300585435d2f3876e66e248c476 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:13:59 +0200
Subject: ptrace: ptrace_check_attach(): rename @kill to @ignore_state and add
 comments

PTRACE_INTERRUPT is going to be added which should also skip
task_is_traced() check in ptrace_check_attach().  Rename @kill to
@ignore_state and make it bool.  Add function comment while at it.

This patch doesn't introduce any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h |  2 +-
 kernel/ptrace.c        | 24 +++++++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 9178d5cc0b01..e93ef1a54fc7 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -105,7 +105,7 @@ extern long arch_ptrace(struct task_struct *child, long request,
 extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern void ptrace_disable(struct task_struct *);
-extern int ptrace_check_attach(struct task_struct *task, int kill);
+extern int ptrace_check_attach(struct task_struct *task, bool ignore_state);
 extern int ptrace_request(struct task_struct *child, long request,
 			  unsigned long addr, unsigned long data);
 extern void ptrace_notify(int exit_code);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 134f34cb142b..eb191116edf7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -97,10 +97,24 @@ void __ptrace_unlink(struct task_struct *child)
 	spin_unlock(&child->sighand->siglock);
 }
 
-/*
- * Check that we have indeed attached to the thing..
+/**
+ * ptrace_check_attach - check whether ptracee is ready for ptrace operation
+ * @child: ptracee to check for
+ * @ignore_state: don't check whether @child is currently %TASK_TRACED
+ *
+ * Check whether @child is being ptraced by %current and ready for further
+ * ptrace operations.  If @ignore_state is %false, @child also should be in
+ * %TASK_TRACED state and on return the child is guaranteed to be traced
+ * and not executing.  If @ignore_state is %true, @child can be in any
+ * state.
+ *
+ * CONTEXT:
+ * Grabs and releases tasklist_lock and @child->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 on success, -ESRCH if %child is not ready.
  */
-int ptrace_check_attach(struct task_struct *child, int kill)
+int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 {
 	int ret = -ESRCH;
 
@@ -119,13 +133,13 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 		 */
 		spin_lock_irq(&child->sighand->siglock);
 		WARN_ON_ONCE(task_is_stopped(child));
-		if (task_is_traced(child) || kill)
+		if (task_is_traced(child) || ignore_state)
 			ret = 0;
 		spin_unlock_irq(&child->sighand->siglock);
 	}
 	read_unlock(&tasklist_lock);
 
-	if (!ret && !kill)
+	if (!ret && !ignore_state)
 		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
 
 	/* All systems go.. */
-- 
cgit v1.2.3


From 3759a0d94c18764247b66511d1038f2b93aa95de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:14:00 +0200
Subject: job control: introduce JOBCTL_PENDING_MASK and
 task_clear_jobctl_pending()

This patch introduces JOBCTL_PENDING_MASK and replaces
task_clear_jobctl_stop_pending() with task_clear_jobctl_pending()
which takes an extra @mask argument.

JOBCTL_PENDING_MASK is currently equal to JOBCTL_STOP_PENDING but
future patches will add more bits.  recalc_sigpending_tsk() is updated
to use JOBCTL_PENDING_MASK instead.

task_clear_jobctl_pending() takes @mask which in subset of
JOBCTL_PENDING_MASK and clears the relevant jobctl bits.  If
JOBCTL_STOP_PENDING is set, other STOP bits are cleared together.  All
task_clear_jobctl_stop_pending() users are updated to call
task_clear_jobctl_pending() with JOBCTL_STOP_PENDING which is
functionally identical to task_clear_jobctl_stop_pending().

This patch doesn't cause any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c             |  2 +-
 include/linux/sched.h |  5 ++++-
 kernel/signal.c       | 27 +++++++++++++++++----------
 3 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 8986bb0f9dc2..4402105287cb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1772,7 +1772,7 @@ static int zap_process(struct task_struct *start, int exit_code)
 
 	t = start;
 	do {
-		task_clear_jobctl_stop_pending(t);
+		task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 		if (t != current && t->mm) {
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0dd064eb4fc..5a958b17f9fe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1817,7 +1817,10 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
 #define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
 
-extern void task_clear_jobctl_stop_pending(struct task_struct *task);
+#define JOBCTL_PENDING_MASK	JOBCTL_STOP_PENDING
+
+extern void task_clear_jobctl_pending(struct task_struct *task,
+				      unsigned int mask);
 
 #ifdef CONFIG_PREEMPT_RCU
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 62a6c3bb9f0d..288d952fa3b8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 
 static int recalc_sigpending_tsk(struct task_struct *t)
 {
-	if ((t->jobctl & JOBCTL_STOP_PENDING) ||
+	if ((t->jobctl & JOBCTL_PENDING_MASK) ||
 	    PENDING(&t->pending, &t->blocked) ||
 	    PENDING(&t->signal->shared_pending, &t->blocked)) {
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -245,18 +245,25 @@ static void task_clear_jobctl_trapping(struct task_struct *task)
 }
 
 /**
- * task_clear_jobctl_stop_pending - clear pending group stop
+ * task_clear_jobctl_pending - clear jobctl pending bits
  * @task: target task
+ * @mask: pending bits to clear
  *
- * Clear group stop states for @task.
+ * Clear @mask from @task->jobctl.  @mask must be subset of
+ * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
+ * STOP bits are cleared together.
  *
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-void task_clear_jobctl_stop_pending(struct task_struct *task)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
 {
-	task->jobctl &= ~(JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME |
-			  JOBCTL_STOP_DEQUEUED);
+	BUG_ON(mask & ~JOBCTL_PENDING_MASK);
+
+	if (mask & JOBCTL_STOP_PENDING)
+		mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
+
+	task->jobctl &= ~mask;
 }
 
 /**
@@ -282,7 +289,7 @@ static bool task_participate_group_stop(struct task_struct *task)
 
 	WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
 
-	task_clear_jobctl_stop_pending(task);
+	task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
 
 	if (!consume)
 		return false;
@@ -810,7 +817,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 		rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
 		t = p;
 		do {
-			task_clear_jobctl_stop_pending(t);
+			task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 			rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
 			wake_up_state(t, __TASK_STOPPED);
 		} while_each_thread(p, t);
@@ -926,7 +933,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 			signal->group_stop_count = 0;
 			t = p;
 			do {
-				task_clear_jobctl_stop_pending(t);
+				task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 				sigaddset(&t->pending.signal, SIGKILL);
 				signal_wake_up(t, 1);
 			} while_each_thread(p, t);
@@ -1161,7 +1168,7 @@ int zap_other_threads(struct task_struct *p)
 	p->signal->group_stop_count = 0;
 
 	while_each_thread(p, t) {
-		task_clear_jobctl_stop_pending(t);
+		task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 		count++;
 
 		/* Don't bother with already dead threads */
-- 
cgit v1.2.3


From 7dd3db54e77d21eb95e145f19ba53f68250d0e73 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:14:00 +0200
Subject: job control: introduce task_set_jobctl_pending()

task->jobctl currently hosts JOBCTL_STOP_PENDING and will host TRAP
pending bits too.  Setting pending conditions on a dying task may make
the task unkillable.  Currently, each setting site is responsible for
checking for the condition but with to-be-added job control traps this
becomes too fragile.

This patch adds task_set_jobctl_pending() which should be used when
setting task->jobctl bits to schedule a stop or trap.  The function
performs the followings to ease setting pending bits.

* Sanity checks.

* If fatal signal is pending or PF_EXITING is set, no bit is set.

* STOP_SIGMASK is automatically cleared if new value is being set.

do_signal_stop() and ptrace_attach() are updated to use
task_set_jobctl_pending() instead of setting STOP_PENDING explicitly.
The surrounding structures around setting are changed to fit
task_set_jobctl_pending() better but there should be no userland
visible behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/sched.h |  2 ++
 kernel/ptrace.c       |  6 +++---
 kernel/signal.c       | 46 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a958b17f9fe..5157bd9eee37 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1819,6 +1819,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 
 #define JOBCTL_PENDING_MASK	JOBCTL_STOP_PENDING
 
+extern bool task_set_jobctl_pending(struct task_struct *task,
+				    unsigned int mask);
 extern void task_clear_jobctl_pending(struct task_struct *task,
 				      unsigned int mask);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eb191116edf7..0c37d999c8b8 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -256,10 +256,10 @@ static int ptrace_attach(struct task_struct *task)
 	 * The following task_is_stopped() test is safe as both transitions
 	 * in and out of STOPPED are protected by siglock.
 	 */
-	if (task_is_stopped(task)) {
-		task->jobctl |= JOBCTL_STOP_PENDING | JOBCTL_TRAPPING;
+	if (task_is_stopped(task) &&
+	    task_set_jobctl_pending(task,
+				    JOBCTL_STOP_PENDING | JOBCTL_TRAPPING))
 		signal_wake_up(task, 1);
-	}
 
 	spin_unlock(&task->sighand->siglock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 637a171b65b6..9ab91c516c3f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -223,6 +223,39 @@ static inline void print_dropped_signal(int sig)
 				current->comm, current->pid, sig);
 }
 
+/**
+ * task_set_jobctl_pending - set jobctl pending bits
+ * @task: target task
+ * @mask: pending bits to set
+ *
+ * Clear @mask from @task->jobctl.  @mask must be subset of
+ * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
+ * %JOBCTL_TRAPPING.  If stop signo is being set, the existing signo is
+ * cleared.  If @task is already being killed or exiting, this function
+ * becomes noop.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if @mask is set, %false if made noop because @task was dying.
+ */
+bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+	BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
+			JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
+	BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
+
+	if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+		return false;
+
+	if (mask & JOBCTL_STOP_SIGMASK)
+		task->jobctl &= ~JOBCTL_STOP_SIGMASK;
+
+	task->jobctl |= mask;
+	return true;
+}
+
 /**
  * task_clear_jobctl_trapping - clear jobctl trapping bit
  * @task: target task
@@ -1902,19 +1935,20 @@ static int do_signal_stop(int signr)
 		else
 			WARN_ON_ONCE(!task_ptrace(current));
 
-		current->jobctl &= ~JOBCTL_STOP_SIGMASK;
-		current->jobctl |= signr | gstop;
-		sig->group_stop_count = 1;
+		sig->group_stop_count = 0;
+
+		if (task_set_jobctl_pending(current, signr | gstop))
+			sig->group_stop_count++;
+
 		for (t = next_thread(current); t != current;
 		     t = next_thread(t)) {
-			t->jobctl &= ~JOBCTL_STOP_SIGMASK;
 			/*
 			 * Setting state to TASK_STOPPED for a group
 			 * stop is always done with the siglock held,
 			 * so this check has no races.
 			 */
-			if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
-				t->jobctl |= signr | gstop;
+			if (!task_is_stopped(t) &&
+			    task_set_jobctl_pending(t, signr | gstop)) {
 				sig->group_stop_count++;
 				signal_wake_up(t, 0);
 			}
-- 
cgit v1.2.3


From dd1d6772692316fe35094085c5e4d9a370ad3462 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:14:00 +0200
Subject: signal: remove three noop tracehooks

Remove the following three noop tracehooks in signals.c.

* tracehook_force_sigpending()
* tracehook_get_signal()
* tracehook_finish_jctl()

The code area is about to be updated and these hooks don't do anything
other than obfuscating the logic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/tracehook.h | 52 -----------------------------------------------
 kernel/signal.c           | 44 +++++++++++++--------------------------
 2 files changed, 14 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index e95f5236611f..15745cdd32ce 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -425,58 +425,6 @@ static inline int tracehook_consider_fatal_signal(struct task_struct *task,
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
 
-/**
- * tracehook_force_sigpending - let tracing force signal_pending(current) on
- *
- * Called when recomputing our signal_pending() flag.  Return nonzero
- * to force the signal_pending() flag on, so that tracehook_get_signal()
- * will be called before the next return to user mode.
- *
- * Called with @current->sighand->siglock held.
- */
-static inline int tracehook_force_sigpending(void)
-{
-	return 0;
-}
-
-/**
- * tracehook_get_signal - deliver synthetic signal to traced task
- * @task:		@current
- * @regs:		task_pt_regs(@current)
- * @info:		details of synthetic signal
- * @return_ka:		sigaction for synthetic signal
- *
- * Return zero to check for a real pending signal normally.
- * Return -1 after releasing the siglock to repeat the check.
- * Return a signal number to induce an artificial signal delivery,
- * setting *@info and *@return_ka to specify its details and behavior.
- *
- * The @return_ka->sa_handler value controls the disposition of the
- * signal, no matter the signal number.  For %SIG_DFL, the return value
- * is a representative signal to indicate the behavior (e.g. %SIGTERM
- * for death, %SIGQUIT for core dump, %SIGSTOP for job control stop,
- * %SIGTSTP for stop unless in an orphaned pgrp), but the signal number
- * reported will be @info->si_signo instead.
- *
- * Called with @task->sighand->siglock held, before dequeuing pending signals.
- */
-static inline int tracehook_get_signal(struct task_struct *task,
-				       struct pt_regs *regs,
-				       siginfo_t *info,
-				       struct k_sigaction *return_ka)
-{
-	return 0;
-}
-
-/**
- * tracehook_finish_jctl - report about return from job control stop
- *
- * This is called by do_signal_stop() after wakeup.
- */
-static inline void tracehook_finish_jctl(void)
-{
-}
-
 #define DEATH_REAP			-1
 #define DEATH_DELAYED_GROUP_LEADER	-2
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 172a4c79f12c..c99b8b5c0be7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 
 void recalc_sigpending(void)
 {
-	if (unlikely(tracehook_force_sigpending()))
-		set_thread_flag(TIF_SIGPENDING);
-	else if (!recalc_sigpending_tsk(current) && !freezing(current))
+	if (!recalc_sigpending_tsk(current) && !freezing(current))
 		clear_thread_flag(TIF_SIGPENDING);
 
 }
@@ -2005,8 +2003,6 @@ retry:
 
 	spin_unlock_irq(&current->sighand->siglock);
 
-	tracehook_finish_jctl();
-
 	return 1;
 }
 
@@ -2109,37 +2105,25 @@ relock:
 
 	for (;;) {
 		struct k_sigaction *ka;
-		/*
-		 * Tracing can induce an artificial signal and choose sigaction.
-		 * The return value in @signr determines the default action,
-		 * but @info->si_signo is the signal number we will report.
-		 */
-		signr = tracehook_get_signal(current, regs, info, return_ka);
-		if (unlikely(signr < 0))
+
+		if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
+		    do_signal_stop(0))
 			goto relock;
-		if (unlikely(signr != 0))
-			ka = return_ka;
-		else {
-			if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
-			    do_signal_stop(0))
-				goto relock;
 
-			signr = dequeue_signal(current, &current->blocked,
-					       info);
+		signr = dequeue_signal(current, &current->blocked, info);
 
-			if (!signr)
-				break; /* will return 0 */
+		if (!signr)
+			break; /* will return 0 */
 
-			if (signr != SIGKILL) {
-				signr = ptrace_signal(signr, info,
-						      regs, cookie);
-				if (!signr)
-					continue;
-			}
-
-			ka = &sighand->action[signr-1];
+		if (signr != SIGKILL) {
+			signr = ptrace_signal(signr, info,
+					      regs, cookie);
+			if (!signr)
+				continue;
 		}
 
+		ka = &sighand->action[signr-1];
+
 		/* Trace actually delivered signals. */
 		trace_signal_deliver(signr, info, ka);
 
-- 
cgit v1.2.3


From f7a2d73fe75c71941fb0a6b4d8fe7da8144f2c7b Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Mon, 6 Jun 2011 15:36:24 -0400
Subject: x86, efi: Fix argument types for SetVariable()

The spec says this takes uint32 for attributes, not uintn.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/r/1307388985-7852-1-git-send-email-mjg@redhat.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/efi/efi.c | 2 +-
 include/linux/efi.h         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 0d3a4fa34560..f4f6de987b7b 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -122,7 +122,7 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
 
 static efi_status_t virt_efi_set_variable(efi_char16_t *name,
 					  efi_guid_t *vendor,
-					  unsigned long attr,
+					  u32 attr,
 					  unsigned long data_size,
 					  void *data)
 {
diff --git a/include/linux/efi.h b/include/linux/efi.h
index e376270cd26e..0758753a17a1 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -168,7 +168,7 @@ typedef efi_status_t efi_get_variable_t (efi_char16_t *name, efi_guid_t *vendor,
 typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char16_t *name,
 					      efi_guid_t *vendor);
 typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, 
-					 unsigned long attr, unsigned long data_size, 
+					 u32 attr, unsigned long data_size,
 					 void *data);
 typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count);
 typedef void efi_reset_system_t (int reset_type, efi_status_t status,
-- 
cgit v1.2.3


From 3b3702377c576f6624348c7c6fd113bfd934fbd7 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Mon, 6 Jun 2011 15:36:25 -0400
Subject: x86, efi: Add infrastructure for UEFI 2.0 runtime services

We're currently missing support for any of the runtime service calls
introduced with the UEFI 2.0 spec in 2006. Add the infrastructure for
supporting them.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/r/1307388985-7852-2-git-send-email-mjg@redhat.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/efi/efi.c | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/efi.h         | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f4f6de987b7b..a0e42447de7c 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -131,6 +131,18 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name,
 			      data_size, data);
 }
 
+static efi_status_t virt_efi_query_variable_info(u32 attr,
+						 u64 *storage_space,
+						 u64 *remaining_space,
+						 u64 *max_variable_size)
+{
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	return efi_call_virt4(query_variable_info, attr, storage_space,
+			      remaining_space, max_variable_size);
+}
+
 static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
 {
 	return efi_call_virt1(get_next_high_mono_count, count);
@@ -145,6 +157,28 @@ static void virt_efi_reset_system(int reset_type,
 		       data_size, data);
 }
 
+static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
+					    unsigned long count,
+					    unsigned long sg_list)
+{
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	return efi_call_virt3(update_capsule, capsules, count, sg_list);
+}
+
+static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
+						unsigned long count,
+						u64 *max_size,
+						int *reset_type)
+{
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	return efi_call_virt4(query_capsule_caps, capsules, count, max_size,
+			      reset_type);
+}
+
 static efi_status_t __init phys_efi_set_virtual_address_map(
 	unsigned long memory_map_size,
 	unsigned long descriptor_size,
@@ -651,6 +685,9 @@ void __init efi_enter_virtual_mode(void)
 	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
 	efi.reset_system = virt_efi_reset_system;
 	efi.set_virtual_address_map = NULL;
+	efi.query_variable_info = virt_efi_query_variable_info;
+	efi.update_capsule = virt_efi_update_capsule;
+	efi.query_capsule_caps = virt_efi_query_capsule_caps;
 	if (__supported_pte_mask & _PAGE_NX)
 		runtime_code_page_mkexec();
 	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 0758753a17a1..ec2572693925 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -101,6 +101,13 @@ typedef struct {
 	u64 attribute;
 } efi_memory_desc_t;
 
+typedef struct {
+	efi_guid_t guid;
+	u32 headersize;
+	u32 flags;
+	u32 imagesize;
+} efi_capsule_header_t;
+
 typedef int (*efi_freemem_callback_t) (u64 start, u64 end, void *arg);
 
 /*
@@ -156,6 +163,9 @@ typedef struct {
 	unsigned long set_variable;
 	unsigned long get_next_high_mono_count;
 	unsigned long reset_system;
+	unsigned long update_capsule;
+	unsigned long query_capsule_caps;
+	unsigned long query_variable_info;
 } efi_runtime_services_t;
 
 typedef efi_status_t efi_get_time_t (efi_time_t *tm, efi_time_cap_t *tc);
@@ -177,6 +187,17 @@ typedef efi_status_t efi_set_virtual_address_map_t (unsigned long memory_map_siz
 						unsigned long descriptor_size,
 						u32 descriptor_version,
 						efi_memory_desc_t *virtual_map);
+typedef efi_status_t efi_query_variable_info_t(u32 attr,
+					       u64 *storage_space,
+					       u64 *remaining_space,
+					       u64 *max_variable_size);
+typedef efi_status_t efi_update_capsule_t(efi_capsule_header_t **capsules,
+					  unsigned long count,
+					  unsigned long sg_list);
+typedef efi_status_t efi_query_capsule_caps_t(efi_capsule_header_t **capsules,
+					      unsigned long count,
+					      u64 *max_size,
+					      int *reset_type);
 
 /*
  *  EFI Configuration Table and GUID definitions
@@ -218,6 +239,13 @@ typedef struct {
 
 #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL)
 
+#define EFI_2_30_SYSTEM_TABLE_REVISION  ((2 << 16) | (30))
+#define EFI_2_20_SYSTEM_TABLE_REVISION  ((2 << 16) | (20))
+#define EFI_2_10_SYSTEM_TABLE_REVISION  ((2 << 16) | (10))
+#define EFI_2_00_SYSTEM_TABLE_REVISION  ((2 << 16) | (00))
+#define EFI_1_10_SYSTEM_TABLE_REVISION  ((1 << 16) | (10))
+#define EFI_1_02_SYSTEM_TABLE_REVISION  ((1 << 16) | (02))
+
 typedef struct {
 	efi_table_hdr_t hdr;
 	unsigned long fw_vendor;	/* physical addr of CHAR16 vendor string */
@@ -250,6 +278,7 @@ struct efi_memory_map {
  */
 extern struct efi {
 	efi_system_table_t *systab;	/* EFI system table */
+	unsigned int runtime_version;	/* Runtime services version */
 	unsigned long mps;		/* MPS table */
 	unsigned long acpi;		/* ACPI table  (IA64 ext 0.71) */
 	unsigned long acpi20;		/* ACPI table  (ACPI 2.0) */
@@ -266,6 +295,9 @@ extern struct efi {
 	efi_get_variable_t *get_variable;
 	efi_get_next_variable_t *get_next_variable;
 	efi_set_variable_t *set_variable;
+	efi_query_variable_info_t *query_variable_info;
+	efi_update_capsule_t *update_capsule;
+	efi_query_capsule_caps_t *query_capsule_caps;
 	efi_get_next_high_mono_count_t *get_next_high_mono_count;
 	efi_reset_system_t *reset_system;
 	efi_set_virtual_address_map_t *set_virtual_address_map;
-- 
cgit v1.2.3


From a6b7a407865aab9f849dd99a71072b7cd1175116 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Jun 2011 10:43:46 +0000
Subject: net: remove interrupt.h inclusion from netdevice.h

* remove interrupt.g inclusion from netdevice.h -- not needed
* fixup fallout, add interrupt.h and hardirq.h back where needed.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/eni.h                             | 1 +
 drivers/atm/firestream.c                      | 1 +
 drivers/atm/horizon.c                         | 1 +
 drivers/atm/idt77252.c                        | 1 +
 drivers/atm/iphase.c                          | 1 +
 drivers/atm/zatm.c                            | 1 +
 drivers/infiniband/hw/amso1100/c2.c           | 1 +
 drivers/infiniband/ulp/iser/iscsi_iser.h      | 1 +
 drivers/isdn/hardware/mISDN/avmfritz.c        | 1 +
 drivers/isdn/hardware/mISDN/hfcmulti.c        | 1 +
 drivers/isdn/hardware/mISDN/hfcpci.c          | 1 +
 drivers/isdn/hardware/mISDN/mISDNinfineon.c   | 1 +
 drivers/isdn/hardware/mISDN/mISDNipac.c       | 1 +
 drivers/isdn/hardware/mISDN/netjet.c          | 1 +
 drivers/isdn/hardware/mISDN/speedfax.c        | 1 +
 drivers/isdn/hardware/mISDN/w6692.c           | 1 +
 drivers/isdn/hisax/hisax.h                    | 1 +
 drivers/isdn/hisax/hisax_fcpcipnp.c           | 1 +
 drivers/media/dvb/b2c2/flexcop-common.h       | 1 +
 drivers/media/dvb/dm1105/dm1105.c             | 1 +
 drivers/media/dvb/mantis/mantis_common.h      | 1 +
 drivers/media/dvb/pluto2/pluto2.c             | 1 +
 drivers/net/3c503.c                           | 1 +
 drivers/net/8139cp.c                          | 1 +
 drivers/net/8139too.c                         | 1 +
 drivers/net/8390.h                            | 1 +
 drivers/net/ac3200.c                          | 1 +
 drivers/net/acenic.h                          | 1 +
 drivers/net/amd8111e.c                        | 1 +
 drivers/net/arcnet/arc-rimi.c                 | 1 +
 drivers/net/arcnet/com20020-isa.c             | 1 +
 drivers/net/arcnet/com20020-pci.c             | 1 +
 drivers/net/arcnet/com20020.c                 | 1 +
 drivers/net/arcnet/com90io.c                  | 1 +
 drivers/net/arcnet/com90xx.c                  | 1 +
 drivers/net/arm/at91_ether.c                  | 1 +
 drivers/net/arm/ep93xx_eth.c                  | 1 +
 drivers/net/arm/ks8695net.c                   | 1 +
 drivers/net/atl1c/atl1c.h                     | 1 +
 drivers/net/atl1e/atl1e.h                     | 1 +
 drivers/net/b44.c                             | 1 +
 drivers/net/bmac.c                            | 1 +
 drivers/net/bnx2x/bnx2x_cmn.c                 | 1 +
 drivers/net/caif/caif_serial.c                | 1 +
 drivers/net/can/sja1000/sja1000.h             | 1 +
 drivers/net/cassini.c                         | 1 +
 drivers/net/cxgb4vf/adapter.h                 | 1 +
 drivers/net/dm9000.c                          | 1 +
 drivers/net/dnet.c                            | 1 +
 drivers/net/e100.c                            | 2 ++
 drivers/net/e1000e/ethtool.c                  | 1 +
 drivers/net/e1000e/netdev.c                   | 1 +
 drivers/net/e2100.c                           | 1 +
 drivers/net/enic/enic_main.c                  | 1 +
 drivers/net/es3210.c                          | 1 +
 drivers/net/ethoc.c                           | 1 +
 drivers/net/fec_mpc52xx.c                     | 1 +
 drivers/net/greth.c                           | 1 +
 drivers/net/hamradio/baycom_ser_fdx.c         | 1 +
 drivers/net/hamradio/baycom_ser_hdx.c         | 1 +
 drivers/net/hp-plus.c                         | 1 +
 drivers/net/hp.c                              | 1 +
 drivers/net/ibmveth.c                         | 1 +
 drivers/net/ifb.c                             | 1 +
 drivers/net/ipg.c                             | 1 +
 drivers/net/irda/ali-ircc.c                   | 1 +
 drivers/net/irda/donauboe.c                   | 1 +
 drivers/net/irda/nsc-ircc.c                   | 1 +
 drivers/net/irda/pxaficp_ir.c                 | 1 +
 drivers/net/irda/sir_dev.c                    | 1 +
 drivers/net/irda/smsc-ircc2.c                 | 1 +
 drivers/net/irda/via-ircc.c                   | 1 +
 drivers/net/irda/vlsi_ir.c                    | 1 +
 drivers/net/irda/w83977af_ir.c                | 1 +
 drivers/net/ixgbe/ixgbe_ethtool.c             | 1 +
 drivers/net/ixgbe/ixgbe_main.c                | 1 +
 drivers/net/ixp2000/ixpdev.c                  | 1 +
 drivers/net/jme.h                             | 1 +
 drivers/net/ks8842.c                          | 1 +
 drivers/net/ks8851.c                          | 1 +
 drivers/net/ks8851_mll.c                      | 1 +
 drivers/net/ksz884x.c                         | 1 +
 drivers/net/lne390.c                          | 1 +
 drivers/net/macb.c                            | 1 +
 drivers/net/mace.c                            | 1 +
 drivers/net/netx-eth.c                        | 1 +
 drivers/net/niu.c                             | 1 +
 drivers/net/ns83820.c                         | 1 +
 drivers/net/ppp_async.c                       | 1 +
 drivers/net/ppp_synctty.c                     | 1 +
 drivers/net/ps3_gelic_net.c                   | 1 +
 drivers/net/qlge/qlge.h                       | 1 +
 drivers/net/r8169.c                           | 1 +
 drivers/net/sc92031.c                         | 1 +
 drivers/net/sfc/nic.c                         | 1 +
 drivers/net/sis190.c                          | 1 +
 drivers/net/skge.h                            | 1 +
 drivers/net/sky2.c                            | 1 +
 drivers/net/smc-mca.c                         | 1 +
 drivers/net/smc-ultra.c                       | 1 +
 drivers/net/smsc911x.c                        | 1 +
 drivers/net/smsc9420.c                        | 1 +
 drivers/net/spider_net.c                      | 1 +
 drivers/net/starfire.c                        | 1 +
 drivers/net/stmmac/stmmac_ethtool.c           | 1 +
 drivers/net/tg3.c                             | 1 +
 drivers/net/tlan.c                            | 2 ++
 drivers/net/tokenring/ibmtr.c                 | 1 +
 drivers/net/tsi108_eth.c                      | 1 +
 drivers/net/tulip/de2104x.c                   | 1 +
 drivers/net/tulip/pnic.c                      | 1 +
 drivers/net/tulip/tulip_core.c                | 1 +
 drivers/net/vxge/vxge-config.h                | 1 +
 drivers/net/vxge/vxge-main.c                  | 1 +
 drivers/net/wan/cycx_main.c                   | 1 +
 drivers/net/wan/dscc4.c                       | 1 +
 drivers/net/wan/farsync.c                     | 1 +
 drivers/net/wan/wanxl.c                       | 1 +
 drivers/net/wireless/adm8211.c                | 1 +
 drivers/net/wireless/ath/ath9k/ath9k.h        | 1 +
 drivers/net/wireless/atmel.c                  | 1 +
 drivers/net/wireless/hostap/hostap_wlan.h     | 1 +
 drivers/net/wireless/ipw2x00/ipw2200.h        | 1 +
 drivers/net/wireless/ipw2x00/libipw_wx.c      | 1 +
 drivers/net/wireless/iwlegacy/iwl-dev.h       | 1 +
 drivers/net/wireless/iwlwifi/iwl-dev.h        | 1 +
 drivers/net/wireless/libertas/cfg.c           | 1 +
 drivers/net/wireless/libertas/cmd.c           | 1 +
 drivers/net/wireless/libertas/cmdresp.c       | 1 +
 drivers/net/wireless/libertas/debugfs.c       | 1 +
 drivers/net/wireless/libertas/ethtool.c       | 1 +
 drivers/net/wireless/libertas/if_spi.c        | 2 ++
 drivers/net/wireless/libertas/main.c          | 1 +
 drivers/net/wireless/libertas/mesh.c          | 1 +
 drivers/net/wireless/libertas/rx.c            | 1 +
 drivers/net/wireless/libertas/tx.c            | 1 +
 drivers/net/wireless/libertas_tf/cmd.c        | 1 +
 drivers/net/wireless/libertas_tf/main.c       | 1 +
 drivers/net/wireless/mwl8k.c                  | 1 +
 drivers/net/wireless/p54/p54pci.h             | 1 +
 drivers/net/wireless/prism54/islpci_dev.c     | 1 +
 drivers/net/wireless/prism54/islpci_dev.h     | 1 +
 drivers/net/wireless/prism54/islpci_hotplug.c | 1 +
 drivers/net/wireless/rt2x00/rt2x00.h          | 1 +
 drivers/net/wireless/rtl818x/rtl8180/dev.c    | 1 +
 drivers/net/wireless/wl1251/sdio.c            | 1 +
 drivers/net/wireless/wl1251/spi.c             | 1 +
 drivers/net/wireless/wl12xx/io.h              | 1 +
 drivers/net/wireless/wl12xx/spi.c             | 1 +
 drivers/s390/net/ctcm_mpc.h                   | 1 +
 include/linux/arcdevice.h                     | 1 +
 include/linux/dccp.h                          | 1 +
 include/linux/netdevice.h                     | 1 -
 include/net/bluetooth/hci_core.h              | 1 +
 include/net/sock.h                            | 1 +
 net/atm/pppoatm.c                             | 1 +
 net/caif/chnl_net.c                           | 1 +
 net/can/bcm.c                                 | 1 +
 net/rds/ib.h                                  | 1 +
 net/rds/iw.h                                  | 1 +
 net/sched/sch_atm.c                           | 1 +
 net/sunrpc/xprtrdma/svc_rdma_transport.c      | 1 +
 net/sunrpc/xprtrdma/verbs.c                   | 1 +
 163 files changed, 165 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/atm/eni.h b/drivers/atm/eni.h
index e4c9525e60b3..493a6932507e 100644
--- a/drivers/atm/eni.h
+++ b/drivers/atm/eni.h
@@ -8,6 +8,7 @@
 
 #include <linux/atm.h>
 #include <linux/atmdev.h>
+#include <linux/interrupt.h>
 #include <linux/sonet.h>
 #include <linux/skbuff.h>
 #include <linux/time.h>
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index ef7a658312a6..7c7b571647f9 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -44,6 +44,7 @@
 #include <linux/ioport.h> /* for request_region */
 #include <linux/uio.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c
index d58e3fcb9db3..287506183893 100644
--- a/drivers/atm/horizon.c
+++ b/drivers/atm/horizon.c
@@ -38,6 +38,7 @@
 #include <linux/delay.h>
 #include <linux/uio.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c
index 1f8d724a18bf..be0dbfeb541c 100644
--- a/drivers/atm/idt77252.c
+++ b/drivers/atm/idt77252.c
@@ -37,6 +37,7 @@
 #include <linux/atm.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/wait.h>
 #include <linux/jiffies.h>
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index dee4f01a64d8..957106f636ea 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -53,6 +53,7 @@
 #include <linux/delay.h>  
 #include <linux/uio.h>  
 #include <linux/init.h>  
+#include <linux/interrupt.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <asm/system.h>  
diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c
index 624917902b65..7f8c5132ff32 100644
--- a/drivers/atm/zatm.c
+++ b/drivers/atm/zatm.c
@@ -16,6 +16,7 @@
 #include <linux/delay.h>
 #include <linux/uio.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/dma-mapping.h>
 #include <linux/atm_zatm.h>
 #include <linux/capability.h>
diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c
index 0cfc455630d0..444470a28de2 100644
--- a/drivers/infiniband/hw/amso1100/c2.c
+++ b/drivers/infiniband/hw/amso1100/c2.c
@@ -36,6 +36,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/inetdevice.h>
+#include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/ethtool.h>
 #include <linux/mii.h>
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 2f02ab0ccc1e..342cbc1bdaae 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -45,6 +45,7 @@
 #include <scsi/libiscsi.h>
 #include <scsi/scsi_transport_iscsi.h>
 
+#include <linux/interrupt.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/list.h>
diff --git a/drivers/isdn/hardware/mISDN/avmfritz.c b/drivers/isdn/hardware/mISDN/avmfritz.c
index 472a2af79446..861b6511f3ee 100644
--- a/drivers/isdn/hardware/mISDN/avmfritz.c
+++ b/drivers/isdn/hardware/mISDN/avmfritz.c
@@ -20,6 +20,7 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  */
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index f6f3c87cc7c2..a440d7fff0ad 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -152,6 +152,7 @@
 
 #define HFC_MULTI_VERSION	"2.03"
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c
index b01a7be1300f..3261de18a91e 100644
--- a/drivers/isdn/hardware/mISDN/hfcpci.c
+++ b/drivers/isdn/hardware/mISDN/hfcpci.c
@@ -44,6 +44,7 @@
  *
  */
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/isdn/hardware/mISDN/mISDNinfineon.c b/drivers/isdn/hardware/mISDN/mISDNinfineon.c
index bc0529ac88a1..6218775ce87d 100644
--- a/drivers/isdn/hardware/mISDN/mISDNinfineon.c
+++ b/drivers/isdn/hardware/mISDN/mISDNinfineon.c
@@ -38,6 +38,7 @@
  *
  */
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/isdn/hardware/mISDN/mISDNipac.c b/drivers/isdn/hardware/mISDN/mISDNipac.c
index 64ecc6f5ffaf..d2ffb1d9b831 100644
--- a/drivers/isdn/hardware/mISDN/mISDNipac.c
+++ b/drivers/isdn/hardware/mISDN/mISDNipac.c
@@ -20,6 +20,7 @@
  *
  */
 
+#include <linux/irqreturn.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/mISDNhw.h>
diff --git a/drivers/isdn/hardware/mISDN/netjet.c b/drivers/isdn/hardware/mISDN/netjet.c
index db25b6b2ae39..5ef9f11ee74b 100644
--- a/drivers/isdn/hardware/mISDN/netjet.c
+++ b/drivers/isdn/hardware/mISDN/netjet.c
@@ -20,6 +20,7 @@
  *
  */
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/isdn/hardware/mISDN/speedfax.c b/drivers/isdn/hardware/mISDN/speedfax.c
index 9e07246bb9e7..4d0d41ea1228 100644
--- a/drivers/isdn/hardware/mISDN/speedfax.c
+++ b/drivers/isdn/hardware/mISDN/speedfax.c
@@ -22,6 +22,7 @@
  *
  */
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
diff --git a/drivers/isdn/hardware/mISDN/w6692.c b/drivers/isdn/hardware/mISDN/w6692.c
index 9e84870b971c..e10e0284533c 100644
--- a/drivers/isdn/hardware/mISDN/w6692.c
+++ b/drivers/isdn/hardware/mISDN/w6692.c
@@ -21,6 +21,7 @@
  *
  */
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index de1c669c7b13..0a5c42a3f125 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/timer.h>
 #include <linux/wait.h>
diff --git a/drivers/isdn/hisax/hisax_fcpcipnp.c b/drivers/isdn/hisax/hisax_fcpcipnp.c
index 8b0a7d86b30f..478ebab54ca4 100644
--- a/drivers/isdn/hisax/hisax_fcpcipnp.c
+++ b/drivers/isdn/hisax/hisax_fcpcipnp.c
@@ -25,6 +25,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/isapnp.h>
 #include <linux/kmod.h>
diff --git a/drivers/media/dvb/b2c2/flexcop-common.h b/drivers/media/dvb/b2c2/flexcop-common.h
index 9e2148a19967..437912e49824 100644
--- a/drivers/media/dvb/b2c2/flexcop-common.h
+++ b/drivers/media/dvb/b2c2/flexcop-common.h
@@ -6,6 +6,7 @@
 #ifndef __FLEXCOP_COMMON_H__
 #define __FLEXCOP_COMMON_H__
 
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/mutex.h>
 
diff --git a/drivers/media/dvb/dm1105/dm1105.c b/drivers/media/dvb/dm1105/dm1105.c
index b2b0c45f32a9..55e6533f15e9 100644
--- a/drivers/media/dvb/dm1105/dm1105.c
+++ b/drivers/media/dvb/dm1105/dm1105.c
@@ -22,6 +22,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
diff --git a/drivers/media/dvb/mantis/mantis_common.h b/drivers/media/dvb/mantis/mantis_common.h
index bd400d21b81f..49dbca145bb8 100644
--- a/drivers/media/dvb/mantis/mantis_common.h
+++ b/drivers/media/dvb/mantis/mantis_common.h
@@ -21,6 +21,7 @@
 #ifndef __MANTIS_COMMON_H
 #define __MANTIS_COMMON_H
 
+#include <linux/interrupt.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
 
diff --git a/drivers/media/dvb/pluto2/pluto2.c b/drivers/media/dvb/pluto2/pluto2.c
index 7cb79ec685f0..80fb51004461 100644
--- a/drivers/media/dvb/pluto2/pluto2.c
+++ b/drivers/media/dvb/pluto2/pluto2.c
@@ -26,6 +26,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
diff --git a/drivers/net/3c503.c b/drivers/net/3c503.c
index d84f6e8903a5..554a26c55188 100644
--- a/drivers/net/3c503.c
+++ b/drivers/net/3c503.c
@@ -49,6 +49,7 @@ static const char version[] =
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ethtool.h>
 
 #include <asm/uaccess.h>
diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index 10c45051caea..73b10b07f9b5 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -60,6 +60,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index 98517a373473..ed6355cc5261 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -100,6 +100,7 @@
 #include <linux/compiler.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
diff --git a/drivers/net/8390.h b/drivers/net/8390.h
index 3d9e8fb4fbee..58a12e4c78f9 100644
--- a/drivers/net/8390.h
+++ b/drivers/net/8390.h
@@ -9,6 +9,7 @@
 
 #include <linux/if_ether.h>
 #include <linux/ioport.h>
+#include <linux/irqreturn.h>
 #include <linux/skbuff.h>
 
 #define TX_PAGES 12	/* Two Tx slots */
diff --git a/drivers/net/ac3200.c b/drivers/net/ac3200.c
index 5181e9322119..f07b2e980fbc 100644
--- a/drivers/net/ac3200.c
+++ b/drivers/net/ac3200.c
@@ -32,6 +32,7 @@ static const char version[] =
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 
 #include <asm/system.h>
 #include <asm/io.h>
diff --git a/drivers/net/acenic.h b/drivers/net/acenic.h
index 0681da7e8753..fd25a3b47fef 100644
--- a/drivers/net/acenic.h
+++ b/drivers/net/acenic.h
@@ -1,5 +1,6 @@
 #ifndef _ACENIC_H_
 #define _ACENIC_H_
+#include <linux/interrupt.h>
 
 
 /*
diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c
index 81475ea3a711..db6d2da5a8f3 100644
--- a/drivers/net/amd8111e.c
+++ b/drivers/net/amd8111e.c
@@ -75,6 +75,7 @@ Revision History:
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/arcnet/arc-rimi.c b/drivers/net/arcnet/arc-rimi.c
index 9efbbbae47ca..25197b698dd6 100644
--- a/drivers/net/arcnet/arc-rimi.c
+++ b/drivers/net/arcnet/arc-rimi.c
@@ -32,6 +32,7 @@
 #include <linux/netdevice.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <asm/io.h>
 #include <linux/arcdevice.h>
 
diff --git a/drivers/net/arcnet/com20020-isa.c b/drivers/net/arcnet/com20020-isa.c
index 37272827ee55..45c61a2c5fbd 100644
--- a/drivers/net/arcnet/com20020-isa.c
+++ b/drivers/net/arcnet/com20020-isa.c
@@ -34,6 +34,7 @@
 #include <linux/delay.h>
 #include <linux/netdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/bootmem.h>
 #include <linux/arcdevice.h>
 #include <linux/com20020.h>
diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c
index 48a1dbf01e60..d427493997b6 100644
--- a/drivers/net/arcnet/com20020-pci.c
+++ b/drivers/net/arcnet/com20020-pci.c
@@ -34,6 +34,7 @@
 #include <linux/errno.h>
 #include <linux/netdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/arcdevice.h>
 #include <linux/com20020.h>
diff --git a/drivers/net/arcnet/com20020.c b/drivers/net/arcnet/com20020.c
index c9e459400ff9..7bfb91f32857 100644
--- a/drivers/net/arcnet/com20020.c
+++ b/drivers/net/arcnet/com20020.c
@@ -33,6 +33,7 @@
 #include <linux/delay.h>
 #include <linux/netdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/arcdevice.h>
 #include <linux/com20020.h>
 
diff --git a/drivers/net/arcnet/com90io.c b/drivers/net/arcnet/com90io.c
index eb27976dab37..487d780ebbdf 100644
--- a/drivers/net/arcnet/com90io.c
+++ b/drivers/net/arcnet/com90io.c
@@ -33,6 +33,7 @@
 #include <linux/netdevice.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <asm/io.h>
 #include <linux/arcdevice.h>
 
diff --git a/drivers/net/arcnet/com90xx.c b/drivers/net/arcnet/com90xx.c
index f3b46f71e293..b80fbe40aa0e 100644
--- a/drivers/net/arcnet/com90xx.c
+++ b/drivers/net/arcnet/com90xx.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/arm/at91_ether.c b/drivers/net/arm/at91_ether.c
index e07b314ed8fd..29dc43523cec 100644
--- a/drivers/net/arm/at91_ether.c
+++ b/drivers/net/arm/at91_ether.c
@@ -19,6 +19,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/mii.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 5a77001b6d10..a167addd5382 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -19,6 +19,7 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/moduleparam.h>
 #include <linux/platform_device.h>
 #include <linux/delay.h>
diff --git a/drivers/net/arm/ks8695net.c b/drivers/net/arm/ks8695net.c
index a7b0caa18179..bb62b3f51837 100644
--- a/drivers/net/arm/ks8695net.c
+++ b/drivers/net/arm/ks8695net.c
@@ -21,6 +21,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/crc32.h>
diff --git a/drivers/net/atl1c/atl1c.h b/drivers/net/atl1c/atl1c.h
index 925929d764ca..dfe4370ccc90 100644
--- a/drivers/net/atl1c/atl1c.h
+++ b/drivers/net/atl1c/atl1c.h
@@ -24,6 +24,7 @@
 
 #include <linux/version.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/module.h>
diff --git a/drivers/net/atl1e/atl1e.h b/drivers/net/atl1e/atl1e.h
index 490d3b38e0cb..9ac37e3ec8fb 100644
--- a/drivers/net/atl1e/atl1e.h
+++ b/drivers/net/atl1e/atl1e.h
@@ -25,6 +25,7 @@
 
 #include <linux/version.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/module.h>
diff --git a/drivers/net/b44.c b/drivers/net/b44.c
index a69331e06b8d..085560e1d17a 100644
--- a/drivers/net/b44.c
+++ b/drivers/net/b44.c
@@ -25,6 +25,7 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/dma-mapping.h>
 #include <linux/ssb/ssb.h>
 #include <linux/slab.h>
diff --git a/drivers/net/bmac.c b/drivers/net/bmac.c
index a1b8c8b8010b..d2e58e2bea78 100644
--- a/drivers/net/bmac.c
+++ b/drivers/net/bmac.c
@@ -7,6 +7,7 @@
  * May 1999, Al Viro: proper release of /proc/net/bmac entry, switched to
  * dynamic procfs inode.
  */
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
index 9b248348c0e6..831c6ec98eed 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/bnx2x/bnx2x_cmn.c
@@ -17,6 +17,7 @@
 
 #include <linux/etherdevice.h>
 #include <linux/if_vlan.h>
+#include <linux/interrupt.h>
 #include <linux/ip.h>
 #include <net/ipv6.h>
 #include <net/ip6_checksum.h>
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index 3df0c0f8b8bf..1cd0b59d13cb 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -4,6 +4,7 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#include <linux/hardirq.h>
 #include <linux/init.h>
 #include <linux/version.h>
 #include <linux/module.h>
diff --git a/drivers/net/can/sja1000/sja1000.h b/drivers/net/can/sja1000/sja1000.h
index de8e778f6832..78bd4ecac140 100644
--- a/drivers/net/can/sja1000/sja1000.h
+++ b/drivers/net/can/sja1000/sja1000.h
@@ -47,6 +47,7 @@
 #ifndef SJA1000_DEV_H
 #define SJA1000_DEV_H
 
+#include <linux/irqreturn.h>
 #include <linux/can/dev.h>
 #include <linux/can/platform/sja1000.h>
 
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 22ce03e55b83..b414f5ae0da5 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -75,6 +75,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
diff --git a/drivers/net/cxgb4vf/adapter.h b/drivers/net/cxgb4vf/adapter.h
index 4fd821aadc8a..6e9a8d9ef592 100644
--- a/drivers/net/cxgb4vf/adapter.h
+++ b/drivers/net/cxgb4vf/adapter.h
@@ -40,6 +40,7 @@
 #ifndef __CXGB4VF_ADAPTER_H__
 #define __CXGB4VF_ADAPTER_H__
 
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/spinlock.h>
 #include <linux/skbuff.h>
diff --git a/drivers/net/dm9000.c b/drivers/net/dm9000.c
index ee597e676ee5..863e9c459e65 100644
--- a/drivers/net/dm9000.c
+++ b/drivers/net/dm9000.c
@@ -24,6 +24,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/crc32.h>
diff --git a/drivers/net/dnet.c b/drivers/net/dnet.c
index 8318ea06cb6d..b22691046678 100644
--- a/drivers/net/dnet.c
+++ b/drivers/net/dnet.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/dma-mapping.h>
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index e336c7937f05..c1352c60c299 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -149,6 +149,8 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/hardirq.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
diff --git a/drivers/net/e1000e/ethtool.c b/drivers/net/e1000e/ethtool.c
index 859d0d3af6c9..cb1a3623253e 100644
--- a/drivers/net/e1000e/ethtool.c
+++ b/drivers/net/e1000e/ethtool.c
@@ -28,6 +28,7 @@
 
 /* ethtool support for e1000 */
 
+#include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/ethtool.h>
 #include <linux/pci.h>
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 3310c3d477d7..888bd9cc2710 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -31,6 +31,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
diff --git a/drivers/net/e2100.c b/drivers/net/e2100.c
index 94ec973b2bdc..d50a9998ae77 100644
--- a/drivers/net/e2100.c
+++ b/drivers/net/e2100.c
@@ -44,6 +44,7 @@ static const char version[] =
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/delay.h>
 
 #include <asm/io.h>
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index 1daf79b36165..9600f435c0fb 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -23,6 +23,7 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/workqueue.h>
 #include <linux/pci.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/es3210.c b/drivers/net/es3210.c
index 0ba5e7b90584..7a09575ecff0 100644
--- a/drivers/net/es3210.c
+++ b/drivers/net/es3210.c
@@ -54,6 +54,7 @@ static const char version[] =
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 
diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c
index a83dd312c3ac..9162c705c4fa 100644
--- a/drivers/net/ethoc.c
+++ b/drivers/net/ethoc.c
@@ -13,6 +13,7 @@
 
 #include <linux/etherdevice.h>
 #include <linux/crc32.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/mii.h>
 #include <linux/phy.h>
diff --git a/drivers/net/fec_mpc52xx.c b/drivers/net/fec_mpc52xx.c
index 9f81b1ac130e..eba0f696a51f 100644
--- a/drivers/net/fec_mpc52xx.c
+++ b/drivers/net/fec_mpc52xx.c
@@ -22,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/crc32.h>
 #include <linux/hardirq.h>
 #include <linux/delay.h>
diff --git a/drivers/net/greth.c b/drivers/net/greth.c
index f181304a7ab6..69b86d7fac85 100644
--- a/drivers/net/greth.c
+++ b/drivers/net/greth.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c
index 99cdce33df8b..a974727dd9a2 100644
--- a/drivers/net/hamradio/baycom_ser_fdx.c
+++ b/drivers/net/hamradio/baycom_ser_fdx.c
@@ -76,6 +76,7 @@
 #include <linux/ioport.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/hdlcdrv.h>
 #include <linux/baycom.h>
 #include <linux/jiffies.h>
diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c
index d92fe6ca788f..e349d867449b 100644
--- a/drivers/net/hamradio/baycom_ser_hdx.c
+++ b/drivers/net/hamradio/baycom_ser_hdx.c
@@ -66,6 +66,7 @@
 #include <linux/ioport.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <linux/hdlcdrv.h>
diff --git a/drivers/net/hp-plus.c b/drivers/net/hp-plus.c
index 82bffc3cabdf..29917363ebfb 100644
--- a/drivers/net/hp-plus.c
+++ b/drivers/net/hp-plus.c
@@ -30,6 +30,7 @@ static const char version[] =
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/delay.h>
 
 #include <asm/system.h>
diff --git a/drivers/net/hp.c b/drivers/net/hp.c
index ef2014375e62..18564d4a7c04 100644
--- a/drivers/net/hp.c
+++ b/drivers/net/hp.c
@@ -30,6 +30,7 @@ static const char version[] =
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/delay.h>
 
 #include <asm/system.h>
diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index b388d782c7c4..838c5b673767 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -34,6 +34,7 @@
 #include <linux/etherdevice.h>
 #include <linux/skbuff.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/mm.h>
 #include <linux/pm.h>
 #include <linux/ethtool.h>
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 4fecaed67fc4..ce53f4a23b19 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -32,6 +32,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/moduleparam.h>
 #include <net/pkt_sched.h>
 #include <net/net_namespace.h>
diff --git a/drivers/net/ipg.c b/drivers/net/ipg.c
index 58cd3202b48c..d4aa40adf1e9 100644
--- a/drivers/net/ipg.c
+++ b/drivers/net/ipg.c
@@ -22,6 +22,7 @@
  */
 #include <linux/crc32.h>
 #include <linux/ethtool.h>
+#include <linux/interrupt.h>
 #include <linux/gfp.h>
 #include <linux/mii.h>
 #include <linux/mutex.h>
diff --git a/drivers/net/irda/ali-ircc.c b/drivers/net/irda/ali-ircc.c
index d532dde5120f..963067d3bda2 100644
--- a/drivers/net/irda/ali-ircc.c
+++ b/drivers/net/irda/ali-ircc.c
@@ -31,6 +31,7 @@
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/rtnetlink.h>
 #include <linux/serial_reg.h>
 #include <linux/dma-mapping.h>
diff --git a/drivers/net/irda/donauboe.c b/drivers/net/irda/donauboe.c
index 174cafad2c1a..b45b2cc42804 100644
--- a/drivers/net/irda/donauboe.c
+++ b/drivers/net/irda/donauboe.c
@@ -152,6 +152,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/rtnetlink.h>
 
diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c
index 7a963d4e6d06..b56636da6cc3 100644
--- a/drivers/net/irda/nsc-ircc.c
+++ b/drivers/net/irda/nsc-ircc.c
@@ -52,6 +52,7 @@
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/rtnetlink.h>
 #include <linux/dma-mapping.h>
 #include <linux/pnp.h>
diff --git a/drivers/net/irda/pxaficp_ir.c b/drivers/net/irda/pxaficp_ir.c
index 001ed0a255f6..b1d1ce3dd8b5 100644
--- a/drivers/net/irda/pxaficp_ir.c
+++ b/drivers/net/irda/pxaficp_ir.c
@@ -12,6 +12,7 @@
  * Infra-red driver (SIR/FIR) for the PXA2xx embedded microprocessor
  *
  */
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c
index efe05bb34dd8..5039f08f5a5b 100644
--- a/drivers/net/irda/sir_dev.c
+++ b/drivers/net/irda/sir_dev.c
@@ -11,6 +11,7 @@
  *
  ********************************************************************/    
 
+#include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
diff --git a/drivers/net/irda/smsc-ircc2.c b/drivers/net/irda/smsc-ircc2.c
index 8800e1fe4129..d5072af81b2d 100644
--- a/drivers/net/irda/smsc-ircc2.c
+++ b/drivers/net/irda/smsc-ircc2.c
@@ -49,6 +49,7 @@
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/rtnetlink.h>
 #include <linux/serial_reg.h>
 #include <linux/dma-mapping.h>
diff --git a/drivers/net/irda/via-ircc.c b/drivers/net/irda/via-ircc.c
index f504b262ba36..6d6479049aa1 100644
--- a/drivers/net/irda/via-ircc.c
+++ b/drivers/net/irda/via-ircc.c
@@ -46,6 +46,7 @@ F02 Oct/28/02: Add SB device ID for 3147 and 3177.
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/rtnetlink.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index c3d07382b7fa..9021d0131727 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -36,6 +36,7 @@ MODULE_LICENSE("GPL");
 
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/irda/w83977af_ir.c b/drivers/net/irda/w83977af_ir.c
index 1f9c3f08d1a3..c4366601b067 100644
--- a/drivers/net/irda/w83977af_ir.c
+++ b/drivers/net/irda/w83977af_ir.c
@@ -47,6 +47,7 @@
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/rtnetlink.h>
 #include <linux/dma-mapping.h>
 #include <linux/gfp.h>
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index cb1555bc8548..4950d03d3ef8 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -27,6 +27,7 @@
 
 /* ethtool support for ixgbe */
 
+#include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 08e8e25c159d..06cfaf31bcf3 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -32,6 +32,7 @@
 #include <linux/vmalloc.h>
 #include <linux/string.h>
 #include <linux/in.h>
+#include <linux/interrupt.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/pkt_sched.h>
diff --git a/drivers/net/ixp2000/ixpdev.c b/drivers/net/ixp2000/ixpdev.c
index 78ddd8b79e7e..e122493ab70e 100644
--- a/drivers/net/ixp2000/ixpdev.c
+++ b/drivers/net/ixp2000/ixpdev.c
@@ -14,6 +14,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/moduleparam.h>
 #include <linux/gfp.h>
 #include <asm/hardware/uengine.h>
diff --git a/drivers/net/jme.h b/drivers/net/jme.h
index e9aaeca96abc..0d5da06489d8 100644
--- a/drivers/net/jme.h
+++ b/drivers/net/jme.h
@@ -24,6 +24,7 @@
 
 #ifndef __JME_H_INCLUDED__
 #define __JME_H_INCLUDED__
+#include <linux/interrupt.h>
 
 #define DRV_NAME	"jme"
 #define DRV_VERSION	"1.0.8"
diff --git a/drivers/net/ks8842.c b/drivers/net/ks8842.c
index fc12ac0d9f2e..4a6ae057e3b1 100644
--- a/drivers/net/ks8842.c
+++ b/drivers/net/ks8842.c
@@ -23,6 +23,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
diff --git a/drivers/net/ks8851.c b/drivers/net/ks8851.c
index bcd9ba68c9f2..f56743a28fc0 100644
--- a/drivers/net/ks8851.c
+++ b/drivers/net/ks8851.c
@@ -13,6 +13,7 @@
 
 #define DEBUG
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ks8851_mll.c b/drivers/net/ks8851_mll.c
index 61631cace913..aefbdd896d6a 100644
--- a/drivers/net/ks8851_mll.c
+++ b/drivers/net/ks8851_mll.c
@@ -23,6 +23,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ksz884x.c b/drivers/net/ksz884x.c
index 41ea5920c158..2ac6c6c984b3 100644
--- a/drivers/net/ksz884x.c
+++ b/drivers/net/ksz884x.c
@@ -17,6 +17,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
diff --git a/drivers/net/lne390.c b/drivers/net/lne390.c
index 8a1097cf8a83..f9888d20177b 100644
--- a/drivers/net/lne390.c
+++ b/drivers/net/lne390.c
@@ -41,6 +41,7 @@ static const char *version =
 #include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 
diff --git a/drivers/net/macb.c b/drivers/net/macb.c
index 6c6a02869dfc..818f0b8c2772 100644
--- a/drivers/net/macb.c
+++ b/drivers/net/macb.c
@@ -15,6 +15,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/dma-mapping.h>
diff --git a/drivers/net/mace.c b/drivers/net/mace.c
index 1c5221f79d6f..2074e9724ba3 100644
--- a/drivers/net/mace.c
+++ b/drivers/net/mace.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/timer.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/crc32.h>
 #include <linux/spinlock.h>
 #include <linux/bitrev.h>
diff --git a/drivers/net/netx-eth.c b/drivers/net/netx-eth.c
index 2e4b42175f3f..2dfee892d200 100644
--- a/drivers/net/netx-eth.c
+++ b/drivers/net/netx-eth.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index cc25bff0bd3b..543eb17acdc5 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -7,6 +7,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index 3e4040f2f3cb..d3afb458987e 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -106,6 +106,7 @@
 #include <linux/delay.h>
 #include <linux/workqueue.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ip.h>	/* for iph */
 #include <linux/in.h>	/* for IPPROTO_... */
 #include <linux/compiler.h>
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index a1b82c9c67d2..6436ba916fe4 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -30,6 +30,7 @@
 #include <linux/ppp_channel.h>
 #include <linux/spinlock.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/jiffies.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
index 2573f525f11c..736a39ee05bb 100644
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -44,6 +44,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
 #include <asm/uaccess.h>
diff --git a/drivers/net/ps3_gelic_net.c b/drivers/net/ps3_gelic_net.c
index d47abb4e7eb6..35e47c3cab35 100644
--- a/drivers/net/ps3_gelic_net.c
+++ b/drivers/net/ps3_gelic_net.c
@@ -28,6 +28,7 @@
 
 #undef DEBUG
 
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/net/qlge/qlge.h b/drivers/net/qlge/qlge.h
index d32850715f5c..7d8483f9012e 100644
--- a/drivers/net/qlge/qlge.h
+++ b/drivers/net/qlge/qlge.h
@@ -7,6 +7,7 @@
 #ifndef _QLGE_H_
 #define _QLGE_H_
 
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index ef1ce2ebeb4a..73108249462e 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -22,6 +22,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/dma-mapping.h>
 #include <linux/pm_runtime.h>
 #include <linux/firmware.h>
diff --git a/drivers/net/sc92031.c b/drivers/net/sc92031.c
index fa74314ef789..9da47337b7c3 100644
--- a/drivers/net/sc92031.c
+++ b/drivers/net/sc92031.c
@@ -22,6 +22,7 @@
  * matching, so you need to enable IFF_PROMISC when using it.
  */
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
diff --git a/drivers/net/sfc/nic.c b/drivers/net/sfc/nic.c
index f2a2b947f860..bafa23a6874c 100644
--- a/drivers/net/sfc/nic.c
+++ b/drivers/net/sfc/nic.c
@@ -10,6 +10,7 @@
 
 #include <linux/bitops.h>
 #include <linux/delay.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c
index b436e007eea0..8ad7bfbaa3af 100644
--- a/drivers/net/sis190.c
+++ b/drivers/net/sis190.c
@@ -21,6 +21,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/skge.h b/drivers/net/skge.h
index 598bf7a1a55e..a2eb34115844 100644
--- a/drivers/net/skge.h
+++ b/drivers/net/skge.h
@@ -3,6 +3,7 @@
  */
 #ifndef _SKGE_H
 #define _SKGE_H
+#include <linux/interrupt.h>
 
 /* PCI config registers */
 #define PCI_DEV_REG1	0x40
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 3ee41da130c2..d252cb123a5f 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -32,6 +32,7 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/pci.h>
+#include <linux/interrupt.h>
 #include <linux/ip.h>
 #include <linux/slab.h>
 #include <net/ip.h>
diff --git a/drivers/net/smc-mca.c b/drivers/net/smc-mca.c
index d07c39cb4daf..34934fb23b97 100644
--- a/drivers/net/smc-mca.c
+++ b/drivers/net/smc-mca.c
@@ -42,6 +42,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 
diff --git a/drivers/net/smc-ultra.c b/drivers/net/smc-ultra.c
index 235a3c6c9f91..ba44ede29198 100644
--- a/drivers/net/smc-ultra.c
+++ b/drivers/net/smc-ultra.c
@@ -62,6 +62,7 @@ static const char version[] =
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/isapnp.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index c6d47d10590c..054597625d55 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -37,6 +37,7 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/net/smsc9420.c b/drivers/net/smsc9420.c
index 4c92ad8be765..5e5e64da5cb9 100644
--- a/drivers/net/smsc9420.c
+++ b/drivers/net/smsc9420.c
@@ -19,6 +19,7 @@
  ***************************************************************************
  */
 
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
 #include <linux/phy.h>
diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c
index 949f124e1278..9bc6c20e8b97 100644
--- a/drivers/net/spider_net.c
+++ b/drivers/net/spider_net.c
@@ -31,6 +31,7 @@
 #include <linux/if_vlan.h>
 #include <linux/in.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/gfp.h>
 #include <linux/ioport.h>
 #include <linux/ip.h>
diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c
index 36045f3b0327..860a50815f84 100644
--- a/drivers/net/starfire.c
+++ b/drivers/net/starfire.c
@@ -30,6 +30,7 @@
 #define DRV_VERSION	"2.1"
 #define DRV_RELDATE	"July  6, 2008"
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
diff --git a/drivers/net/stmmac/stmmac_ethtool.c b/drivers/net/stmmac/stmmac_ethtool.c
index ae5213a8c4cd..720c5a1531bc 100644
--- a/drivers/net/stmmac/stmmac_ethtool.c
+++ b/drivers/net/stmmac/stmmac_ethtool.c
@@ -24,6 +24,7 @@
 
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
+#include <linux/interrupt.h>
 #include <linux/mii.h>
 #include <linux/phy.h>
 
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index a1f9f9eef37d..c8a145d72f17 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/in.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/tlan.c b/drivers/net/tlan.c
index ace6404e2fac..145871b3130b 100644
--- a/drivers/net/tlan.c
+++ b/drivers/net/tlan.c
@@ -29,8 +29,10 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/eisa.h>
 #include <linux/pci.h>
diff --git a/drivers/net/tokenring/ibmtr.c b/drivers/net/tokenring/ibmtr.c
index e896ad0e8d24..e257a00fe14b 100644
--- a/drivers/net/tokenring/ibmtr.c
+++ b/drivers/net/tokenring/ibmtr.c
@@ -123,6 +123,7 @@ in the event that chatty debug messages are desired - jjs 12/30/98 */
 /* some 95 OS send many non UI frame; this allow removing the warning */
 #define TR_FILTERNONUI	1
 
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/netdevice.h>
 #include <linux/ip.h>
diff --git a/drivers/net/tsi108_eth.c b/drivers/net/tsi108_eth.c
index 5c633a32eaeb..64cb9ac19ed9 100644
--- a/drivers/net/tsi108_eth.c
+++ b/drivers/net/tsi108_eth.c
@@ -33,6 +33,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/net.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/net/tulip/de2104x.c b/drivers/net/tulip/de2104x.c
index e2f692351180..ce90efc6ba3c 100644
--- a/drivers/net/tulip/de2104x.c
+++ b/drivers/net/tulip/de2104x.c
@@ -38,6 +38,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/ethtool.h>
diff --git a/drivers/net/tulip/pnic.c b/drivers/net/tulip/pnic.c
index aa4d9dad0395..52d898bdbeb4 100644
--- a/drivers/net/tulip/pnic.c
+++ b/drivers/net/tulip/pnic.c
@@ -13,6 +13,7 @@
 	Please submit bugs to http://bugzilla.kernel.org/ .
 */
 
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/jiffies.h>
 #include "tulip.h"
diff --git a/drivers/net/tulip/tulip_core.c b/drivers/net/tulip/tulip_core.c
index 82f87647207e..1246998a677c 100644
--- a/drivers/net/tulip/tulip_core.c
+++ b/drivers/net/tulip/tulip_core.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include "tulip.h"
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/etherdevice.h>
 #include <linux/delay.h>
 #include <linux/mii.h>
diff --git a/drivers/net/vxge/vxge-config.h b/drivers/net/vxge/vxge-config.h
index 359b9b9f8041..6219006d9d2e 100644
--- a/drivers/net/vxge/vxge-config.h
+++ b/drivers/net/vxge/vxge-config.h
@@ -13,6 +13,7 @@
  ******************************************************************************/
 #ifndef VXGE_CONFIG_H
 #define VXGE_CONFIG_H
+#include <linux/hardirq.h>
 #include <linux/list.h>
 #include <linux/slab.h>
 
diff --git a/drivers/net/vxge/vxge-main.c b/drivers/net/vxge/vxge-main.c
index 8ab870a2ad02..e658edd1c959 100644
--- a/drivers/net/vxge/vxge-main.c
+++ b/drivers/net/vxge/vxge-main.c
@@ -44,6 +44,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/if_vlan.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/tcp.h>
diff --git a/drivers/net/wan/cycx_main.c b/drivers/net/wan/cycx_main.c
index 859dba9b972e..a0976d1f97c6 100644
--- a/drivers/net/wan/cycx_main.c
+++ b/drivers/net/wan/cycx_main.c
@@ -50,6 +50,7 @@
 #include <linux/wanrouter.h>	/* WAN router definitions */
 #include <linux/cyclomx.h>	/* cyclomx common user API definitions */
 #include <linux/init.h>         /* __init (when not using as a module) */
+#include <linux/interrupt.h>
 
 unsigned int cycx_debug;
 
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index acb9ea830628..3590d588327c 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -99,6 +99,7 @@
 #include <asm/irq.h>
 
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/string.h>
 
 #include <linux/if_arp.h>
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index e050bd65e037..1eba06f6fa4c 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/if.h>
 #include <linux/hdlc.h>
 #include <asm/io.h>
diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c
index db73a7be199f..4ea89fe0006f 100644
--- a/drivers/net/wan/wanxl.c
+++ b/drivers/net/wan/wanxl.c
@@ -22,6 +22,7 @@
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/netdevice.h>
 #include <linux/hdlc.h>
diff --git a/drivers/net/wireless/adm8211.c b/drivers/net/wireless/adm8211.c
index afe2cbc6cb24..43ebc44fc82c 100644
--- a/drivers/net/wireless/adm8211.c
+++ b/drivers/net/wireless/adm8211.c
@@ -16,6 +16,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/if.h>
 #include <linux/skbuff.h>
 #include <linux/slab.h>
diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index f75068b4b310..e99dfe6065bf 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -19,6 +19,7 @@
 
 #include <linux/etherdevice.h>
 #include <linux/device.h>
+#include <linux/interrupt.h>
 #include <linux/leds.h>
 #include <linux/completion.h>
 
diff --git a/drivers/net/wireless/atmel.c b/drivers/net/wireless/atmel.c
index 39a11e8af4fa..7e45ca2e78ef 100644
--- a/drivers/net/wireless/atmel.c
+++ b/drivers/net/wireless/atmel.c
@@ -40,6 +40,7 @@
 ******************************************************************************/
 
 #include <linux/init.h>
+#include <linux/interrupt.h>
 
 #include <linux/kernel.h>
 #include <linux/ptrace.h>
diff --git a/drivers/net/wireless/hostap/hostap_wlan.h b/drivers/net/wireless/hostap/hostap_wlan.h
index 88dc6a52bdf1..7bb0b4b3f2cb 100644
--- a/drivers/net/wireless/hostap/hostap_wlan.h
+++ b/drivers/net/wireless/hostap/hostap_wlan.h
@@ -1,6 +1,7 @@
 #ifndef HOSTAP_WLAN_H
 #define HOSTAP_WLAN_H
 
+#include <linux/interrupt.h>
 #include <linux/wireless.h>
 #include <linux/netdevice.h>
 #include <linux/mutex.h>
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.h b/drivers/net/wireless/ipw2x00/ipw2200.h
index 91795b5a93c5..ecb561d7a7a0 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.h
+++ b/drivers/net/wireless/ipw2x00/ipw2200.h
@@ -32,6 +32,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/mutex.h>
 
 #include <linux/pci.h>
diff --git a/drivers/net/wireless/ipw2x00/libipw_wx.c b/drivers/net/wireless/ipw2x00/libipw_wx.c
index d7bd6cf00a81..6623e5052254 100644
--- a/drivers/net/wireless/ipw2x00/libipw_wx.c
+++ b/drivers/net/wireless/ipw2x00/libipw_wx.c
@@ -30,6 +30,7 @@
 
 ******************************************************************************/
 
+#include <linux/hardirq.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
 #include <linux/module.h>
diff --git a/drivers/net/wireless/iwlegacy/iwl-dev.h b/drivers/net/wireless/iwlegacy/iwl-dev.h
index be0106c6a2da..416448acd971 100644
--- a/drivers/net/wireless/iwlegacy/iwl-dev.h
+++ b/drivers/net/wireless/iwlegacy/iwl-dev.h
@@ -32,6 +32,7 @@
 #ifndef __iwl_legacy_dev_h__
 #define __iwl_legacy_dev_h__
 
+#include <linux/interrupt.h>
 #include <linux/pci.h> /* for struct pci_device_id */
 #include <linux/kernel.h>
 #include <linux/leds.h>
diff --git a/drivers/net/wireless/iwlwifi/iwl-dev.h b/drivers/net/wireless/iwlwifi/iwl-dev.h
index 22a6e3ec7094..1f4797d9848d 100644
--- a/drivers/net/wireless/iwlwifi/iwl-dev.h
+++ b/drivers/net/wireless/iwlwifi/iwl-dev.h
@@ -31,6 +31,7 @@
 #ifndef __iwl_dev_h__
 #define __iwl_dev_h__
 
+#include <linux/interrupt.h>
 #include <linux/pci.h> /* for struct pci_device_id */
 #include <linux/kernel.h>
 #include <linux/wait.h>
diff --git a/drivers/net/wireless/libertas/cfg.c b/drivers/net/wireless/libertas/cfg.c
index 5d637af2d7c3..b456a53b64b1 100644
--- a/drivers/net/wireless/libertas/cfg.c
+++ b/drivers/net/wireless/libertas/cfg.c
@@ -8,6 +8,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c
index 71c8f3fccfa1..9dcf967e1ee4 100644
--- a/drivers/net/wireless/libertas/cmd.c
+++ b/drivers/net/wireless/libertas/cmd.c
@@ -3,6 +3,7 @@
  * It prepares command and sends it to firmware when it is ready.
  */
 
+#include <linux/hardirq.h>
 #include <linux/kfifo.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/drivers/net/wireless/libertas/cmdresp.c b/drivers/net/wireless/libertas/cmdresp.c
index 207fc361db84..2ffe5a1ff003 100644
--- a/drivers/net/wireless/libertas/cmdresp.c
+++ b/drivers/net/wireless/libertas/cmdresp.c
@@ -3,6 +3,7 @@
  * responses as well as events generated by firmware.
  */
 
+#include <linux/hardirq.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
diff --git a/drivers/net/wireless/libertas/debugfs.c b/drivers/net/wireless/libertas/debugfs.c
index 23250f621761..1af182778844 100644
--- a/drivers/net/wireless/libertas/debugfs.c
+++ b/drivers/net/wireless/libertas/debugfs.c
@@ -1,6 +1,7 @@
 #include <linux/dcache.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
+#include <linux/hardirq.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/slab.h>
diff --git a/drivers/net/wireless/libertas/ethtool.c b/drivers/net/wireless/libertas/ethtool.c
index 29dbce4a9f86..4dfb3bfd2cf3 100644
--- a/drivers/net/wireless/libertas/ethtool.c
+++ b/drivers/net/wireless/libertas/ethtool.c
@@ -1,3 +1,4 @@
+#include <linux/hardirq.h>
 #include <linux/netdevice.h>
 #include <linux/ethtool.h>
 #include <linux/delay.h>
diff --git a/drivers/net/wireless/libertas/if_spi.c b/drivers/net/wireless/libertas/if_spi.c
index 463352c890d7..4fa0be9943f9 100644
--- a/drivers/net/wireless/libertas/if_spi.c
+++ b/drivers/net/wireless/libertas/if_spi.c
@@ -19,6 +19,8 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/hardirq.h>
+#include <linux/interrupt.h>
 #include <linux/moduleparam.h>
 #include <linux/firmware.h>
 #include <linux/jiffies.h>
diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c
index 8c40949cb076..cf3d2c8e1969 100644
--- a/drivers/net/wireless/libertas/main.c
+++ b/drivers/net/wireless/libertas/main.c
@@ -9,6 +9,7 @@
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
 #include <linux/etherdevice.h>
+#include <linux/hardirq.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/kthread.h>
diff --git a/drivers/net/wireless/libertas/mesh.c b/drivers/net/wireless/libertas/mesh.c
index 24cf06680c6b..7969d104189d 100644
--- a/drivers/net/wireless/libertas/mesh.c
+++ b/drivers/net/wireless/libertas/mesh.c
@@ -2,6 +2,7 @@
 
 #include <linux/delay.h>
 #include <linux/etherdevice.h>
+#include <linux/hardirq.h>
 #include <linux/netdevice.h>
 #include <linux/if_ether.h>
 #include <linux/if_arp.h>
diff --git a/drivers/net/wireless/libertas/rx.c b/drivers/net/wireless/libertas/rx.c
index fdb0448301a0..bfb8898ae518 100644
--- a/drivers/net/wireless/libertas/rx.c
+++ b/drivers/net/wireless/libertas/rx.c
@@ -5,6 +5,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/etherdevice.h>
+#include <linux/hardirq.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <net/cfg80211.h>
diff --git a/drivers/net/wireless/libertas/tx.c b/drivers/net/wireless/libertas/tx.c
index bbb95f88dc01..f19495b178f6 100644
--- a/drivers/net/wireless/libertas/tx.c
+++ b/drivers/net/wireless/libertas/tx.c
@@ -1,6 +1,7 @@
 /*
  * This file contains the handling of TX in wlan driver.
  */
+#include <linux/hardirq.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/sched.h>
diff --git a/drivers/net/wireless/libertas_tf/cmd.c b/drivers/net/wireless/libertas_tf/cmd.c
index 8945afd6ce3e..13557fe0bf95 100644
--- a/drivers/net/wireless/libertas_tf/cmd.c
+++ b/drivers/net/wireless/libertas_tf/cmd.c
@@ -9,6 +9,7 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/hardirq.h>
 #include <linux/slab.h>
 
 #include "libertas_tf.h"
diff --git a/drivers/net/wireless/libertas_tf/main.c b/drivers/net/wireless/libertas_tf/main.c
index d4005081f1df..5beb58142e97 100644
--- a/drivers/net/wireless/libertas_tf/main.c
+++ b/drivers/net/wireless/libertas_tf/main.c
@@ -9,6 +9,7 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/hardirq.h>
 #include <linux/slab.h>
 
 #include <linux/etherdevice.h>
diff --git a/drivers/net/wireless/mwl8k.c b/drivers/net/wireless/mwl8k.c
index 32261189bcef..8ff43c281411 100644
--- a/drivers/net/wireless/mwl8k.c
+++ b/drivers/net/wireless/mwl8k.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
diff --git a/drivers/net/wireless/p54/p54pci.h b/drivers/net/wireless/p54/p54pci.h
index ee9bc62a4fa2..7aa509f7e387 100644
--- a/drivers/net/wireless/p54/p54pci.h
+++ b/drivers/net/wireless/p54/p54pci.h
@@ -1,5 +1,6 @@
 #ifndef P54PCI_H
 #define P54PCI_H
+#include <linux/interrupt.h>
 
 /*
  * Defines for PCI based mac80211 Prism54 driver
diff --git a/drivers/net/wireless/prism54/islpci_dev.c b/drivers/net/wireless/prism54/islpci_dev.c
index ec2c75d77cea..5d0f61508a2e 100644
--- a/drivers/net/wireless/prism54/islpci_dev.c
+++ b/drivers/net/wireless/prism54/islpci_dev.c
@@ -18,6 +18,7 @@
  *
  */
 
+#include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 
diff --git a/drivers/net/wireless/prism54/islpci_dev.h b/drivers/net/wireless/prism54/islpci_dev.h
index c4d0f19b7cbc..c40403877f97 100644
--- a/drivers/net/wireless/prism54/islpci_dev.h
+++ b/drivers/net/wireless/prism54/islpci_dev.h
@@ -22,6 +22,7 @@
 #ifndef _ISLPCI_DEV_H
 #define _ISLPCI_DEV_H
 
+#include <linux/irqreturn.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
 #include <net/iw_handler.h>
diff --git a/drivers/net/wireless/prism54/islpci_hotplug.c b/drivers/net/wireless/prism54/islpci_hotplug.c
index b5e64d71b7a6..9e68e0cb718e 100644
--- a/drivers/net/wireless/prism54/islpci_hotplug.c
+++ b/drivers/net/wireless/prism54/islpci_hotplug.c
@@ -17,6 +17,7 @@
  *
  */
 
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/net/wireless/rt2x00/rt2x00.h b/drivers/net/wireless/rt2x00/rt2x00.h
index c446db69bd3c..4efaf886fb89 100644
--- a/drivers/net/wireless/rt2x00/rt2x00.h
+++ b/drivers/net/wireless/rt2x00/rt2x00.h
@@ -29,6 +29,7 @@
 #define RT2X00_H
 
 #include <linux/bitops.h>
+#include <linux/interrupt.h>
 #include <linux/skbuff.h>
 #include <linux/workqueue.h>
 #include <linux/firmware.h>
diff --git a/drivers/net/wireless/rtl818x/rtl8180/dev.c b/drivers/net/wireless/rtl818x/rtl8180/dev.c
index 80db5cabc9b9..66b29dc07cc3 100644
--- a/drivers/net/wireless/rtl818x/rtl8180/dev.c
+++ b/drivers/net/wireless/rtl818x/rtl8180/dev.c
@@ -16,6 +16,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
diff --git a/drivers/net/wireless/wl1251/sdio.c b/drivers/net/wireless/wl1251/sdio.c
index f51a0241a440..f78694295c39 100644
--- a/drivers/net/wireless/wl1251/sdio.c
+++ b/drivers/net/wireless/wl1251/sdio.c
@@ -19,6 +19,7 @@
  * Copyright (C) 2008 Google Inc
  * Copyright (C) 2009 Bob Copeland (me@bobcopeland.com)
  */
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
 #include <linux/mmc/sdio_func.h>
diff --git a/drivers/net/wireless/wl1251/spi.c b/drivers/net/wireless/wl1251/spi.c
index af6448c4d3e2..eaa5f9556200 100644
--- a/drivers/net/wireless/wl1251/spi.c
+++ b/drivers/net/wireless/wl1251/spi.c
@@ -19,6 +19,7 @@
  *
  */
 
+#include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/net/wireless/wl12xx/io.h b/drivers/net/wireless/wl12xx/io.h
index beed621a8ae0..20b00319e444 100644
--- a/drivers/net/wireless/wl12xx/io.h
+++ b/drivers/net/wireless/wl12xx/io.h
@@ -25,6 +25,7 @@
 #ifndef __IO_H__
 #define __IO_H__
 
+#include <linux/irqreturn.h>
 #include "reg.h"
 
 #define HW_ACCESS_MEMORY_MAX_RANGE	0x1FFC0
diff --git a/drivers/net/wireless/wl12xx/spi.c b/drivers/net/wireless/wl12xx/spi.c
index 51662bb68019..beebf64c5359 100644
--- a/drivers/net/wireless/wl12xx/spi.c
+++ b/drivers/net/wireless/wl12xx/spi.c
@@ -21,6 +21,7 @@
  *
  */
 
+#include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/crc7.h>
diff --git a/drivers/s390/net/ctcm_mpc.h b/drivers/s390/net/ctcm_mpc.h
index 5336120cddf1..1fa07b0c11c0 100644
--- a/drivers/s390/net/ctcm_mpc.h
+++ b/drivers/s390/net/ctcm_mpc.h
@@ -12,6 +12,7 @@
 #ifndef _CTC_MPC_H_
 #define _CTC_MPC_H_
 
+#include <linux/interrupt.h>
 #include <linux/skbuff.h>
 #include "fsm.h"
 
diff --git a/include/linux/arcdevice.h b/include/linux/arcdevice.h
index 7d650a0e3d8f..7216b0daf544 100644
--- a/include/linux/arcdevice.h
+++ b/include/linux/arcdevice.h
@@ -20,6 +20,7 @@
 #include <linux/if_arcnet.h>
 
 #ifdef __KERNEL__
+#include  <linux/irqreturn.h>
 
 #ifndef bool
 #define bool int
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index d638e85dc501..710c04302a15 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -236,6 +236,7 @@ enum dccp_packet_dequeueing_policy {
 #ifdef __KERNEL__
 
 #include <linux/in.h>
+#include <linux/interrupt.h>
 #include <linux/ktime.h>
 #include <linux/list.h>
 #include <linux/uio.h>
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ca333e79e10f..336288567c78 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1563,7 +1563,6 @@ struct packet_type {
 	struct list_head	list;
 };
 
-#include <linux/interrupt.h>
 #include <linux/notifier.h>
 
 extern rwlock_t				dev_base_lock;		/* Device list lock */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 6c994c004d15..7851c05833e2 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -25,6 +25,7 @@
 #ifndef __HCI_CORE_H
 #define __HCI_CORE_H
 
+#include <linux/interrupt.h>
 #include <net/bluetooth/hci.h>
 
 /* HCI upper protocols */
diff --git a/include/net/sock.h b/include/net/sock.h
index f2046e404a61..ebbc0bafe661 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -40,6 +40,7 @@
 #ifndef _SOCK_H
 #define _SOCK_H
 
+#include <linux/hardirq.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/list_nulls.h>
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index e9aced0ec56b..db4a11c61d15 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -37,6 +37,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/skbuff.h>
 #include <linux/slab.h>
 #include <linux/atm.h>
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index adbb424403d4..c628a57953c9 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -9,6 +9,7 @@
 
 #include <linux/version.h>
 #include <linux/fs.h>
+#include <linux/hardirq.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 184a6572b67e..d6c8ae5b2e6a 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -43,6 +43,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/hrtimer.h>
 #include <linux/list.h>
 #include <linux/proc_fs.h>
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 4297d92788dc..edfaaaf164eb 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -3,6 +3,7 @@
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
+#include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include "rds.h"
diff --git a/net/rds/iw.h b/net/rds/iw.h
index 90151922178c..04ce3b193f79 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -1,6 +1,7 @@
 #ifndef _RDS_IW_H
 #define _RDS_IW_H
 
+#include <linux/interrupt.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 #include "rds.h"
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 3f08158b8688..e25e49061a0d 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/skbuff.h>
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index c3c232a88d94..a385430c722a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -42,6 +42,7 @@
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/rpc_rdma.h>
+#include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 80f8da344df5..28236bab57f9 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -47,6 +47,7 @@
  *  o buffer memory
  */
 
+#include <linux/interrupt.h>
 #include <linux/pci.h>	/* for Tavor hack below */
 #include <linux/slab.h>
 
-- 
cgit v1.2.3


From 5cec93c216db77c45f7ce970d46283bcb1933884 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@MIT.EDU>
Date: Sun, 5 Jun 2011 13:50:24 -0400
Subject: x86-64: Emulate legacy vsyscalls

There's a fair amount of code in the vsyscall page.  It contains
a syscall instruction (in the gettimeofday fallback) and who
knows what will happen if an exploit jumps into the middle of
some other code.

Reduce the risk by replacing the vsyscalls with short magic
incantations that cause the kernel to emulate the real
vsyscalls. These incantations are useless if entered in the
middle.

This causes vsyscalls to be a little more expensive than real
syscalls.  Fortunately sensible programs don't use them.
The only exception is time() which is still called by glibc
through the vsyscall - but calling time() millions of times
per second is not sensible. glibc has this fixed in the
development tree.

This patch is not perfect: the vread_tsc and vread_hpet
functions are still at a fixed address.  Fixing that might
involve making alternative patching work in the vDSO.

Signed-off-by: Andy Lutomirski <luto@mit.edu>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jesper Juhl <jj@chaosbits.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: richard -rw- weinberger <richard.weinberger@gmail.com>
Cc: Mikael Pettersson <mikpe@it.uu.se>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: Valdis.Kletnieks@vt.edu
Cc: pageexec@freemail.hu
Link: http://lkml.kernel.org/r/e64e1b3c64858820d12c48fa739efbd1485e79d5.1307292171.git.luto@mit.edu
[ Removed the CONFIG option - it's simpler to just do it unconditionally. Tidied up the code as well. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/irq_vectors.h |   6 +-
 arch/x86/include/asm/traps.h       |   4 +
 arch/x86/include/asm/vsyscall.h    |  12 ++
 arch/x86/kernel/Makefile           |   1 +
 arch/x86/kernel/entry_64.S         |   2 +
 arch/x86/kernel/traps.c            |   6 +
 arch/x86/kernel/vsyscall_64.c      | 261 +++++++++++++++++--------------------
 arch/x86/kernel/vsyscall_emu_64.S  |  27 ++++
 include/linux/seccomp.h            |  10 ++
 9 files changed, 189 insertions(+), 140 deletions(-)
 create mode 100644 arch/x86/kernel/vsyscall_emu_64.S

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6e976ee3b3ef..a563c509edcb 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -17,7 +17,8 @@
  *  Vectors   0 ...  31 : system traps and exceptions - hardcoded events
  *  Vectors  32 ... 127 : device interrupts
  *  Vector  128         : legacy int80 syscall interface
- *  Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
+ *  Vector  204         : legacy x86_64 vsyscall emulation
+ *  Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts
  *  Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
  *
  * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
@@ -50,6 +51,9 @@
 #ifdef CONFIG_X86_32
 # define SYSCALL_VECTOR			0x80
 #endif
+#ifdef CONFIG_X86_64
+# define VSYSCALL_EMU_VECTOR		0xcc
+#endif
 
 /*
  * Vectors 0x30-0x3f are used for ISA interrupts.
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0310da67307f..2bae0a513b40 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_TRAPS_H
 #define _ASM_X86_TRAPS_H
 
+#include <linux/kprobes.h>
+
 #include <asm/debugreg.h>
 #include <asm/siginfo.h>			/* TRAP_TRACE, ... */
 
@@ -38,6 +40,7 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 #endif /* CONFIG_X86_MCE */
 asmlinkage void simd_coprocessor_error(void);
+asmlinkage void emulate_vsyscall(void);
 
 dotraplinkage void do_divide_error(struct pt_regs *, long);
 dotraplinkage void do_debug(struct pt_regs *, long);
@@ -64,6 +67,7 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long);
 dotraplinkage void do_machine_check(struct pt_regs *, long);
 #endif
 dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
+dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long);
 #ifdef CONFIG_X86_32
 dotraplinkage void do_iret_error(struct pt_regs *, long);
 #endif
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d55597351f6a..bb710cb0cdc1 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -31,6 +31,18 @@ extern struct timezone sys_tz;
 
 extern void map_vsyscall(void);
 
+/* Emulation */
+
+static inline bool is_vsyscall_entry(unsigned long addr)
+{
+	return (addr & ~0xC00UL) == VSYSCALL_START;
+}
+
+static inline int vsyscall_entry_nr(unsigned long addr)
+{
+	return (addr & 0xC00UL) >> 10;
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90b06d4daee2..cc0469a65120 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,6 +44,7 @@ obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o vread_tsc_64.o
+obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 72c4a777bb91..e949793d6b93 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1123,6 +1123,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
 zeroentry coprocessor_error do_coprocessor_error
 errorentry alignment_check do_alignment_check
 zeroentry simd_coprocessor_error do_simd_coprocessor_error
+zeroentry emulate_vsyscall do_emulate_vsyscall
+
 
 	/* Reload gs selector with exception handling */
 	/* edi:  new selector */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9de..fbc097a085ca 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -872,6 +872,12 @@ void __init trap_init(void)
 	set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
 
+#ifdef CONFIG_X86_64
+	BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors));
+	set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall);
+	set_bit(VSYSCALL_EMU_VECTOR, used_vectors);
+#endif
+
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 70a5f6eebd6c..10cd8ac3395a 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
  *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
  *  Copyright 2003 Andi Kleen, SuSE Labs.
  *
+ *  [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ *
  *  Thanks to hpa@transmeta.com for some useful hint.
  *  Special thanks to Ingo Molnar for his early experience with
  *  a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,10 +13,9 @@
  *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
  *  jumping out of line if necessary. We cannot add more with this
  *  mechanism because older kernels won't return -ENOSYS.
- *  If we want more than four we need a vDSO.
  *
- *  Note: the concept clashes with user mode linux. If you use UML and
- *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ *  Note: the concept clashes with user mode linux.  UML users should
+ *  use the vDSO.
  */
 
 /* Disable profiling for userspace code: */
@@ -32,6 +33,8 @@
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
 
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
@@ -44,10 +47,7 @@
 #include <asm/desc.h>
 #include <asm/topology.h>
 #include <asm/vgtod.h>
-
-#define __vsyscall(nr) \
-		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
-#define __syscall_clobber "r11","cx","memory"
+#include <asm/traps.h>
 
 DEFINE_VVAR(int, vgetcpu_mode);
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
@@ -71,146 +71,129 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 	unsigned long flags;
 
 	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+
 	/* copy vsyscall data */
-	vsyscall_gtod_data.clock.vread = clock->vread;
-	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
-	vsyscall_gtod_data.clock.mask = clock->mask;
-	vsyscall_gtod_data.clock.mult = mult;
-	vsyscall_gtod_data.clock.shift = clock->shift;
-	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
-	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-	vsyscall_gtod_data.wall_to_monotonic = *wtm;
-	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+	vsyscall_gtod_data.clock.vread		= clock->vread;
+	vsyscall_gtod_data.clock.cycle_last	= clock->cycle_last;
+	vsyscall_gtod_data.clock.mask		= clock->mask;
+	vsyscall_gtod_data.clock.mult		= mult;
+	vsyscall_gtod_data.clock.shift		= clock->shift;
+	vsyscall_gtod_data.wall_time_sec	= wall_time->tv_sec;
+	vsyscall_gtod_data.wall_time_nsec	= wall_time->tv_nsec;
+	vsyscall_gtod_data.wall_to_monotonic	= *wtm;
+	vsyscall_gtod_data.wall_time_coarse	= __current_kernel_time();
+
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-/* RED-PEN may want to readd seq locking, but then the variable should be
- * write-once.
- */
-static __always_inline void do_get_tz(struct timezone * tz)
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+			      const char *message)
 {
-	*tz = VVAR(vsyscall_gtod_data).sys_tz;
-}
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+	struct task_struct *tsk;
 
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
-	int ret;
-	asm volatile("syscall"
-		: "=a" (ret)
-		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
-		: __syscall_clobber );
-	return ret;
-}
+	if (!show_unhandled_signals || !__ratelimit(&rs))
+		return;
 
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
-	cycle_t now, base, mask, cycle_delta;
-	unsigned seq;
-	unsigned long mult, shift, nsec;
-	cycle_t (*vread)(void);
-	do {
-		seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
-
-		vread = VVAR(vsyscall_gtod_data).clock.vread;
-		if (unlikely(!vread)) {
-			gettimeofday(tv,NULL);
-			return;
-		}
-
-		now = vread();
-		base = VVAR(vsyscall_gtod_data).clock.cycle_last;
-		mask = VVAR(vsyscall_gtod_data).clock.mask;
-		mult = VVAR(vsyscall_gtod_data).clock.mult;
-		shift = VVAR(vsyscall_gtod_data).clock.shift;
-
-		tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
-		nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
-	} while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
-
-	/* calculate interval: */
-	cycle_delta = (now - base) & mask;
-	/* convert to nsecs: */
-	nsec += (cycle_delta * mult) >> shift;
-
-	while (nsec >= NSEC_PER_SEC) {
-		tv->tv_sec += 1;
-		nsec -= NSEC_PER_SEC;
-	}
-	tv->tv_usec = nsec / NSEC_PER_USEC;
-}
+	tsk = current;
 
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
-{
-	if (tv)
-		do_vgettimeofday(tv);
-	if (tz)
-		do_get_tz(tz);
-	return 0;
+	printk("%s%s[%d] %s ip:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+	       level, tsk->comm, task_pid_nr(tsk),
+	       message, regs->ip - 2, regs->sp, regs->ax, regs->si, regs->di);
 }
 
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
+void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
 {
-	unsigned seq;
-	time_t result;
+	const char *vsyscall_name;
+	struct task_struct *tsk;
+	unsigned long caller;
+	int vsyscall_nr;
+	long ret;
+
+	/* Kernel code must never get here. */
+	BUG_ON(!user_mode(regs));
+
+	local_irq_enable();
+
+	/*
+	 * x86-ism here: regs->ip points to the instruction after the int 0xcc,
+	 * and int 0xcc is two bytes long.
+	 */
+	if (!is_vsyscall_entry(regs->ip - 2)) {
+		warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)");
+		goto sigsegv;
+	}
+	vsyscall_nr = vsyscall_entry_nr(regs->ip - 2);
 
-	do {
-		seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
+	if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+		warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
+		goto sigsegv;
+	}
 
-		result = VVAR(vsyscall_gtod_data).wall_time_sec;
+	tsk = current;
+	if (seccomp_mode(&tsk->seccomp))
+		do_exit(SIGKILL);
+
+	switch (vsyscall_nr) {
+	case 0:
+		vsyscall_name = "gettimeofday";
+		ret = sys_gettimeofday(
+			(struct timeval __user *)regs->di,
+			(struct timezone __user *)regs->si);
+		break;
+
+	case 1:
+		vsyscall_name = "time";
+		ret = sys_time((time_t __user *)regs->di);
+		break;
+
+	case 2:
+		vsyscall_name = "getcpu";
+		ret = sys_getcpu((unsigned __user *)regs->di,
+				 (unsigned __user *)regs->si,
+				 0);
+		break;
+
+	default:
+		/*
+		 * If we get here, then vsyscall_nr indicates that int 0xcc
+		 * happened at an address in the vsyscall page that doesn't
+		 * contain int 0xcc.  That can't happen.
+		 */
+		BUG();
+	}
 
-	} while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
+	if (ret == -EFAULT) {
+		/*
+		 * Bad news -- userspace fed a bad pointer to a vsyscall.
+		 *
+		 * With a real vsyscall, that would have caused SIGSEGV.
+		 * To make writing reliable exploits using the emulated
+		 * vsyscalls harder, generate SIGSEGV here as well.
+		 */
+		warn_bad_vsyscall(KERN_INFO, regs,
+				  "vsyscall fault (exploit attempt?)");
+		goto sigsegv;
+	}
 
-	if (t)
-		*t = result;
-	return result;
-}
+	regs->ax = ret;
 
-/* Fast way to get current CPU and node.
-   This helps to do per node and per CPU caches in user space.
-   The result is not guaranteed without CPU affinity, but usually
-   works out because the scheduler tries to keep a thread on the same
-   CPU.
+	/* Emulate a ret instruction. */
+	regs->ip = caller;
+	regs->sp += 8;
 
-   tcache must point to a two element sized long array.
-   All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
-	unsigned int p;
-	unsigned long j = 0;
-
-	/* Fast cache - only recompute value once per jiffies and avoid
-	   relatively costly rdtscp/cpuid otherwise.
-	   This works because the scheduler usually keeps the process
-	   on the same CPU and this syscall doesn't guarantee its
-	   results anyways.
-	   We do this here because otherwise user space would do it on
-	   its own in a likely inferior way (no access to jiffies).
-	   If you don't like it pass NULL. */
-	if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
-		p = tcache->blob[1];
-	} else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
-		/* Load per CPU data from RDTSCP */
-		native_read_tscp(&p);
-	} else {
-		/* Load per CPU data from GDT */
-		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
-	}
-	if (tcache) {
-		tcache->blob[0] = j;
-		tcache->blob[1] = p;
-	}
-	if (cpu)
-		*cpu = p & 0xfff;
-	if (node)
-		*node = p >> 12;
-	return 0;
+	local_irq_disable();
+	return;
+
+sigsegv:
+	regs->ip -= 2;  /* The faulting instruction should be the int 0xcc. */
+	force_sig(SIGSEGV, current);
 }
 
-/* Assume __initcall executes before all user space. Hopefully kmod
-   doesn't violate that. We'll find out if it does. */
+/*
+ * Assume __initcall executes before all user space. Hopefully kmod
+ * doesn't violate that. We'll find out if it does.
+ */
 static void __cpuinit vsyscall_set_cpu(int cpu)
 {
 	unsigned long d;
@@ -221,13 +204,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
 	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
 		write_rdtscp_aux((node << 12) | cpu);
 
-	/* Store cpu number in limit so that it can be loaded quickly
-	   in user space in vgetcpu.
-	   12 bits for the CPU and 8 bits for the node. */
+	/*
+	 * Store cpu number in limit so that it can be loaded quickly
+	 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
+	 */
 	d = 0x0f40000000000ULL;
 	d |= cpu;
 	d |= (node & 0xf) << 12;
 	d |= (node >> 4) << 48;
+
 	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 }
 
@@ -241,8 +226,10 @@ static int __cpuinit
 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
 	long cpu = (long)arg;
+
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
 		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
+
 	return NOTIFY_DONE;
 }
 
@@ -256,21 +243,17 @@ void __init map_vsyscall(void)
 	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
 	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
-	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
-		     (unsigned long)VVAR_ADDRESS);
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
 }
 
 static int __init vsyscall_init(void)
 {
-	BUG_ON(((unsigned long) &vgettimeofday !=
-			VSYSCALL_ADDR(__NR_vgettimeofday)));
-	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
-	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
-	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
+	BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
+
 	on_each_cpu(cpu_vsyscall_init, NULL, 1);
 	/* notifier priority > KVM */
 	hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
 	return 0;
 }
-
 __initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 000000000000..ffa845eae5ca
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,27 @@
+/*
+ * vsyscall_emu_64.S: Vsyscall emulation page
+ *
+ * Copyright (c) 2011 Andy Lutomirski
+ *
+ * Subject to the GNU General Public License, version 2
+ */
+
+#include <linux/linkage.h>
+#include <asm/irq_vectors.h>
+
+/* The unused parts of the page are filled with 0xcc by the linker script. */
+
+.section .vsyscall_0, "a"
+ENTRY(vsyscall_0)
+	int $VSYSCALL_EMU_VECTOR
+END(vsyscall_0)
+
+.section .vsyscall_1, "a"
+ENTRY(vsyscall_1)
+	int $VSYSCALL_EMU_VECTOR
+END(vsyscall_1)
+
+.section .vsyscall_2, "a"
+ENTRY(vsyscall_2)
+	int $VSYSCALL_EMU_VECTOR
+END(vsyscall_2)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 167c33361d9c..cc7a4e9cc7ad 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -19,6 +19,11 @@ static inline void secure_computing(int this_syscall)
 extern long prctl_get_seccomp(void);
 extern long prctl_set_seccomp(unsigned long);
 
+static inline int seccomp_mode(seccomp_t *s)
+{
+	return s->mode;
+}
+
 #else /* CONFIG_SECCOMP */
 
 #include <linux/errno.h>
@@ -37,6 +42,11 @@ static inline long prctl_set_seccomp(unsigned long arg2)
 	return -EINVAL;
 }
 
+static inline int seccomp_mode(seccomp_t *s)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SECCOMP */
 
 #endif /* _LINUX_SECCOMP_H */
-- 
cgit v1.2.3


From 6dc1418e13144162e8bc4858789010d8f0e1e65c Mon Sep 17 00:00:00 2001
From: Tomoki Sekiyama <tomoki.sekiyama@gmail.com>
Date: Mon, 23 May 2011 15:45:44 -0700
Subject: HID: yurex: recognize GeneralKeys wireless presenter as generic HID

Unfortunately, the device seems to have the same Vendor ID and Product ID
as YUREX leg-shakes sensors, and the commit 6bc235a2e2 ("USB: add driver
for Meywa-Denki & Kayac YUREX") added the ID to hid_ignore_list.

I believe that we can distinguish YUREX and the Wireless Presenter by
device type.  The patch below makes the driver ignore only YUREX
(bInterfaceProtocol==0), and recognize Wireless Presenter
(bInterfaceProtocol is keyboard or mouse) as generic HID.  (I don't have
the Wireless Presenter, so not yet ested.)

** YUREX lsusb information:
Bus 002 Device 007: ID 0c45:1010 Microdia
Device Descriptor:
   bLength                18
   bDescriptorType         1
   bcdUSB               1.10
   bDeviceClass            0 (Defined at Interface level)
   bDeviceSubClass         0
   bDeviceProtocol         0
   bMaxPacketSize0         8
   idVendor           0x0c45 Microdia
   idProduct          0x1010
   bcdDevice            0.03
   iManufacturer           1 JESS
   iProduct                2 YUREX
   iSerial                 3 10000269
   bNumConfigurations      1
   Configuration Descriptor:
     bLength                 9
     bDescriptorType         2
     wTotalLength           34
     bNumInterfaces          1
     bConfigurationValue     1
     iConfiguration          0
     bmAttributes         0xa0
       (Bus Powered)
       Remote Wakeup
     MaxPower              100mA
     Interface Descriptor:
       bLength                 9
       bDescriptorType         4
       bInterfaceNumber        0
       bAlternateSetting       0
       bNumEndpoints           1
       bInterfaceClass         3 Human Interface Device
       bInterfaceSubClass      1 Boot Interface Subclass
       bInterfaceProtocol      0 None
       iInterface              0
         HID Device Descriptor:
           bLength                 9
           bDescriptorType        33
           bcdHID               1.10
           bCountryCode            0 Not supported
           bNumDescriptors         1
           bDescriptorType        34 Report
           wDescriptorLength      31
          Report Descriptors:
            ** UNAVAILABLE **
       Endpoint Descriptor:
         bLength                 7
         bDescriptorType         5
         bEndpointAddress     0x81  EP 1 IN
         bmAttributes            3
           Transfer Type            Interrupt
           Synch Type               None
           Usage Type               Data
         wMaxPacketSize     0x0008  1x 8 bytes
         bInterval              10
Device Status:     0x0002
   (Bus Powered)
   Remote Wakeup Enabled

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=26922

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama@gmail.com>
Cc: Greg KH <gregkh@suse.de>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Maciej Rutecki <maciej.rutecki@gmail.com>
Reported-by: Thomas B?chler <thomas@archlinux.org>
Tested-by: Thomas B?chler <thomas@archlinux.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c        | 6 +++++-
 drivers/hid/usbhid/hid-core.c | 2 ++
 include/linux/hid.h           | 3 ++-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index c957c4b4fe70..9d6495d19e02 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1769,7 +1769,6 @@ static const struct hid_device_id hid_ignore_list[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_GTCO, USB_DEVICE_ID_GTCO_1006) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_GTCO, USB_DEVICE_ID_GTCO_1007) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_IMATION, USB_DEVICE_ID_DISC_STAKKA) },
-	{ HID_USB_DEVICE(USB_VENDOR_ID_JESS, USB_DEVICE_ID_JESS_YUREX) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_KBGEAR, USB_DEVICE_ID_KBGEAR_JAMSTUDIO) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_KWORLD, USB_DEVICE_ID_KWORLD_RADIO_FM700) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_GPEN_560) },
@@ -1910,6 +1909,11 @@ static bool hid_ignore(struct hid_device *hdev)
 		    hdev->product <= USB_DEVICE_ID_HANWANG_TABLET_LAST)
 			return true;
 		break;
+	case USB_VENDOR_ID_JESS:
+		if (hdev->product == USB_DEVICE_ID_JESS_YUREX &&
+				hdev->type == HID_TYPE_USBNONE)
+			return true;
+	break;
 	}
 
 	if (hdev->type == HID_TYPE_USBMOUSE &&
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 38c261a40c74..ad978f5748d3 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1191,6 +1191,8 @@ static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *
 	if (intf->cur_altsetting->desc.bInterfaceProtocol ==
 			USB_INTERFACE_PROTOCOL_MOUSE)
 		hid->type = HID_TYPE_USBMOUSE;
+	else if (intf->cur_altsetting->desc.bInterfaceProtocol == 0)
+		hid->type = HID_TYPE_USBNONE;
 
 	if (dev->manufacturer)
 		strlcpy(hid->name, dev->manufacturer, sizeof(hid->name));
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 42f7e2fb501f..9cf8e7ae7450 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -453,7 +453,8 @@ struct hid_input {
 
 enum hid_type {
 	HID_TYPE_OTHER = 0,
-	HID_TYPE_USBMOUSE
+	HID_TYPE_USBMOUSE,
+	HID_TYPE_USBNONE
 };
 
 struct hid_driver;
-- 
cgit v1.2.3


From ea2c00095c022846dd8dfd211de05154d3e4e1b8 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Tue, 17 May 2011 15:25:37 -0700
Subject: Connector: Set the CN_NETLINK_USERS correctly

The CN_NETLINK_USERS must be set to the highest valid index +1.
Thanks to Evgeniy for pointing this out.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Cc: stable <stable@kernel.org> [.39]
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/connector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/connector.h b/include/linux/connector.h
index 7c60d0942adb..f696bccd48cb 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -44,7 +44,7 @@
 #define CN_VAL_DRBD			0x1
 #define CN_KVP_IDX			0x9	/* HyperV KVP */
 
-#define CN_NETLINK_USERS		9
+#define CN_NETLINK_USERS		10	/* Highest index + 1 */
 
 /*
  * Maximum connector's message size.
-- 
cgit v1.2.3


From 98d9f30c820d509145757e6ecbc36013aa02f7bc Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 11 Apr 2011 11:37:07 +1000
Subject: pci/of: Match PCI devices to OF nodes dynamically

powerpc has two different ways of matching PCI devices to their
corresponding OF node (if any) for historical reasons. The ppc64 one
does a scan looking for matching bus/dev/fn, while the ppc32 one does a
scan looking only for matching dev/fn on each level in order to be
agnostic to busses being renumbered (which Linux does on some
platforms).

This removes both and instead moves the matching code to the PCI core
itself. It's the most logical place to do it: when a pci_dev is created,
we know the parent and thus can do a single level scan for the matching
device_node (if any).

The benefit is that all archs now get the matching for free. There's one
hook the arch might want to provide to match a PHB bus to its device
node. A default weak implementation is provided that looks for the
parent device device node, but it's not entirely reliable on powerpc for
various reasons so powerpc provides its own.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/microblaze/include/asm/pci-bridge.h |  14 +--
 arch/microblaze/include/asm/prom.h       |  15 ----
 arch/microblaze/pci/pci_32.c             |  40 ++-------
 arch/powerpc/include/asm/pci-bridge.h    |  35 +++-----
 arch/powerpc/include/asm/pci.h           |   3 +-
 arch/powerpc/include/asm/prom.h          |  14 ---
 arch/powerpc/kernel/pci-common.c         |  11 ++-
 arch/powerpc/kernel/pci_32.c             | 150 +++----------------------------
 arch/powerpc/kernel/pci_dn.c             |  47 ----------
 arch/powerpc/kernel/pci_of_scan.c        |   9 +-
 arch/powerpc/platforms/powermac/pci.c    |   3 +-
 arch/sparc/kernel/pci.c                  |   2 +-
 drivers/of/Kconfig                       |   8 +-
 drivers/of/Makefile                      |   1 +
 drivers/of/of_pci.c                      | 112 +++++++----------------
 drivers/of/of_pci_irq.c                  |  92 +++++++++++++++++++
 drivers/pci/Makefile                     |   2 +
 drivers/pci/hotplug/rpadlpar_core.c      |   2 +-
 drivers/pci/of.c                         |  61 +++++++++++++
 drivers/pci/probe.c                      |   7 +-
 include/linux/of_pci.h                   |   5 ++
 include/linux/pci.h                      |  18 ++++
 22 files changed, 275 insertions(+), 376 deletions(-)
 create mode 100644 drivers/of/of_pci_irq.c
 create mode 100644 drivers/pci/of.c

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/pci-bridge.h b/arch/microblaze/include/asm/pci-bridge.h
index 746df91e5796..728f8d6a59a4 100644
--- a/arch/microblaze/include/asm/pci-bridge.h
+++ b/arch/microblaze/include/asm/pci-bridge.h
@@ -105,19 +105,19 @@ struct pci_controller {
 };
 
 #ifdef CONFIG_PCI
-static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
+static inline struct device_node *pci_device_to_OF_node(struct pci_dev *dev)
 {
-	return bus->sysdata;
+	return dev->dev.of_node;
 }
 
 static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
 {
-	struct pci_controller *host;
+	return bus->dev.of_node;
+}
 
-	if (bus->self)
-		return pci_device_to_OF_node(bus->self);
-	host = pci_bus_to_host(bus);
-	return host ? host->dn : NULL;
+static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
+{
+	return bus->sysdata;
 }
 
 static inline int isa_vaddr_is_ioport(void __iomem *address)
diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index d0890d36ef61..9bd01ecb00d6 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -29,21 +29,6 @@
 extern int early_uartlite_console(void);
 extern int early_uart16550_console(void);
 
-#ifdef CONFIG_PCI
-/*
- * PCI <-> OF matching functions
- * (XXX should these be here?)
- */
-struct pci_bus;
-struct pci_dev;
-extern int pci_device_from_OF_node(struct device_node *node,
-					u8 *bus, u8 *devfn);
-extern struct device_node *pci_busdev_to_OF_node(struct pci_bus *bus,
-							int devfn);
-extern struct device_node *pci_device_to_OF_node(struct pci_dev *dev);
-extern void pci_create_OF_bus_map(void);
-#endif
-
 /*
  * OF address retreival & translation
  */
diff --git a/arch/microblaze/pci/pci_32.c b/arch/microblaze/pci/pci_32.c
index 92728a6cfd80..2fa95069e6ba 100644
--- a/arch/microblaze/pci/pci_32.c
+++ b/arch/microblaze/pci/pci_32.c
@@ -210,38 +210,6 @@ static struct device_node *scan_OF_for_pci_bus(struct pci_bus *bus)
 	return np;
 }
 
-/*
- * Scans the OF tree for a device node matching a PCI device
- */
-struct device_node *
-pci_busdev_to_OF_node(struct pci_bus *bus, int devfn)
-{
-	struct device_node *parent, *np;
-
-	pr_debug("pci_busdev_to_OF_node(%d,0x%x)\n", bus->number, devfn);
-	parent = scan_OF_for_pci_bus(bus);
-	if (parent == NULL)
-		return NULL;
-	pr_debug(" parent is %s\n", parent ? parent->full_name : "<NULL>");
-	np = scan_OF_for_pci_dev(parent, devfn);
-	of_node_put(parent);
-	pr_debug(" result is %s\n", np ? np->full_name : "<NULL>");
-
-	/* XXX most callers don't release the returned node
-	 * mostly because ppc64 doesn't increase the refcount,
-	 * we need to fix that.
-	 */
-	return np;
-}
-EXPORT_SYMBOL(pci_busdev_to_OF_node);
-
-struct device_node*
-pci_device_to_OF_node(struct pci_dev *dev)
-{
-	return pci_busdev_to_OF_node(dev->bus, dev->devfn);
-}
-EXPORT_SYMBOL(pci_device_to_OF_node);
-
 static int
 find_OF_pci_device_filter(struct device_node *node, void *data)
 {
@@ -315,6 +283,13 @@ pci_create_OF_bus_map(void)
 	}
 }
 
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+	struct pci_controller *hose = bus->sysdata;
+
+	return of_node_get(hose->dn);
+}
+
 static void __devinit pcibios_scan_phb(struct pci_controller *hose)
 {
 	struct pci_bus *bus;
@@ -332,7 +307,6 @@ static void __devinit pcibios_scan_phb(struct pci_controller *hose)
 		       hose->global_number);
 		return;
 	}
-	bus.dev->of_node = of_node_get(node);
 	bus->secondary = hose->first_busno;
 	hose->bus = bus;
 
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index b90dbf8e5cd9..3e6869476e55 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -169,18 +169,22 @@ static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
 	return bus->sysdata;
 }
 
-#ifndef CONFIG_PPC64
+static inline struct device_node *pci_device_to_OF_node(struct pci_dev *dev)
+{
+	return dev->dev.of_node;
+}
 
 static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
 {
-	struct pci_controller *host;
-
-	if (bus->self)
-		return pci_device_to_OF_node(bus->self);
-	host = pci_bus_to_host(bus);
-	return host ? host->dn : NULL;
+	return bus->dev.of_node;
 }
 
+#ifndef CONFIG_PPC64
+
+extern int pci_device_from_OF_node(struct device_node *node,
+				   u8 *bus, u8 *devfn);
+extern void pci_create_OF_bus_map(void);
+
 static inline int isa_vaddr_is_ioport(void __iomem *address)
 {
 	/* No specific ISA handling on ppc32 at this stage, it
@@ -223,17 +227,8 @@ struct pci_dn {
 /* Get the pointer to a device_node's pci_dn */
 #define PCI_DN(dn)	((struct pci_dn *) (dn)->data)
 
-extern struct device_node *fetch_dev_dn(struct pci_dev *dev);
 extern void * update_dn_pci_info(struct device_node *dn, void *data);
 
-/* Get a device_node from a pci_dev.  This code must be fast except
- * in the case where the sysdata is incorrect and needs to be fixed
- * up (this will only happen once). */
-static inline struct device_node *pci_device_to_OF_node(struct pci_dev *dev)
-{
-	return dev->dev.of_node ? dev->dev.of_node : fetch_dev_dn(dev);
-}
-
 static inline int pci_device_from_OF_node(struct device_node *np,
 					  u8 *bus, u8 *devfn)
 {
@@ -244,14 +239,6 @@ static inline int pci_device_from_OF_node(struct device_node *np,
 	return 0;
 }
 
-static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
-{
-	if (bus->self)
-		return pci_device_to_OF_node(bus->self);
-	else
-		return bus->dev.of_node; /* Must be root bus (PHB) */
-}
-
 /** Find the bus corresponding to the indicated device node */
 extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn);
 
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 7d7790954e02..1f522680ea17 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -179,8 +179,7 @@ extern int remove_phb_dynamic(struct pci_controller *phb);
 extern struct pci_dev *of_create_pci_dev(struct device_node *node,
 					struct pci_bus *bus, int devfn);
 
-extern void of_scan_pci_bridge(struct device_node *node,
-				struct pci_dev *dev);
+extern void of_scan_pci_bridge(struct pci_dev *dev);
 
 extern void of_scan_bus(struct device_node *node, struct pci_bus *bus);
 extern void of_rescan_bus(struct device_node *node, struct pci_bus *bus);
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index c189aa5fe1f4..b823536375dc 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -22,20 +22,6 @@
 
 #define HAVE_ARCH_DEVTREE_FIXUPS
 
-#ifdef CONFIG_PPC32
-/*
- * PCI <-> OF matching functions
- * (XXX should these be here?)
- */
-struct pci_bus;
-struct pci_dev;
-extern int pci_device_from_OF_node(struct device_node *node,
-				   u8* bus, u8* devfn);
-extern struct device_node* pci_busdev_to_OF_node(struct pci_bus *, int);
-extern struct device_node* pci_device_to_OF_node(struct pci_dev *);
-extern void pci_create_OF_bus_map(void);
-#endif
-
 /*
  * OF address retreival & translation
  */
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 893af2a9cd03..a3c92770e422 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1097,9 +1097,6 @@ void __devinit pcibios_setup_bus_devices(struct pci_bus *bus)
 		if (dev->is_added)
 			continue;
 
-		/* Setup OF node pointer in the device */
-		dev->dev.of_node = pci_device_to_OF_node(dev);
-
 		/* Fixup NUMA node as it may not be setup yet by the generic
 		 * code and is needed by the DMA init
 		 */
@@ -1685,6 +1682,13 @@ int early_find_capability(struct pci_controller *hose, int bus, int devfn,
 	return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap);
 }
 
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+	struct pci_controller *hose = bus->sysdata;
+
+	return of_node_get(hose->dn);
+}
+
 /**
  * pci_scan_phb - Given a pci_controller, setup and scan the PCI bus
  * @hose: Pointer to the PCI host controller instance structure
@@ -1705,7 +1709,6 @@ void __devinit pcibios_scan_phb(struct pci_controller *hose)
 			hose->global_number);
 		return;
 	}
-	bus->dev.of_node = of_node_get(node);
 	bus->secondary = hose->first_busno;
 	hose->bus = bus;
 
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index bedb370459f2..86585508e9c1 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -167,150 +167,26 @@ pcibios_make_OF_bus_map(void)
 #endif
 }
 
-typedef int (*pci_OF_scan_iterator)(struct device_node* node, void* data);
-
-static struct device_node*
-scan_OF_pci_childs(struct device_node *parent, pci_OF_scan_iterator filter, void* data)
-{
-	struct device_node *node;
-	struct device_node* sub_node;
-
-	for_each_child_of_node(parent, node) {
-		const unsigned int *class_code;
-	
-		if (filter(node, data)) {
-			of_node_put(node);
-			return node;
-		}
-
-		/* For PCI<->PCI bridges or CardBus bridges, we go down
-		 * Note: some OFs create a parent node "multifunc-device" as
-		 * a fake root for all functions of a multi-function device,
-		 * we go down them as well.
-		 */
-		class_code = of_get_property(node, "class-code", NULL);
-		if ((!class_code || ((*class_code >> 8) != PCI_CLASS_BRIDGE_PCI &&
-			(*class_code >> 8) != PCI_CLASS_BRIDGE_CARDBUS)) &&
-			strcmp(node->name, "multifunc-device"))
-			continue;
-		sub_node = scan_OF_pci_childs(node, filter, data);
-		if (sub_node) {
-			of_node_put(node);
-			return sub_node;
-		}
-	}
-	return NULL;
-}
-
-static struct device_node *scan_OF_for_pci_dev(struct device_node *parent,
-					       unsigned int devfn)
-{
-	struct device_node *np, *cnp;
-	const u32 *reg;
-	unsigned int psize;
-
-	for_each_child_of_node(parent, np) {
-		reg = of_get_property(np, "reg", &psize);
-                if (reg && psize >= 4 && ((reg[0] >> 8) & 0xff) == devfn)
-			return np;
-
-		/* Note: some OFs create a parent node "multifunc-device" as
-		 * a fake root for all functions of a multi-function device,
-		 * we go down them as well. */
-                if (!strcmp(np->name, "multifunc-device")) {
-                        cnp = scan_OF_for_pci_dev(np, devfn);
-                        if (cnp)
-                                return cnp;
-                }
-	}
-	return NULL;
-}
-
-
-static struct device_node *scan_OF_for_pci_bus(struct pci_bus *bus)
-{
-	struct device_node *parent, *np;
-
-	/* Are we a root bus ? */
-	if (bus->self == NULL || bus->parent == NULL) {
-		struct pci_controller *hose = pci_bus_to_host(bus);
-		if (hose == NULL)
-			return NULL;
-		return of_node_get(hose->dn);
-	}
-
-	/* not a root bus, we need to get our parent */
-	parent = scan_OF_for_pci_bus(bus->parent);
-	if (parent == NULL)
-		return NULL;
-
-	/* now iterate for children for a match */
-	np = scan_OF_for_pci_dev(parent, bus->self->devfn);
-	of_node_put(parent);
-
-	return np;
-}
-
-/*
- * Scans the OF tree for a device node matching a PCI device
- */
-struct device_node *
-pci_busdev_to_OF_node(struct pci_bus *bus, int devfn)
-{
-	struct device_node *parent, *np;
-
-	pr_debug("pci_busdev_to_OF_node(%d,0x%x)\n", bus->number, devfn);
-	parent = scan_OF_for_pci_bus(bus);
-	if (parent == NULL)
-		return NULL;
-	pr_debug(" parent is %s\n", parent ? parent->full_name : "<NULL>");
-	np = scan_OF_for_pci_dev(parent, devfn);
-	of_node_put(parent);
-	pr_debug(" result is %s\n", np ? np->full_name : "<NULL>");
-
-	/* XXX most callers don't release the returned node
-	 * mostly because ppc64 doesn't increase the refcount,
-	 * we need to fix that.
-	 */
-	return np;
-}
-EXPORT_SYMBOL(pci_busdev_to_OF_node);
-
-struct device_node*
-pci_device_to_OF_node(struct pci_dev *dev)
-{
-	return pci_busdev_to_OF_node(dev->bus, dev->devfn);
-}
-EXPORT_SYMBOL(pci_device_to_OF_node);
-
-static int
-find_OF_pci_device_filter(struct device_node* node, void* data)
-{
-	return ((void *)node == data);
-}
 
 /*
  * Returns the PCI device matching a given OF node
  */
-int
-pci_device_from_OF_node(struct device_node* node, u8* bus, u8* devfn)
+int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn)
 {
-	const unsigned int *reg;
-	struct pci_controller* hose;
-	struct pci_dev* dev = NULL;
-	
-	/* Make sure it's really a PCI device */
-	hose = pci_find_hose_for_OF_device(node);
-	if (!hose || !hose->dn)
-		return -ENODEV;
-	if (!scan_OF_pci_childs(hose->dn,
-			find_OF_pci_device_filter, (void *)node))
+	struct pci_dev *dev = NULL;
+	const __be32 *reg;
+	int size;
+
+	/* Check if it might have a chance to be a PCI device */
+	if (!pci_find_hose_for_OF_device(node))
 		return -ENODEV;
-	reg = of_get_property(node, "reg", NULL);
-	if (!reg)
+
+	reg = of_get_property(node, "reg", &size);
+	if (!reg || size < 5 * sizeof(u32))
 		return -ENODEV;
-	*bus = (reg[0] >> 16) & 0xff;
-	*devfn = ((reg[0] >> 8) & 0xff);
+
+	*bus = (be32_to_cpup(&reg[0]) >> 16) & 0xff;
+	*devfn = (be32_to_cpup(&reg[0]) >> 8) & 0xff;
 
 	/* Ok, here we need some tweak. If we have already renumbered
 	 * all busses, we can't rely on the OF bus number any more.
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 6baabc13306a..478f8d78716b 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -142,53 +142,6 @@ void __devinit pci_devs_phb_init_dynamic(struct pci_controller *phb)
 	traverse_pci_devices(dn, update_dn_pci_info, phb);
 }
 
-/*
- * Traversal func that looks for a <busno,devfcn> value.
- * If found, the pci_dn is returned (thus terminating the traversal).
- */
-static void *is_devfn_node(struct device_node *dn, void *data)
-{
-	int busno = ((unsigned long)data >> 8) & 0xff;
-	int devfn = ((unsigned long)data) & 0xff;
-	struct pci_dn *pci = dn->data;
-
-	if (pci && (devfn == pci->devfn) && (busno == pci->busno))
-		return dn;
-	return NULL;
-}
-
-/*
- * This is the "slow" path for looking up a device_node from a
- * pci_dev.  It will hunt for the device under its parent's
- * phb and then update of_node pointer.
- *
- * It may also do fixups on the actual device since this happens
- * on the first read/write.
- *
- * Note that it also must deal with devices that don't exist.
- * In this case it may probe for real hardware ("just in case")
- * and add a device_node to the device tree if necessary.
- *
- * Is this function necessary anymore now that dev->dev.of_node is
- * used to store the node pointer?
- *
- */
-struct device_node *fetch_dev_dn(struct pci_dev *dev)
-{
-	struct pci_controller *phb = dev->sysdata;
-	struct device_node *dn;
-	unsigned long searchval = (dev->bus->number << 8) | dev->devfn;
-
-	if (WARN_ON(!phb))
-		return NULL;
-
-	dn = traverse_pci_devices(phb->dn, is_devfn_node, (void *)searchval);
-	if (dn)
-		dev->dev.of_node = dn;
-	return dn;
-}
-EXPORT_SYMBOL(fetch_dev_dn);
-
 /** 
  * pci_devs_phb_init - Initialize phbs and pci devs under them.
  * 
diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c
index 1e89a72fd030..fe0a5ad6f73e 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -202,9 +202,9 @@ EXPORT_SYMBOL(of_create_pci_dev);
  * this routine in turn call of_scan_bus() recusively to scan for more child
  * devices.
  */
-void __devinit of_scan_pci_bridge(struct device_node *node,
-				  struct pci_dev *dev)
+void __devinit of_scan_pci_bridge(struct pci_dev *dev)
 {
+	struct device_node *node = dev->dev.of_node;
 	struct pci_bus *bus;
 	const u32 *busrange, *ranges;
 	int len, i, mode;
@@ -238,7 +238,6 @@ void __devinit of_scan_pci_bridge(struct device_node *node,
 	bus->primary = dev->bus->number;
 	bus->subordinate = busrange[1];
 	bus->bridge_ctl = 0;
-	bus->dev.of_node = of_node_get(node);
 
 	/* parse ranges property */
 	/* PCI #address-cells == 3 and #size-cells == 2 always */
@@ -335,9 +334,7 @@ static void __devinit __of_scan_bus(struct device_node *node,
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
 		    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) {
-			struct device_node *child = pci_device_to_OF_node(dev);
-			if (child)
-				of_scan_pci_bridge(child, dev);
+			of_scan_pci_bridge(dev);
 		}
 	}
 }
diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c
index f33e08d573ce..abe8d7e2ebeb 100644
--- a/arch/powerpc/platforms/powermac/pci.c
+++ b/arch/powerpc/platforms/powermac/pci.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/irq.h>
+#include <linux/of_pci.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -235,7 +236,7 @@ static int chaos_validate_dev(struct pci_bus *bus, int devfn, int offset)
 
 	if (offset >= 0x100)
 		return  PCIBIOS_BAD_REGISTER_NUMBER;
-	np = pci_busdev_to_OF_node(bus, devfn);
+	np = of_pci_find_child_device(bus->dev.of_node, devfn);
 	if (np == NULL)
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 713dc91020a6..e539d23dec9d 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -284,7 +284,7 @@ static struct pci_dev *of_create_pci_dev(struct pci_pbm_info *pbm,
 	dev->sysdata = node;
 	dev->dev.parent = bus->bridge;
 	dev->dev.bus = &pci_bus_type;
-	dev->dev.of_node = node;
+	dev->dev.of_node = of_node_get(node);
 	dev->devfn = devfn;
 	dev->multifunction = 0;		/* maybe a lie? */
 	set_pcie_port_type(dev);
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index d06a6374ed6c..cac63c9f49ae 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -71,8 +71,14 @@ config OF_MDIO
 
 config OF_PCI
 	def_tristate PCI
-	depends on PCI && (PPC || MICROBLAZE || X86)
+	depends on PCI
 	help
 	  OpenFirmware PCI bus accessors
 
+config OF_PCI_IRQ
+	def_tristate PCI
+	depends on OF_PCI && OF_IRQ
+	help
+	  OpenFirmware PCI IRQ routing helpers
+
 endmenu # OF
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index f7861ed2f287..dccb1176be57 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_OF_NET)	+= of_net.o
 obj-$(CONFIG_OF_SPI)	+= of_spi.o
 obj-$(CONFIG_OF_MDIO)	+= of_mdio.o
 obj-$(CONFIG_OF_PCI)	+= of_pci.o
+obj-$(CONFIG_OF_PCI_IRQ)  += of_pci_irq.o
diff --git a/drivers/of/of_pci.c b/drivers/of/of_pci.c
index ac1ec54e4fd5..ec7b060ae952 100644
--- a/drivers/of/of_pci.c
+++ b/drivers/of/of_pci.c
@@ -1,92 +1,40 @@
 #include <linux/kernel.h>
 #include <linux/of_pci.h>
-#include <linux/of_irq.h>
 #include <asm/prom.h>
 
-/**
- * of_irq_map_pci - Resolve the interrupt for a PCI device
- * @pdev:       the device whose interrupt is to be resolved
- * @out_irq:    structure of_irq filled by this function
- *
- * This function resolves the PCI interrupt for a given PCI device. If a
- * device-node exists for a given pci_dev, it will use normal OF tree
- * walking. If not, it will implement standard swizzling and walk up the
- * PCI tree until an device-node is found, at which point it will finish
- * resolving using the OF tree walking.
- */
-int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
+static inline int __of_pci_pci_compare(struct device_node *node,
+				       unsigned int devfn)
 {
-	struct device_node *dn, *ppnode;
-	struct pci_dev *ppdev;
-	u32 lspec;
-	__be32 lspec_be;
-	__be32 laddr[3];
-	u8 pin;
-	int rc;
+	unsigned int size;
+	const __be32 *reg = of_get_property(node, "reg", &size);
 
-	/* Check if we have a device node, if yes, fallback to standard
-	 * device tree parsing
-	 */
-	dn = pci_device_to_OF_node(pdev);
-	if (dn) {
-		rc = of_irq_map_one(dn, 0, out_irq);
-		if (!rc)
-			return rc;
-	}
-
-	/* Ok, we don't, time to have fun. Let's start by building up an
-	 * interrupt spec.  we assume #interrupt-cells is 1, which is standard
-	 * for PCI. If you do different, then don't use that routine.
-	 */
-	rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
-	if (rc != 0)
-		return rc;
-	/* No pin, exit */
-	if (pin == 0)
-		return -ENODEV;
-
-	/* Now we walk up the PCI tree */
-	lspec = pin;
-	for (;;) {
-		/* Get the pci_dev of our parent */
-		ppdev = pdev->bus->self;
-
-		/* Ouch, it's a host bridge... */
-		if (ppdev == NULL) {
-			ppnode = pci_bus_to_OF_node(pdev->bus);
-
-			/* No node for host bridge ? give up */
-			if (ppnode == NULL)
-				return -EINVAL;
-		} else {
-			/* We found a P2P bridge, check if it has a node */
-			ppnode = pci_device_to_OF_node(ppdev);
-		}
-
-		/* Ok, we have found a parent with a device-node, hand over to
-		 * the OF parsing code.
-		 * We build a unit address from the linux device to be used for
-		 * resolution. Note that we use the linux bus number which may
-		 * not match your firmware bus numbering.
-		 * Fortunately, in most cases, interrupt-map-mask doesn't
-		 * include the bus number as part of the matching.
-		 * You should still be careful about that though if you intend
-		 * to rely on this function (you ship  a firmware that doesn't
-		 * create device nodes for all PCI devices).
-		 */
-		if (ppnode)
-			break;
+	if (!reg || size < 5 * sizeof(__be32))
+		return 0;
+	return ((be32_to_cpup(&reg[0]) >> 8) & 0xff) == devfn;
+}
 
-		/* We can only get here if we hit a P2P bridge with no node,
-		 * let's do standard swizzling and try again
+struct device_node *of_pci_find_child_device(struct device_node *parent,
+					     unsigned int devfn)
+{
+	struct device_node *node, *node2;
+
+	for_each_child_of_node(parent, node) {
+		if (__of_pci_pci_compare(node, devfn))
+			return node;
+		/*
+		 * Some OFs create a parent node "multifunc-device" as
+		 * a fake root for all functions of a multi-function
+		 * device we go down them as well.
 		 */
-		lspec = pci_swizzle_interrupt_pin(pdev, lspec);
-		pdev = ppdev;
+		if (!strcmp(node->name, "multifunc-device")) {
+			for_each_child_of_node(node, node2) {
+				if (__of_pci_pci_compare(node2, devfn)) {
+					of_node_put(node);
+					return node2;
+				}
+			}
+		}
 	}
-
-	lspec_be = cpu_to_be32(lspec);
-	laddr[0] = cpu_to_be32((pdev->bus->number << 16) | (pdev->devfn << 8));
-	laddr[1]  = laddr[2] = cpu_to_be32(0);
-	return of_irq_map_raw(ppnode, &lspec_be, 1, laddr, out_irq);
+	return NULL;
 }
-EXPORT_SYMBOL_GPL(of_irq_map_pci);
+EXPORT_SYMBOL_GPL(of_pci_find_child_device);
diff --git a/drivers/of/of_pci_irq.c b/drivers/of/of_pci_irq.c
new file mode 100644
index 000000000000..ac1ec54e4fd5
--- /dev/null
+++ b/drivers/of/of_pci_irq.c
@@ -0,0 +1,92 @@
+#include <linux/kernel.h>
+#include <linux/of_pci.h>
+#include <linux/of_irq.h>
+#include <asm/prom.h>
+
+/**
+ * of_irq_map_pci - Resolve the interrupt for a PCI device
+ * @pdev:       the device whose interrupt is to be resolved
+ * @out_irq:    structure of_irq filled by this function
+ *
+ * This function resolves the PCI interrupt for a given PCI device. If a
+ * device-node exists for a given pci_dev, it will use normal OF tree
+ * walking. If not, it will implement standard swizzling and walk up the
+ * PCI tree until an device-node is found, at which point it will finish
+ * resolving using the OF tree walking.
+ */
+int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
+{
+	struct device_node *dn, *ppnode;
+	struct pci_dev *ppdev;
+	u32 lspec;
+	__be32 lspec_be;
+	__be32 laddr[3];
+	u8 pin;
+	int rc;
+
+	/* Check if we have a device node, if yes, fallback to standard
+	 * device tree parsing
+	 */
+	dn = pci_device_to_OF_node(pdev);
+	if (dn) {
+		rc = of_irq_map_one(dn, 0, out_irq);
+		if (!rc)
+			return rc;
+	}
+
+	/* Ok, we don't, time to have fun. Let's start by building up an
+	 * interrupt spec.  we assume #interrupt-cells is 1, which is standard
+	 * for PCI. If you do different, then don't use that routine.
+	 */
+	rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
+	if (rc != 0)
+		return rc;
+	/* No pin, exit */
+	if (pin == 0)
+		return -ENODEV;
+
+	/* Now we walk up the PCI tree */
+	lspec = pin;
+	for (;;) {
+		/* Get the pci_dev of our parent */
+		ppdev = pdev->bus->self;
+
+		/* Ouch, it's a host bridge... */
+		if (ppdev == NULL) {
+			ppnode = pci_bus_to_OF_node(pdev->bus);
+
+			/* No node for host bridge ? give up */
+			if (ppnode == NULL)
+				return -EINVAL;
+		} else {
+			/* We found a P2P bridge, check if it has a node */
+			ppnode = pci_device_to_OF_node(ppdev);
+		}
+
+		/* Ok, we have found a parent with a device-node, hand over to
+		 * the OF parsing code.
+		 * We build a unit address from the linux device to be used for
+		 * resolution. Note that we use the linux bus number which may
+		 * not match your firmware bus numbering.
+		 * Fortunately, in most cases, interrupt-map-mask doesn't
+		 * include the bus number as part of the matching.
+		 * You should still be careful about that though if you intend
+		 * to rely on this function (you ship  a firmware that doesn't
+		 * create device nodes for all PCI devices).
+		 */
+		if (ppnode)
+			break;
+
+		/* We can only get here if we hit a P2P bridge with no node,
+		 * let's do standard swizzling and try again
+		 */
+		lspec = pci_swizzle_interrupt_pin(pdev, lspec);
+		pdev = ppdev;
+	}
+
+	lspec_be = cpu_to_be32(lspec);
+	laddr[0] = cpu_to_be32((pdev->bus->number << 16) | (pdev->devfn << 8));
+	laddr[1]  = laddr[2] = cpu_to_be32(0);
+	return of_irq_map_raw(ppnode, &lspec_be, 1, laddr, out_irq);
+}
+EXPORT_SYMBOL_GPL(of_irq_map_pci);
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index c85f744270a5..f27f4a1488a1 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -70,4 +70,6 @@ obj-$(CONFIG_PCI_STUB) += pci-stub.o
 
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 
+obj-$(CONFIG_OF) += of.o
+
 ccflags-$(CONFIG_PCI_DEBUG) := -DDEBUG
diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c
index 083034710fa6..1d002b1c2bf4 100644
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c
@@ -158,7 +158,7 @@ static void dlpar_pci_add_bus(struct device_node *dn)
 	/* Scan below the new bridge */
 	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
 	    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
-		of_scan_pci_bridge(dn, dev);
+		of_scan_pci_bridge(dev);
 
 	/* Map IO space for child bus, which may or may not succeed */
 	pcibios_map_io_space(dev->subordinate);
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
new file mode 100644
index 000000000000..c94d37ec55c8
--- /dev/null
+++ b/drivers/pci/of.c
@@ -0,0 +1,61 @@
+/*
+ * PCI <-> OF mapping helpers
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/of.h>
+#include <linux/of_pci.h>
+#include "pci.h"
+
+void pci_set_of_node(struct pci_dev *dev)
+{
+	if (!dev->bus->dev.of_node)
+		return;
+	dev->dev.of_node = of_pci_find_child_device(dev->bus->dev.of_node,
+						    dev->devfn);
+}
+
+void pci_release_of_node(struct pci_dev *dev)
+{
+	of_node_put(dev->dev.of_node);
+	dev->dev.of_node = NULL;
+}
+
+void pci_set_bus_of_node(struct pci_bus *bus)
+{
+	if (bus->self == NULL)
+		bus->dev.of_node = pcibios_get_phb_of_node(bus);
+	else
+		bus->dev.of_node = of_node_get(bus->self->dev.of_node);
+}
+
+void pci_release_bus_of_node(struct pci_bus *bus)
+{
+	of_node_put(bus->dev.of_node);
+	bus->dev.of_node = NULL;
+}
+
+struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+	/* This should only be called for PHBs */
+	if (WARN_ON(bus->self || bus->parent))
+		return NULL;
+
+	/* Look for a node pointer in either the intermediary device we
+	 * create above the root bus or it's own parent. Normally only
+	 * the later is populated.
+	 */
+	if (bus->bridge->of_node)
+		return of_node_get(bus->bridge->of_node);
+	if (bus->bridge->parent->of_node)
+		return of_node_get(bus->bridge->parent->of_node);
+	return NULL;
+}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 48849ffdd672..c28c7b91910e 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -52,6 +52,7 @@ static void release_pcibus_dev(struct device *dev)
 	if (pci_bus->bridge)
 		put_device(pci_bus->bridge);
 	pci_bus_remove_resources(pci_bus);
+	pci_release_bus_of_node(pci_bus);
 	kfree(pci_bus);
 }
 
@@ -588,7 +589,7 @@ static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
 
 	child->self = bridge;
 	child->bridge = get_device(&bridge->dev);
-
+	pci_set_bus_of_node(child);
 	pci_set_bus_speed(child);
 
 	/* Set up default resource pointers and names.. */
@@ -1038,6 +1039,7 @@ static void pci_release_dev(struct device *dev)
 
 	pci_dev = to_pci_dev(dev);
 	pci_release_capabilities(pci_dev);
+	pci_release_of_node(pci_dev);
 	kfree(pci_dev);
 }
 
@@ -1157,6 +1159,8 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
 	dev->vendor = l & 0xffff;
 	dev->device = (l >> 16) & 0xffff;
 
+	pci_set_of_node(dev);
+
 	if (pci_setup_device(dev)) {
 		kfree(dev);
 		return NULL;
@@ -1409,6 +1413,7 @@ struct pci_bus * pci_create_bus(struct device *parent,
 		goto dev_reg_err;
 	b->bridge = get_device(dev);
 	device_enable_async_suspend(b->bridge);
+	pci_set_bus_of_node(b);
 
 	if (!parent)
 		set_dev_node(b->bridge, pcibus_to_node(b));
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index 85a27b650d76..f93e21700d3e 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -6,4 +6,9 @@
 struct pci_dev;
 struct of_irq;
 int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
+
+struct device_node;
+struct device_node *of_pci_find_child_device(struct device_node *parent,
+					     unsigned int devfn);
+
 #endif
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c446b5ca2d38..e5086e9a9bf7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1589,5 +1589,23 @@ int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt);
 int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
 			      unsigned int len, const char *kw);
 
+/* PCI <-> OF binding helpers */
+#ifdef CONFIG_OF
+struct device_node;
+extern void pci_set_of_node(struct pci_dev *dev);
+extern void pci_release_of_node(struct pci_dev *dev);
+extern void pci_set_bus_of_node(struct pci_bus *bus);
+extern void pci_release_bus_of_node(struct pci_bus *bus);
+
+/* Arch may override this (weak) */
+extern struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus);
+
+#else /* CONFIG_OF */
+static inline void pci_set_of_node(struct pci_dev *dev) { }
+static inline void pci_release_of_node(struct pci_dev *dev) { }
+static inline void pci_set_bus_of_node(struct pci_bus *bus) { }
+static inline void pci_release_bus_of_node(struct pci_bus *bus) { }
+#endif  /* CONFIG_OF */
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3


From 64099d981c9916ec4a485b3ffbb89fa877fc595f Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 7 Apr 2011 13:09:47 +1000
Subject: pci/of: Consolidate pci_device_to_OF_node()

All archs do more or less the same thing now, move it into
a single generic place.

I chose pci.h rather than of_pci.h to avoid having to change
all call-sites to include the later.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/microblaze/include/asm/pci-bridge.h | 5 -----
 arch/powerpc/include/asm/pci-bridge.h    | 5 -----
 arch/sparc/include/asm/pci_32.h          | 3 ---
 arch/sparc/include/asm/pci_64.h          | 3 ---
 arch/sparc/kernel/pci.c                  | 6 ------
 arch/sparc/kernel/pcic.c                 | 8 --------
 arch/x86/include/asm/prom.h              | 5 -----
 include/linux/pci.h                      | 5 +++++
 8 files changed, 5 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/pci-bridge.h b/arch/microblaze/include/asm/pci-bridge.h
index 6bddc0701a07..0d74d031e2a3 100644
--- a/arch/microblaze/include/asm/pci-bridge.h
+++ b/arch/microblaze/include/asm/pci-bridge.h
@@ -102,11 +102,6 @@ struct pci_controller {
 };
 
 #ifdef CONFIG_PCI
-static inline struct device_node *pci_device_to_OF_node(struct pci_dev *dev)
-{
-	return dev->dev.of_node;
-}
-
 static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
 {
 	return bus->dev.of_node;
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 3e6869476e55..578060ef99ad 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -169,11 +169,6 @@ static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
 	return bus->sysdata;
 }
 
-static inline struct device_node *pci_device_to_OF_node(struct pci_dev *dev)
-{
-	return dev->dev.of_node;
-}
-
 static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
 {
 	return bus->dev.of_node;
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index 332ac9ab36bc..4d0c39a931c5 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -42,9 +42,6 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 }
 #endif
 
-struct device_node;
-extern struct device_node *pci_device_to_OF_node(struct pci_dev *pdev);
-
 #endif /* __KERNEL__ */
 
 /* generic pci stuff */
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 948b686ec089..2614d96141c9 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -91,9 +91,6 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 	return PCI_IRQ_NONE;
 }
 
-struct device_node;
-extern struct device_node *pci_device_to_OF_node(struct pci_dev *pdev);
-
 #define HAVE_ARCH_PCI_RESOURCE_TO_USER
 extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
 				 const struct resource *rsrc,
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index e539d23dec9d..80a87e2a3e7c 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -1021,12 +1021,6 @@ void arch_teardown_msi_irq(unsigned int irq)
 }
 #endif /* !(CONFIG_PCI_MSI) */
 
-struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
-{
-	return pdev->dev.of_node;
-}
-EXPORT_SYMBOL(pci_device_to_OF_node);
-
 static void ali_sound_dma_hack(struct pci_dev *pdev, int set_bit)
 {
 	struct pci_dev *ali_isa_bridge;
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index 948601a066ff..a19f04195478 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -885,14 +885,6 @@ int pcibios_assign_resource(struct pci_dev *pdev, int resource)
 	return -ENXIO;
 }
 
-struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
-{
-	struct pcidev_cookie *pc = pdev->sysdata;
-
-	return pc->prom_node;
-}
-EXPORT_SYMBOL(pci_device_to_OF_node);
-
 /*
  * This probably belongs here rather than ioport.c because
  * we do not want this crud linked into SBus kernels.
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 971e0b46446e..dd6066a56346 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -31,11 +31,6 @@ extern void x86_add_irq_domains(void);
 void __cpuinit x86_of_pci_init(void);
 void x86_dtb_init(void);
 
-static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
-{
-	return pdev ? pdev->dev.of_node : NULL;
-}
-
 static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
 {
 	return pci_device_to_OF_node(bus->self);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e5086e9a9bf7..795e6a56d8cd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1600,6 +1600,11 @@ extern void pci_release_bus_of_node(struct pci_bus *bus);
 /* Arch may override this (weak) */
 extern struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus);
 
+static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
+{
+	return pdev ? pdev->dev.of_node : NULL;
+}
+
 #else /* CONFIG_OF */
 static inline void pci_set_of_node(struct pci_dev *dev) { }
 static inline void pci_release_of_node(struct pci_dev *dev) { }
-- 
cgit v1.2.3


From ef3b4f8cc20e80c767e47b848fb7512770ab80d7 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 11 Apr 2011 11:34:33 +1000
Subject: pci/of: Consolidate pci_bus_to_OF_node()

The generic code always get the device-node in the right place now
so a single implementation will work for all archs

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/microblaze/include/asm/pci-bridge.h | 5 -----
 arch/powerpc/include/asm/pci-bridge.h    | 5 -----
 arch/x86/include/asm/prom.h              | 6 ------
 include/linux/pci.h                      | 5 +++++
 4 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/pci-bridge.h b/arch/microblaze/include/asm/pci-bridge.h
index 0d74d031e2a3..242be57a319c 100644
--- a/arch/microblaze/include/asm/pci-bridge.h
+++ b/arch/microblaze/include/asm/pci-bridge.h
@@ -102,11 +102,6 @@ struct pci_controller {
 };
 
 #ifdef CONFIG_PCI
-static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
-{
-	return bus->dev.of_node;
-}
-
 static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
 {
 	return bus->sysdata;
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 578060ef99ad..90bd3ed48165 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -169,11 +169,6 @@ static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
 	return bus->sysdata;
 }
 
-static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
-{
-	return bus->dev.of_node;
-}
-
 #ifndef CONFIG_PPC64
 
 extern int pci_device_from_OF_node(struct device_node *node,
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index dd6066a56346..df1287019e6d 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -30,12 +30,6 @@ extern void add_dtb(u64 data);
 extern void x86_add_irq_domains(void);
 void __cpuinit x86_of_pci_init(void);
 void x86_dtb_init(void);
-
-static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
-{
-	return pci_device_to_OF_node(bus->self);
-}
-
 #else
 static inline void add_dtb(u64 data) { }
 static inline void x86_add_irq_domains(void) { }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 795e6a56d8cd..2d292182dde5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1605,6 +1605,11 @@ static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
 	return pdev ? pdev->dev.of_node : NULL;
 }
 
+static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
+{
+	return bus ? bus->dev.of_node : NULL;
+}
+
 #else /* CONFIG_OF */
 static inline void pci_set_of_node(struct pci_dev *dev) { }
 static inline void pci_release_of_node(struct pci_dev *dev) { }
-- 
cgit v1.2.3


From bff55273f98dea0ceb78e28eb69462fe5f72ef3d Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 8 Jun 2011 12:35:08 +0000
Subject: v2 ethtool: remove support for ETHTOOL_GRXNTUPLE

This change is meant to remove all support for displaying an ntuple as
strings via ETHTOOL_GRXNTUPLE.  The reason for this change is due to the
fact that multiple issues have been found including:
 - Multiple buffer overruns for strings being displayed.
 - Incorrect filters displayed, cleared filters with ring of -2 are displayed
 - Setting get_rx_ntuple displays no rules if defined.
 - Endianess wrong on displayed values.
 - Hard limit of 1024 filters makes display functionality extremely limited

The only driver that had supported this interface was ixgbe.  Since it no
longer uses the interface and due to the issues mentioned above I am
submitting this patch to remove it.

v2:
Updated based on comments from Ben Hutchings
 - Left ETH_SS_NTUPLE_FILTERS in code but commented on it being deprecated
 - Removed ethtool_rx_ntuple_list and ethtool_rx_ntuple_flow_spec_container
 - Left ETHTOOL_GRXNTUPLE but commented it as deprecated

Also cleaned up set_rx_ntuple since there is no flow spec container to
maintain we can drop all the code for the alloc and free of it and just
return ops->set_rx_ntuple().
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h   |  19 +--
 include/linux/netdevice.h |   3 -
 net/core/dev.c            |   5 -
 net/core/ethtool.c        | 309 +---------------------------------------------
 4 files changed, 3 insertions(+), 333 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c6a850ab2ec5..dfd34934213d 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -287,7 +287,7 @@ enum ethtool_stringset {
 	ETH_SS_TEST		= 0,
 	ETH_SS_STATS,
 	ETH_SS_PRIV_FLAGS,
-	ETH_SS_NTUPLE_FILTERS,
+	ETH_SS_NTUPLE_FILTERS,	/* Do not use, GRXNTUPLE is now deprecated */
 	ETH_SS_FEATURES,
 };
 
@@ -714,18 +714,6 @@ enum ethtool_sfeatures_retval_bits {
 /* needed by dev_disable_lro() */
 extern int __ethtool_set_flags(struct net_device *dev, u32 flags);
 
-struct ethtool_rx_ntuple_flow_spec_container {
-	struct ethtool_rx_ntuple_flow_spec fs;
-	struct list_head list;
-};
-
-struct ethtool_rx_ntuple_list {
-#define ETHTOOL_MAX_NTUPLE_LIST_ENTRY 1024
-#define ETHTOOL_MAX_NTUPLE_STRING_PER_ENTRY 14
-	struct list_head	list;
-	unsigned int		count;
-};
-
 /**
  * enum ethtool_phys_id_state - indicator state for physical identification
  * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated
@@ -758,7 +746,6 @@ u32 ethtool_op_get_ufo(struct net_device *dev);
 int ethtool_op_set_ufo(struct net_device *dev, u32 data);
 u32 ethtool_op_get_flags(struct net_device *dev);
 int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported);
-void ethtool_ntuple_flush(struct net_device *dev);
 bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported);
 
 /**
@@ -865,7 +852,6 @@ bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported);
  *	error code or zero.
  * @set_rx_ntuple: Set an RX n-tuple rule.  Returns a negative error code
  *	or zero.
- * @get_rx_ntuple: Deprecated.
  * @get_rxfh_indir: Get the contents of the RX flow hash indirection table.
  *	Returns a negative error code or zero.
  * @set_rxfh_indir: Set the contents of the RX flow hash indirection table.
@@ -944,7 +930,6 @@ struct ethtool_ops {
 	int	(*reset)(struct net_device *, u32 *);
 	int	(*set_rx_ntuple)(struct net_device *,
 				 struct ethtool_rx_ntuple *);
-	int	(*get_rx_ntuple)(struct net_device *, u32 stringset, void *);
 	int	(*get_rxfh_indir)(struct net_device *,
 				  struct ethtool_rxfh_indir *);
 	int	(*set_rxfh_indir)(struct net_device *,
@@ -1017,7 +1002,7 @@ struct ethtool_ops {
 #define ETHTOOL_FLASHDEV	0x00000033 /* Flash firmware to device */
 #define ETHTOOL_RESET		0x00000034 /* Reset hardware */
 #define ETHTOOL_SRXNTUPLE	0x00000035 /* Add an n-tuple filter to device */
-#define ETHTOOL_GRXNTUPLE	0x00000036 /* Get n-tuple filters from device */
+#define ETHTOOL_GRXNTUPLE	0x00000036 /* deprecated */
 #define ETHTOOL_GSSET_INFO	0x00000037 /* Get string set info */
 #define ETHTOOL_GRXFHINDIR	0x00000038 /* Get RX flow hash indir'n table */
 #define ETHTOOL_SRXFHINDIR	0x00000039 /* Set RX flow hash indir'n table */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 336288567c78..6469fa942d1d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1348,9 +1348,6 @@ struct net_device {
 	/* max exchange id for FCoE LRO by ddp */
 	unsigned int		fcoe_ddp_xid;
 #endif
-	/* n-tuple filter list attached to this device */
-	struct ethtool_rx_ntuple_list ethtool_ntuple_list;
-
 	/* phy device may attach itself for hardware timestamping */
 	struct phy_device *phydev;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 939307891e71..b3f52d2f56d7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5867,8 +5867,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
-	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
-	dev->ethtool_ntuple_list.count = 0;
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
 	INIT_LIST_HEAD(&dev->link_watch_list);
@@ -5932,9 +5930,6 @@ void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
-	/* Clear ethtool n-tuple list */
-	ethtool_ntuple_flush(dev);
-
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index fd14116ad7f0..b7c12a63d0ce 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -169,18 +169,6 @@ int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
 }
 EXPORT_SYMBOL(ethtool_op_set_flags);
 
-void ethtool_ntuple_flush(struct net_device *dev)
-{
-	struct ethtool_rx_ntuple_flow_spec_container *fsc, *f;
-
-	list_for_each_entry_safe(fsc, f, &dev->ethtool_ntuple_list.list, list) {
-		list_del(&fsc->list);
-		kfree(fsc);
-	}
-	dev->ethtool_ntuple_list.count = 0;
-}
-EXPORT_SYMBOL(ethtool_ntuple_flush);
-
 /* Handlers for each ethtool command */
 
 #define ETHTOOL_DEV_FEATURE_WORDS	1
@@ -865,34 +853,6 @@ out:
 	return ret;
 }
 
-static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
-			struct ethtool_rx_ntuple_flow_spec *spec,
-			struct ethtool_rx_ntuple_flow_spec_container *fsc)
-{
-
-	/* don't add filters forever */
-	if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) {
-		/* free the container */
-		kfree(fsc);
-		return;
-	}
-
-	/* Copy the whole filter over */
-	fsc->fs.flow_type = spec->flow_type;
-	memcpy(&fsc->fs.h_u, &spec->h_u, sizeof(spec->h_u));
-	memcpy(&fsc->fs.m_u, &spec->m_u, sizeof(spec->m_u));
-
-	fsc->fs.vlan_tag = spec->vlan_tag;
-	fsc->fs.vlan_tag_mask = spec->vlan_tag_mask;
-	fsc->fs.data = spec->data;
-	fsc->fs.data_mask = spec->data_mask;
-	fsc->fs.action = spec->action;
-
-	/* add to the list */
-	list_add_tail_rcu(&fsc->list, &list->list);
-	list->count++;
-}
-
 /*
  * ethtool does not (or did not) set masks for flow parameters that are
  * not specified, so if both value and mask are 0 then this must be
@@ -930,8 +890,6 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
 {
 	struct ethtool_rx_ntuple cmd;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
-	struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL;
-	int ret;
 
 	if (!ops->set_rx_ntuple)
 		return -EOPNOTSUPP;
@@ -944,269 +902,7 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
 
 	rx_ntuple_fix_masks(&cmd.fs);
 
-	/*
-	 * Cache filter in dev struct for GET operation only if
-	 * the underlying driver doesn't have its own GET operation, and
-	 * only if the filter was added successfully.  First make sure we
-	 * can allocate the filter, then continue if successful.
-	 */
-	if (!ops->get_rx_ntuple) {
-		fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC);
-		if (!fsc)
-			return -ENOMEM;
-	}
-
-	ret = ops->set_rx_ntuple(dev, &cmd);
-	if (ret) {
-		kfree(fsc);
-		return ret;
-	}
-
-	if (!ops->get_rx_ntuple)
-		__rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs, fsc);
-
-	return ret;
-}
-
-static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
-{
-	struct ethtool_gstrings gstrings;
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-	struct ethtool_rx_ntuple_flow_spec_container *fsc;
-	u8 *data;
-	char *p;
-	int ret, i, num_strings = 0;
-
-	if (!ops->get_sset_count)
-		return -EOPNOTSUPP;
-
-	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
-		return -EFAULT;
-
-	ret = ops->get_sset_count(dev, gstrings.string_set);
-	if (ret < 0)
-		return ret;
-
-	gstrings.len = ret;
-
-	data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
-	if (!data)
-		return -ENOMEM;
-
-	if (ops->get_rx_ntuple) {
-		/* driver-specific filter grab */
-		ret = ops->get_rx_ntuple(dev, gstrings.string_set, data);
-		goto copy;
-	}
-
-	/* default ethtool filter grab */
-	i = 0;
-	p = (char *)data;
-	list_for_each_entry(fsc, &dev->ethtool_ntuple_list.list, list) {
-		sprintf(p, "Filter %d:\n", i);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-
-		switch (fsc->fs.flow_type) {
-		case TCP_V4_FLOW:
-			sprintf(p, "\tFlow Type: TCP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case UDP_V4_FLOW:
-			sprintf(p, "\tFlow Type: UDP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case SCTP_V4_FLOW:
-			sprintf(p, "\tFlow Type: SCTP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case AH_ESP_V4_FLOW:
-			sprintf(p, "\tFlow Type: AH ESP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case ESP_V4_FLOW:
-			sprintf(p, "\tFlow Type: ESP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case IP_USER_FLOW:
-			sprintf(p, "\tFlow Type: Raw IP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case IPV4_FLOW:
-			sprintf(p, "\tFlow Type: IPv4\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		default:
-			sprintf(p, "\tFlow Type: Unknown\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			goto unknown_filter;
-		}
-
-		/* now the rest of the filters */
-		switch (fsc->fs.flow_type) {
-		case TCP_V4_FLOW:
-		case UDP_V4_FLOW:
-		case SCTP_V4_FLOW:
-			sprintf(p, "\tSrc IP addr: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc IP mask: 0x%x\n",
-				fsc->fs.m_u.tcp_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP addr: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP mask: 0x%x\n",
-				fsc->fs.m_u.tcp_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc Port: %d, mask: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.psrc,
-				fsc->fs.m_u.tcp_ip4_spec.psrc);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest Port: %d, mask: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.pdst,
-				fsc->fs.m_u.tcp_ip4_spec.pdst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.tos,
-				fsc->fs.m_u.tcp_ip4_spec.tos);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case AH_ESP_V4_FLOW:
-		case ESP_V4_FLOW:
-			sprintf(p, "\tSrc IP addr: 0x%x\n",
-				fsc->fs.h_u.ah_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc IP mask: 0x%x\n",
-				fsc->fs.m_u.ah_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP addr: 0x%x\n",
-				fsc->fs.h_u.ah_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP mask: 0x%x\n",
-				fsc->fs.m_u.ah_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSPI: %d, mask: 0x%x\n",
-				fsc->fs.h_u.ah_ip4_spec.spi,
-				fsc->fs.m_u.ah_ip4_spec.spi);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
-				fsc->fs.h_u.ah_ip4_spec.tos,
-				fsc->fs.m_u.ah_ip4_spec.tos);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case IP_USER_FLOW:
-			sprintf(p, "\tSrc IP addr: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc IP mask: 0x%x\n",
-				fsc->fs.m_u.usr_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP addr: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP mask: 0x%x\n",
-				fsc->fs.m_u.usr_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case IPV4_FLOW:
-			sprintf(p, "\tSrc IP addr: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc IP mask: 0x%x\n",
-				fsc->fs.m_u.usr_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP addr: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP mask: 0x%x\n",
-				fsc->fs.m_u.usr_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.l4_4_bytes,
-				fsc->fs.m_u.usr_ip4_spec.l4_4_bytes);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.tos,
-				fsc->fs.m_u.usr_ip4_spec.tos);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tIP Version: %d, mask: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip_ver,
-				fsc->fs.m_u.usr_ip4_spec.ip_ver);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tProtocol: %d, mask: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.proto,
-				fsc->fs.m_u.usr_ip4_spec.proto);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		}
-		sprintf(p, "\tVLAN: %d, mask: 0x%x\n",
-			fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-		sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-		sprintf(p, "\tUser-defined mask: 0x%Lx\n", fsc->fs.data_mask);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-		if (fsc->fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP)
-			sprintf(p, "\tAction: Drop\n");
-		else
-			sprintf(p, "\tAction: Direct to queue %d\n",
-				fsc->fs.action);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-unknown_filter:
-		i++;
-	}
-copy:
-	/* indicate to userspace how many strings we actually have */
-	gstrings.len = num_strings;
-	ret = -EFAULT;
-	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
-		goto out;
-	useraddr += sizeof(gstrings);
-	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
-		goto out;
-	ret = 0;
-
-out:
-	kfree(data);
-	return ret;
+	return ops->set_rx_ntuple(dev, &cmd);
 }
 
 static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
@@ -2101,9 +1797,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_SRXNTUPLE:
 		rc = ethtool_set_rx_ntuple(dev, useraddr);
 		break;
-	case ETHTOOL_GRXNTUPLE:
-		rc = ethtool_get_rx_ntuple(dev, useraddr);
-		break;
 	case ETHTOOL_GSSET_INFO:
 		rc = ethtool_get_sset_info(dev, useraddr);
 		break;
-- 
cgit v1.2.3


From 9ad7c049f0f79c418e293b1b68cf10d68f54fcdb Mon Sep 17 00:00:00 2001
From: Jerry Chu <hkchu@google.com>
Date: Wed, 8 Jun 2011 11:08:38 +0000
Subject: tcp: RFC2988bis + taking RTT sample from 3WHS for the passive open
 side

This patch lowers the default initRTO from 3secs to 1sec per
RFC2988bis. It falls back to 3secs if the SYN or SYN-ACK packet
has been retransmitted, AND the TCP timestamp option is not on.

It also adds support to take RTT sample during 3WHS on the passive
open side, just like its active open counterpart, and uses it, if
valid, to seed the initRTO for the data transmission phase.

The patch also resets ssthresh to its initial default at the
beginning of the data transmission phase, and reduces cwnd to 1 if
there has been MORE THAN ONE retransmission during 3WHS per RFC5681.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  1 +
 include/net/tcp.h        | 11 +++++++++--
 net/ipv4/syncookies.c    |  1 +
 net/ipv4/tcp_input.c     | 46 +++++++++++++++++++++++++---------------------
 net/ipv4/tcp_ipv4.c      | 11 ++++++++---
 net/ipv4/tcp_minisocks.c |  6 +++++-
 net/ipv6/syncookies.c    |  1 +
 net/ipv6/tcp_ipv6.c      |  5 +++++
 8 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e64f4c67d0ef..531ede8006d9 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -282,6 +282,7 @@ struct tcp_request_sock {
 #endif
 	u32				rcv_isn;
 	u32				snt_isn;
+	u32				snt_synack; /* synack sent time */
 };
 
 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cda30ea354a2..149a415d1e0a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -122,7 +122,13 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #endif
 #define TCP_RTO_MAX	((unsigned)(120*HZ))
 #define TCP_RTO_MIN	((unsigned)(HZ/5))
-#define TCP_TIMEOUT_INIT ((unsigned)(3*HZ))	/* RFC 1122 initial RTO value	*/
+#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC2988bis initial RTO value	*/
+#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))	/* RFC 1122 initial RTO value, now
+						 * used as a fallback RTO for the
+						 * initial data transmission if no
+						 * valid RTT sample has been acquired,
+						 * most likely due to retrans in 3WHS.
+						 */
 
 #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
 					                 * for local resources.
@@ -295,7 +301,7 @@ static inline void tcp_synq_overflow(struct sock *sk)
 static inline int tcp_synq_no_recent_overflow(const struct sock *sk)
 {
 	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
-	return time_after(jiffies, last_overflow + TCP_TIMEOUT_INIT);
+	return time_after(jiffies, last_overflow + TCP_TIMEOUT_FALLBACK);
 }
 
 extern struct proto tcp_prot;
@@ -508,6 +514,7 @@ extern void tcp_initialize_rcv_mss(struct sock *sk);
 extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
 extern int tcp_mss_to_mtu(struct sock *sk, int mss);
 extern void tcp_mtup_init(struct sock *sk);
+extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
 
 static inline void tcp_bound_rto(const struct sock *sk)
 {
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 26461492a847..92bb9434b338 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -316,6 +316,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->wscale_ok		= tcp_opt.wscale_ok;
 	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bef9f04c22ba..ea0d2183df4b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -880,6 +880,11 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
 		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
 			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+	} else {
+		/* ssthresh may have been reduced unnecessarily during.
+		 * 3WHS. Restore it back to its initial default.
+		 */
+		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	}
 	if (dst_metric(dst, RTAX_REORDERING) &&
 	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
@@ -887,10 +892,7 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->reordering = dst_metric(dst, RTAX_REORDERING);
 	}
 
-	if (dst_metric(dst, RTAX_RTT) == 0)
-		goto reset;
-
-	if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
+	if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
 		goto reset;
 
 	/* Initial rtt is determined from SYN,SYN-ACK.
@@ -916,19 +918,26 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
 	}
 	tcp_set_rto(sk);
-	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
 reset:
-		/* Play conservative. If timestamps are not
-		 * supported, TCP will fail to recalculate correct
-		 * rtt, if initial rto is too small. FORGET ALL AND RESET!
+	if (tp->srtt == 0) {
+		/* RFC2988bis: We've failed to get a valid RTT sample from
+		 * 3WHS. This is most likely due to retransmission,
+		 * including spurious one. Reset the RTO back to 3secs
+		 * from the more aggressive 1sec to avoid more spurious
+		 * retransmission.
 		 */
-		if (!tp->rx_opt.saw_tstamp && tp->srtt) {
-			tp->srtt = 0;
-			tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
-			inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
-		}
+		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
 	}
-	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+	 * retransmitted. In light of RFC2988bis' more aggressive 1sec
+	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+	 * retransmission has occurred.
+	 */
+	if (tp->total_retrans > 1)
+		tp->snd_cwnd = 1;
+	else
+		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
@@ -3112,12 +3121,13 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
 	tcp_xmit_retransmit_queue(sk);
 }
 
-static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
+void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
 {
 	tcp_rtt_estimator(sk, seq_rtt);
 	tcp_set_rto(sk);
 	inet_csk(sk)->icsk_backoff = 0;
 }
+EXPORT_SYMBOL(tcp_valid_rtt_meas);
 
 /* Read draft-ietf-tcplw-high-performance before mucking
  * with this code. (Supersedes RFC1323)
@@ -5806,12 +5816,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 					      tp->rx_opt.snd_wscale;
 				tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
 
-				/* tcp_ack considers this ACK as duplicate
-				 * and does not calculate rtt.
-				 * Force it here.
-				 */
-				tcp_ack_update_rtt(sk, 0, 0);
-
 				if (tp->rx_opt.tstamp_ok)
 					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a7d6671e33b8..617dee3ccfb1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -429,8 +429,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 			break;
 
 		icsk->icsk_backoff--;
-		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
-					 icsk->icsk_backoff;
+		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
+			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 		tcp_bound_rto(sk);
 
 		skb = tcp_write_queue_head(sk);
@@ -1384,6 +1384,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		isn = tcp_v4_init_sequence(skb);
 	}
 	tcp_rsk(req)->snt_isn = isn;
+	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 
 	if (tcp_v4_send_synack(sk, dst, req,
 			       (struct request_values *)&tmp_ext) ||
@@ -1458,6 +1459,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
 
 	tcp_initialize_rcv_mss(newsk);
+	if (tcp_rsk(req)->snt_synack)
+		tcp_valid_rtt_meas(newsk,
+		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
+	newtp->total_retrans = req->retrans;
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Copy over the MD5 key from the original socket */
@@ -1854,7 +1859,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	 * algorithms that we must have the following bandaid to talk
 	 * efficiently to them.  -DaveM
 	 */
-	tp->snd_cwnd = 2;
+	tp->snd_cwnd = TCP_INIT_CWND;
 
 	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 80b1f80759ab..d2fe4e06b472 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -486,7 +486,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		 * algorithms that we must have the following bandaid to talk
 		 * efficiently to them.  -DaveM
 		 */
-		newtp->snd_cwnd = 2;
+		newtp->snd_cwnd = TCP_INIT_CWND;
 		newtp->snd_cwnd_cnt = 0;
 		newtp->bytes_acked = 0;
 
@@ -720,6 +720,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
 		return NULL;
 	}
+	if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
+		tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
+	else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
+		tcp_rsk(req)->snt_synack = 0;
 
 	/* OK, ACK is valid, create big socket and
 	 * feed this segment to it. It will repeat all
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 8b9644a8b697..89d5bf806222 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -223,6 +223,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	ireq->wscale_ok		= tcp_opt.wscale_ok;
 	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
 	treq->rcv_isn = ntohl(th->seq) - 1;
 	treq->snt_isn = cookie;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d1fd28711ba5..a1ef61a889c3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1341,6 +1341,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	}
 have_isn:
 	tcp_rsk(req)->snt_isn = isn;
+	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 
 	security_inet_conn_request(sk, skb, req);
 
@@ -1509,6 +1510,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric_advmss(dst);
 	tcp_initialize_rcv_mss(newsk);
+	if (tcp_rsk(req)->snt_synack)
+		tcp_valid_rtt_meas(newsk,
+		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
+	newtp->total_retrans = req->retrans;
 
 	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
 	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
-- 
cgit v1.2.3


From 76369139ceb955deefc509e6e12ce9d6ce50ccab Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 May 2011 19:55:04 +0200
Subject: perf: Split up buffer handling from core code

And create the internal perf events header.

v2: Keep an internal inlined perf_output_copy()

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Stephane Eranian <eranian@google.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1305827704-5607-1-git-send-email-fweisbec@gmail.com
[ v3: use clearer 'ring_buffer' and 'rb' naming ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h  |  33 +--
 kernel/events/Makefile      |   2 +-
 kernel/events/core.c        | 568 ++++++--------------------------------------
 kernel/events/internal.h    |  97 ++++++++
 kernel/events/ring_buffer.c | 399 +++++++++++++++++++++++++++++++
 5 files changed, 572 insertions(+), 527 deletions(-)
 create mode 100644 kernel/events/internal.h
 create mode 100644 kernel/events/ring_buffer.c

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3412684ce5d5..779f6ed54d52 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -680,33 +680,6 @@ enum perf_event_active_state {
 };
 
 struct file;
-
-#define PERF_BUFFER_WRITABLE		0x01
-
-struct perf_buffer {
-	atomic_t			refcount;
-	struct rcu_head			rcu_head;
-#ifdef CONFIG_PERF_USE_VMALLOC
-	struct work_struct		work;
-	int				page_order;	/* allocation order  */
-#endif
-	int				nr_pages;	/* nr of data pages  */
-	int				writable;	/* are we writable   */
-
-	atomic_t			poll;		/* POLL_ for wakeups */
-
-	local_t				head;		/* write position    */
-	local_t				nest;		/* nested writers    */
-	local_t				events;		/* event limit       */
-	local_t				wakeup;		/* wakeup stamp      */
-	local_t				lost;		/* nr records lost   */
-
-	long				watermark;	/* wakeup watermark  */
-
-	struct perf_event_mmap_page	*user_page;
-	void				*data_pages[0];
-};
-
 struct perf_sample_data;
 
 typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
@@ -745,6 +718,8 @@ struct perf_cgroup {
 };
 #endif
 
+struct ring_buffer;
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -834,7 +809,7 @@ struct perf_event {
 	atomic_t			mmap_count;
 	int				mmap_locked;
 	struct user_struct		*mmap_user;
-	struct perf_buffer		*buffer;
+	struct ring_buffer		*rb;
 
 	/* poll related */
 	wait_queue_head_t		waitq;
@@ -945,7 +920,7 @@ struct perf_cpu_context {
 
 struct perf_output_handle {
 	struct perf_event		*event;
-	struct perf_buffer		*buffer;
+	struct ring_buffer		*rb;
 	unsigned long			wakeup;
 	unsigned long			size;
 	void				*addr;
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 1ce23d3d8394..89e5e8aa4c36 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_core.o = -pg
 endif
 
-obj-y := core.o
+obj-y := core.o ring_buffer.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5e8c7b1389bc..5e70f62752a2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,8 @@
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
 
+#include "internal.h"
+
 #include <asm/irq_regs.h>
 
 struct remote_function_call {
@@ -2886,7 +2888,7 @@ static void free_event_rcu(struct rcu_head *head)
 	kfree(event);
 }
 
-static void perf_buffer_put(struct perf_buffer *buffer);
+static void ring_buffer_put(struct ring_buffer *rb);
 
 static void free_event(struct perf_event *event)
 {
@@ -2909,9 +2911,9 @@ static void free_event(struct perf_event *event)
 		}
 	}
 
-	if (event->buffer) {
-		perf_buffer_put(event->buffer);
-		event->buffer = NULL;
+	if (event->rb) {
+		ring_buffer_put(event->rb);
+		event->rb = NULL;
 	}
 
 	if (is_cgroup_event(event))
@@ -3139,13 +3141,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_event *event = file->private_data;
-	struct perf_buffer *buffer;
+	struct ring_buffer *rb;
 	unsigned int events = POLL_HUP;
 
 	rcu_read_lock();
-	buffer = rcu_dereference(event->buffer);
-	if (buffer)
-		events = atomic_xchg(&buffer->poll, 0);
+	rb = rcu_dereference(event->rb);
+	if (rb)
+		events = atomic_xchg(&rb->poll, 0);
 	rcu_read_unlock();
 
 	poll_wait(file, &event->waitq, wait);
@@ -3356,14 +3358,14 @@ static int perf_event_index(struct perf_event *event)
 void perf_event_update_userpage(struct perf_event *event)
 {
 	struct perf_event_mmap_page *userpg;
-	struct perf_buffer *buffer;
+	struct ring_buffer *rb;
 
 	rcu_read_lock();
-	buffer = rcu_dereference(event->buffer);
-	if (!buffer)
+	rb = rcu_dereference(event->rb);
+	if (!rb)
 		goto unlock;
 
-	userpg = buffer->user_page;
+	userpg = rb->user_page;
 
 	/*
 	 * Disable preemption so as to not let the corresponding user-space
@@ -3390,220 +3392,10 @@ unlock:
 	rcu_read_unlock();
 }
 
-static unsigned long perf_data_size(struct perf_buffer *buffer);
-
-static void
-perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
-{
-	long max_size = perf_data_size(buffer);
-
-	if (watermark)
-		buffer->watermark = min(max_size, watermark);
-
-	if (!buffer->watermark)
-		buffer->watermark = max_size / 2;
-
-	if (flags & PERF_BUFFER_WRITABLE)
-		buffer->writable = 1;
-
-	atomic_set(&buffer->refcount, 1);
-}
-
-#ifndef CONFIG_PERF_USE_VMALLOC
-
-/*
- * Back perf_mmap() with regular GFP_KERNEL-0 pages.
- */
-
-static struct page *
-perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
-{
-	if (pgoff > buffer->nr_pages)
-		return NULL;
-
-	if (pgoff == 0)
-		return virt_to_page(buffer->user_page);
-
-	return virt_to_page(buffer->data_pages[pgoff - 1]);
-}
-
-static void *perf_mmap_alloc_page(int cpu)
-{
-	struct page *page;
-	int node;
-
-	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
-	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
-	if (!page)
-		return NULL;
-
-	return page_address(page);
-}
-
-static struct perf_buffer *
-perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
-{
-	struct perf_buffer *buffer;
-	unsigned long size;
-	int i;
-
-	size = sizeof(struct perf_buffer);
-	size += nr_pages * sizeof(void *);
-
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
-		goto fail;
-
-	buffer->user_page = perf_mmap_alloc_page(cpu);
-	if (!buffer->user_page)
-		goto fail_user_page;
-
-	for (i = 0; i < nr_pages; i++) {
-		buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
-		if (!buffer->data_pages[i])
-			goto fail_data_pages;
-	}
-
-	buffer->nr_pages = nr_pages;
-
-	perf_buffer_init(buffer, watermark, flags);
-
-	return buffer;
-
-fail_data_pages:
-	for (i--; i >= 0; i--)
-		free_page((unsigned long)buffer->data_pages[i]);
-
-	free_page((unsigned long)buffer->user_page);
-
-fail_user_page:
-	kfree(buffer);
-
-fail:
-	return NULL;
-}
-
-static void perf_mmap_free_page(unsigned long addr)
-{
-	struct page *page = virt_to_page((void *)addr);
-
-	page->mapping = NULL;
-	__free_page(page);
-}
-
-static void perf_buffer_free(struct perf_buffer *buffer)
-{
-	int i;
-
-	perf_mmap_free_page((unsigned long)buffer->user_page);
-	for (i = 0; i < buffer->nr_pages; i++)
-		perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
-	kfree(buffer);
-}
-
-static inline int page_order(struct perf_buffer *buffer)
-{
-	return 0;
-}
-
-#else
-
-/*
- * Back perf_mmap() with vmalloc memory.
- *
- * Required for architectures that have d-cache aliasing issues.
- */
-
-static inline int page_order(struct perf_buffer *buffer)
-{
-	return buffer->page_order;
-}
-
-static struct page *
-perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
-{
-	if (pgoff > (1UL << page_order(buffer)))
-		return NULL;
-
-	return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
-}
-
-static void perf_mmap_unmark_page(void *addr)
-{
-	struct page *page = vmalloc_to_page(addr);
-
-	page->mapping = NULL;
-}
-
-static void perf_buffer_free_work(struct work_struct *work)
-{
-	struct perf_buffer *buffer;
-	void *base;
-	int i, nr;
-
-	buffer = container_of(work, struct perf_buffer, work);
-	nr = 1 << page_order(buffer);
-
-	base = buffer->user_page;
-	for (i = 0; i < nr + 1; i++)
-		perf_mmap_unmark_page(base + (i * PAGE_SIZE));
-
-	vfree(base);
-	kfree(buffer);
-}
-
-static void perf_buffer_free(struct perf_buffer *buffer)
-{
-	schedule_work(&buffer->work);
-}
-
-static struct perf_buffer *
-perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
-{
-	struct perf_buffer *buffer;
-	unsigned long size;
-	void *all_buf;
-
-	size = sizeof(struct perf_buffer);
-	size += sizeof(void *);
-
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
-		goto fail;
-
-	INIT_WORK(&buffer->work, perf_buffer_free_work);
-
-	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
-	if (!all_buf)
-		goto fail_all_buf;
-
-	buffer->user_page = all_buf;
-	buffer->data_pages[0] = all_buf + PAGE_SIZE;
-	buffer->page_order = ilog2(nr_pages);
-	buffer->nr_pages = 1;
-
-	perf_buffer_init(buffer, watermark, flags);
-
-	return buffer;
-
-fail_all_buf:
-	kfree(buffer);
-
-fail:
-	return NULL;
-}
-
-#endif
-
-static unsigned long perf_data_size(struct perf_buffer *buffer)
-{
-	return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
-}
-
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct perf_event *event = vma->vm_file->private_data;
-	struct perf_buffer *buffer;
+	struct ring_buffer *rb;
 	int ret = VM_FAULT_SIGBUS;
 
 	if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3613,14 +3405,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 
 	rcu_read_lock();
-	buffer = rcu_dereference(event->buffer);
-	if (!buffer)
+	rb = rcu_dereference(event->rb);
+	if (!rb)
 		goto unlock;
 
 	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
 		goto unlock;
 
-	vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
+	vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
 	if (!vmf->page)
 		goto unlock;
 
@@ -3635,35 +3427,35 @@ unlock:
 	return ret;
 }
 
-static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
+static void rb_free_rcu(struct rcu_head *rcu_head)
 {
-	struct perf_buffer *buffer;
+	struct ring_buffer *rb;
 
-	buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
-	perf_buffer_free(buffer);
+	rb = container_of(rcu_head, struct ring_buffer, rcu_head);
+	rb_free(rb);
 }
 
-static struct perf_buffer *perf_buffer_get(struct perf_event *event)
+static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 {
-	struct perf_buffer *buffer;
+	struct ring_buffer *rb;
 
 	rcu_read_lock();
-	buffer = rcu_dereference(event->buffer);
-	if (buffer) {
-		if (!atomic_inc_not_zero(&buffer->refcount))
-			buffer = NULL;
+	rb = rcu_dereference(event->rb);
+	if (rb) {
+		if (!atomic_inc_not_zero(&rb->refcount))
+			rb = NULL;
 	}
 	rcu_read_unlock();
 
-	return buffer;
+	return rb;
 }
 
-static void perf_buffer_put(struct perf_buffer *buffer)
+static void ring_buffer_put(struct ring_buffer *rb)
 {
-	if (!atomic_dec_and_test(&buffer->refcount))
+	if (!atomic_dec_and_test(&rb->refcount))
 		return;
 
-	call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
+	call_rcu(&rb->rcu_head, rb_free_rcu);
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3678,16 +3470,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 
 	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-		unsigned long size = perf_data_size(event->buffer);
+		unsigned long size = perf_data_size(event->rb);
 		struct user_struct *user = event->mmap_user;
-		struct perf_buffer *buffer = event->buffer;
+		struct ring_buffer *rb = event->rb;
 
 		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
 		vma->vm_mm->locked_vm -= event->mmap_locked;
-		rcu_assign_pointer(event->buffer, NULL);
+		rcu_assign_pointer(event->rb, NULL);
 		mutex_unlock(&event->mmap_mutex);
 
-		perf_buffer_put(buffer);
+		ring_buffer_put(rb);
 		free_uid(user);
 	}
 }
@@ -3705,7 +3497,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
 	unsigned long locked, lock_limit;
-	struct perf_buffer *buffer;
+	struct ring_buffer *rb;
 	unsigned long vma_size;
 	unsigned long nr_pages;
 	long user_extra, extra;
@@ -3714,7 +3506,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	/*
 	 * Don't allow mmap() of inherited per-task counters. This would
 	 * create a performance issue due to all children writing to the
-	 * same buffer.
+	 * same rb.
 	 */
 	if (event->cpu == -1 && event->attr.inherit)
 		return -EINVAL;
@@ -3726,7 +3518,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	nr_pages = (vma_size / PAGE_SIZE) - 1;
 
 	/*
-	 * If we have buffer pages ensure they're a power-of-two number, so we
+	 * If we have rb pages ensure they're a power-of-two number, so we
 	 * can do bitmasks instead of modulo.
 	 */
 	if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3740,9 +3532,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	WARN_ON_ONCE(event->ctx->parent_ctx);
 	mutex_lock(&event->mmap_mutex);
-	if (event->buffer) {
-		if (event->buffer->nr_pages == nr_pages)
-			atomic_inc(&event->buffer->refcount);
+	if (event->rb) {
+		if (event->rb->nr_pages == nr_pages)
+			atomic_inc(&event->rb->refcount);
 		else
 			ret = -EINVAL;
 		goto unlock;
@@ -3772,18 +3564,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	WARN_ON(event->buffer);
+	WARN_ON(event->rb);
 
 	if (vma->vm_flags & VM_WRITE)
-		flags |= PERF_BUFFER_WRITABLE;
+		flags |= RING_BUFFER_WRITABLE;
 
-	buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
+	rb = rb_alloc(nr_pages, event->attr.wakeup_watermark,
 				   event->cpu, flags);
-	if (!buffer) {
+	if (!rb) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
-	rcu_assign_pointer(event->buffer, buffer);
+	rcu_assign_pointer(event->rb, rb);
 
 	atomic_long_add(user_extra, &user->locked_vm);
 	event->mmap_locked = extra;
@@ -3882,117 +3674,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 }
 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
 
-/*
- * Output
- */
-static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
-			      unsigned long offset, unsigned long head)
-{
-	unsigned long mask;
-
-	if (!buffer->writable)
-		return true;
-
-	mask = perf_data_size(buffer) - 1;
-
-	offset = (offset - tail) & mask;
-	head   = (head   - tail) & mask;
-
-	if ((int)(head - offset) < 0)
-		return false;
-
-	return true;
-}
-
-static void perf_output_wakeup(struct perf_output_handle *handle)
-{
-	atomic_set(&handle->buffer->poll, POLL_IN);
-
-	if (handle->nmi) {
-		handle->event->pending_wakeup = 1;
-		irq_work_queue(&handle->event->pending);
-	} else
-		perf_event_wakeup(handle->event);
-}
-
-/*
- * We need to ensure a later event_id doesn't publish a head when a former
- * event isn't done writing. However since we need to deal with NMIs we
- * cannot fully serialize things.
- *
- * We only publish the head (and generate a wakeup) when the outer-most
- * event completes.
- */
-static void perf_output_get_handle(struct perf_output_handle *handle)
-{
-	struct perf_buffer *buffer = handle->buffer;
-
-	preempt_disable();
-	local_inc(&buffer->nest);
-	handle->wakeup = local_read(&buffer->wakeup);
-}
-
-static void perf_output_put_handle(struct perf_output_handle *handle)
-{
-	struct perf_buffer *buffer = handle->buffer;
-	unsigned long head;
-
-again:
-	head = local_read(&buffer->head);
-
-	/*
-	 * IRQ/NMI can happen here, which means we can miss a head update.
-	 */
-
-	if (!local_dec_and_test(&buffer->nest))
-		goto out;
-
-	/*
-	 * Publish the known good head. Rely on the full barrier implied
-	 * by atomic_dec_and_test() order the buffer->head read and this
-	 * write.
-	 */
-	buffer->user_page->data_head = head;
-
-	/*
-	 * Now check if we missed an update, rely on the (compiler)
-	 * barrier in atomic_dec_and_test() to re-read buffer->head.
-	 */
-	if (unlikely(head != local_read(&buffer->head))) {
-		local_inc(&buffer->nest);
-		goto again;
-	}
-
-	if (handle->wakeup != local_read(&buffer->wakeup))
-		perf_output_wakeup(handle);
-
-out:
-	preempt_enable();
-}
-
-__always_inline void perf_output_copy(struct perf_output_handle *handle,
-		      const void *buf, unsigned int len)
-{
-	do {
-		unsigned long size = min_t(unsigned long, handle->size, len);
-
-		memcpy(handle->addr, buf, size);
-
-		len -= size;
-		handle->addr += size;
-		buf += size;
-		handle->size -= size;
-		if (!handle->size) {
-			struct perf_buffer *buffer = handle->buffer;
-
-			handle->page++;
-			handle->page &= buffer->nr_pages - 1;
-			handle->addr = buffer->data_pages[handle->page];
-			handle->size = PAGE_SIZE << page_order(buffer);
-		}
-	} while (len);
-}
-
 static void __perf_event_header__init_id(struct perf_event_header *header,
 					 struct perf_sample_data *data,
 					 struct perf_event *event)
@@ -4023,9 +3704,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
 	}
 }
 
-static void perf_event_header__init_id(struct perf_event_header *header,
-				       struct perf_sample_data *data,
-				       struct perf_event *event)
+void perf_event_header__init_id(struct perf_event_header *header,
+				struct perf_sample_data *data,
+				struct perf_event *event)
 {
 	if (event->attr.sample_id_all)
 		__perf_event_header__init_id(header, data, event);
@@ -4052,121 +3733,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
 		perf_output_put(handle, data->cpu_entry);
 }
 
-static void perf_event__output_id_sample(struct perf_event *event,
-					 struct perf_output_handle *handle,
-					 struct perf_sample_data *sample)
+void perf_event__output_id_sample(struct perf_event *event,
+				  struct perf_output_handle *handle,
+				  struct perf_sample_data *sample)
 {
 	if (event->attr.sample_id_all)
 		__perf_event__output_id_sample(handle, sample);
 }
 
-int perf_output_begin(struct perf_output_handle *handle,
-		      struct perf_event *event, unsigned int size,
-		      int nmi, int sample)
-{
-	struct perf_buffer *buffer;
-	unsigned long tail, offset, head;
-	int have_lost;
-	struct perf_sample_data sample_data;
-	struct {
-		struct perf_event_header header;
-		u64			 id;
-		u64			 lost;
-	} lost_event;
-
-	rcu_read_lock();
-	/*
-	 * For inherited events we send all the output towards the parent.
-	 */
-	if (event->parent)
-		event = event->parent;
-
-	buffer = rcu_dereference(event->buffer);
-	if (!buffer)
-		goto out;
-
-	handle->buffer	= buffer;
-	handle->event	= event;
-	handle->nmi	= nmi;
-	handle->sample	= sample;
-
-	if (!buffer->nr_pages)
-		goto out;
-
-	have_lost = local_read(&buffer->lost);
-	if (have_lost) {
-		lost_event.header.size = sizeof(lost_event);
-		perf_event_header__init_id(&lost_event.header, &sample_data,
-					   event);
-		size += lost_event.header.size;
-	}
-
-	perf_output_get_handle(handle);
-
-	do {
-		/*
-		 * Userspace could choose to issue a mb() before updating the
-		 * tail pointer. So that all reads will be completed before the
-		 * write is issued.
-		 */
-		tail = ACCESS_ONCE(buffer->user_page->data_tail);
-		smp_rmb();
-		offset = head = local_read(&buffer->head);
-		head += size;
-		if (unlikely(!perf_output_space(buffer, tail, offset, head)))
-			goto fail;
-	} while (local_cmpxchg(&buffer->head, offset, head) != offset);
-
-	if (head - local_read(&buffer->wakeup) > buffer->watermark)
-		local_add(buffer->watermark, &buffer->wakeup);
-
-	handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
-	handle->page &= buffer->nr_pages - 1;
-	handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
-	handle->addr = buffer->data_pages[handle->page];
-	handle->addr += handle->size;
-	handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
-
-	if (have_lost) {
-		lost_event.header.type = PERF_RECORD_LOST;
-		lost_event.header.misc = 0;
-		lost_event.id          = event->id;
-		lost_event.lost        = local_xchg(&buffer->lost, 0);
-
-		perf_output_put(handle, lost_event);
-		perf_event__output_id_sample(event, handle, &sample_data);
-	}
-
-	return 0;
-
-fail:
-	local_inc(&buffer->lost);
-	perf_output_put_handle(handle);
-out:
-	rcu_read_unlock();
-
-	return -ENOSPC;
-}
-
-void perf_output_end(struct perf_output_handle *handle)
-{
-	struct perf_event *event = handle->event;
-	struct perf_buffer *buffer = handle->buffer;
-
-	int wakeup_events = event->attr.wakeup_events;
-
-	if (handle->sample && wakeup_events) {
-		int events = local_inc_return(&buffer->events);
-		if (events >= wakeup_events) {
-			local_sub(wakeup_events, &buffer->events);
-			local_inc(&buffer->wakeup);
-		}
-	}
-
-	perf_output_put_handle(handle);
-	rcu_read_unlock();
-}
-
 static void perf_output_read_one(struct perf_output_handle *handle,
 				 struct perf_event *event,
 				 u64 enabled, u64 running)
@@ -4187,7 +3761,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
 
-	perf_output_copy(handle, values, n * sizeof(u64));
+	__output_copy(handle, values, n * sizeof(u64));
 }
 
 /*
@@ -4217,7 +3791,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
 
-	perf_output_copy(handle, values, n * sizeof(u64));
+	__output_copy(handle, values, n * sizeof(u64));
 
 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 		n = 0;
@@ -4229,7 +3803,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
 
-		perf_output_copy(handle, values, n * sizeof(u64));
+		__output_copy(handle, values, n * sizeof(u64));
 	}
 }
 
@@ -4309,7 +3883,7 @@ void perf_output_sample(struct perf_output_handle *handle,
 
 			size *= sizeof(u64);
 
-			perf_output_copy(handle, data->callchain, size);
+			__output_copy(handle, data->callchain, size);
 		} else {
 			u64 nr = 0;
 			perf_output_put(handle, nr);
@@ -4319,8 +3893,8 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_RAW) {
 		if (data->raw) {
 			perf_output_put(handle, data->raw->size);
-			perf_output_copy(handle, data->raw->data,
-					 data->raw->size);
+			__output_copy(handle, data->raw->data,
+					   data->raw->size);
 		} else {
 			struct {
 				u32	size;
@@ -4617,7 +4191,7 @@ static void perf_event_comm_output(struct perf_event *event,
 	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
 
 	perf_output_put(&handle, comm_event->event_id);
-	perf_output_copy(&handle, comm_event->comm,
+	__output_copy(&handle, comm_event->comm,
 				   comm_event->comm_size);
 
 	perf_event__output_id_sample(event, &handle, &sample);
@@ -4763,7 +4337,7 @@ static void perf_event_mmap_output(struct perf_event *event,
 	mmap_event->event_id.tid = perf_event_tid(event, current);
 
 	perf_output_put(&handle, mmap_event->event_id);
-	perf_output_copy(&handle, mmap_event->file_name,
+	__output_copy(&handle, mmap_event->file_name,
 				   mmap_event->file_size);
 
 	perf_event__output_id_sample(event, &handle, &sample);
@@ -4819,7 +4393,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 
 	if (file) {
 		/*
-		 * d_path works from the end of the buffer backwards, so we
+		 * d_path works from the end of the rb backwards, so we
 		 * need to add enough zero bytes after the string to handle
 		 * the 64bit alignment we do later.
 		 */
@@ -6346,7 +5920,7 @@ err_size:
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-	struct perf_buffer *buffer = NULL, *old_buffer = NULL;
+	struct ring_buffer *rb = NULL, *old_rb = NULL;
 	int ret = -EINVAL;
 
 	if (!output_event)
@@ -6363,7 +5937,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 		goto out;
 
 	/*
-	 * If its not a per-cpu buffer, it must be the same task.
+	 * If its not a per-cpu rb, it must be the same task.
 	 */
 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 		goto out;
@@ -6375,20 +5949,20 @@ set:
 		goto unlock;
 
 	if (output_event) {
-		/* get the buffer we want to redirect to */
-		buffer = perf_buffer_get(output_event);
-		if (!buffer)
+		/* get the rb we want to redirect to */
+		rb = ring_buffer_get(output_event);
+		if (!rb)
 			goto unlock;
 	}
 
-	old_buffer = event->buffer;
-	rcu_assign_pointer(event->buffer, buffer);
+	old_rb = event->rb;
+	rcu_assign_pointer(event->rb, rb);
 	ret = 0;
 unlock:
 	mutex_unlock(&event->mmap_mutex);
 
-	if (old_buffer)
-		perf_buffer_put(old_buffer);
+	if (old_rb)
+		ring_buffer_put(old_rb);
 out:
 	return ret;
 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 000000000000..114f27f3a624
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,97 @@
+#ifndef _KERNEL_EVENTS_INTERNAL_H
+#define _KERNEL_EVENTS_INTERNAL_H
+
+#define RING_BUFFER_WRITABLE		0x01
+
+struct ring_buffer {
+	atomic_t			refcount;
+	struct rcu_head			rcu_head;
+#ifdef CONFIG_PERF_USE_VMALLOC
+	struct work_struct		work;
+	int				page_order;	/* allocation order  */
+#endif
+	int				nr_pages;	/* nr of data pages  */
+	int				writable;	/* are we writable   */
+
+	atomic_t			poll;		/* POLL_ for wakeups */
+
+	local_t				head;		/* write position    */
+	local_t				nest;		/* nested writers    */
+	local_t				events;		/* event limit       */
+	local_t				wakeup;		/* wakeup stamp      */
+	local_t				lost;		/* nr records lost   */
+
+	long				watermark;	/* wakeup watermark  */
+
+	struct perf_event_mmap_page	*user_page;
+	void				*data_pages[0];
+};
+
+
+extern void rb_free(struct ring_buffer *rb);
+extern struct ring_buffer *
+rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+extern void perf_event_wakeup(struct perf_event *event);
+
+extern void
+perf_event_header__init_id(struct perf_event_header *header,
+			   struct perf_sample_data *data,
+			   struct perf_event *event);
+extern void
+perf_event__output_id_sample(struct perf_event *event,
+			     struct perf_output_handle *handle,
+			     struct perf_sample_data *sample);
+
+extern struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
+
+#ifdef CONFIG_PERF_USE_VMALLOC
+/*
+ * Back perf_mmap() with vmalloc memory.
+ *
+ * Required for architectures that have d-cache aliasing issues.
+ */
+
+static inline int page_order(struct ring_buffer *rb)
+{
+	return rb->page_order;
+}
+
+#else
+
+static inline int page_order(struct ring_buffer *rb)
+{
+	return 0;
+}
+#endif
+
+static unsigned long perf_data_size(struct ring_buffer *rb)
+{
+	return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
+}
+
+static inline void
+__output_copy(struct perf_output_handle *handle,
+		   const void *buf, unsigned int len)
+{
+	do {
+		unsigned long size = min_t(unsigned long, handle->size, len);
+
+		memcpy(handle->addr, buf, size);
+
+		len -= size;
+		handle->addr += size;
+		buf += size;
+		handle->size -= size;
+		if (!handle->size) {
+			struct ring_buffer *rb = handle->rb;
+
+			handle->page++;
+			handle->page &= rb->nr_pages - 1;
+			handle->addr = rb->data_pages[handle->page];
+			handle->size = PAGE_SIZE << page_order(rb);
+		}
+	} while (len);
+}
+
+#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 000000000000..fde52595d8f7
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,399 @@
+/*
+ * Performance events ring-buffer code:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
+			      unsigned long offset, unsigned long head)
+{
+	unsigned long mask;
+
+	if (!rb->writable)
+		return true;
+
+	mask = perf_data_size(rb) - 1;
+
+	offset = (offset - tail) & mask;
+	head   = (head   - tail) & mask;
+
+	if ((int)(head - offset) < 0)
+		return false;
+
+	return true;
+}
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+	atomic_set(&handle->rb->poll, POLL_IN);
+
+	if (handle->nmi) {
+		handle->event->pending_wakeup = 1;
+		irq_work_queue(&handle->event->pending);
+	} else
+		perf_event_wakeup(handle->event);
+}
+
+/*
+ * We need to ensure a later event_id doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_get_handle(struct perf_output_handle *handle)
+{
+	struct ring_buffer *rb = handle->rb;
+
+	preempt_disable();
+	local_inc(&rb->nest);
+	handle->wakeup = local_read(&rb->wakeup);
+}
+
+static void perf_output_put_handle(struct perf_output_handle *handle)
+{
+	struct ring_buffer *rb = handle->rb;
+	unsigned long head;
+
+again:
+	head = local_read(&rb->head);
+
+	/*
+	 * IRQ/NMI can happen here, which means we can miss a head update.
+	 */
+
+	if (!local_dec_and_test(&rb->nest))
+		goto out;
+
+	/*
+	 * Publish the known good head. Rely on the full barrier implied
+	 * by atomic_dec_and_test() order the rb->head read and this
+	 * write.
+	 */
+	rb->user_page->data_head = head;
+
+	/*
+	 * Now check if we missed an update, rely on the (compiler)
+	 * barrier in atomic_dec_and_test() to re-read rb->head.
+	 */
+	if (unlikely(head != local_read(&rb->head))) {
+		local_inc(&rb->nest);
+		goto again;
+	}
+
+	if (handle->wakeup != local_read(&rb->wakeup))
+		perf_output_wakeup(handle);
+
+out:
+	preempt_enable();
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_event *event, unsigned int size,
+		      int nmi, int sample)
+{
+	struct ring_buffer *rb;
+	unsigned long tail, offset, head;
+	int have_lost;
+	struct perf_sample_data sample_data;
+	struct {
+		struct perf_event_header header;
+		u64			 id;
+		u64			 lost;
+	} lost_event;
+
+	rcu_read_lock();
+	/*
+	 * For inherited events we send all the output towards the parent.
+	 */
+	if (event->parent)
+		event = event->parent;
+
+	rb = rcu_dereference(event->rb);
+	if (!rb)
+		goto out;
+
+	handle->rb	= rb;
+	handle->event	= event;
+	handle->nmi	= nmi;
+	handle->sample	= sample;
+
+	if (!rb->nr_pages)
+		goto out;
+
+	have_lost = local_read(&rb->lost);
+	if (have_lost) {
+		lost_event.header.size = sizeof(lost_event);
+		perf_event_header__init_id(&lost_event.header, &sample_data,
+					   event);
+		size += lost_event.header.size;
+	}
+
+	perf_output_get_handle(handle);
+
+	do {
+		/*
+		 * Userspace could choose to issue a mb() before updating the
+		 * tail pointer. So that all reads will be completed before the
+		 * write is issued.
+		 */
+		tail = ACCESS_ONCE(rb->user_page->data_tail);
+		smp_rmb();
+		offset = head = local_read(&rb->head);
+		head += size;
+		if (unlikely(!perf_output_space(rb, tail, offset, head)))
+			goto fail;
+	} while (local_cmpxchg(&rb->head, offset, head) != offset);
+
+	if (head - local_read(&rb->wakeup) > rb->watermark)
+		local_add(rb->watermark, &rb->wakeup);
+
+	handle->page = offset >> (PAGE_SHIFT + page_order(rb));
+	handle->page &= rb->nr_pages - 1;
+	handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
+	handle->addr = rb->data_pages[handle->page];
+	handle->addr += handle->size;
+	handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
+
+	if (have_lost) {
+		lost_event.header.type = PERF_RECORD_LOST;
+		lost_event.header.misc = 0;
+		lost_event.id          = event->id;
+		lost_event.lost        = local_xchg(&rb->lost, 0);
+
+		perf_output_put(handle, lost_event);
+		perf_event__output_id_sample(event, handle, &sample_data);
+	}
+
+	return 0;
+
+fail:
+	local_inc(&rb->lost);
+	perf_output_put_handle(handle);
+out:
+	rcu_read_unlock();
+
+	return -ENOSPC;
+}
+
+void perf_output_copy(struct perf_output_handle *handle,
+		      const void *buf, unsigned int len)
+{
+	__output_copy(handle, buf, len);
+}
+
+void perf_output_end(struct perf_output_handle *handle)
+{
+	struct perf_event *event = handle->event;
+	struct ring_buffer *rb = handle->rb;
+
+	int wakeup_events = event->attr.wakeup_events;
+
+	if (handle->sample && wakeup_events) {
+		int events = local_inc_return(&rb->events);
+		if (events >= wakeup_events) {
+			local_sub(wakeup_events, &rb->events);
+			local_inc(&rb->wakeup);
+		}
+	}
+
+	perf_output_put_handle(handle);
+	rcu_read_unlock();
+}
+
+static void
+ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+{
+	long max_size = perf_data_size(rb);
+
+	if (watermark)
+		rb->watermark = min(max_size, watermark);
+
+	if (!rb->watermark)
+		rb->watermark = max_size / 2;
+
+	if (flags & RING_BUFFER_WRITABLE)
+		rb->writable = 1;
+
+	atomic_set(&rb->refcount, 1);
+}
+
+#ifndef CONFIG_PERF_USE_VMALLOC
+
+/*
+ * Back perf_mmap() with regular GFP_KERNEL-0 pages.
+ */
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+	if (pgoff > rb->nr_pages)
+		return NULL;
+
+	if (pgoff == 0)
+		return virt_to_page(rb->user_page);
+
+	return virt_to_page(rb->data_pages[pgoff - 1]);
+}
+
+static void *perf_mmap_alloc_page(int cpu)
+{
+	struct page *page;
+	int node;
+
+	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+	if (!page)
+		return NULL;
+
+	return page_address(page);
+}
+
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+	struct ring_buffer *rb;
+	unsigned long size;
+	int i;
+
+	size = sizeof(struct ring_buffer);
+	size += nr_pages * sizeof(void *);
+
+	rb = kzalloc(size, GFP_KERNEL);
+	if (!rb)
+		goto fail;
+
+	rb->user_page = perf_mmap_alloc_page(cpu);
+	if (!rb->user_page)
+		goto fail_user_page;
+
+	for (i = 0; i < nr_pages; i++) {
+		rb->data_pages[i] = perf_mmap_alloc_page(cpu);
+		if (!rb->data_pages[i])
+			goto fail_data_pages;
+	}
+
+	rb->nr_pages = nr_pages;
+
+	ring_buffer_init(rb, watermark, flags);
+
+	return rb;
+
+fail_data_pages:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)rb->data_pages[i]);
+
+	free_page((unsigned long)rb->user_page);
+
+fail_user_page:
+	kfree(rb);
+
+fail:
+	return NULL;
+}
+
+static void perf_mmap_free_page(unsigned long addr)
+{
+	struct page *page = virt_to_page((void *)addr);
+
+	page->mapping = NULL;
+	__free_page(page);
+}
+
+void rb_free(struct ring_buffer *rb)
+{
+	int i;
+
+	perf_mmap_free_page((unsigned long)rb->user_page);
+	for (i = 0; i < rb->nr_pages; i++)
+		perf_mmap_free_page((unsigned long)rb->data_pages[i]);
+	kfree(rb);
+}
+
+#else
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+	if (pgoff > (1UL << page_order(rb)))
+		return NULL;
+
+	return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
+}
+
+static void perf_mmap_unmark_page(void *addr)
+{
+	struct page *page = vmalloc_to_page(addr);
+
+	page->mapping = NULL;
+}
+
+static void rb_free_work(struct work_struct *work)
+{
+	struct ring_buffer *rb;
+	void *base;
+	int i, nr;
+
+	rb = container_of(work, struct ring_buffer, work);
+	nr = 1 << page_order(rb);
+
+	base = rb->user_page;
+	for (i = 0; i < nr + 1; i++)
+		perf_mmap_unmark_page(base + (i * PAGE_SIZE));
+
+	vfree(base);
+	kfree(rb);
+}
+
+void rb_free(struct ring_buffer *rb)
+{
+	schedule_work(&rb->work);
+}
+
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+	struct ring_buffer *rb;
+	unsigned long size;
+	void *all_buf;
+
+	size = sizeof(struct ring_buffer);
+	size += sizeof(void *);
+
+	rb = kzalloc(size, GFP_KERNEL);
+	if (!rb)
+		goto fail;
+
+	INIT_WORK(&rb->work, rb_free_work);
+
+	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
+	if (!all_buf)
+		goto fail_all_buf;
+
+	rb->user_page = all_buf;
+	rb->data_pages[0] = all_buf + PAGE_SIZE;
+	rb->page_order = ilog2(nr_pages);
+	rb->nr_pages = 1;
+
+	ring_buffer_init(rb, watermark, flags);
+
+	return rb;
+
+fail_all_buf:
+	kfree(rb);
+
+fail:
+	return NULL;
+}
+
+#endif
-- 
cgit v1.2.3


From 334955ef964bee9d3b1e20966847eee28cfd05f6 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 1 Jun 2011 19:04:57 +0100
Subject: i8253: Create linux/i8253.h and use it in all 8253 related files

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Link: http://lkml.kernel.org/r/20110601180610.054254048@duck.linux-mips.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

 arch/arm/mach-footbridge/isa-timer.c |    2 +-
 arch/mips/cobalt/time.c              |    2 +-
 arch/mips/jazz/irq.c                 |    2 +-
 arch/mips/kernel/i8253.c             |    2 +-
 arch/mips/mti-malta/malta-time.c     |    2 +-
 arch/mips/sgi-ip22/ip22-time.c       |    2 +-
 arch/mips/sni/time.c                 |    2 +-
 arch/x86/kernel/apic/apic.c          |    2 +-
 arch/x86/kernel/apm_32.c             |    2 +-
 arch/x86/kernel/hpet.c               |    2 +-
 arch/x86/kernel/i8253.c              |    2 +-
 arch/x86/kernel/time.c               |    2 +-
 drivers/block/hd.c                   |    2 +-
 drivers/clocksource/i8253.c          |    2 +-
 drivers/input/gameport/gameport.c    |    2 +-
 drivers/input/joystick/analog.c      |    2 +-
 drivers/input/misc/pcspkr.c          |    2 +-
 include/linux/i8253.h                |   11 +++++++++++
 sound/drivers/pcsp/pcsp.h            |    2 +-
 19 files changed, 29 insertions(+), 18 deletions(-)
---
 arch/arm/mach-footbridge/isa-timer.c |  2 +-
 arch/mips/cobalt/time.c              |  2 +-
 arch/mips/jazz/irq.c                 |  2 +-
 arch/mips/kernel/i8253.c             |  2 +-
 arch/mips/mti-malta/malta-time.c     |  2 +-
 arch/mips/sgi-ip22/ip22-time.c       |  2 +-
 arch/mips/sni/time.c                 |  2 +-
 arch/x86/kernel/apic/apic.c          |  2 +-
 arch/x86/kernel/apm_32.c             |  2 +-
 arch/x86/kernel/hpet.c               |  2 +-
 arch/x86/kernel/i8253.c              |  2 +-
 arch/x86/kernel/time.c               |  2 +-
 drivers/block/hd.c                   |  2 +-
 drivers/clocksource/i8253.c          |  2 +-
 drivers/input/gameport/gameport.c    |  2 +-
 drivers/input/joystick/analog.c      |  2 +-
 drivers/input/misc/pcspkr.c          |  2 +-
 include/linux/i8253.h                | 11 +++++++++++
 sound/drivers/pcsp/pcsp.h            |  2 +-
 19 files changed, 29 insertions(+), 18 deletions(-)
 create mode 100644 include/linux/i8253.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-footbridge/isa-timer.c b/arch/arm/mach-footbridge/isa-timer.c
index 7020f1a3feca..3c7367be5459 100644
--- a/arch/arm/mach-footbridge/isa-timer.c
+++ b/arch/arm/mach-footbridge/isa-timer.c
@@ -6,6 +6,7 @@
  */
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
+#include <linux/i8253.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
@@ -14,7 +15,6 @@
 #include <linux/timex.h>
 
 #include <asm/irq.h>
-#include <asm/i8253.h>
 #include <asm/mach/time.h>
 
 #include "common.h"
diff --git a/arch/mips/cobalt/time.c b/arch/mips/cobalt/time.c
index 0162f9edc693..3bff3b820baf 100644
--- a/arch/mips/cobalt/time.c
+++ b/arch/mips/cobalt/time.c
@@ -17,10 +17,10 @@
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
+#include <linux/i8253.h>
 #include <linux/init.h>
 
 #include <asm/gt64120.h>
-#include <asm/i8253.h>
 #include <asm/time.h>
 
 #define GT641XX_BASE_CLOCK	50000000	/* 50MHz */
diff --git a/arch/mips/jazz/irq.c b/arch/mips/jazz/irq.c
index 260df4750949..ca9bd2069142 100644
--- a/arch/mips/jazz/irq.c
+++ b/arch/mips/jazz/irq.c
@@ -7,6 +7,7 @@
  * Copyright (C) 1994 - 2001, 2003, 07 Ralf Baechle
  */
 #include <linux/clockchips.h>
+#include <linux/i8253.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
@@ -15,7 +16,6 @@
 #include <linux/irq.h>
 
 #include <asm/irq_cpu.h>
-#include <asm/i8253.h>
 #include <asm/i8259.h>
 #include <asm/io.h>
 #include <asm/jazz.h>
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index 391221b6a6aa..82b5a9f037e7 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -3,6 +3,7 @@
  *
  */
 #include <linux/clockchips.h>
+#include <linux/i8253.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
@@ -12,7 +13,6 @@
 #include <linux/irq.h>
 
 #include <asm/delay.h>
-#include <asm/i8253.h>
 #include <asm/io.h>
 #include <asm/time.h>
 
diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c
index 1620b83cd13e..f8ee945ee411 100644
--- a/arch/mips/mti-malta/malta-time.c
+++ b/arch/mips/mti-malta/malta-time.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/i8253.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
 #include <linux/sched.h>
@@ -31,7 +32,6 @@
 #include <asm/mipsregs.h>
 #include <asm/mipsmtregs.h>
 #include <asm/hardirq.h>
-#include <asm/i8253.h>
 #include <asm/irq.h>
 #include <asm/div64.h>
 #include <asm/cpu.h>
diff --git a/arch/mips/sgi-ip22/ip22-time.c b/arch/mips/sgi-ip22/ip22-time.c
index 1a94c9894188..607192449335 100644
--- a/arch/mips/sgi-ip22/ip22-time.c
+++ b/arch/mips/sgi-ip22/ip22-time.c
@@ -10,6 +10,7 @@
  * Copyright (C) 2003, 06 Ralf Baechle (ralf@linux-mips.org)
  */
 #include <linux/bcd.h>
+#include <linux/i8253.h>
 #include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
@@ -20,7 +21,6 @@
 
 #include <asm/cpu.h>
 #include <asm/mipsregs.h>
-#include <asm/i8253.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/time.h>
diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c
index 0904d4d30cb3..ec0be14996a4 100644
--- a/arch/mips/sni/time.c
+++ b/arch/mips/sni/time.c
@@ -1,11 +1,11 @@
 #include <linux/types.h>
+#include <linux/i8253.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/smp.h>
 #include <linux/time.h>
 #include <linux/clockchips.h>
 
-#include <asm/i8253.h>
 #include <asm/sni.h>
 #include <asm/time.h>
 #include <asm-generic/rtc.h>
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b961af86bfea..f3c37841bb33 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -27,6 +27,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/timex.h>
+#include <linux/i8253.h>
 #include <linux/dmar.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
@@ -39,7 +40,6 @@
 #include <asm/pgalloc.h>
 #include <asm/atomic.h>
 #include <asm/mpspec.h>
-#include <asm/i8253.h>
 #include <asm/i8259.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 965a7666c283..a30740e6890e 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -229,11 +229,11 @@
 #include <linux/jiffies.h>
 #include <linux/acpi.h>
 #include <linux/syscore_ops.h>
+#include <linux/i8253.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
-#include <asm/i8253.h>
 #include <asm/olpc.h>
 #include <asm/paravirt.h>
 #include <asm/reboot.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 6781765b3a0d..85b8a8a76c21 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
 #include <linux/sysdev.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
+#include <linux/i8253.h>
 #include <linux/slab.h>
 #include <linux/hpet.h>
 #include <linux/init.h>
@@ -12,7 +13,6 @@
 #include <linux/io.h>
 
 #include <asm/fixmap.h>
-#include <asm/i8253.h>
 #include <asm/hpet.h>
 
 #define HPET_MASK			CLOCKSOURCE_MASK(32)
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index fb66dc9e36cb..323555050f89 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -9,10 +9,10 @@
 #include <linux/module.h>
 #include <linux/timex.h>
 #include <linux/delay.h>
+#include <linux/i8253.h>
 #include <linux/init.h>
 #include <linux/io.h>
 
-#include <asm/i8253.h>
 #include <asm/hpet.h>
 #include <asm/smp.h>
 
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 00cbb272627f..5a64d057be57 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -11,13 +11,13 @@
 
 #include <linux/clockchips.h>
 #include <linux/interrupt.h>
+#include <linux/i8253.h>
 #include <linux/time.h>
 #include <linux/mca.h>
 
 #include <asm/vsyscall.h>
 #include <asm/x86_init.h>
 #include <asm/i8259.h>
-#include <asm/i8253.h>
 #include <asm/timer.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 007c630904c1..b52c9ca146fc 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -155,7 +155,7 @@ else \
 
 #if (HD_DELAY > 0)
 
-#include <asm/i8253.h>
+#include <linux/i8253.h>
 
 unsigned long last_req;
 
diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c
index 225c1761b372..33735ebd2dcc 100644
--- a/drivers/clocksource/i8253.c
+++ b/drivers/clocksource/i8253.c
@@ -7,7 +7,7 @@
 #include <linux/spinlock.h>
 #include <linux/timex.h>
 
-#include <asm/i8253.h>
+#include <linux/i8253.h>
 
 /*
  * Since the PIT overflows every tick, its not very useful
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index 5b8f59d6c3e8..c351aa421f8f 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -47,7 +47,7 @@ static void gameport_disconnect_port(struct gameport *gameport);
 
 #if defined(__i386__)
 
-#include <asm/i8253.h>
+#include <linux/i8253.h>
 
 #define DELTA(x,y)      ((y)-(x)+((y)<(x)?1193182/HZ:0))
 #define GET_TIME(x)     do { x = get_time_pit(); } while (0)
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 4afe0a3b4884..9882971827e6 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -136,7 +136,7 @@ struct analog_port {
 
 #ifdef __i386__
 
-#include <asm/i8253.h>
+#include <linux/i8253.h>
 
 #define GET_TIME(x)	do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0)
 #define DELTA(x,y)	(cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? CLOCK_TICK_RATE / HZ : 0)))
diff --git a/drivers/input/misc/pcspkr.c b/drivers/input/misc/pcspkr.c
index f080dd31499b..2aa9d9020179 100644
--- a/drivers/input/misc/pcspkr.c
+++ b/drivers/input/misc/pcspkr.c
@@ -27,7 +27,7 @@ MODULE_ALIAS("platform:pcspkr");
 
 #if defined(CONFIG_MIPS) || defined(CONFIG_X86)
 /* Use the global PIT lock ! */
-#include <asm/i8253.h>
+#include <linux/i8253.h>
 #else
 #include <asm/8253pit.h>
 static DEFINE_RAW_SPINLOCK(i8253_lock);
diff --git a/include/linux/i8253.h b/include/linux/i8253.h
new file mode 100644
index 000000000000..d2cba88f75d6
--- /dev/null
+++ b/include/linux/i8253.h
@@ -0,0 +1,11 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#ifndef __LINUX_I8253_H
+#define __LINUX_I8253_H
+
+#include <asm/i8253.h>
+
+#endif /* __LINUX_I8253_H */
diff --git a/sound/drivers/pcsp/pcsp.h b/sound/drivers/pcsp/pcsp.h
index 4ff6c8cc5077..6757c304ee46 100644
--- a/sound/drivers/pcsp/pcsp.h
+++ b/sound/drivers/pcsp/pcsp.h
@@ -13,7 +13,7 @@
 #include <linux/timex.h>
 #if defined(CONFIG_MIPS) || defined(CONFIG_X86)
 /* Use the global PIT lock ! */
-#include <asm/i8253.h>
+#include <linux/i8253.h>
 #else
 #include <asm/8253pit.h>
 static DEFINE_RAW_SPINLOCK(i8253_lock);
-- 
cgit v1.2.3


From cb2455aa274b780802c593fecf115240a655d809 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 1 Jun 2011 19:04:58 +0100
Subject: i8253: Unify all kernel declarations of i8253_lock

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mips@linux-mips.org
Link: http://lkml.kernel.org/r/20110601180610.134151920@duck.linux-mips.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/include/asm/i8253.h  |  9 ---------
 arch/mips/include/asm/i8253.h | 12 ------------
 arch/x86/include/asm/i8253.h  | 10 ----------
 include/linux/i8253.h         | 14 ++++++++++++++
 4 files changed, 14 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/i8253.h b/arch/arm/include/asm/i8253.h
index 70656b69d5ce..5b41b92ecf6a 100644
--- a/arch/arm/include/asm/i8253.h
+++ b/arch/arm/include/asm/i8253.h
@@ -1,15 +1,6 @@
 #ifndef __ASMARM_I8253_H
 #define __ASMARM_I8253_H
 
-/* i8253A PIT registers */
-#define PIT_MODE	0x43
-#define PIT_CH0		0x40
-
 #define PIT_LATCH	((PIT_TICK_RATE + HZ / 2) / HZ)
 
-extern raw_spinlock_t i8253_lock;
-
-#define outb_pit	outb_p
-#define inb_pit		inb_p
-
 #endif
diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h
index 9ad011366f73..32ec1f00c6b3 100644
--- a/arch/mips/include/asm/i8253.h
+++ b/arch/mips/include/asm/i8253.h
@@ -5,20 +5,8 @@
 #ifndef __ASM_I8253_H
 #define __ASM_I8253_H
 
-#include <linux/spinlock.h>
-
-/* i8253A PIT registers */
-#define PIT_MODE		0x43
-#define PIT_CH0			0x40
-#define PIT_CH2			0x42
-
 #define PIT_LATCH		LATCH
 
-extern raw_spinlock_t i8253_lock;
-
 extern void setup_pit_timer(void);
 
-#define inb_pit inb_p
-#define outb_pit outb_p
-
 #endif /* __ASM_I8253_H */
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index 65aaa91d5850..20480cef7547 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -1,20 +1,10 @@
 #ifndef _ASM_X86_I8253_H
 #define _ASM_X86_I8253_H
 
-/* i8253A PIT registers */
-#define PIT_MODE		0x43
-#define PIT_CH0			0x40
-#define PIT_CH2			0x42
-
 #define PIT_LATCH	LATCH
 
-extern raw_spinlock_t i8253_lock;
-
 extern struct clock_event_device *global_clock_event;
 
 extern void setup_pit_timer(void);
 
-#define inb_pit		inb_p
-#define outb_pit	outb_p
-
 #endif /* _ASM_X86_I8253_H */
diff --git a/include/linux/i8253.h b/include/linux/i8253.h
index d2cba88f75d6..13048d335368 100644
--- a/include/linux/i8253.h
+++ b/include/linux/i8253.h
@@ -2,10 +2,24 @@
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
+ *
+ *  Machine specific IO port address definition for generic.
+ *  Written by Osamu Tomita <tomita@cinet.co.jp>
  */
 #ifndef __LINUX_I8253_H
 #define __LINUX_I8253_H
 
+#include <linux/spinlock.h>
 #include <asm/i8253.h>
 
+/* i8253A PIT registers */
+#define PIT_MODE	0x43
+#define PIT_CH0		0x40
+#define PIT_CH2		0x42
+
+#define inb_pit         inb_p
+#define outb_pit        outb_p
+
+extern raw_spinlock_t i8253_lock;
+
 #endif /* __LINUX_I8253_H */
-- 
cgit v1.2.3


From 49cf3f29a1ef2e59f1be823e376c1fbac586eb8d Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 1 Jun 2011 19:05:07 +0100
Subject: i8253: Consolidate definitions of PIT_LATCH

x86 defines PIT_LATCH as LATCH which in <linux/timex.h> is defined as
((CLOCK_TICK_RATE + HZ/2) / HZ) and <asm/timex.h> again defines
CLOCK_TICK_RATE as PIT_TICK_RATE.

MIPS defines PIT_LATCH as LATCH which in <linux/timex.h> is defined as
((CLOCK_TICK_RATE + HZ/2) / HZ) and <asm/timex.h> again defines
CLOCK_TICK_RATE as 1193182.

ARM defines PITCH_LATCH as ((PIT_TICK_RATE + HZ / 2) / HZ) - and that's
the sanest thing and equivalent to above definitions so use that as the
new definition in <linux/i8253.h>.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mips@linux-mips.org
Link: http://lkml.kernel.org/r/20110601180610.832810002@duck.linux-mips.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/include/asm/i8253.h  | 2 --
 arch/mips/include/asm/i8253.h | 2 --
 arch/x86/include/asm/i8253.h  | 2 --
 include/linux/i8253.h         | 4 ++++
 4 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/i8253.h b/arch/arm/include/asm/i8253.h
index 5b41b92ecf6a..fee59eb4581d 100644
--- a/arch/arm/include/asm/i8253.h
+++ b/arch/arm/include/asm/i8253.h
@@ -1,6 +1,4 @@
 #ifndef __ASMARM_I8253_H
 #define __ASMARM_I8253_H
 
-#define PIT_LATCH	((PIT_TICK_RATE + HZ / 2) / HZ)
-
 #endif
diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h
index 32ec1f00c6b3..26f4eb197c3a 100644
--- a/arch/mips/include/asm/i8253.h
+++ b/arch/mips/include/asm/i8253.h
@@ -5,8 +5,6 @@
 #ifndef __ASM_I8253_H
 #define __ASM_I8253_H
 
-#define PIT_LATCH		LATCH
-
 extern void setup_pit_timer(void);
 
 #endif /* __ASM_I8253_H */
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index 3d5f5eea5da4..0049dc9fbfee 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -1,8 +1,6 @@
 #ifndef _ASM_X86_I8253_H
 #define _ASM_X86_I8253_H
 
-#define PIT_LATCH	LATCH
-
 extern void setup_pit_timer(void);
 
 #endif /* _ASM_X86_I8253_H */
diff --git a/include/linux/i8253.h b/include/linux/i8253.h
index 13048d335368..1ec6323a1507 100644
--- a/include/linux/i8253.h
+++ b/include/linux/i8253.h
@@ -9,7 +9,9 @@
 #ifndef __LINUX_I8253_H
 #define __LINUX_I8253_H
 
+#include <linux/param.h>
 #include <linux/spinlock.h>
+#include <linux/timex.h>
 #include <asm/i8253.h>
 
 /* i8253A PIT registers */
@@ -17,6 +19,8 @@
 #define PIT_CH0		0x40
 #define PIT_CH2		0x42
 
+#define PIT_LATCH	((PIT_TICK_RATE + HZ/2) / HZ)
+
 #define inb_pit         inb_p
 #define outb_pit        outb_p
 
-- 
cgit v1.2.3


From 850492760cfd38d14d64a5fc6d636091464f755e Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 1 Jun 2011 19:05:08 +0100
Subject: i8253: Move remaining content and delete asm/i8253.h

Move setup_pit_timer() declaration to the common header file and
remove the arch specific ones.

[ tglx: Move it to linux/i8253.h instead of asm/mips and asm/x86 ]

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Russell King <linux@arm.linux.org.uk>
Cc: linux-mips@linux-mips.org
Cc: Sergei Shtylyov <sshtylyov@mvista.com
Link: http://lkml.kernel.org/r/20110601180610.913463093@duck.linux-mips.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/include/asm/i8253.h  |  4 ----
 arch/mips/include/asm/i8253.h | 10 ----------
 arch/x86/include/asm/i8253.h  |  6 ------
 include/linux/i8253.h         |  3 ++-
 4 files changed, 2 insertions(+), 21 deletions(-)
 delete mode 100644 arch/arm/include/asm/i8253.h
 delete mode 100644 arch/mips/include/asm/i8253.h
 delete mode 100644 arch/x86/include/asm/i8253.h

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/i8253.h b/arch/arm/include/asm/i8253.h
deleted file mode 100644
index fee59eb4581d..000000000000
--- a/arch/arm/include/asm/i8253.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __ASMARM_I8253_H
-#define __ASMARM_I8253_H
-
-#endif
diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h
deleted file mode 100644
index 26f4eb197c3a..000000000000
--- a/arch/mips/include/asm/i8253.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- *  Machine specific IO port address definition for generic.
- *  Written by Osamu Tomita <tomita@cinet.co.jp>
- */
-#ifndef __ASM_I8253_H
-#define __ASM_I8253_H
-
-extern void setup_pit_timer(void);
-
-#endif /* __ASM_I8253_H */
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
deleted file mode 100644
index 0049dc9fbfee..000000000000
--- a/arch/x86/include/asm/i8253.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_X86_I8253_H
-#define _ASM_X86_I8253_H
-
-extern void setup_pit_timer(void);
-
-#endif /* _ASM_X86_I8253_H */
diff --git a/include/linux/i8253.h b/include/linux/i8253.h
index 1ec6323a1507..76039c86ab58 100644
--- a/include/linux/i8253.h
+++ b/include/linux/i8253.h
@@ -12,7 +12,6 @@
 #include <linux/param.h>
 #include <linux/spinlock.h>
 #include <linux/timex.h>
-#include <asm/i8253.h>
 
 /* i8253A PIT registers */
 #define PIT_MODE	0x43
@@ -26,4 +25,6 @@
 
 extern raw_spinlock_t i8253_lock;
 
+extern void setup_pit_timer(void);
+
 #endif /* __LINUX_I8253_H */
-- 
cgit v1.2.3


From c7ac8679bec9397afe8918f788cbcef88c38da54 Mon Sep 17 00:00:00 2001
From: Greg Rose <gregory.v.rose@intel.com>
Date: Fri, 10 Jun 2011 01:27:09 +0000
Subject: rtnetlink: Compute and store minimum ifinfo dump size

The message size allocated for rtnl ifinfo dumps was limited to
a single page.  This is not enough for additional interface info
available with devices that support SR-IOV and caused a bug in
which VF info would not be displayed if more than approximately
40 VFs were created per interface.

Implement a new function pointer for the rtnl_register service that will
calculate the amount of data required for the ifinfo dump and allocate
enough data to satisfy the request.

Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/infiniband/core/netlink.c    |  2 +-
 include/linux/netlink.h              |  6 ++--
 include/net/rtnetlink.h              |  7 +++--
 net/bridge/br_netlink.c              | 15 ++++++---
 net/core/fib_rules.c                 |  6 ++--
 net/core/neighbour.c                 | 11 ++++---
 net/core/rtnetlink.c                 | 60 +++++++++++++++++++++++++++++-------
 net/dcb/dcbnl.c                      |  4 +--
 net/decnet/dn_dev.c                  |  6 ++--
 net/decnet/dn_fib.c                  |  4 +--
 net/decnet/dn_route.c                |  5 +--
 net/ipv4/devinet.c                   |  6 ++--
 net/ipv4/fib_frontend.c              |  6 ++--
 net/ipv4/inet_diag.c                 |  2 +-
 net/ipv4/ipmr.c                      |  3 +-
 net/ipv4/route.c                     |  2 +-
 net/ipv6/addrconf.c                  | 16 ++++++----
 net/ipv6/addrlabel.c                 |  9 ++++--
 net/ipv6/ip6_fib.c                   |  3 +-
 net/ipv6/ip6mr.c                     |  3 +-
 net/ipv6/route.c                     |  6 ++--
 net/netfilter/ipset/ip_set_core.c    |  2 +-
 net/netfilter/nf_conntrack_netlink.c |  4 +--
 net/netlink/af_netlink.c             | 17 ++++++----
 net/netlink/genetlink.c              |  2 +-
 net/phonet/pn_netlink.c              | 13 ++++----
 net/sched/act_api.c                  |  7 +++--
 net/sched/cls_api.c                  |  6 ++--
 net/sched/sch_api.c                  | 12 ++++----
 net/xfrm/xfrm_user.c                 |  3 +-
 30 files changed, 158 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 4a5abaf0a25c..9227f4acd79c 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -148,7 +148,7 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				return -EINVAL;
 			return netlink_dump_start(nls, skb, nlh,
 						  client->cb_table[op].dump,
-						  NULL);
+						  NULL, 0);
 		}
 	}
 
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index a9dd89552f9c..fdd0188a167e 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -221,7 +221,8 @@ struct netlink_callback {
 	int			(*dump)(struct sk_buff * skb,
 					struct netlink_callback *cb);
 	int			(*done)(struct netlink_callback *cb);
-	int			family;
+	u16			family;
+	u16			min_dump_alloc;
 	long			args[6];
 };
 
@@ -259,7 +260,8 @@ __nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, int type, int len, int flags)
 extern int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 			      const struct nlmsghdr *nlh,
 			      int (*dump)(struct sk_buff *skb, struct netlink_callback*),
-			      int (*done)(struct netlink_callback*));
+			      int (*done)(struct netlink_callback*),
+			      u16 min_dump_alloc);
 
 
 #define NL_NONROOT_RECV 0x1
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 4093ca78cf60..678f1ffaf843 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -6,11 +6,14 @@
 
 typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, void *);
 typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
+typedef u16 (*rtnl_calcit_func)(struct sk_buff *);
 
 extern int	__rtnl_register(int protocol, int msgtype,
-				rtnl_doit_func, rtnl_dumpit_func);
+				rtnl_doit_func, rtnl_dumpit_func,
+				rtnl_calcit_func);
 extern void	rtnl_register(int protocol, int msgtype,
-			      rtnl_doit_func, rtnl_dumpit_func);
+			      rtnl_doit_func, rtnl_dumpit_func,
+			      rtnl_calcit_func);
 extern int	rtnl_unregister(int protocol, int msgtype);
 extern void	rtnl_unregister_all(int protocol);
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index ffb0dc4cc0e8..6814083a92f4 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -218,19 +218,24 @@ int __init br_netlink_init(void)
 	if (err < 0)
 		goto err1;
 
-	err = __rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, br_dump_ifinfo);
+	err = __rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL,
+			      br_dump_ifinfo, NULL);
 	if (err)
 		goto err2;
-	err = __rtnl_register(PF_BRIDGE, RTM_SETLINK, br_rtm_setlink, NULL);
+	err = __rtnl_register(PF_BRIDGE, RTM_SETLINK,
+			      br_rtm_setlink, NULL, NULL);
 	if (err)
 		goto err3;
-	err = __rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, br_fdb_add, NULL);
+	err = __rtnl_register(PF_BRIDGE, RTM_NEWNEIGH,
+			      br_fdb_add, NULL, NULL);
 	if (err)
 		goto err3;
-	err = __rtnl_register(PF_BRIDGE, RTM_DELNEIGH, br_fdb_delete, NULL);
+	err = __rtnl_register(PF_BRIDGE, RTM_DELNEIGH,
+			      br_fdb_delete, NULL, NULL);
 	if (err)
 		goto err3;
-	err = __rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, br_fdb_dump);
+	err = __rtnl_register(PF_BRIDGE, RTM_GETNEIGH,
+			      NULL, br_fdb_dump, NULL);
 	if (err)
 		goto err3;
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 008dc70b064b..e7ab0c0285b5 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -740,9 +740,9 @@ static struct pernet_operations fib_rules_net_ops = {
 static int __init fib_rules_init(void)
 {
 	int err;
-	rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule);
+	rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL);
 
 	err = register_pernet_subsys(&fib_rules_net_ops);
 	if (err < 0)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 799f06e03a22..ceb505b1507c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2909,12 +2909,13 @@ EXPORT_SYMBOL(neigh_sysctl_unregister);
 
 static int __init neigh_init(void)
 {
-	rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info);
+	rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL);
 
-	rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info);
-	rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
+		      NULL);
+	rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL);
 
 	return 0;
 }
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index abd936d8a716..a798fc6f2aa1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -56,9 +56,11 @@
 struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
+	rtnl_calcit_func 	calcit;
 };
 
 static DEFINE_MUTEX(rtnl_mutex);
+static u16 min_ifinfo_dump_size;
 
 void rtnl_lock(void)
 {
@@ -144,12 +146,28 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
 	return tab ? tab[msgindex].dumpit : NULL;
 }
 
+static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)
+{
+	struct rtnl_link *tab;
+
+	if (protocol <= RTNL_FAMILY_MAX)
+		tab = rtnl_msg_handlers[protocol];
+	else
+		tab = NULL;
+
+	if (tab == NULL || tab[msgindex].calcit == NULL)
+		tab = rtnl_msg_handlers[PF_UNSPEC];
+
+	return tab ? tab[msgindex].calcit : NULL;
+}
+
 /**
  * __rtnl_register - Register a rtnetlink message type
  * @protocol: Protocol family or PF_UNSPEC
  * @msgtype: rtnetlink message type
  * @doit: Function pointer called for each request message
  * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @calcit: Function pointer to calc size of dump message
  *
  * Registers the specified function pointers (at least one of them has
  * to be non-NULL) to be called whenever a request message for the
@@ -162,7 +180,8 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
  * Returns 0 on success or a negative error code.
  */
 int __rtnl_register(int protocol, int msgtype,
-		    rtnl_doit_func doit, rtnl_dumpit_func dumpit)
+		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+		    rtnl_calcit_func calcit)
 {
 	struct rtnl_link *tab;
 	int msgindex;
@@ -185,6 +204,9 @@ int __rtnl_register(int protocol, int msgtype,
 	if (dumpit)
 		tab[msgindex].dumpit = dumpit;
 
+	if (calcit)
+		tab[msgindex].calcit = calcit;
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__rtnl_register);
@@ -199,9 +221,10 @@ EXPORT_SYMBOL_GPL(__rtnl_register);
  * of memory implies no sense in continuing.
  */
 void rtnl_register(int protocol, int msgtype,
-		   rtnl_doit_func doit, rtnl_dumpit_func dumpit)
+		   rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+		   rtnl_calcit_func calcit)
 {
-	if (__rtnl_register(protocol, msgtype, doit, dumpit) < 0)
+	if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0)
 		panic("Unable to register rtnetlink message handler, "
 		      "protocol = %d, message type = %d\n",
 		      protocol, msgtype);
@@ -1818,6 +1841,11 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	return err;
 }
 
+static u16 rtnl_calcit(struct sk_buff *skb)
+{
+	return min_ifinfo_dump_size;
+}
+
 static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int idx;
@@ -1847,11 +1875,14 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
+	size_t if_info_size;
 
-	skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
+	skb = nlmsg_new((if_info_size = if_nlmsg_size(dev)), GFP_KERNEL);
 	if (skb == NULL)
 		goto errout;
 
+	min_ifinfo_dump_size = max_t(u16, if_info_size, min_ifinfo_dump_size);
+
 	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
@@ -1902,14 +1933,20 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
 		struct sock *rtnl;
 		rtnl_dumpit_func dumpit;
+		rtnl_calcit_func calcit;
+		u16 min_dump_alloc = 0;
 
 		dumpit = rtnl_get_dumpit(family, type);
 		if (dumpit == NULL)
 			return -EOPNOTSUPP;
+		calcit = rtnl_get_calcit(family, type);
+		if (calcit)
+			min_dump_alloc = calcit(skb);
 
 		__rtnl_unlock();
 		rtnl = net->rtnl;
-		err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL);
+		err = netlink_dump_start(rtnl, skb, nlh, dumpit,
+					 NULL, min_dump_alloc);
 		rtnl_lock();
 		return err;
 	}
@@ -2019,12 +2056,13 @@ void __init rtnetlink_init(void)
 	netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
 	register_netdevice_notifier(&rtnetlink_dev_notifier);
 
-	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo);
-	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL);
-	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
+		      rtnl_dump_ifinfo, rtnl_calcit);
+	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL);
 
-	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all);
-	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all);
+	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
 }
 
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 3609eacaf4ce..ed1bb8c65a9e 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1819,8 +1819,8 @@ static int __init dcbnl_init(void)
 {
 	INIT_LIST_HEAD(&dcb_app_list);
 
-	rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL);
-	rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, NULL);
 
 	return 0;
 }
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index cf26ac74a188..3780fd6e7bfe 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1414,9 +1414,9 @@ void __init dn_dev_init(void)
 
 	dn_dev_devices_on();
 
-	rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL);
-	rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL);
-	rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr);
+	rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL, NULL);
+	rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, NULL);
+	rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, NULL);
 
 	proc_net_fops_create(&init_net, "decnet_dev", S_IRUGO, &dn_dev_seq_fops);
 
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 1c74ed36ce8f..104324d6d535 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -763,8 +763,8 @@ void __init dn_fib_init(void)
 
 	register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
 
-	rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL);
-	rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL);
+	rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL, NULL);
+	rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, NULL);
 }
 
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 74544bc6fdec..2949ca474ede 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1841,10 +1841,11 @@ void __init dn_route_init(void)
 	proc_net_fops_create(&init_net, "decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops);
 
 #ifdef CONFIG_DECNET_ROUTER
-	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, dn_fib_dump);
+	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
+		      dn_fib_dump, NULL);
 #else
 	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
-		      dn_cache_dump);
+		      dn_cache_dump, NULL);
 #endif
 }
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 0d4a184af16f..37b3c188d8b3 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1833,8 +1833,8 @@ void __init devinet_init(void)
 
 	rtnl_af_register(&inet_af_ops);
 
-	rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
-	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
-	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
+	rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
+	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
+	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
 }
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 22524716fe70..92fc5f69f5da 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1124,9 +1124,9 @@ static struct pernet_operations fib_net_ops = {
 
 void __init ip_fib_init(void)
 {
-	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
-	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
-	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
+	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
 
 	register_pernet_subsys(&fib_net_ops);
 	register_netdevice_notifier(&fib_netdev_notifier);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 6ffe94ca5bc9..5ff47656fced 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -871,7 +871,7 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		}
 
 		return netlink_dump_start(idiagnl, skb, nlh,
-					  inet_diag_dump, NULL);
+					  inet_diag_dump, NULL, 0);
 	}
 
 	return inet_diag_get_exact(skb, nlh);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 30a7763c400e..aae2bd8cd924 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2544,7 +2544,8 @@ int __init ip_mr_init(void)
 		goto add_proto_fail;
 	}
 #endif
-	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
+	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
+		      NULL, ipmr_rtm_dumproute, NULL);
 	return 0;
 
 #ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 52b0b956508b..aa29c6291353 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3295,7 +3295,7 @@ int __init ip_rt_init(void)
 	xfrm_init();
 	xfrm4_init(ip_rt_max_size);
 #endif
-	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
+	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
 
 #ifdef CONFIG_SYSCTL
 	register_pernet_subsys(&sysctl_route_ops);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3e369425df58..05838c7fcf64 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4727,16 +4727,20 @@ int __init addrconf_init(void)
 	if (err < 0)
 		goto errout_af;
 
-	err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo);
+	err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
+			      NULL);
 	if (err < 0)
 		goto errout;
 
 	/* Only the first call to __rtnl_register can fail */
-	__rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL);
-	__rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL);
-	__rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, inet6_dump_ifaddr);
-	__rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr);
-	__rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr);
+	__rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL);
+	__rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL);
+	__rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
+			inet6_dump_ifaddr, NULL);
+	__rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
+			inet6_dump_ifmcaddr, NULL);
+	__rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
+			inet6_dump_ifacaddr, NULL);
 
 	ipv6_addr_label_rtnl_register();
 
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index c8993e5a337c..2d8ddba9ee58 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -592,8 +592,11 @@ out:
 
 void __init ipv6_addr_label_rtnl_register(void)
 {
-	__rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, NULL);
-	__rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, NULL);
-	__rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, ip6addrlbl_dump);
+	__rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
+			NULL, NULL);
+	__rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
+			NULL, NULL);
+	__rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
+			ip6addrlbl_dump, NULL);
 }
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4076a0b14b20..3030bdfd3ca4 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1586,7 +1586,8 @@ int __init fib6_init(void)
 	if (ret)
 		goto out_kmem_cache_create;
 
-	ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib);
+	ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
+			      NULL);
 	if (ret)
 		goto out_unregister_subsys;
 out:
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 82a809901f8e..705c82886281 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1354,7 +1354,8 @@ int __init ip6_mr_init(void)
 		goto add_proto_fail;
 	}
 #endif
-	rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, ip6mr_rtm_dumproute);
+	rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL,
+		      ip6mr_rtm_dumproute, NULL);
 	return 0;
 #ifdef CONFIG_IPV6_PIMSM_V2
 add_proto_fail:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index de2b1decd786..216ff31a0cc9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2925,9 +2925,9 @@ int __init ip6_route_init(void)
 		goto xfrm6_init;
 
 	ret = -ENOBUFS;
-	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
-	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
-	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
+	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
+	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
+	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
 		goto fib6_rules_init;
 
 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 8041befc6555..333b0bedf298 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1120,7 +1120,7 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
 
 	return netlink_dump_start(ctnl, skb, nlh,
 				  ip_set_dump_start,
-				  ip_set_dump_done);
+				  ip_set_dump_done, 0);
 }
 
 /* Add, del and test */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 482e90c61850..7dec88a1755b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -970,7 +970,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP)
 		return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table,
-					  ctnetlink_done);
+					  ctnetlink_done, 0);
 
 	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
 	if (err < 0)
@@ -1840,7 +1840,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		return netlink_dump_start(ctnl, skb, nlh,
 					  ctnetlink_exp_dump_table,
-					  ctnetlink_exp_done);
+					  ctnetlink_exp_done, 0);
 	}
 
 	err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6ef64adf7362..0b92f75491b1 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1659,13 +1659,10 @@ static int netlink_dump(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 	struct netlink_callback *cb;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct nlmsghdr *nlh;
 	int len, err = -ENOBUFS;
-
-	skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL);
-	if (!skb)
-		goto errout;
+	int alloc_size;
 
 	mutex_lock(nlk->cb_mutex);
 
@@ -1675,6 +1672,12 @@ static int netlink_dump(struct sock *sk)
 		goto errout_skb;
 	}
 
+	alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
+
+	skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
 	len = cb->dump(skb, cb);
 
 	if (len > 0) {
@@ -1721,7 +1724,8 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 		       const struct nlmsghdr *nlh,
 		       int (*dump)(struct sk_buff *skb,
 				   struct netlink_callback *),
-		       int (*done)(struct netlink_callback *))
+		       int (*done)(struct netlink_callback *),
+		       u16 min_dump_alloc)
 {
 	struct netlink_callback *cb;
 	struct sock *sk;
@@ -1735,6 +1739,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	cb->dump = dump;
 	cb->done = done;
 	cb->nlh = nlh;
+	cb->min_dump_alloc = min_dump_alloc;
 	atomic_inc(&skb->users);
 	cb->skb = skb;
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 1781d99145e2..482fa571b4ee 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -525,7 +525,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 		genl_unlock();
 		err = netlink_dump_start(net->genl_sock, skb, nlh,
-					 ops->dumpit, ops->done);
+					 ops->dumpit, ops->done, 0);
 		genl_lock();
 		return err;
 	}
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index 438accb7a5a8..d61f6761777d 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -289,15 +289,16 @@ out:
 
 int __init phonet_netlink_register(void)
 {
-	int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit, NULL);
+	int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit,
+				  NULL, NULL);
 	if (err)
 		return err;
 
 	/* Further __rtnl_register() cannot fail */
-	__rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL);
-	__rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit);
-	__rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL);
-	__rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL);
-	__rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit);
+	__rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL, NULL);
+	__rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, NULL);
+	__rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL, NULL);
+	__rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL, NULL);
+	__rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, NULL);
 	return 0;
 }
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a606025814a1..2f64262ab5d2 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1115,9 +1115,10 @@ nlmsg_failure:
 
 static int __init tc_action_init(void)
 {
-	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action);
+	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
+		      NULL);
 
 	return 0;
 }
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index bb2c523f8158..9563887f219f 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -610,10 +610,10 @@ EXPORT_SYMBOL(tcf_exts_dump_stats);
 
 static int __init tc_filter_init(void)
 {
-	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
 	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
-						 tc_dump_tfilter);
+		      tc_dump_tfilter, NULL);
 
 	return 0;
 }
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 6b8627661c98..8182aefafb02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1792,12 +1792,12 @@ static int __init pktsched_init(void)
 	register_qdisc(&pfifo_head_drop_qdisc_ops);
 	register_qdisc(&mq_qdisc_ops);
 
-	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
-	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
+	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
 
 	return 0;
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index c658cb3bc7c3..0256b8a0a7cf 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2299,7 +2299,8 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (link->dump == NULL)
 			return -EINVAL;
 
-		return netlink_dump_start(net->xfrm.nlsk, skb, nlh, link->dump, link->done);
+		return netlink_dump_start(net->xfrm.nlsk, skb, nlh,
+					  link->dump, link->done, 0);
 	}
 
 	err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, XFRMA_MAX,
-- 
cgit v1.2.3


From bdd4e85dc36cdbcfc1608a5b2a17c80a9db8986a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 8 Jun 2011 01:13:27 +0200
Subject: sched: Isolate preempt counting in its own config option

Create a new CONFIG_PREEMPT_COUNT that handles the inc/dec
of preempt count offset independently. So that the offset
can be updated by preempt_disable() and preempt_enable()
even without the need for CONFIG_PREEMPT beeing set.

This prepares to make CONFIG_DEBUG_SPINLOCK_SLEEP working
with !CONFIG_PREEMPT where it currently doesn't detect
code that sleeps inside explicit preemption disabled
sections.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/bit_spinlock.h |  2 +-
 include/linux/hardirq.h      |  4 ++--
 include/linux/pagemap.h      |  4 ++--
 include/linux/preempt.h      | 26 +++++++++++++++++---------
 include/linux/rcupdate.h     | 12 ++++++------
 include/linux/sched.h        |  2 +-
 kernel/Kconfig.preempt       |  3 +++
 kernel/sched.c               |  2 +-
 8 files changed, 33 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
index b4326bfa684f..564d997e2168 100644
--- a/include/linux/bit_spinlock.h
+++ b/include/linux/bit_spinlock.h
@@ -88,7 +88,7 @@ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
 {
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 	return test_bit(bitnum, addr);
-#elif defined CONFIG_PREEMPT
+#elif defined CONFIG_PREEMPT_COUNT
 	return preempt_count();
 #else
 	return 1;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index ba362171e8ae..f743883f769e 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -93,7 +93,7 @@
  */
 #define in_nmi()	(preempt_count() & NMI_MASK)
 
-#if defined(CONFIG_PREEMPT)
+#if defined(CONFIG_PREEMPT_COUNT)
 # define PREEMPT_CHECK_OFFSET 1
 #else
 # define PREEMPT_CHECK_OFFSET 0
@@ -115,7 +115,7 @@
 #define in_atomic_preempt_off() \
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
 # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
 #else
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 716875e53520..8e38d4c140ff 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -134,7 +134,7 @@ static inline int page_cache_get_speculative(struct page *page)
 	VM_BUG_ON(in_interrupt());
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT
+# ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 # endif
 	/*
@@ -172,7 +172,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
 	VM_BUG_ON(in_interrupt());
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT
+# ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 # endif
 	VM_BUG_ON(page_count(page) == 0);
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 2e681d9555bd..58969b2a8a82 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -27,6 +27,21 @@
 
 asmlinkage void preempt_schedule(void);
 
+#define preempt_check_resched() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+		preempt_schedule(); \
+} while (0)
+
+#else /* !CONFIG_PREEMPT */
+
+#define preempt_check_resched()		do { } while (0)
+
+#endif /* CONFIG_PREEMPT */
+
+
+#ifdef CONFIG_PREEMPT_COUNT
+
 #define preempt_disable() \
 do { \
 	inc_preempt_count(); \
@@ -39,12 +54,6 @@ do { \
 	dec_preempt_count(); \
 } while (0)
 
-#define preempt_check_resched() \
-do { \
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
-		preempt_schedule(); \
-} while (0)
-
 #define preempt_enable() \
 do { \
 	preempt_enable_no_resched(); \
@@ -80,18 +89,17 @@ do { \
 	preempt_check_resched(); \
 } while (0)
 
-#else
+#else /* !CONFIG_PREEMPT_COUNT */
 
 #define preempt_disable()		do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
-#define preempt_check_resched()		do { } while (0)
 
 #define preempt_disable_notrace()		do { } while (0)
 #define preempt_enable_no_resched_notrace()	do { } while (0)
 #define preempt_enable_notrace()		do { } while (0)
 
-#endif
+#endif /* CONFIG_PREEMPT_COUNT */
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 99f9aa7c2804..8f4f881a0ad8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -239,7 +239,7 @@ extern int rcu_read_lock_bh_held(void);
  * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
  */
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
 {
 	int lockdep_opinion = 0;
@@ -250,12 +250,12 @@ static inline int rcu_read_lock_sched_held(void)
 		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
 	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
 }
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPT_COUNT */
 static inline int rcu_read_lock_sched_held(void)
 {
 	return 1;
 }
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
@@ -276,17 +276,17 @@ static inline int rcu_read_lock_bh_held(void)
 	return 1;
 }
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
 {
 	return preempt_count() != 0 || irqs_disabled();
 }
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPT_COUNT */
 static inline int rcu_read_lock_sched_held(void)
 {
 	return 1;
 }
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 483c1ed5bc4d..4ecd5cbe7e24 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2502,7 +2502,7 @@ extern int _cond_resched(void);
 
 extern int __cond_resched_lock(spinlock_t *lock);
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 #define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
 #else
 #define PREEMPT_LOCK_OFFSET	0
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b356..24e7cb0ba26a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
 
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
+	select PREEMPT_COUNT
 	help
 	  This option reduces the latency of the kernel by making
 	  all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
 
 endchoice
 
+config PREEMPT_COUNT
+       bool
\ No newline at end of file
diff --git a/kernel/sched.c b/kernel/sched.c
index 01d9536aaa8e..90ad7cf2b290 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2843,7 +2843,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
 	p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
-- 
cgit v1.2.3


From e3ae0cac00042d7fb76914c30c5f991f918e65b4 Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend@broadcom.com>
Date: Thu, 9 Jun 2011 20:07:20 +0200
Subject: drivers: bcma: export bcma_core_disable() function

In the brcm80211 driver we disable the 80211 core when the driver is
'down'. The bcma_core_disable() function exactly does the same as
our implementation so exporting this function makes sense.

Cc: linux-wireless@vger.kernel.org
Cc: Rafal Milecki <zajec5@gmail.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/bcma/core.c       | 3 ++-
 include/linux/bcma/bcma.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/core.c b/drivers/bcma/core.c
index ced379f7b371..1ec7d4528dd0 100644
--- a/drivers/bcma/core.c
+++ b/drivers/bcma/core.c
@@ -19,7 +19,7 @@ bool bcma_core_is_enabled(struct bcma_device *core)
 }
 EXPORT_SYMBOL_GPL(bcma_core_is_enabled);
 
-static void bcma_core_disable(struct bcma_device *core, u32 flags)
+void bcma_core_disable(struct bcma_device *core, u32 flags)
 {
 	if (bcma_aread32(core, BCMA_RESET_CTL) & BCMA_RESET_CTL_RESET)
 		return;
@@ -31,6 +31,7 @@ static void bcma_core_disable(struct bcma_device *core, u32 flags)
 	bcma_awrite32(core, BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
 	udelay(1);
 }
+EXPORT_SYMBOL_GPL(bcma_core_disable);
 
 int bcma_core_enable(struct bcma_device *core, u32 flags)
 {
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 6ff080eac0b2..3895aeb494a3 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -244,6 +244,7 @@ void bcma_awrite32(struct bcma_device *core, u16 offset, u32 value)
 }
 
 extern bool bcma_core_is_enabled(struct bcma_device *core);
+extern void bcma_core_disable(struct bcma_device *core, u32 flags);
 extern int bcma_core_enable(struct bcma_device *core, u32 flags);
 
 #endif /* LINUX_BCMA_H_ */
-- 
cgit v1.2.3


From 10a8d94a95742bb15b4e617ee9884bb4381362be Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 10 Jun 2011 00:56:17 +0000
Subject: virtio_net: introduce VIRTIO_NET_HDR_F_DATA_VALID

There's no need for the guest to validate the checksum if it have been
validated by host nics. So this patch introduces a new flag -
VIRTIO_NET_HDR_F_DATA_VALID which is used to bypass the checksum
examing in guest. The backend (tap/macvtap) may set this flag when
met skbs with CHECKSUM_UNNECESSARY to save cpu utilization.

No feature negotiation is needed as old driver just ignore this flag.

Iperf shows 12%-30% performance improvement for UDP traffic. For TCP,
when gro is on no difference as it produces skb with partial
checksum. But when gro is disabled, 20% or even higher improvement
could be measured by netperf.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c      | 2 ++
 drivers/net/tun.c          | 2 ++
 drivers/net/virtio_net.c   | 2 ++
 include/linux/virtio_net.h | 1 +
 net/packet/af_packet.c     | 2 ++
 5 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 6696e56e6320..ecee0fe65a97 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -508,6 +508,8 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
 		vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 		vnet_hdr->csum_start = skb_checksum_start_offset(skb);
 		vnet_hdr->csum_offset = skb->csum_offset;
+	} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+		vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
 	} /* else everything is zero */
 
 	return 0;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ef68e13c042d..4dab85eecb60 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -788,6 +788,8 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 			gso.csum_start = skb_checksum_start_offset(skb);
 			gso.csum_offset = skb->csum_offset;
+		} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
 		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f6853247a620..be3686a298da 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -274,6 +274,8 @@ static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
 					  hdr->hdr.csum_start,
 					  hdr->hdr.csum_offset))
 			goto frame_err;
+	} else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
 	skb->protocol = eth_type_trans(skb, dev);
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 136040bba3e3..970d5a2a9047 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -63,6 +63,7 @@ struct virtio_net_config {
  * specify GSO or CSUM features, you can simply ignore the header. */
 struct virtio_net_hdr {
 #define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	// Use csum_start, csum_offset
+#define VIRTIO_NET_HDR_F_DATA_VALID	2	// Csum is valid
 	__u8 flags;
 #define VIRTIO_NET_HDR_GSO_NONE		0	// Not a GSO frame
 #define VIRTIO_NET_HDR_GSO_TCPV4	1	// GSO frame, IPv4 TCP (TSO)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 67f6749a0a45..b54ec41adea9 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1685,6 +1685,8 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 			vnet_hdr.csum_start = skb_checksum_start_offset(skb);
 			vnet_hdr.csum_offset = skb->csum_offset;
+		} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+			vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
 		err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
-- 
cgit v1.2.3


From 55309216baeb9d7f951520cf8e8bf2337cd17bad Mon Sep 17 00:00:00 2001
From: Harry Butterworth <heb1001@gmail.com>
Date: Sat, 11 Jun 2011 16:02:06 +0800
Subject: ALSA: ctxfi: Add support for Creative Titanium HD

Initialise model-specific DAC and ADC parts.
Add controls for output and mic source selection.
Rename some mixer controls according to ControlNames.txt.
Remove Playback switches for Line-in and IEC958-in - these
were controlling the input mute/unmute which affected
capture too.  Use the capture switches to control the
input mute/unmute instead - it's less confusing.
Initialise the WM8775 to invert the left-right clock
to swap the left and right channels of the mic and aux
input.

Signed-off-by: Harry Butterworth <heb1001@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/pci_ids.h      |   1 +
 sound/pci/ctxfi/ct20k2reg.h  |   1 +
 sound/pci/ctxfi/ctatc.c      | 124 ++++++++++++-----
 sound/pci/ctxfi/ctatc.h      |   8 ++
 sound/pci/ctxfi/ctdaio.c     |  23 +--
 sound/pci/ctxfi/ctdaio.h     |   1 +
 sound/pci/ctxfi/cthardware.h |   8 ++
 sound/pci/ctxfi/cthw20k1.c   |  18 +++
 sound/pci/ctxfi/cthw20k2.c   | 323 ++++++++++++++++++++++++++++++++-----------
 sound/pci/ctxfi/ctmixer.c    | 141 ++++++++++++++-----
 sound/pci/ctxfi/xfi.c        |   4 +-
 11 files changed, 488 insertions(+), 164 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a311008af5e1..f23f8bf02b04 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1308,6 +1308,7 @@
 #define PCI_SUBDEVICE_ID_CREATIVE_SB08801	0x0041
 #define PCI_SUBDEVICE_ID_CREATIVE_SB08802	0x0042
 #define PCI_SUBDEVICE_ID_CREATIVE_SB08803	0x0043
+#define PCI_SUBDEVICE_ID_CREATIVE_SB1270	0x0062
 #define PCI_SUBDEVICE_ID_CREATIVE_HENDRIX	0x6000
 
 #define PCI_VENDOR_ID_ECTIVA		0x1102 /* duplicate: CREATIVE */
diff --git a/sound/pci/ctxfi/ct20k2reg.h b/sound/pci/ctxfi/ct20k2reg.h
index e0394e3996e8..ca501ba03d64 100644
--- a/sound/pci/ctxfi/ct20k2reg.h
+++ b/sound/pci/ctxfi/ct20k2reg.h
@@ -55,6 +55,7 @@
 /* GPIO Registers */
 #define GPIO_DATA           0x1B7020
 #define GPIO_CTRL           0x1B7024
+#define GPIO_EXT_DATA       0x1B70A0
 
 /* Virtual memory registers */
 #define VMEM_PTPAL          0x1C6300 /* 0x1C6300 + (16 * Chn) */
diff --git a/sound/pci/ctxfi/ctatc.c b/sound/pci/ctxfi/ctatc.c
index 13f33c0719d3..952fd94c2666 100644
--- a/sound/pci/ctxfi/ctatc.c
+++ b/sound/pci/ctxfi/ctatc.c
@@ -30,7 +30,6 @@
 #include <sound/asoundef.h>
 
 #define MONO_SUM_SCALE	0x19a8	/* 2^(-0.5) in 14-bit floating format */
-#define DAIONUM		7
 #define MAX_MULTI_CHN	8
 
 #define IEC958_DEFAULT_CON ((IEC958_AES0_NONAUDIO \
@@ -53,6 +52,8 @@ static struct snd_pci_quirk __devinitdata subsys_20k1_list[] = {
 static struct snd_pci_quirk __devinitdata subsys_20k2_list[] = {
 	SND_PCI_QUIRK(PCI_VENDOR_ID_CREATIVE, PCI_SUBDEVICE_ID_CREATIVE_SB0760,
 		      "SB0760", CTSB0760),
+	SND_PCI_QUIRK(PCI_VENDOR_ID_CREATIVE, PCI_SUBDEVICE_ID_CREATIVE_SB1270,
+		      "SB1270", CTSB1270),
 	SND_PCI_QUIRK(PCI_VENDOR_ID_CREATIVE, PCI_SUBDEVICE_ID_CREATIVE_SB08801,
 		      "SB0880", CTSB0880),
 	SND_PCI_QUIRK(PCI_VENDOR_ID_CREATIVE, PCI_SUBDEVICE_ID_CREATIVE_SB08802,
@@ -75,6 +76,7 @@ static const char *ct_subsys_name[NUM_CTCARDS] = {
 	[CTSB0760]	= "SB076x",
 	[CTHENDRIX]	= "Hendrix",
 	[CTSB0880]	= "SB0880",
+	[CTSB1270]      = "SB1270",
 	[CT20K2_UNKNOWN] = "Unknown",
 };
 
@@ -459,12 +461,12 @@ static void setup_src_node_conf(struct ct_atc *atc, struct ct_atc_pcm *apcm,
 				apcm->substream->runtime->rate);
 	*n_srcc = 0;
 
-	if (1 == atc->msr) {
+	if (1 == atc->msr) { /* FIXME: do we really need SRC here if pitch==1 */
 		*n_srcc = apcm->substream->runtime->channels;
 		conf[0].pitch = pitch;
 		conf[0].mix_msr = conf[0].imp_msr = conf[0].msr = 1;
 		conf[0].vo = 1;
-	} else if (2 == atc->msr) {
+	} else if (2 <= atc->msr) {
 		if (0x8000000 < pitch) {
 			/* Need two-stage SRCs, SRCIMPs and
 			 * AMIXERs for converting format */
@@ -977,6 +979,55 @@ static int atc_have_digit_io_switch(struct ct_atc *atc)
 	return hw->have_digit_io_switch(hw);
 }
 
+static int atc_have_dedicated_mic(struct ct_atc *atc)
+{
+	struct hw *hw = atc->hw;
+
+	return hw->have_dedicated_mic(hw);
+}
+
+static int atc_have_output_switch(struct ct_atc *atc)
+{
+	struct hw *hw = atc->hw;
+
+	return hw->have_output_switch(hw);
+}
+
+static int atc_output_switch_get(struct ct_atc *atc)
+{
+	struct hw *hw = atc->hw;
+
+	return hw->output_switch_get(hw);
+}
+
+static int atc_output_switch_put(struct ct_atc *atc, int position)
+{
+	struct hw *hw = atc->hw;
+
+	return hw->output_switch_put(hw, position);
+}
+
+static int atc_have_mic_source_switch(struct ct_atc *atc)
+{
+	struct hw *hw = atc->hw;
+
+	return hw->have_mic_source_switch(hw);
+}
+
+static int atc_mic_source_switch_get(struct ct_atc *atc)
+{
+	struct hw *hw = atc->hw;
+
+	return hw->mic_source_switch_get(hw);
+}
+
+static int atc_mic_source_switch_put(struct ct_atc *atc, int position)
+{
+	struct hw *hw = atc->hw;
+
+	return hw->mic_source_switch_put(hw, position);
+}
+
 static int atc_select_digit_io(struct ct_atc *atc)
 {
 	struct hw *hw = atc->hw;
@@ -1045,6 +1096,11 @@ static int atc_line_in_unmute(struct ct_atc *atc, unsigned char state)
 	return atc_daio_unmute(atc, state, LINEIM);
 }
 
+static int atc_mic_unmute(struct ct_atc *atc, unsigned char state)
+{
+	return atc_daio_unmute(atc, state, MIC);
+}
+
 static int atc_spdif_out_unmute(struct ct_atc *atc, unsigned char state)
 {
 	return atc_daio_unmute(atc, state, SPDIFOO);
@@ -1331,17 +1387,20 @@ static int atc_get_resources(struct ct_atc *atc)
 	struct srcimp_mgr *srcimp_mgr;
 	struct sum_desc sum_dsc = {0};
 	struct sum_mgr *sum_mgr;
-	int err, i;
+	int err, i, num_srcs, num_daios;
+
+	num_daios = ((atc->model == CTSB1270) ? 8 : 7);
+	num_srcs = ((atc->model == CTSB1270) ? 6 : 4);
 
-	atc->daios = kzalloc(sizeof(void *)*(DAIONUM), GFP_KERNEL);
+	atc->daios = kzalloc(sizeof(void *)*num_daios, GFP_KERNEL);
 	if (!atc->daios)
 		return -ENOMEM;
 
-	atc->srcs = kzalloc(sizeof(void *)*(2*2), GFP_KERNEL);
+	atc->srcs = kzalloc(sizeof(void *)*num_srcs, GFP_KERNEL);
 	if (!atc->srcs)
 		return -ENOMEM;
 
-	atc->srcimps = kzalloc(sizeof(void *)*(2*2), GFP_KERNEL);
+	atc->srcimps = kzalloc(sizeof(void *)*num_srcs, GFP_KERNEL);
 	if (!atc->srcimps)
 		return -ENOMEM;
 
@@ -1351,8 +1410,9 @@ static int atc_get_resources(struct ct_atc *atc)
 
 	daio_mgr = (struct daio_mgr *)atc->rsc_mgrs[DAIO];
 	da_desc.msr = atc->msr;
-	for (i = 0, atc->n_daio = 0; i < DAIONUM-1; i++) {
-		da_desc.type = i;
+	for (i = 0, atc->n_daio = 0; i < num_daios; i++) {
+		da_desc.type = (atc->model != CTSB073X) ? i :
+			     ((i == SPDIFIO) ? SPDIFI1 : i);
 		err = daio_mgr->get_daio(daio_mgr, &da_desc,
 					(struct daio **)&atc->daios[i]);
 		if (err) {
@@ -1362,23 +1422,12 @@ static int atc_get_resources(struct ct_atc *atc)
 		}
 		atc->n_daio++;
 	}
-	if (atc->model == CTSB073X)
-		da_desc.type = SPDIFI1;
-	else
-		da_desc.type = SPDIFIO;
-	err = daio_mgr->get_daio(daio_mgr, &da_desc,
-				(struct daio **)&atc->daios[i]);
-	if (err) {
-		printk(KERN_ERR "ctxfi: Failed to get S/PDIF-in resource!!!\n");
-		return err;
-	}
-	atc->n_daio++;
 
 	src_mgr = atc->rsc_mgrs[SRC];
 	src_dsc.multi = 1;
 	src_dsc.msr = atc->msr;
 	src_dsc.mode = ARCRW;
-	for (i = 0, atc->n_src = 0; i < (2*2); i++) {
+	for (i = 0, atc->n_src = 0; i < num_srcs; i++) {
 		err = src_mgr->get_src(src_mgr, &src_dsc,
 					(struct src **)&atc->srcs[i]);
 		if (err)
@@ -1388,8 +1437,8 @@ static int atc_get_resources(struct ct_atc *atc)
 	}
 
 	srcimp_mgr = atc->rsc_mgrs[SRCIMP];
-	srcimp_dsc.msr = 8; /* SRCIMPs for S/PDIFIn SRT */
-	for (i = 0, atc->n_srcimp = 0; i < (2*1); i++) {
+	srcimp_dsc.msr = 8;
+	for (i = 0, atc->n_srcimp = 0; i < num_srcs; i++) {
 		err = srcimp_mgr->get_srcimp(srcimp_mgr, &srcimp_dsc,
 					(struct srcimp **)&atc->srcimps[i]);
 		if (err)
@@ -1397,15 +1446,6 @@ static int atc_get_resources(struct ct_atc *atc)
 
 		atc->n_srcimp++;
 	}
-	srcimp_dsc.msr = 8; /* SRCIMPs for LINE/MICIn SRT */
-	for (i = 0; i < (2*1); i++) {
-		err = srcimp_mgr->get_srcimp(srcimp_mgr, &srcimp_dsc,
-				(struct srcimp **)&atc->srcimps[2*1+i]);
-		if (err)
-			return err;
-
-		atc->n_srcimp++;
-	}
 
 	sum_mgr = atc->rsc_mgrs[SUM];
 	sum_dsc.msr = atc->msr;
@@ -1488,6 +1528,18 @@ static void atc_connect_resources(struct ct_atc *atc)
 	src = atc->srcs[3];
 	mixer->set_input_right(mixer, MIX_LINE_IN, &src->rsc);
 
+	if (atc->model == CTSB1270) {
+		/* Titanium HD has a dedicated ADC for the Mic. */
+		dai = container_of(atc->daios[MIC], struct dai, daio);
+		atc_connect_dai(atc->rsc_mgrs[SRC], dai,
+			(struct src **)&atc->srcs[4],
+			(struct srcimp **)&atc->srcimps[4]);
+		src = atc->srcs[4];
+		mixer->set_input_left(mixer, MIX_MIC_IN, &src->rsc);
+		src = atc->srcs[5];
+		mixer->set_input_right(mixer, MIX_MIC_IN, &src->rsc);
+	}
+
 	dai = container_of(atc->daios[SPDIFIO], struct dai, daio);
 	atc_connect_dai(atc->rsc_mgrs[SRC], dai,
 			(struct src **)&atc->srcs[0],
@@ -1606,12 +1658,20 @@ static struct ct_atc atc_preset __devinitdata = {
 	.line_clfe_unmute = atc_line_clfe_unmute,
 	.line_rear_unmute = atc_line_rear_unmute,
 	.line_in_unmute = atc_line_in_unmute,
+	.mic_unmute = atc_mic_unmute,
 	.spdif_out_unmute = atc_spdif_out_unmute,
 	.spdif_in_unmute = atc_spdif_in_unmute,
 	.spdif_out_get_status = atc_spdif_out_get_status,
 	.spdif_out_set_status = atc_spdif_out_set_status,
 	.spdif_out_passthru = atc_spdif_out_passthru,
 	.have_digit_io_switch = atc_have_digit_io_switch,
+	.have_dedicated_mic = atc_have_dedicated_mic,
+	.have_output_switch = atc_have_output_switch,
+	.output_switch_get = atc_output_switch_get,
+	.output_switch_put = atc_output_switch_put,
+	.have_mic_source_switch = atc_have_mic_source_switch,
+	.mic_source_switch_get = atc_mic_source_switch_get,
+	.mic_source_switch_put = atc_mic_source_switch_put,
 #ifdef CONFIG_PM
 	.suspend = atc_suspend,
 	.resume = atc_resume,
diff --git a/sound/pci/ctxfi/ctatc.h b/sound/pci/ctxfi/ctatc.h
index 7167c0185d52..6bad27e06f4d 100644
--- a/sound/pci/ctxfi/ctatc.h
+++ b/sound/pci/ctxfi/ctatc.h
@@ -115,12 +115,20 @@ struct ct_atc {
 	int (*line_clfe_unmute)(struct ct_atc *atc, unsigned char state);
 	int (*line_rear_unmute)(struct ct_atc *atc, unsigned char state);
 	int (*line_in_unmute)(struct ct_atc *atc, unsigned char state);
+	int (*mic_unmute)(struct ct_atc *atc, unsigned char state);
 	int (*spdif_out_unmute)(struct ct_atc *atc, unsigned char state);
 	int (*spdif_in_unmute)(struct ct_atc *atc, unsigned char state);
 	int (*spdif_out_get_status)(struct ct_atc *atc, unsigned int *status);
 	int (*spdif_out_set_status)(struct ct_atc *atc, unsigned int status);
 	int (*spdif_out_passthru)(struct ct_atc *atc, unsigned char state);
 	int (*have_digit_io_switch)(struct ct_atc *atc);
+	int (*have_dedicated_mic)(struct ct_atc *atc);
+	int (*have_output_switch)(struct ct_atc *atc);
+	int (*output_switch_get)(struct ct_atc *atc);
+	int (*output_switch_put)(struct ct_atc *atc, int position);
+	int (*have_mic_source_switch)(struct ct_atc *atc);
+	int (*mic_source_switch_get)(struct ct_atc *atc);
+	int (*mic_source_switch_put)(struct ct_atc *atc, int position);
 
 	/* Don't touch! Used for internal object. */
 	void *rsc_mgrs[NUM_RSCTYP]; /* chip resource managers */
diff --git a/sound/pci/ctxfi/ctdaio.c b/sound/pci/ctxfi/ctdaio.c
index 47d9ea97de02..0c00eb4088ef 100644
--- a/sound/pci/ctxfi/ctdaio.c
+++ b/sound/pci/ctxfi/ctdaio.c
@@ -22,20 +22,9 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 
-#define DAIO_RESOURCE_NUM	NUM_DAIOTYP
 #define DAIO_OUT_MAX		SPDIFOO
 
-union daio_usage {
-	struct {
-		unsigned short lineo1:1;
-		unsigned short lineo2:1;
-		unsigned short lineo3:1;
-		unsigned short lineo4:1;
-		unsigned short spdifoo:1;
-		unsigned short lineim:1;
-		unsigned short spdifio:1;
-		unsigned short spdifi1:1;
-	} bf;
+struct daio_usage {
 	unsigned short data;
 };
 
@@ -61,6 +50,7 @@ struct daio_rsc_idx idx_20k2[NUM_DAIOTYP] = {
 	[LINEO3] = {.left = 0x50, .right = 0x51},
 	[LINEO4] = {.left = 0x70, .right = 0x71},
 	[LINEIM] = {.left = 0x45, .right = 0xc5},
+	[MIC]	 = {.left = 0x55, .right = 0xd5},
 	[SPDIFOO] = {.left = 0x00, .right = 0x01},
 	[SPDIFIO] = {.left = 0x05, .right = 0x85},
 };
@@ -138,6 +128,7 @@ static unsigned int daio_device_index(enum DAIOTYP type, struct hw *hw)
 		case LINEO3:	return 5;
 		case LINEO4:	return 6;
 		case LINEIM:	return 4;
+		case MIC:	return 5;
 		default:	return -EINVAL;
 		}
 	default:
@@ -519,17 +510,17 @@ static int dai_rsc_uninit(struct dai *dai)
 
 static int daio_mgr_get_rsc(struct rsc_mgr *mgr, enum DAIOTYP type)
 {
-	if (((union daio_usage *)mgr->rscs)->data & (0x1 << type))
+	if (((struct daio_usage *)mgr->rscs)->data & (0x1 << type))
 		return -ENOENT;
 
-	((union daio_usage *)mgr->rscs)->data |= (0x1 << type);
+	((struct daio_usage *)mgr->rscs)->data |= (0x1 << type);
 
 	return 0;
 }
 
 static int daio_mgr_put_rsc(struct rsc_mgr *mgr, enum DAIOTYP type)
 {
-	((union daio_usage *)mgr->rscs)->data &= ~(0x1 << type);
+	((struct daio_usage *)mgr->rscs)->data &= ~(0x1 << type);
 
 	return 0;
 }
@@ -712,7 +703,7 @@ int daio_mgr_create(void *hw, struct daio_mgr **rdaio_mgr)
 	if (!daio_mgr)
 		return -ENOMEM;
 
-	err = rsc_mgr_init(&daio_mgr->mgr, DAIO, DAIO_RESOURCE_NUM, hw);
+	err = rsc_mgr_init(&daio_mgr->mgr, DAIO, NUM_DAIOTYP, hw);
 	if (err)
 		goto error1;
 
diff --git a/sound/pci/ctxfi/ctdaio.h b/sound/pci/ctxfi/ctdaio.h
index 0f52ce571ee8..85ccb6ee1ab4 100644
--- a/sound/pci/ctxfi/ctdaio.h
+++ b/sound/pci/ctxfi/ctdaio.h
@@ -33,6 +33,7 @@ enum DAIOTYP {
 	SPDIFOO,	/* S/PDIF Out (Flexijack/Optical) */
 	LINEIM,
 	SPDIFIO,	/* S/PDIF In (Flexijack/Optical) on the card */
+	MIC,		/* Dedicated mic on Titanium HD */
 	SPDIFI1,	/* S/PDIF In on internal Drive Bay */
 	NUM_DAIOTYP
 };
diff --git a/sound/pci/ctxfi/cthardware.h b/sound/pci/ctxfi/cthardware.h
index af55405f5dec..de59bbf2702f 100644
--- a/sound/pci/ctxfi/cthardware.h
+++ b/sound/pci/ctxfi/cthardware.h
@@ -39,6 +39,7 @@ enum CTCARDS {
 	CT20K2_MODEL_FIRST = CTSB0760,
 	CTHENDRIX,
 	CTSB0880,
+	CTSB1270,
 	CT20K2_UNKNOWN,
 	NUM_CTCARDS		/* This should always be the last */
 };
@@ -71,6 +72,13 @@ struct hw {
 	int (*is_adc_source_selected)(struct hw *hw, enum ADCSRC source);
 	int (*select_adc_source)(struct hw *hw, enum ADCSRC source);
 	int (*have_digit_io_switch)(struct hw *hw);
+	int (*have_dedicated_mic)(struct hw *hw);
+	int (*have_output_switch)(struct hw *hw);
+	int (*output_switch_get)(struct hw *hw);
+	int (*output_switch_put)(struct hw *hw, int position);
+	int (*have_mic_source_switch)(struct hw *hw);
+	int (*mic_source_switch_get)(struct hw *hw);
+	int (*mic_source_switch_put)(struct hw *hw, int position);
 
 	/* SRC operations */
 	int (*src_rsc_get_ctrl_blk)(void **rblk);
diff --git a/sound/pci/ctxfi/cthw20k1.c b/sound/pci/ctxfi/cthw20k1.c
index a5c957db5cea..9a85a84b23ab 100644
--- a/sound/pci/ctxfi/cthw20k1.c
+++ b/sound/pci/ctxfi/cthw20k1.c
@@ -1783,6 +1783,21 @@ static int hw_have_digit_io_switch(struct hw *hw)
 	return !(hw->model == CTSB073X || hw->model == CTUAA);
 }
 
+static int hw_have_dedicated_mic(struct hw *hw)
+{
+	return 0;
+}
+
+static int hw_have_output_switch(struct hw *hw)
+{
+	return 0;
+}
+
+static int hw_have_mic_source_switch(struct hw *hw)
+{
+	return 0;
+}
+
 #define CTLBITS(a, b, c, d)	(((a) << 24) | ((b) << 16) | ((c) << 8) | (d))
 
 #define UAA_CFG_PWRSTATUS	0x44
@@ -2173,6 +2188,9 @@ static struct hw ct20k1_preset __devinitdata = {
 	.is_adc_source_selected = hw_is_adc_input_selected,
 	.select_adc_source = hw_adc_input_select,
 	.have_digit_io_switch = hw_have_digit_io_switch,
+	.have_dedicated_mic = hw_have_dedicated_mic,
+	.have_output_switch = hw_have_output_switch,
+	.have_mic_source_switch = hw_have_mic_source_switch,
 #ifdef CONFIG_PM
 	.suspend = hw_suspend,
 	.resume = hw_resume,
diff --git a/sound/pci/ctxfi/cthw20k2.c b/sound/pci/ctxfi/cthw20k2.c
index 5364164674e4..8bc6e41ce64b 100644
--- a/sound/pci/ctxfi/cthw20k2.c
+++ b/sound/pci/ctxfi/cthw20k2.c
@@ -8,7 +8,7 @@
  * @File	cthw20k2.c
  *
  * @Brief
- * This file contains the implementation of hardware access methord for 20k2.
+ * This file contains the implementation of hardware access method for 20k2.
  *
  * @Author	Liu Chun
  * @Date 	May 14 2008
@@ -38,6 +38,8 @@ struct hw20k2 {
 	unsigned char dev_id;
 	unsigned char addr_size;
 	unsigned char data_size;
+
+	int mic_source;
 };
 
 static u32 hw_read_20kx(struct hw *hw, u32 reg);
@@ -1163,7 +1165,12 @@ static int hw_daio_init(struct hw *hw, const struct daio_conf *info)
 		hw_write_20kx(hw, AUDIO_IO_TX_BLRCLK, 0x01010101);
 		hw_write_20kx(hw, AUDIO_IO_RX_BLRCLK, 0);
 	} else if (2 == info->msr) {
-		hw_write_20kx(hw, AUDIO_IO_MCLK, 0x11111111);
+		if (hw->model != CTSB1270) {
+			hw_write_20kx(hw, AUDIO_IO_MCLK, 0x11111111);
+		} else {
+			/* PCM4220 on Titanium HD is different. */
+			hw_write_20kx(hw, AUDIO_IO_MCLK, 0x11011111);
+		}
 		/* Specify all playing 96khz
 		 * EA [0]	- Enabled
 		 * RTA [4:5]	- 96kHz
@@ -1175,6 +1182,10 @@ static int hw_daio_init(struct hw *hw, const struct daio_conf *info)
 		 * RTD [28:29]	- 96kHz */
 		hw_write_20kx(hw, AUDIO_IO_TX_BLRCLK, 0x11111111);
 		hw_write_20kx(hw, AUDIO_IO_RX_BLRCLK, 0);
+	} else if ((4 == info->msr) && (hw->model == CTSB1270)) {
+		hw_write_20kx(hw, AUDIO_IO_MCLK, 0x21011111);
+		hw_write_20kx(hw, AUDIO_IO_TX_BLRCLK, 0x21212121);
+		hw_write_20kx(hw, AUDIO_IO_RX_BLRCLK, 0);
 	} else {
 		printk(KERN_ALERT "ctxfi: ERROR!!! Invalid sampling rate!!!\n");
 		return -EINVAL;
@@ -1182,6 +1193,8 @@ static int hw_daio_init(struct hw *hw, const struct daio_conf *info)
 
 	for (i = 0; i < 8; i++) {
 		if (i <= 3) {
+			/* This comment looks wrong since loop is over 4  */
+			/* channels and emu20k2 supports 4 spdif IOs.     */
 			/* 1st 3 channels are SPDIFs (SB0960) */
 			if (i == 3)
 				data = 0x1001001;
@@ -1206,12 +1219,16 @@ static int hw_daio_init(struct hw *hw, const struct daio_conf *info)
 
 			hw_write_20kx(hw, AUDIO_IO_TX_CSTAT_H+(0x40*i), 0x0B);
 		} else {
+			/* Again, loop is over 4 channels not 5. */
 			/* Next 5 channels are I2S (SB0960) */
 			data = 0x11;
 			hw_write_20kx(hw, AUDIO_IO_RX_CTL+(0x40*i), data);
 			if (2 == info->msr) {
 				/* Four channels per sample period */
 				data |= 0x1000;
+			} else if (4 == info->msr) {
+				/* FIXME: check this against the chip spec */
+				data |= 0x2000;
 			}
 			hw_write_20kx(hw, AUDIO_IO_TX_CTL+(0x40*i), data);
 		}
@@ -1557,7 +1574,7 @@ static int hw20k2_i2c_write(struct hw *hw, u16 addr, u32 data)
 
 	hw_write_20kx(hw, I2C_IF_STATUS, i2c_status);
 	hw20k2_i2c_wait_data_ready(hw);
-	/* Dummy write to trigger the write oprtation */
+	/* Dummy write to trigger the write operation */
 	hw_write_20kx(hw, I2C_IF_WDATA, 0);
 	hw20k2_i2c_wait_data_ready(hw);
 
@@ -1568,6 +1585,30 @@ static int hw20k2_i2c_write(struct hw *hw, u16 addr, u32 data)
 	return 0;
 }
 
+static void hw_dac_stop(struct hw *hw)
+{
+	u32 data;
+	data = hw_read_20kx(hw, GPIO_DATA);
+	data &= 0xFFFFFFFD;
+	hw_write_20kx(hw, GPIO_DATA, data);
+	mdelay(10);
+}
+
+static void hw_dac_start(struct hw *hw)
+{
+	u32 data;
+	data = hw_read_20kx(hw, GPIO_DATA);
+	data |= 0x2;
+	hw_write_20kx(hw, GPIO_DATA, data);
+	mdelay(50);
+}
+
+static void hw_dac_reset(struct hw *hw)
+{
+	hw_dac_stop(hw);
+	hw_dac_start(hw);
+}
+
 static int hw_dac_init(struct hw *hw, const struct dac_conf *info)
 {
 	int err;
@@ -1594,6 +1635,21 @@ static int hw_dac_init(struct hw *hw, const struct dac_conf *info)
 				   0x00000000   /* Vol Control B4 */
 				 };
 
+	if (hw->model == CTSB1270) {
+		hw_dac_stop(hw);
+		data = hw_read_20kx(hw, GPIO_DATA);
+		data &= ~0x0600;
+		if (1 == info->msr)
+			data |= 0x0000; /* Single Speed Mode 0-50kHz */
+		else if (2 == info->msr)
+			data |= 0x0200; /* Double Speed Mode 50-100kHz */
+		else
+			data |= 0x0600; /* Quad Speed Mode 100-200kHz */
+		hw_write_20kx(hw, GPIO_DATA, data);
+		hw_dac_start(hw);
+		return 0;
+	}
+
 	/* Set DAC reset bit as output */
 	data = hw_read_20kx(hw, GPIO_CTRL);
 	data |= 0x02;
@@ -1606,22 +1662,8 @@ static int hw_dac_init(struct hw *hw, const struct dac_conf *info)
 	for (i = 0; i < 2; i++) {
 		/* Reset DAC twice just in-case the chip
 		 * didn't initialized properly */
-		data = hw_read_20kx(hw, GPIO_DATA);
-		/* GPIO data bit 1 */
-		data &= 0xFFFFFFFD;
-		hw_write_20kx(hw, GPIO_DATA, data);
-		mdelay(10);
-		data |= 0x2;
-		hw_write_20kx(hw, GPIO_DATA, data);
-		mdelay(50);
-
-		/* Reset the 2nd time */
-		data &= 0xFFFFFFFD;
-		hw_write_20kx(hw, GPIO_DATA, data);
-		mdelay(10);
-		data |= 0x2;
-		hw_write_20kx(hw, GPIO_DATA, data);
-		mdelay(50);
+		hw_dac_reset(hw);
+		hw_dac_reset(hw);
 
 		if (hw20k2_i2c_read(hw, CS4382_MC1,  &cs_read.mode_control_1))
 			continue;
@@ -1725,7 +1767,11 @@ End:
 static int hw_is_adc_input_selected(struct hw *hw, enum ADCSRC type)
 {
 	u32 data;
-
+	if (hw->model == CTSB1270) {
+		/* Titanium HD has two ADC chips, one for line in and one */
+		/* for MIC. We don't need to switch the ADC input. */
+		return 1;
+	}
 	data = hw_read_20kx(hw, GPIO_DATA);
 	switch (type) {
 	case ADC_MICIN:
@@ -1742,35 +1788,47 @@ static int hw_is_adc_input_selected(struct hw *hw, enum ADCSRC type)
 
 #define MIC_BOOST_0DB 0xCF
 #define MIC_BOOST_STEPS_PER_DB 2
-#define MIC_BOOST_20DB (MIC_BOOST_0DB + 20 * MIC_BOOST_STEPS_PER_DB)
+
+static void hw_wm8775_input_select(struct hw *hw, u8 input, s8 gain_in_db)
+{
+	u32 adcmc, gain;
+
+	if (input > 3)
+		input = 3;
+
+	adcmc = ((u32)1 << input) | 0x100; /* Link L+R gain... */
+
+	hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_ADCMC, adcmc),
+				MAKE_WM8775_DATA(adcmc));
+
+	if (gain_in_db < -103)
+		gain_in_db = -103;
+	if (gain_in_db > 24)
+		gain_in_db = 24;
+
+	gain = gain_in_db * MIC_BOOST_STEPS_PER_DB + MIC_BOOST_0DB;
+
+	hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_AADCL, gain),
+				MAKE_WM8775_DATA(gain));
+	/* ...so there should be no need for the following. */
+	hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_AADCR, gain),
+				MAKE_WM8775_DATA(gain));
+}
 
 static int hw_adc_input_select(struct hw *hw, enum ADCSRC type)
 {
 	u32 data;
-
 	data = hw_read_20kx(hw, GPIO_DATA);
 	switch (type) {
 	case ADC_MICIN:
 		data |= (0x1 << 14);
 		hw_write_20kx(hw, GPIO_DATA, data);
-		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_ADCMC, 0x101),
-				MAKE_WM8775_DATA(0x101)); /* Mic-in */
-		hw20k2_i2c_write(hw,
-				MAKE_WM8775_ADDR(WM8775_AADCL, MIC_BOOST_20DB),
-				MAKE_WM8775_DATA(MIC_BOOST_20DB)); /* +20dB */
-		hw20k2_i2c_write(hw,
-				MAKE_WM8775_ADDR(WM8775_AADCR, MIC_BOOST_20DB),
-				MAKE_WM8775_DATA(MIC_BOOST_20DB)); /* +20dB */
+		hw_wm8775_input_select(hw, 0, 20); /* Mic, 20dB */
 		break;
 	case ADC_LINEIN:
 		data &= ~(0x1 << 14);
 		hw_write_20kx(hw, GPIO_DATA, data);
-		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_ADCMC, 0x102),
-				MAKE_WM8775_DATA(0x102)); /* Line-in */
-		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_AADCL, 0xCF),
-				MAKE_WM8775_DATA(0xCF)); /* No boost */
-		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_AADCR, 0xCF),
-				MAKE_WM8775_DATA(0xCF)); /* No boost */
+		hw_wm8775_input_select(hw, 1, 0); /* Line-in, 0dB */
 		break;
 	default:
 		break;
@@ -1782,7 +1840,7 @@ static int hw_adc_input_select(struct hw *hw, enum ADCSRC type)
 static int hw_adc_init(struct hw *hw, const struct adc_conf *info)
 {
 	int err;
-	u32 mux = 2, data, ctl;
+	u32 data, ctl;
 
 	/*  Set ADC reset bit as output */
 	data = hw_read_20kx(hw, GPIO_CTRL);
@@ -1796,19 +1854,42 @@ static int hw_adc_init(struct hw *hw, const struct adc_conf *info)
 		goto error;
 	}
 
-	/* Make ADC in normal operation */
+	/* Reset the ADC (reset is active low). */
 	data = hw_read_20kx(hw, GPIO_DATA);
 	data &= ~(0x1 << 15);
+	hw_write_20kx(hw, GPIO_DATA, data);
+
+	if (hw->model == CTSB1270) {
+		/* Set up the PCM4220 ADC on Titanium HD */
+		data &= ~0x0C;
+		if (1 == info->msr)
+			data |= 0x00; /* Single Speed Mode 32-50kHz */
+		else if (2 == info->msr)
+			data |= 0x08; /* Double Speed Mode 50-108kHz */
+		else
+			data |= 0x04; /* Quad Speed Mode 108kHz-216kHz */
+		hw_write_20kx(hw, GPIO_DATA, data);
+	}
+
 	mdelay(10);
+	/* Return the ADC to normal operation. */
 	data |= (0x1 << 15);
 	hw_write_20kx(hw, GPIO_DATA, data);
 	mdelay(50);
 
+	/* I2C write to register offset 0x0B to set ADC LRCLK polarity */
+	/* invert bit, interface format to I2S, word length to 24-bit, */
+	/* enable ADC high pass filter. Fixes bug 5323?		*/
+	hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_IC, 0x26),
+			 MAKE_WM8775_DATA(0x26));
+
 	/* Set the master mode (256fs) */
 	if (1 == info->msr) {
+		/* slave mode, 128x oversampling 256fs */
 		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_MMC, 0x02),
 						MAKE_WM8775_DATA(0x02));
-	} else if (2 == info->msr) {
+	} else if ((2 == info->msr) || (4 == info->msr)) {
+		/* slave mode, 64x oversampling, 256fs */
 		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_MMC, 0x0A),
 						MAKE_WM8775_DATA(0x0A));
 	} else {
@@ -1818,47 +1899,17 @@ static int hw_adc_init(struct hw *hw, const struct adc_conf *info)
 		goto error;
 	}
 
-	/* Configure GPIO bit 14 change to line-in/mic-in */
-	ctl = hw_read_20kx(hw, GPIO_CTRL);
-	ctl |= 0x1 << 14;
-	hw_write_20kx(hw, GPIO_CTRL, ctl);
-
-	/* Check using Mic-in or Line-in */
-	data = hw_read_20kx(hw, GPIO_DATA);
-
-	if (mux == 1) {
-		/* Configures GPIO data to select Mic-in */
-		data |= 0x1 << 14;
-		hw_write_20kx(hw, GPIO_DATA, data);
-
-		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_ADCMC, 0x101),
-				MAKE_WM8775_DATA(0x101)); /* Mic-in */
-		hw20k2_i2c_write(hw,
-				MAKE_WM8775_ADDR(WM8775_AADCL, MIC_BOOST_20DB),
-				MAKE_WM8775_DATA(MIC_BOOST_20DB)); /* +20dB */
-		hw20k2_i2c_write(hw,
-				MAKE_WM8775_ADDR(WM8775_AADCR, MIC_BOOST_20DB),
-				MAKE_WM8775_DATA(MIC_BOOST_20DB)); /* +20dB */
-	} else if (mux == 2) {
-		/* Configures GPIO data to select Line-in */
-		data &= ~(0x1 << 14);
-		hw_write_20kx(hw, GPIO_DATA, data);
-
-		/* Setup ADC */
-		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_ADCMC, 0x102),
-				MAKE_WM8775_DATA(0x102)); /* Line-in */
-		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_AADCL, 0xCF),
-				MAKE_WM8775_DATA(0xCF)); /* No boost */
-		hw20k2_i2c_write(hw, MAKE_WM8775_ADDR(WM8775_AADCR, 0xCF),
-				MAKE_WM8775_DATA(0xCF)); /* No boost */
+	if (hw->model != CTSB1270) {
+		/* Configure GPIO bit 14 change to line-in/mic-in */
+		ctl = hw_read_20kx(hw, GPIO_CTRL);
+		ctl |= 0x1 << 14;
+		hw_write_20kx(hw, GPIO_CTRL, ctl);
+		hw_adc_input_select(hw, ADC_LINEIN);
 	} else {
-		printk(KERN_ALERT "ctxfi: ERROR!!! Invalid input mux!!!\n");
-		err = -EINVAL;
-		goto error;
+		hw_wm8775_input_select(hw, 0, 0);
 	}
 
 	return 0;
-
 error:
 	hw20k2_i2c_uninit(hw);
 	return err;
@@ -1869,6 +1920,102 @@ static int hw_have_digit_io_switch(struct hw *hw)
 	return 0;
 }
 
+static int hw_have_dedicated_mic(struct hw *hw)
+{
+	return hw->model == CTSB1270;
+}
+
+static int hw_have_output_switch(struct hw *hw)
+{
+	return hw->model == CTSB1270;
+}
+
+static int hw_output_switch_get(struct hw *hw)
+{
+	u32 data = hw_read_20kx(hw, GPIO_EXT_DATA);
+
+	switch (data & 0x30) {
+	case 0x00:
+	     return 0;
+	case 0x10:
+	     return 1;
+	case 0x20:
+	     return 2;
+	default:
+	     return 3;
+	}
+}
+
+static int hw_output_switch_put(struct hw *hw, int position)
+{
+	u32 data;
+
+	if (position == hw_output_switch_get(hw))
+		return 0;
+
+	/* Mute line and headphones (intended for anti-pop). */
+	data = hw_read_20kx(hw, GPIO_DATA);
+	data |= (0x03 << 11);
+	hw_write_20kx(hw, GPIO_DATA, data);
+
+	data = hw_read_20kx(hw, GPIO_EXT_DATA) & ~0x30;
+	switch (position) {
+	case 0:
+		break;
+	case 1:
+		data |= 0x10;
+		break;
+	default:
+		data |= 0x20;
+	}
+	hw_write_20kx(hw, GPIO_EXT_DATA, data);
+
+	/* Unmute line and headphones. */
+	data = hw_read_20kx(hw, GPIO_DATA);
+	data &= ~(0x03 << 11);
+	hw_write_20kx(hw, GPIO_DATA, data);
+
+	return 1;
+}
+
+static int hw_have_mic_source_switch(struct hw *hw)
+{
+	return hw->model == CTSB1270;
+}
+
+static int hw_mic_source_switch_get(struct hw *hw)
+{
+	struct hw20k2 *hw20k2 = (struct hw20k2 *)hw;
+
+	return hw20k2->mic_source;
+}
+
+static int hw_mic_source_switch_put(struct hw *hw, int position)
+{
+	struct hw20k2 *hw20k2 = (struct hw20k2 *)hw;
+
+	if (position == hw20k2->mic_source)
+		return 0;
+
+	switch (position) {
+	case 0:
+		hw_wm8775_input_select(hw, 0, 0); /* Mic, 0dB */
+		break;
+	case 1:
+		hw_wm8775_input_select(hw, 1, 0); /* FP Mic, 0dB */
+		break;
+	case 2:
+		hw_wm8775_input_select(hw, 3, 0); /* Aux Ext, 0dB */
+		break;
+	default:
+		return 0;
+	}
+
+	hw20k2->mic_source = position;
+
+	return 1;
+}
+
 static irqreturn_t ct_20k2_interrupt(int irq, void *dev_id)
 {
 	struct hw *hw = dev_id;
@@ -2023,13 +2170,16 @@ static int hw_card_init(struct hw *hw, struct card_conf *info)
 	/* Reset all SRC pending interrupts */
 	hw_write_20kx(hw, SRC_IP, 0);
 
-	/* TODO: detect the card ID and configure GPIO accordingly. */
-	/* Configures GPIO (0xD802 0x98028) */
-	/*hw_write_20kx(hw, GPIO_CTRL, 0x7F07);*/
-	/* Configures GPIO (SB0880) */
-	/*hw_write_20kx(hw, GPIO_CTRL, 0xFF07);*/
-	hw_write_20kx(hw, GPIO_CTRL, 0xD802);
-
+	if (hw->model != CTSB1270) {
+		/* TODO: detect the card ID and configure GPIO accordingly. */
+		/* Configures GPIO (0xD802 0x98028) */
+		/*hw_write_20kx(hw, GPIO_CTRL, 0x7F07);*/
+		/* Configures GPIO (SB0880) */
+		/*hw_write_20kx(hw, GPIO_CTRL, 0xFF07);*/
+		hw_write_20kx(hw, GPIO_CTRL, 0xD802);
+	} else {
+		hw_write_20kx(hw, GPIO_CTRL, 0x9E5F);
+	}
 	/* Enable audio ring */
 	hw_write_20kx(hw, MIXER_AR_ENABLE, 0x01);
 
@@ -2107,6 +2257,13 @@ static struct hw ct20k2_preset __devinitdata = {
 	.is_adc_source_selected = hw_is_adc_input_selected,
 	.select_adc_source = hw_adc_input_select,
 	.have_digit_io_switch = hw_have_digit_io_switch,
+	.have_dedicated_mic = hw_have_dedicated_mic,
+	.have_output_switch = hw_have_output_switch,
+	.output_switch_get = hw_output_switch_get,
+	.output_switch_put = hw_output_switch_put,
+	.have_mic_source_switch = hw_have_mic_source_switch,
+	.mic_source_switch_get = hw_mic_source_switch_get,
+	.mic_source_switch_put = hw_mic_source_switch_put,
 #ifdef CONFIG_PM
 	.suspend = hw_suspend,
 	.resume = hw_resume,
diff --git a/sound/pci/ctxfi/ctmixer.c b/sound/pci/ctxfi/ctmixer.c
index c3519ff42fbb..388235c43789 100644
--- a/sound/pci/ctxfi/ctmixer.c
+++ b/sound/pci/ctxfi/ctmixer.c
@@ -86,9 +86,7 @@ enum CTALSA_MIXER_CTL {
 	MIXER_LINEIN_C_S,
 	MIXER_MIC_C_S,
 	MIXER_SPDIFI_C_S,
-	MIXER_LINEIN_P_S,
 	MIXER_SPDIFO_P_S,
-	MIXER_SPDIFI_P_S,
 	MIXER_WAVEF_P_S,
 	MIXER_WAVER_P_S,
 	MIXER_WAVEC_P_S,
@@ -137,11 +135,11 @@ ct_kcontrol_init_table[NUM_CTALSA_MIXERS] = {
 	},
 	[MIXER_LINEIN_P] = {
 		.ctl = 1,
-		.name = "Line-in Playback Volume",
+		.name = "Line Playback Volume",
 	},
 	[MIXER_LINEIN_C] = {
 		.ctl = 1,
-		.name = "Line-in Capture Volume",
+		.name = "Line Capture Volume",
 	},
 	[MIXER_MIC_P] = {
 		.ctl = 1,
@@ -153,15 +151,15 @@ ct_kcontrol_init_table[NUM_CTALSA_MIXERS] = {
 	},
 	[MIXER_SPDIFI_P] = {
 		.ctl = 1,
-		.name = "S/PDIF-in Playback Volume",
+		.name = "IEC958 Playback Volume",
 	},
 	[MIXER_SPDIFI_C] = {
 		.ctl = 1,
-		.name = "S/PDIF-in Capture Volume",
+		.name = "IEC958 Capture Volume",
 	},
 	[MIXER_SPDIFO_P] = {
 		.ctl = 1,
-		.name = "S/PDIF-out Playback Volume",
+		.name = "Digital Playback Volume",
 	},
 	[MIXER_WAVEF_P] = {
 		.ctl = 1,
@@ -179,14 +177,13 @@ ct_kcontrol_init_table[NUM_CTALSA_MIXERS] = {
 		.ctl = 1,
 		.name = "Surround Playback Volume",
 	},
-
 	[MIXER_PCM_C_S] = {
 		.ctl = 1,
 		.name = "PCM Capture Switch",
 	},
 	[MIXER_LINEIN_C_S] = {
 		.ctl = 1,
-		.name = "Line-in Capture Switch",
+		.name = "Line Capture Switch",
 	},
 	[MIXER_MIC_C_S] = {
 		.ctl = 1,
@@ -194,19 +191,11 @@ ct_kcontrol_init_table[NUM_CTALSA_MIXERS] = {
 	},
 	[MIXER_SPDIFI_C_S] = {
 		.ctl = 1,
-		.name = "S/PDIF-in Capture Switch",
-	},
-	[MIXER_LINEIN_P_S] = {
-		.ctl = 1,
-		.name = "Line-in Playback Switch",
+		.name = "IEC958 Capture Switch",
 	},
 	[MIXER_SPDIFO_P_S] = {
 		.ctl = 1,
-		.name = "S/PDIF-out Playback Switch",
-	},
-	[MIXER_SPDIFI_P_S] = {
-		.ctl = 1,
-		.name = "S/PDIF-in Playback Switch",
+		.name = "Digital Playback Switch",
 	},
 	[MIXER_WAVEF_P_S] = {
 		.ctl = 1,
@@ -236,6 +225,8 @@ ct_mixer_recording_select(struct ct_mixer *mixer, enum CT_AMIXER_CTL type);
 static void
 ct_mixer_recording_unselect(struct ct_mixer *mixer, enum CT_AMIXER_CTL type);
 
+/* FIXME: this static looks like it would fail if more than one card was */
+/* installed. */
 static struct snd_kcontrol *kctls[2] = {NULL};
 
 static enum CT_AMIXER_CTL get_amixer_index(enum CTALSA_MIXER_CTL alsa_index)
@@ -420,6 +411,77 @@ static struct snd_kcontrol_new vol_ctl = {
 	.tlv		= { .p =  ct_vol_db_scale },
 };
 
+static int output_switch_info(struct snd_kcontrol *kcontrol,
+			      struct snd_ctl_elem_info *info)
+{
+	static const char *const names[3] = {
+	  "FP Headphones", "Headphones", "Speakers"
+	};
+
+	return snd_ctl_enum_info(info, 1, 3, names);
+}
+
+static int output_switch_get(struct snd_kcontrol *kcontrol,
+			     struct snd_ctl_elem_value *ucontrol)
+{
+	struct ct_atc *atc = snd_kcontrol_chip(kcontrol);
+	ucontrol->value.enumerated.item[0] = atc->output_switch_get(atc);
+	return 0;
+}
+
+static int output_switch_put(struct snd_kcontrol *kcontrol,
+			     struct snd_ctl_elem_value *ucontrol)
+{
+	struct ct_atc *atc = snd_kcontrol_chip(kcontrol);
+	if (ucontrol->value.enumerated.item[0] > 2)
+		return -EINVAL;
+	return atc->output_switch_put(atc, ucontrol->value.enumerated.item[0]);
+}
+
+static struct snd_kcontrol_new output_ctl = {
+	.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name = "Analog Output Playback Enum",
+	.info = output_switch_info,
+	.get = output_switch_get,
+	.put = output_switch_put,
+};
+
+static int mic_source_switch_info(struct snd_kcontrol *kcontrol,
+			      struct snd_ctl_elem_info *info)
+{
+	static const char *const names[3] = {
+	  "Mic", "FP Mic", "Aux"
+	};
+
+	return snd_ctl_enum_info(info, 1, 3, names);
+}
+
+static int mic_source_switch_get(struct snd_kcontrol *kcontrol,
+			     struct snd_ctl_elem_value *ucontrol)
+{
+	struct ct_atc *atc = snd_kcontrol_chip(kcontrol);
+	ucontrol->value.enumerated.item[0] = atc->mic_source_switch_get(atc);
+	return 0;
+}
+
+static int mic_source_switch_put(struct snd_kcontrol *kcontrol,
+			     struct snd_ctl_elem_value *ucontrol)
+{
+	struct ct_atc *atc = snd_kcontrol_chip(kcontrol);
+	if (ucontrol->value.enumerated.item[0] > 2)
+		return -EINVAL;
+	return atc->mic_source_switch_put(atc,
+					ucontrol->value.enumerated.item[0]);
+}
+
+static struct snd_kcontrol_new mic_source_ctl = {
+	.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name = "Mic Source Capture Enum",
+	.info = mic_source_switch_info,
+	.get = mic_source_switch_get,
+	.put = mic_source_switch_put,
+};
+
 static void
 do_line_mic_switch(struct ct_atc *atc, enum CTALSA_MIXER_CTL type)
 {
@@ -465,6 +527,7 @@ do_digit_io_switch(struct ct_atc *atc, int state)
 static void do_switch(struct ct_atc *atc, enum CTALSA_MIXER_CTL type, int state)
 {
 	struct ct_mixer *mixer = atc->mixer;
+	int have_dedicated_mic = atc->have_dedicated_mic(atc);
 
 	/* Do changes in mixer. */
 	if ((SWH_CAPTURE_START <= type) && (SWH_CAPTURE_END >= type)) {
@@ -477,8 +540,17 @@ static void do_switch(struct ct_atc *atc, enum CTALSA_MIXER_CTL type, int state)
 		}
 	}
 	/* Do changes out of mixer. */
-	if (state && (MIXER_LINEIN_C_S == type || MIXER_MIC_C_S == type))
-		do_line_mic_switch(atc, type);
+	if (!have_dedicated_mic &&
+	    (MIXER_LINEIN_C_S == type || MIXER_MIC_C_S == type)) {
+		if (state)
+			do_line_mic_switch(atc, type);
+		atc->line_in_unmute(atc, state);
+	} else if (have_dedicated_mic && (MIXER_LINEIN_C_S == type))
+		atc->line_in_unmute(atc, state);
+	else if (have_dedicated_mic && (MIXER_MIC_C_S == type))
+		atc->mic_unmute(atc, state);
+	else if (MIXER_SPDIFI_C_S == type)
+		atc->spdif_in_unmute(atc, state);
 	else if (MIXER_WAVEF_P_S == type)
 		atc->line_front_unmute(atc, state);
 	else if (MIXER_WAVES_P_S == type)
@@ -487,12 +559,8 @@ static void do_switch(struct ct_atc *atc, enum CTALSA_MIXER_CTL type, int state)
 		atc->line_clfe_unmute(atc, state);
 	else if (MIXER_WAVER_P_S == type)
 		atc->line_rear_unmute(atc, state);
-	else if (MIXER_LINEIN_P_S == type)
-		atc->line_in_unmute(atc, state);
 	else if (MIXER_SPDIFO_P_S == type)
 		atc->spdif_out_unmute(atc, state);
-	else if (MIXER_SPDIFI_P_S == type)
-		atc->spdif_in_unmute(atc, state);
 	else if (MIXER_DIGITAL_IO_S == type)
 		do_digit_io_switch(atc, state);
 
@@ -686,6 +754,7 @@ static int ct_mixer_kcontrols_create(struct ct_mixer *mixer)
 
 	ct_kcontrol_init_table[MIXER_DIGITAL_IO_S].ctl =
 					atc->have_digit_io_switch(atc);
+
 	for (type = SWH_MIXER_START; type <= SWH_MIXER_END; type++) {
 		if (ct_kcontrol_init_table[type].ctl) {
 			swh_ctl.name = ct_kcontrol_init_table[type].name;
@@ -708,6 +777,17 @@ static int ct_mixer_kcontrols_create(struct ct_mixer *mixer)
 	if (err)
 		return err;
 
+	if (atc->have_output_switch(atc)) {
+		err = ct_mixer_kcontrol_new(mixer, &output_ctl);
+		if (err)
+			return err;
+	}
+
+	if (atc->have_mic_source_switch(atc)) {
+		err = ct_mixer_kcontrol_new(mixer, &mic_source_ctl);
+		if (err)
+			return err;
+	}
 	atc->line_front_unmute(atc, 1);
 	set_switch_state(mixer, MIXER_WAVEF_P_S, 1);
 	atc->line_surround_unmute(atc, 0);
@@ -719,13 +799,12 @@ static int ct_mixer_kcontrols_create(struct ct_mixer *mixer)
 	atc->spdif_out_unmute(atc, 0);
 	set_switch_state(mixer, MIXER_SPDIFO_P_S, 0);
 	atc->line_in_unmute(atc, 0);
-	set_switch_state(mixer, MIXER_LINEIN_P_S, 0);
+	if (atc->have_dedicated_mic(atc))
+		atc->mic_unmute(atc, 0);
 	atc->spdif_in_unmute(atc, 0);
-	set_switch_state(mixer, MIXER_SPDIFI_P_S, 0);
-
-	set_switch_state(mixer, MIXER_PCM_C_S, 1);
-	set_switch_state(mixer, MIXER_LINEIN_C_S, 1);
-	set_switch_state(mixer, MIXER_SPDIFI_C_S, 1);
+	set_switch_state(mixer, MIXER_PCM_C_S, 0);
+	set_switch_state(mixer, MIXER_LINEIN_C_S, 0);
+	set_switch_state(mixer, MIXER_SPDIFI_C_S, 0);
 
 	return 0;
 }
diff --git a/sound/pci/ctxfi/xfi.c b/sound/pci/ctxfi/xfi.c
index f42e7e1a1074..7afa765afa3c 100644
--- a/sound/pci/ctxfi/xfi.c
+++ b/sound/pci/ctxfi/xfi.c
@@ -80,11 +80,11 @@ ct_card_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
 		       "are 48000 and 44100, Value 48000 is assumed.\n");
 		reference_rate = 48000;
 	}
-	if ((multiple != 1) && (multiple != 2)) {
+	if ((multiple != 1) && (multiple != 2) && (multiple != 4)) {
 		printk(KERN_ERR "ctxfi: Invalid multiple value %u!!!\n",
 		       multiple);
 		printk(KERN_ERR "ctxfi: The valid values for multiple are "
-		       "1 and 2, Value 2 is assumed.\n");
+		       "1, 2 and 4, Value 2 is assumed.\n");
 		multiple = 2;
 	}
 	err = ct_atc_create(card, pci, reference_rate, multiple,
-- 
cgit v1.2.3


From 74315cccd2104a953f493acca2c6b0519d6f5c6f Mon Sep 17 00:00:00 2001
From: Laura Abbott <lauraa@codeaurora.org>
Date: Wed, 8 Jun 2011 17:29:11 -0400
Subject: iommu-api: Add missing header file

If CONFIG_IOMMU_API is not defined some functions will just
return -ENODEV. Add errno.h for the definition of ENODEV.

Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/iommu.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0a2ba4098996..9940319d6f9d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -19,6 +19,8 @@
 #ifndef __LINUX_IOMMU_H
 #define __LINUX_IOMMU_H
 
+#include <linux/errno.h>
+
 #define IOMMU_READ	(1)
 #define IOMMU_WRITE	(2)
 #define IOMMU_CACHE	(4) /* DMA cache coherency */
-- 
cgit v1.2.3


From 7ea5906405a1f3fc1c0033dfd7e02f2cfd1de5e5 Mon Sep 17 00:00:00 2001
From: Vaibhav Nagarnaik <vnagarnaik@google.com>
Date: Tue, 3 May 2011 17:56:42 -0700
Subject: tracing: Use NUMA allocation for per-cpu ring buffer pages

The tracing ring buffer is a group of per-cpu ring buffers where
allocation and logging is done on a per-cpu basis. The events that are
generated on a particular CPU are logged in the corresponding buffer.
This is to provide wait-free writes between CPUs and good NUMA node
locality while accessing the ring buffer.

However, the allocation routines consider NUMA locality only for buffer
page metadata and not for the actual buffer page. This causes the pages
to be allocated on the NUMA node local to the CPU where the allocation
routine is running at the time.

This patch fixes the problem by using a NUMA node specific allocation
routine so that the pages are allocated from a NUMA node local to the
logging CPU.

I tested with the getuid_microbench from autotest. It is a simple binary
that calls getuid() in a loop and measures the average time for the
syscall to complete. The following command was used to test:
$ getuid_microbench 1000000

Compared the numbers found on kernel with and without this patch and
found that logging latency decreases by 30-50 ns/call.
tracing with non-NUMA allocation - 569 ns/call
tracing with NUMA allocation     - 512 ns/call

Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michael Rubin <mrubin@google.com>
Cc: David Sharp <dhsharp@google.com>
Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h          |  2 +-
 kernel/trace/ring_buffer.c           | 36 +++++++++++++++++++-----------------
 kernel/trace/ring_buffer_benchmark.c |  2 +-
 kernel/trace/trace.c                 |  7 +++----
 4 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index ab38ac80b0f9..b891de96000f 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -169,7 +169,7 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
 size_t ring_buffer_page_len(void *page);
 
 
-void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
 			  size_t len, int cpu, int full);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b0c7aa407943..725153d6cf73 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -997,13 +997,14 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 			     unsigned nr_pages)
 {
 	struct buffer_page *bpage, *tmp;
-	unsigned long addr;
 	LIST_HEAD(pages);
 	unsigned i;
 
 	WARN_ON(!nr_pages);
 
 	for (i = 0; i < nr_pages; i++) {
+		struct page *page;
+
 		bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 				    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
 		if (!bpage)
@@ -1013,10 +1014,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 
 		list_add(&bpage->list, &pages);
 
-		addr = __get_free_page(GFP_KERNEL);
-		if (!addr)
+		page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+					GFP_KERNEL, 0);
+		if (!page)
 			goto free_pages;
-		bpage->page = (void *)addr;
+		bpage->page = page_address(page);
 		rb_init_page(bpage->page);
 	}
 
@@ -1045,7 +1047,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct buffer_page *bpage;
-	unsigned long addr;
+	struct page *page;
 	int ret;
 
 	cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1067,10 +1069,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	rb_check_bpage(cpu_buffer, bpage);
 
 	cpu_buffer->reader_page = bpage;
-	addr = __get_free_page(GFP_KERNEL);
-	if (!addr)
+	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+	if (!page)
 		goto fail_free_reader;
-	bpage->page = (void *)addr;
+	bpage->page = page_address(page);
 	rb_init_page(bpage->page);
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -1314,7 +1316,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	unsigned nr_pages, rm_pages, new_pages;
 	struct buffer_page *bpage, *tmp;
 	unsigned long buffer_size;
-	unsigned long addr;
 	LIST_HEAD(pages);
 	int i, cpu;
 
@@ -1375,16 +1376,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 
 	for_each_buffer_cpu(buffer, cpu) {
 		for (i = 0; i < new_pages; i++) {
+			struct page *page;
 			bpage = kzalloc_node(ALIGN(sizeof(*bpage),
 						  cache_line_size()),
 					    GFP_KERNEL, cpu_to_node(cpu));
 			if (!bpage)
 				goto free_pages;
 			list_add(&bpage->list, &pages);
-			addr = __get_free_page(GFP_KERNEL);
-			if (!addr)
+			page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+			if (!page)
 				goto free_pages;
-			bpage->page = (void *)addr;
+			bpage->page = page_address(page);
 			rb_init_page(bpage->page);
 		}
 	}
@@ -3730,16 +3732,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
  * Returns:
  *  The page allocated, or NULL on error.
  */
-void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 {
 	struct buffer_data_page *bpage;
-	unsigned long addr;
+	struct page *page;
 
-	addr = __get_free_page(GFP_KERNEL);
-	if (!addr)
+	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+	if (!page)
 		return NULL;
 
-	bpage = (void *)addr;
+	bpage = page_address(page);
 
 	rb_init_page(bpage);
 
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 302f8a614635..a5457d577b98 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)
 	int inc;
 	int i;
 
-	bpage = ring_buffer_alloc_read_page(buffer);
+	bpage = ring_buffer_alloc_read_page(buffer, cpu);
 	if (!bpage)
 		return EVENT_DROPPED;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 71777c8fe36b..61fda6b6f1ab 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3697,7 +3697,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return 0;
 
 	if (!info->spare)
-		info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+		info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
 	if (!info->spare)
 		return -ENOMEM;
 
@@ -3854,7 +3854,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
 		ref->ref = 1;
 		ref->buffer = info->tr->buffer;
-		ref->page = ring_buffer_alloc_read_page(ref->buffer);
+		ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
 		if (!ref->page) {
 			kfree(ref);
 			break;
@@ -3863,8 +3863,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		r = ring_buffer_read_page(ref->buffer, &ref->page,
 					  len, info->cpu, 1);
 		if (r < 0) {
-			ring_buffer_free_read_page(ref->buffer,
-						   ref->page);
+			ring_buffer_free_read_page(ref->buffer, ref->page);
 			kfree(ref);
 			break;
 		}
-- 
cgit v1.2.3


From 395810627b6a43c8d0ec948884043946fa162308 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 8 Jun 2011 16:09:21 +0900
Subject: x86: Swap save_stack_trace_regs parameters

Swap the 1st and 2nd parameters of save_stack_trace_regs()
as same as the parameters of save_stack_trace_tsk().

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Namhyung Kim <namhyung@gmail.com>
Link: http://lkml.kernel.org/r/20110608070921.17777.31103.stgit@fedora15
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/stacktrace.c  | 2 +-
 arch/x86/mm/kmemcheck/error.c | 2 +-
 include/linux/stacktrace.h    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 55d9bc03f696..fdd0c6430e5a 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -66,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace)
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
 
-void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 {
 	dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
 	if (trace->nr_entries < trace->max_entries)
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 704a37cedddb..dab41876cdd5 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
 	e->trace.entries = e->trace_entries;
 	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
 	e->trace.skip = 0;
-	save_stack_trace_regs(&e->trace, regs);
+	save_stack_trace_regs(regs, &e->trace);
 
 	/* Round address down to nearest 16 bytes */
 	shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 25310f1d7f37..115b570e3bff 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -14,8 +14,8 @@ struct stack_trace {
 };
 
 extern void save_stack_trace(struct stack_trace *trace);
-extern void save_stack_trace_regs(struct stack_trace *trace,
-				  struct pt_regs *regs);
+extern void save_stack_trace_regs(struct pt_regs *regs,
+				  struct stack_trace *trace);
 extern void save_stack_trace_tsk(struct task_struct *tsk,
 				struct stack_trace *trace);
 
-- 
cgit v1.2.3


From 1fd8df2c3970c9e7e4e262354154ee39e58bdd7c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 8 Jun 2011 16:09:34 +0900
Subject: tracing/kprobes: Fix kprobe-tracer to support stack trace

Fix to support kernel stack trace correctly on kprobe-tracer.
Since the execution path of kprobe-based dynamic events is different
from other tracepoint-based events, normal ftrace_trace_stack() doesn't
work correctly. To fix that, this introduces ftrace_trace_stack_regs()
which traces stack via pt_regs instead of current stack register.

e.g.

 # echo p schedule+4 > /sys/kernel/debug/tracing/kprobe_events
 # echo 1 > /sys/kernel/debug/tracing/options/stacktrace
 # echo 1 > /sys/kernel/debug/tracing/events/kprobes/enable
 # head -n 20 /sys/kernel/debug/tracing/trace
            bash-2968  [000] 10297.050245: p_schedule_4: (schedule+0x4/0x4ca)
            bash-2968  [000] 10297.050247: <stack trace>
 => schedule_timeout
 => n_tty_read
 => tty_read
 => vfs_read
 => sys_read
 => system_call_fastpath
     kworker/0:1-2940  [000] 10297.050265: p_schedule_4: (schedule+0x4/0x4ca)
     kworker/0:1-2940  [000] 10297.050266: <stack trace>
 => worker_thread
 => kthread
 => kernel_thread_helper
            sshd-1132  [000] 10297.050365: p_schedule_4: (schedule+0x4/0x4ca)
            sshd-1132  [000] 10297.050365: <stack trace>
 => sysret_careful

Note: Even with this fix, the first entry will be skipped
if the probe is put on the function entry area before
the frame pointer is set up (usually, that is 4 bytes
 (push %bp; mov %sp %bp) on x86), because stack unwinder
depends on the frame pointer.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Namhyung Kim <namhyung@gmail.com>
Link: http://lkml.kernel.org/r/20110608070934.17777.17116.stgit@fedora15
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  4 ++++
 kernel/trace/trace.c         | 34 +++++++++++++++++++++++++++++-----
 kernel/trace/trace.h         |  9 +++++++++
 kernel/trace/trace_kprobe.c  |  6 ++++--
 4 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 59d3ef100eb9..b1e69eefc203 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -129,6 +129,10 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
 void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
 				       struct ring_buffer_event *event,
 					unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+					    struct ring_buffer_event *event,
+					    unsigned long flags, int pc,
+					    struct pt_regs *regs);
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 					 struct ring_buffer_event *event);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c977018e87c2..d9c16123f6e2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1193,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
 
+void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+					    struct ring_buffer_event *event,
+					    unsigned long flags, int pc,
+					    struct pt_regs *regs)
+{
+	ring_buffer_unlock_commit(buffer, event);
+
+	ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
+	ftrace_trace_userstack(buffer, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
+
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 					 struct ring_buffer_event *event)
 {
@@ -1238,7 +1250,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 #ifdef CONFIG_STACKTRACE
 static void __ftrace_trace_stack(struct ring_buffer *buffer,
 				 unsigned long flags,
-				 int skip, int pc)
+				 int skip, int pc, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
@@ -1257,24 +1269,36 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 	trace.skip		= skip;
 	trace.entries		= entry->caller;
 
-	save_stack_trace(&trace);
+	if (regs)
+		save_stack_trace_regs(regs, &trace);
+	else
+		save_stack_trace(&trace);
 	if (!filter_check_discard(call, entry, buffer, event))
 		ring_buffer_unlock_commit(buffer, event);
 }
 
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+			     int skip, int pc, struct pt_regs *regs)
+{
+	if (!(trace_flags & TRACE_ITER_STACKTRACE))
+		return;
+
+	__ftrace_trace_stack(buffer, flags, skip, pc, regs);
+}
+
 void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
 			int skip, int pc)
 {
 	if (!(trace_flags & TRACE_ITER_STACKTRACE))
 		return;
 
-	__ftrace_trace_stack(buffer, flags, skip, pc);
+	__ftrace_trace_stack(buffer, flags, skip, pc, NULL);
 }
 
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 		   int pc)
 {
-	__ftrace_trace_stack(tr->buffer, flags, skip, pc);
+	__ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
 }
 
 /**
@@ -1290,7 +1314,7 @@ void trace_dump_stack(void)
 	local_save_flags(flags);
 
 	/* skipping 3 traces, seems to get us at the caller of this function */
-	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
+	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
 }
 
 static DEFINE_PER_CPU(int, user_stack_count);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 742f545ae185..a3e2db708072 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -389,6 +389,9 @@ void update_max_tr_single(struct trace_array *tr,
 void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
 			int skip, int pc);
 
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+			     int skip, int pc, struct pt_regs *regs);
+
 void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
 			    int pc);
 
@@ -400,6 +403,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,
 {
 }
 
+static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
+					   unsigned long flags, int skip,
+					   int pc, struct pt_regs *regs)
+{
+}
+
 static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
 					  unsigned long flags, int pc)
 {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f925c45f0afa..7053ef3d73d2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1397,7 +1397,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
-		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+		trace_nowake_buffer_unlock_commit_regs(buffer, event,
+						       irq_flags, pc, regs);
 }
 
 /* Kretprobe handler */
@@ -1429,7 +1430,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
-		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+		trace_nowake_buffer_unlock_commit_regs(buffer, event,
+						       irq_flags, pc, regs);
 }
 
 /* Event entry printers */
-- 
cgit v1.2.3


From 221c56373ee7088dd3015b928782d5e70dc5074e Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Mon, 13 Jun 2011 13:39:01 +0000
Subject: tg3: Migrate phy preprocessor defs to system defs

This patch changes to code to use some of the preprocessor
definitions from mii.h over its homegrown equivalents.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
Reviewed-by: Benjamin Li <benli@broadcom.com>
Signed-off-by: David S. Miller <davem@conan.davemloft.net>
---
 drivers/net/tg3.c   | 26 ++++++++++++--------------
 drivers/net/tg3.h   |  8 --------
 include/linux/mii.h |  2 ++
 3 files changed, 14 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 7b1e2abf0cea..da5e9105b9f4 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -861,7 +861,7 @@ static int tg3_writephy(struct tg3 *tp, int reg, u32 val)
 	int ret;
 
 	if ((tp->phy_flags & TG3_PHYFLG_IS_FET) &&
-	    (reg == MII_TG3_CTRL || reg == MII_TG3_AUX_CTRL))
+	    (reg == MII_CTRL1000 || reg == MII_TG3_AUX_CTRL))
 		return 0;
 
 	if ((tp->mi_mode & MAC_MI_MODE_AUTO_POLL) != 0) {
@@ -1981,15 +1981,14 @@ static int tg3_phy_reset_5703_4_5(struct tg3 *tp)
 
 		/* Set full-duplex, 1000 mbps.  */
 		tg3_writephy(tp, MII_BMCR,
-			     BMCR_FULLDPLX | TG3_BMCR_SPEED1000);
+			     BMCR_FULLDPLX | BMCR_SPEED1000);
 
 		/* Set to master mode.  */
-		if (tg3_readphy(tp, MII_TG3_CTRL, &phy9_orig))
+		if (tg3_readphy(tp, MII_CTRL1000, &phy9_orig))
 			continue;
 
-		tg3_writephy(tp, MII_TG3_CTRL,
-			     (MII_TG3_CTRL_AS_MASTER |
-			      MII_TG3_CTRL_ENABLE_AS_MASTER));
+		tg3_writephy(tp, MII_CTRL1000,
+			     CTL1000_AS_MASTER | CTL1000_ENABLE_MASTER);
 
 		err = TG3_PHY_AUXCTL_SMDSP_ENABLE(tp);
 		if (err)
@@ -2014,7 +2013,7 @@ static int tg3_phy_reset_5703_4_5(struct tg3 *tp)
 
 	TG3_PHY_AUXCTL_SMDSP_DISABLE(tp);
 
-	tg3_writephy(tp, MII_TG3_CTRL, phy9_orig);
+	tg3_writephy(tp, MII_CTRL1000, phy9_orig);
 
 	if (!tg3_readphy(tp, MII_TG3_EXT_CTRL, &reg32)) {
 		reg32 &= ~0x3000;
@@ -2958,16 +2957,15 @@ static int tg3_phy_autoneg_cfg(struct tg3 *tp, u32 advertise, u32 flowctrl)
 
 	new_adv = 0;
 	if (advertise & ADVERTISED_1000baseT_Half)
-		new_adv |= MII_TG3_CTRL_ADV_1000_HALF;
+		new_adv |= ADVERTISE_1000HALF;
 	if (advertise & ADVERTISED_1000baseT_Full)
-		new_adv |= MII_TG3_CTRL_ADV_1000_FULL;
+		new_adv |= ADVERTISE_1000FULL;
 
 	if (tp->pci_chip_rev_id == CHIPREV_ID_5701_A0 ||
 	    tp->pci_chip_rev_id == CHIPREV_ID_5701_B0)
-		new_adv |= (MII_TG3_CTRL_AS_MASTER |
-			    MII_TG3_CTRL_ENABLE_AS_MASTER);
+		new_adv |= CTL1000_AS_MASTER | CTL1000_ENABLE_MASTER;
 
-	err = tg3_writephy(tp, MII_TG3_CTRL, new_adv);
+	err = tg3_writephy(tp, MII_CTRL1000, new_adv);
 	if (err)
 		goto done;
 
@@ -3076,7 +3074,7 @@ static void tg3_phy_copper_begin(struct tg3 *tp)
 			break;
 
 		case SPEED_1000:
-			bmcr |= TG3_BMCR_SPEED1000;
+			bmcr |= BMCR_SPEED1000;
 			break;
 		}
 
@@ -3153,7 +3151,7 @@ static int tg3_copper_is_advertising_all(struct tg3 *tp, u32 mask)
 		if (mask & ADVERTISED_1000baseT_Full)
 			all_mask |= ADVERTISE_1000FULL;
 
-		if (tg3_readphy(tp, MII_TG3_CTRL, &tg3_ctrl))
+		if (tg3_readphy(tp, MII_CTRL1000, &tg3_ctrl))
 			return 0;
 
 		if ((tg3_ctrl & all_mask) != all_mask)
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 54441a346cb3..bedc3b4557b5 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2152,14 +2152,6 @@
 
 
 /*** Tigon3 specific PHY MII registers. ***/
-#define  TG3_BMCR_SPEED1000		0x0040
-
-#define MII_TG3_CTRL			0x09 /* 1000-baseT control register */
-#define  MII_TG3_CTRL_ADV_1000_HALF	0x0100
-#define  MII_TG3_CTRL_ADV_1000_FULL	0x0200
-#define  MII_TG3_CTRL_AS_MASTER		0x0800
-#define  MII_TG3_CTRL_ENABLE_AS_MASTER	0x1000
-
 #define MII_TG3_MMD_CTRL		0x0d /* MMD Access Control register */
 #define MII_TG3_MMD_CTRL_DATA_NOINC	0x4000
 #define MII_TG3_MMD_ADDRESS		0x0e /* MMD Address Data register */
diff --git a/include/linux/mii.h b/include/linux/mii.h
index 359fba880274..103113a2fd18 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -128,6 +128,8 @@
 /* 1000BASE-T Control register */
 #define ADVERTISE_1000FULL      0x0200  /* Advertise 1000BASE-T full duplex */
 #define ADVERTISE_1000HALF      0x0100  /* Advertise 1000BASE-T half duplex */
+#define CTL1000_AS_MASTER	0x0800
+#define CTL1000_ENABLE_MASTER	0x1000
 
 /* 1000BASE-T Status register */
 #define LPA_1000LOCALRXOK       0x2000  /* Link partner local receiver status */
-- 
cgit v1.2.3


From 3192b920bf7d0c528ab54e7d3689f44055316a37 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 14 Jun 2011 16:16:36 -0500
Subject: slab, slub, slob: Unify alignment definition

Every slab has its on alignment definition in include/linux/sl?b_def.h. Extract those
and define a common set in include/linux/slab.h.

SLOB: As notes sometimes we need double word alignment on 32 bit. This gives all
structures allocated by SLOB a unsigned long long alignment like the others do.

SLAB: If ARCH_SLAB_MINALIGN is not set SLAB would set ARCH_SLAB_MINALIGN to
zero meaning no alignment at all. Give it the default unsigned long long alignment.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h     | 10 ++++++++++
 include/linux/slab_def.h | 26 --------------------------
 include/linux/slob_def.h | 10 ----------
 include/linux/slub_def.h | 10 ----------
 4 files changed, 10 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index ad4dd1c8d30a..646a639a4aae 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -133,6 +133,16 @@ unsigned int kmem_cache_size(struct kmem_cache *);
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_HIGH)
 #define KMALLOC_MAX_ORDER	(KMALLOC_SHIFT_HIGH - PAGE_SHIFT)
 
+#ifdef ARCH_DMA_MINALIGN
+#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
+#else
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+#endif
+
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
+#endif
+
 /*
  * Common kmalloc functions provided by all allocators
  */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 83203ae9390b..d7f63112f63c 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -17,32 +17,6 @@
 
 #include <trace/events/kmem.h>
 
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * ARCH_KMALLOC_MINALIGN allows that.
- * Note that increasing this value may disable some debug features.
- */
-#ifdef ARCH_DMA_MINALIGN
-#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
-#else
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
-
 /*
  * struct kmem_cache
  *
diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index 4382db09df4f..0ec00b39d006 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -1,16 +1,6 @@
 #ifndef __LINUX_SLOB_DEF_H
 #define __LINUX_SLOB_DEF_H
 
-#ifdef ARCH_DMA_MINALIGN
-#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
-#else
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
-#endif
-
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
 static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep,
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index c8668d161dd8..fd4fdc72bc8c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -113,16 +113,6 @@ struct kmem_cache {
 
 #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
 
-#ifdef ARCH_DMA_MINALIGN
-#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
-#else
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
-
 /*
  * Maximum kmalloc object size handled by SLUB. Larger object allocations
  * are passed through to the page allocator. The page allocator "fastpath"
-- 
cgit v1.2.3


From 5416219e5ca4504ea80d662fdda7337e52e86ee5 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 18:40:55 +0200
Subject: netfilter: ipset: timeout can be modified for already added elements

When an element to a set with timeout added, one can change the timeout
by "readding" the element with the "-exist" flag. That means the timeout
value is reset to the specified one (or to the default from the set
specification if the "timeout n" option is not used). Example

ipset add foo 1.2.3.4 timeout 10
ipset add foo 1.2.3.4 timeout 600 -exist

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set.h       |  3 +-
 include/linux/netfilter/ipset/ip_set_ahash.h | 15 ++---
 net/netfilter/ipset/ip_set_bitmap_ip.c       | 20 +++---
 net/netfilter/ipset/ip_set_bitmap_ipmac.c    | 21 ++++---
 net/netfilter/ipset/ip_set_bitmap_port.c     | 20 +++---
 net/netfilter/ipset/ip_set_hash_ip.c         | 10 +--
 net/netfilter/ipset/ip_set_hash_ipport.c     | 12 ++--
 net/netfilter/ipset/ip_set_hash_ipportip.c   | 12 ++--
 net/netfilter/ipset/ip_set_hash_ipportnet.c  | 12 ++--
 net/netfilter/ipset/ip_set_hash_net.c        |  8 +--
 net/netfilter/ipset/ip_set_hash_netport.c    | 12 ++--
 net/netfilter/ipset/ip_set_list_set.c        | 92 +++++++++++++++++++---------
 12 files changed, 136 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 5a262e3ae715..277b7fbc7fb2 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -214,7 +214,8 @@ enum ip_set_feature {
 
 struct ip_set;
 
-typedef int (*ipset_adtfn)(struct ip_set *set, void *value, u32 timeout);
+typedef int (*ipset_adtfn)(struct ip_set *set, void *value,
+			   u32 timeout, u32 flags);
 
 /* Set type, variant-specific part */
 struct ip_set_type_variant {
diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index ac3c822eb39a..36cf4dc703bb 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -349,7 +349,7 @@ retry:
 /* Add an element to a hash and update the internal counters when succeeded,
  * otherwise report the proper error code. */
 static int
-type_pf_add(struct ip_set *set, void *value, u32 timeout)
+type_pf_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct ip_set_hash *h = set->data;
 	struct htable *t;
@@ -388,7 +388,7 @@ out:
  * and free up space if possible.
  */
 static int
-type_pf_del(struct ip_set *set, void *value, u32 timeout)
+type_pf_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct ip_set_hash *h = set->data;
 	struct htable *t = h->table;
@@ -463,7 +463,7 @@ type_pf_test_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
 
 /* Test whether the element is added to the set */
 static int
-type_pf_test(struct ip_set *set, void *value, u32 timeout)
+type_pf_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct ip_set_hash *h = set->data;
 	struct htable *t = h->table;
@@ -776,7 +776,7 @@ retry:
 }
 
 static int
-type_pf_tadd(struct ip_set *set, void *value, u32 timeout)
+type_pf_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct ip_set_hash *h = set->data;
 	struct htable *t = h->table;
@@ -784,6 +784,7 @@ type_pf_tadd(struct ip_set *set, void *value, u32 timeout)
 	struct hbucket *n;
 	struct type_pf_elem *data;
 	int ret = 0, i, j = AHASH_MAX_SIZE + 1;
+	bool flag_exist = flags & IPSET_FLAG_EXIST;
 	u32 key;
 
 	if (h->elements >= h->maxelem)
@@ -799,7 +800,7 @@ type_pf_tadd(struct ip_set *set, void *value, u32 timeout)
 	for (i = 0; i < n->pos; i++) {
 		data = ahash_tdata(n, i);
 		if (type_pf_data_equal(data, d)) {
-			if (type_pf_data_expired(data))
+			if (type_pf_data_expired(data) || flag_exist)
 				j = i;
 			else {
 				ret = -IPSET_ERR_EXIST;
@@ -833,7 +834,7 @@ out:
 }
 
 static int
-type_pf_tdel(struct ip_set *set, void *value, u32 timeout)
+type_pf_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct ip_set_hash *h = set->data;
 	struct htable *t = h->table;
@@ -905,7 +906,7 @@ type_pf_ttest_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
 #endif
 
 static int
-type_pf_ttest(struct ip_set *set, void *value, u32 timeout)
+type_pf_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct ip_set_hash *h = set->data;
 	struct htable *t = h->table;
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index ba2d16607f48..85b1cdf0a4b8 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -54,7 +54,7 @@ ip_to_id(const struct bitmap_ip *m, u32 ip)
 }
 
 static int
-bitmap_ip_test(struct ip_set *set, void *value, u32 timeout)
+bitmap_ip_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	const struct bitmap_ip *map = set->data;
 	u16 id = *(u16 *)value;
@@ -63,7 +63,7 @@ bitmap_ip_test(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_ip_add(struct ip_set *set, void *value, u32 timeout)
+bitmap_ip_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_ip *map = set->data;
 	u16 id = *(u16 *)value;
@@ -75,7 +75,7 @@ bitmap_ip_add(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_ip_del(struct ip_set *set, void *value, u32 timeout)
+bitmap_ip_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_ip *map = set->data;
 	u16 id = *(u16 *)value;
@@ -131,7 +131,7 @@ nla_put_failure:
 /* Timeout variant */
 
 static int
-bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout)
+bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	const struct bitmap_ip *map = set->data;
 	const unsigned long *members = map->members;
@@ -141,13 +141,13 @@ bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout)
+bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_ip *map = set->data;
 	unsigned long *members = map->members;
 	u16 id = *(u16 *)value;
 
-	if (ip_set_timeout_test(members[id]))
+	if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST))
 		return -IPSET_ERR_EXIST;
 
 	members[id] = ip_set_timeout_set(timeout);
@@ -156,7 +156,7 @@ bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout)
+bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_ip *map = set->data;
 	unsigned long *members = map->members;
@@ -231,7 +231,7 @@ bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 	ip = ip_to_id(map, ip);
 
-	return adtfn(set, &ip, map->timeout);
+	return adtfn(set, &ip, map->timeout, flags);
 }
 
 static int
@@ -266,7 +266,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (adt == IPSET_TEST) {
 		id = ip_to_id(map, ip);
-		return adtfn(set, &id, timeout);
+		return adtfn(set, &id, timeout, flags);
 	}
 
 	if (tb[IPSET_ATTR_IP_TO]) {
@@ -293,7 +293,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	for (; !before(ip_to, ip); ip += map->hosts) {
 		id = ip_to_id(map, ip);
-		ret = adtfn(set, &id, timeout);
+		ret = adtfn(set, &id, timeout, flags);
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index a274300b6a56..913a461382e4 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -99,7 +99,7 @@ bitmap_ipmac_exist(const struct ipmac_telem *elem)
 /* Base variant */
 
 static int
-bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout)
+bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	const struct bitmap_ipmac *map = set->data;
 	const struct ipmac *data = value;
@@ -117,7 +117,7 @@ bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout)
+bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_ipmac *map = set->data;
 	const struct ipmac *data = value;
@@ -146,7 +146,7 @@ bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout)
+bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_ipmac *map = set->data;
 	const struct ipmac *data = value;
@@ -212,7 +212,7 @@ nla_put_failure:
 /* Timeout variant */
 
 static int
-bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout)
+bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	const struct bitmap_ipmac *map = set->data;
 	const struct ipmac *data = value;
@@ -231,15 +231,16 @@ bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout)
+bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_ipmac *map = set->data;
 	const struct ipmac *data = value;
 	struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id);
+	bool flag_exist = flags & IPSET_FLAG_EXIST;
 
 	switch (elem->match) {
 	case MAC_UNSET:
-		if (!data->ether)
+		if (!(data->ether || flag_exist))
 			/* Already added without ethernet address */
 			return -IPSET_ERR_EXIST;
 		/* Fill the MAC address and activate the timer */
@@ -251,7 +252,7 @@ bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout)
 		elem->timeout = ip_set_timeout_set(timeout);
 		break;
 	case MAC_FILLED:
-		if (!bitmap_expired(map, data->id))
+		if (!(bitmap_expired(map, data->id) || flag_exist))
 			return -IPSET_ERR_EXIST;
 		/* Fall through */
 	case MAC_EMPTY:
@@ -273,7 +274,7 @@ bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout)
+bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_ipmac *map = set->data;
 	const struct ipmac *data = value;
@@ -359,7 +360,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
 	data.id -= map->first_ip;
 	data.ether = eth_hdr(skb)->h_source;
 
-	return adtfn(set, &data, map->timeout);
+	return adtfn(set, &data, map->timeout, flags);
 }
 
 static int
@@ -399,7 +400,7 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	data.id -= map->first_ip;
 
-	ret = adtfn(set, &data, timeout);
+	ret = adtfn(set, &data, timeout, flags);
 
 	return ip_set_eexist(ret, flags) ? 0 : ret;
 }
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index 6b38eb8f6ed8..a3935eef76fc 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -40,7 +40,7 @@ struct bitmap_port {
 /* Base variant */
 
 static int
-bitmap_port_test(struct ip_set *set, void *value, u32 timeout)
+bitmap_port_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	const struct bitmap_port *map = set->data;
 	u16 id = *(u16 *)value;
@@ -49,7 +49,7 @@ bitmap_port_test(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_port_add(struct ip_set *set, void *value, u32 timeout)
+bitmap_port_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_port *map = set->data;
 	u16 id = *(u16 *)value;
@@ -61,7 +61,7 @@ bitmap_port_add(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_port_del(struct ip_set *set, void *value, u32 timeout)
+bitmap_port_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_port *map = set->data;
 	u16 id = *(u16 *)value;
@@ -119,7 +119,7 @@ nla_put_failure:
 /* Timeout variant */
 
 static int
-bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout)
+bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	const struct bitmap_port *map = set->data;
 	const unsigned long *members = map->members;
@@ -129,13 +129,13 @@ bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout)
+bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_port *map = set->data;
 	unsigned long *members = map->members;
 	u16 id = *(u16 *)value;
 
-	if (ip_set_timeout_test(members[id]))
+	if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST))
 		return -IPSET_ERR_EXIST;
 
 	members[id] = ip_set_timeout_set(timeout);
@@ -144,7 +144,7 @@ bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout)
 }
 
 static int
-bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout)
+bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
 {
 	struct bitmap_port *map = set->data;
 	unsigned long *members = map->members;
@@ -225,7 +225,7 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 	port -= map->first_port;
 
-	return adtfn(set, &port, map->timeout);
+	return adtfn(set, &port, map->timeout, flags);
 }
 
 static int
@@ -259,7 +259,7 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (adt == IPSET_TEST) {
 		id = port - map->first_port;
-		return adtfn(set, &id, timeout);
+		return adtfn(set, &id, timeout, flags);
 	}
 
 	if (tb[IPSET_ATTR_PORT_TO]) {
@@ -277,7 +277,7 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	for (; port <= port_to; port++) {
 		id = port - map->first_port;
-		ret = adtfn(set, &id, timeout);
+		ret = adtfn(set, &id, timeout, flags);
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 43bcce200129..368302001120 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -121,7 +121,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (ip == 0)
 		return -EINVAL;
 
-	return adtfn(set, &ip, h->timeout);
+	return adtfn(set, &ip, h->timeout, flags);
 }
 
 static int
@@ -157,7 +157,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 		nip = htonl(ip);
 		if (nip == 0)
 			return -IPSET_ERR_HASH_ELEM;
-		return adtfn(set, &nip, timeout);
+		return adtfn(set, &nip, timeout, flags);
 	}
 
 	if (tb[IPSET_ATTR_IP_TO]) {
@@ -182,7 +182,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 		nip = htonl(ip);
 		if (nip == 0)
 			return -IPSET_ERR_HASH_ELEM;
-		ret = adtfn(set, &nip, timeout);
+		ret = adtfn(set, &nip, timeout, flags);
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
@@ -294,7 +294,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (ipv6_addr_any(&ip.in6))
 		return -EINVAL;
 
-	return adtfn(set, &ip, h->timeout);
+	return adtfn(set, &ip, h->timeout, flags);
 }
 
 static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = {
@@ -336,7 +336,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	ret = adtfn(set, &ip, timeout);
+	ret = adtfn(set, &ip, timeout, flags);
 
 	return ip_set_eexist(ret, flags) ? 0 : ret;
 }
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 14281b6b8074..65c2ff4b27aa 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -138,7 +138,7 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -192,7 +192,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (adt == IPSET_TEST ||
 	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
 	      tb[IPSET_ATTR_PORT_TO])) {
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -224,7 +224,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 		for (p = port; p <= port_to; p++) {
 			data.ip = htonl(ip);
 			data.port = htons(p);
-			ret = adtfn(set, &data, timeout);
+			ret = adtfn(set, &data, timeout, flags);
 
 			if (ret && !ip_set_eexist(ret, flags))
 				return ret;
@@ -342,7 +342,7 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -396,7 +396,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
 	}
 
 	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -407,7 +407,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 401c8a2531db..670e5e4a1232 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -142,7 +142,7 @@ hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
 	ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -200,7 +200,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (adt == IPSET_TEST ||
 	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
 	      tb[IPSET_ATTR_PORT_TO])) {
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -232,7 +232,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 		for (p = port; p <= port_to; p++) {
 			data.ip = htonl(ip);
 			data.port = htons(p);
-			ret = adtfn(set, &data, timeout);
+			ret = adtfn(set, &data, timeout, flags);
 
 			if (ret && !ip_set_eexist(ret, flags))
 				return ret;
@@ -356,7 +356,7 @@ hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
 	ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -414,7 +414,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
 	}
 
 	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -425,7 +425,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 4743e5402522..4bb365c9f3db 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -162,7 +162,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
 	data.ip2 &= ip_set_netmask(data.cidr);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -228,7 +228,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (adt == IPSET_TEST ||
 	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
 	      tb[IPSET_ATTR_PORT_TO])) {
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -260,7 +260,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 		for (p = port; p <= port_to; p++) {
 			data.ip = htonl(ip);
 			data.port = htons(p);
-			ret = adtfn(set, &data, timeout);
+			ret = adtfn(set, &data, timeout, flags);
 
 			if (ret && !ip_set_eexist(ret, flags))
 				return ret;
@@ -410,7 +410,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
 	ip6_netmask(&data.ip2, data.cidr);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -476,7 +476,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 	}
 
 	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -487,7 +487,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index c4db202b7da4..440b38f9fe3b 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -141,7 +141,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
 	data.ip &= ip_set_netmask(data.cidr);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -179,7 +179,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	ret = adtfn(set, &data, timeout);
+	ret = adtfn(set, &data, timeout, flags);
 
 	return ip_set_eexist(ret, flags) ? 0 : ret;
 }
@@ -306,7 +306,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
 	ip6_netmask(&data.ip, data.cidr);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -344,7 +344,7 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	ret = adtfn(set, &data, timeout);
+	ret = adtfn(set, &data, timeout, flags);
 
 	return ip_set_eexist(ret, flags) ? 0 : ret;
 }
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index d2a40362dd3a..2d31291ba83e 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -158,7 +158,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
 	data.ip &= ip_set_netmask(data.cidr);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -216,7 +216,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	}
 
 	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -227,7 +227,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
@@ -371,7 +371,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
 	ip6_netmask(&data.ip, data.cidr);
 
-	return adtfn(set, &data, h->timeout);
+	return adtfn(set, &data, h->timeout, flags);
 }
 
 static int
@@ -429,7 +429,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
 	}
 
 	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -440,7 +440,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
-		ret = adtfn(set, &data, timeout);
+		ret = adtfn(set, &data, timeout, flags);
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index e9159e99fc4b..a0290ffad355 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -109,15 +109,28 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
 }
 
 static bool
-next_id_eq(const struct list_set *map, u32 i, ip_set_id_t id)
+id_eq(const struct list_set *map, u32 i, ip_set_id_t id)
 {
 	const struct set_elem *elem;
 
-	if (i + 1 < map->size) {
-		elem = list_set_elem(map, i + 1);
+	if (i < map->size) {
+		elem = list_set_elem(map, i);
+		return elem->id == id;
+	}
+
+	return 0;
+}
+
+static bool
+id_eq_timeout(const struct list_set *map, u32 i, ip_set_id_t id)
+{
+	const struct set_elem *elem;
+
+	if (i < map->size) {
+		elem = list_set_elem(map, i);
 		return !!(elem->id == id &&
 			  !(with_timeout(map->timeout) &&
-			    list_set_expired(map, i + 1)));
+			    list_set_expired(map, i)));
 	}
 
 	return 0;
@@ -190,12 +203,26 @@ list_set_del(struct list_set *map, u32 i)
 	return 0;
 }
 
+static void
+cleanup_entries(struct list_set *map)
+{
+	struct set_telem *e;
+	u32 i;
+
+	for (i = 0; i < map->size; i++) {
+		e = list_set_telem(map, i);
+		if (e->id != IPSET_INVALID_ID && list_set_expired(map, i))
+			list_set_del(map, i);
+	}
+}
+
 static int
 list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 	      enum ipset_adt adt, u32 *lineno, u32 flags)
 {
 	struct list_set *map = set->data;
 	bool with_timeout = with_timeout(map->timeout);
+	bool flag_exist = flags & IPSET_FLAG_EXIST;
 	int before = 0;
 	u32 timeout = map->timeout;
 	ip_set_id_t id, refid = IPSET_INVALID_ID;
@@ -248,6 +275,8 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 		}
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
+	if (with_timeout && adt != IPSET_TEST)
+		cleanup_entries(map);
 
 	switch (adt) {
 	case IPSET_TEST:
@@ -259,22 +288,37 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 			else if (with_timeout && list_set_expired(map, i))
 				continue;
 			else if (before > 0 && elem->id == id)
-				ret = next_id_eq(map, i, refid);
+				ret = id_eq_timeout(map, i + 1, refid);
 			else if (before < 0 && elem->id == refid)
-				ret = next_id_eq(map, i, id);
+				ret = id_eq_timeout(map, i + 1, id);
 			else if (before == 0 && elem->id == id)
 				ret = 1;
 		}
 		break;
 	case IPSET_ADD:
-		for (i = 0; i < map->size && !ret; i++) {
+		for (i = 0; i < map->size; i++) {
 			elem = list_set_elem(map, i);
-			if (elem->id == id &&
-			    !(with_timeout && list_set_expired(map, i)))
+			if (elem->id != id)
+				continue;
+			if (!(with_timeout && flag_exist)) {
 				ret = -IPSET_ERR_EXIST;
+				goto finish;
+			} else {
+				struct set_telem *e = list_set_telem(map, i);
+
+				if ((before > 1 &&
+				     !id_eq(map, i + 1, refid)) ||
+				    (before < 0 &&
+				     (i == 0 || !id_eq(map, i - 1, refid)))) {
+				     	ret = -IPSET_ERR_EXIST;
+				     	goto finish;
+				}
+				e->timeout = ip_set_timeout_set(timeout);
+				ip_set_put_byindex(id);
+				ret = 0;
+				goto finish;
+			}
 		}
-		if (ret == -IPSET_ERR_EXIST)
-			break;
 		ret = -IPSET_ERR_LIST_FULL;
 		for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
 			elem = list_set_elem(map, i);
@@ -283,9 +327,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 					: list_set_add(map, i, id, timeout);
 			else if (elem->id != refid)
 				continue;
-			else if (with_timeout && list_set_expired(map, i))
-				ret = -IPSET_ERR_REF_EXIST;
-			else if (before)
+			else if (before > 0)
 				ret = list_set_add(map, i, id, timeout);
 			else if (i + 1 < map->size)
 				ret = list_set_add(map, i + 1, id, timeout);
@@ -299,16 +341,12 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 				ret = before != 0 ? -IPSET_ERR_REF_EXIST
 						  : -IPSET_ERR_EXIST;
 				break;
-			} else if (with_timeout && list_set_expired(map, i))
-				continue;
-			else if (elem->id == id &&
-				 (before == 0 ||
-				  (before > 0 &&
-				   next_id_eq(map, i, refid))))
+			} else if (elem->id == id &&
+				   (before == 0 ||
+				    (before > 0 && id_eq(map, i + 1, refid))))
 				ret = list_set_del(map, i);
-			else if (before < 0 &&
-				 elem->id == refid &&
-				 next_id_eq(map, i, id))
+			else if (elem->id == refid &&
+				 before < 0 && id_eq(map, i + 1, id))
 				ret = list_set_del(map, i + 1);
 		}
 		break;
@@ -454,15 +492,9 @@ list_set_gc(unsigned long ul_set)
 {
 	struct ip_set *set = (struct ip_set *) ul_set;
 	struct list_set *map = set->data;
-	struct set_telem *e;
-	u32 i;
 
 	write_lock_bh(&set->lock);
-	for (i = 0; i < map->size; i++) {
-		e = list_set_telem(map, i);
-		if (e->id != IPSET_INVALID_ID && list_set_expired(map, i))
-			list_set_del(map, i);
-	}
+	cleanup_entries(map);
 	write_unlock_bh(&set->lock);
 
 	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
-- 
cgit v1.2.3


From ac8cc925d35fc5a05da2bd097e602f20de2478a4 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 18:42:40 +0200
Subject: netfilter: ipset: options and flags support added to the kernel API

The support makes possible to specify the timeout value for
the SET target and a flag to reset the timeout for already existing
entries.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set.h         |  18 ++-
 include/linux/netfilter/ipset/ip_set_ahash.h   |   2 +-
 include/linux/netfilter/ipset/ip_set_timeout.h |   3 +
 include/linux/netfilter/xt_set.h               |  15 ++-
 net/netfilter/ipset/ip_set_bitmap_ip.c         |   6 +-
 net/netfilter/ipset/ip_set_bitmap_ipmac.c      |   8 +-
 net/netfilter/ipset/ip_set_bitmap_port.c       |   7 +-
 net/netfilter/ipset/ip_set_core.c              |  26 ++---
 net/netfilter/ipset/ip_set_hash_ip.c           |  12 +-
 net/netfilter/ipset/ip_set_hash_ipport.c       |  16 +--
 net/netfilter/ipset/ip_set_hash_ipportip.c     |  20 ++--
 net/netfilter/ipset/ip_set_hash_ipportnet.c    |  20 ++--
 net/netfilter/ipset/ip_set_hash_net.c          |  12 +-
 net/netfilter/ipset/ip_set_hash_netport.c      |  16 +--
 net/netfilter/ipset/ip_set_list_set.c          |   8 +-
 net/netfilter/xt_set.c                         | 151 ++++++++++++++++---------
 16 files changed, 206 insertions(+), 134 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 277b7fbc7fb2..68b21f5d6a84 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -217,6 +217,15 @@ struct ip_set;
 typedef int (*ipset_adtfn)(struct ip_set *set, void *value,
 			   u32 timeout, u32 flags);
 
+/* Kernel API function options */
+struct ip_set_adt_opt {
+	u8 family;		/* Actual protocol family */
+	u8 dim;			/* Dimension of match/target */
+	u8 flags;		/* Direction and negation flags */
+	u32 cmdflags;		/* Command-like flags */
+	u32 timeout;		/* Timeout value */
+};
+
 /* Set type, variant-specific part */
 struct ip_set_type_variant {
 	/* Kernelspace: test/add/del entries
@@ -224,7 +233,7 @@ struct ip_set_type_variant {
 	 *			zero for no match/success to add/delete
 	 *			positive for matching element */
 	int (*kadt)(struct ip_set *set, const struct sk_buff * skb,
-		    enum ipset_adt adt, u8 pf, u8 dim, u8 flags);
+		    enum ipset_adt adt, const struct ip_set_adt_opt *opt);
 
 	/* Userspace: test/add/del entries
 	 *		returns negative error code,
@@ -314,12 +323,13 @@ extern ip_set_id_t ip_set_nfnl_get_byindex(ip_set_id_t index);
 extern void ip_set_nfnl_put(ip_set_id_t index);
 
 /* API for iptables set match, and SET target */
+
 extern int ip_set_add(ip_set_id_t id, const struct sk_buff *skb,
-		      u8 family, u8 dim, u8 flags);
+		      const struct ip_set_adt_opt *opt);
 extern int ip_set_del(ip_set_id_t id, const struct sk_buff *skb,
-		      u8 family, u8 dim, u8 flags);
+		      const struct ip_set_adt_opt *opt);
 extern int ip_set_test(ip_set_id_t id, const struct sk_buff *skb,
-		       u8 family, u8 dim, u8 flags);
+		       const struct ip_set_adt_opt *opt);
 
 /* Utility functions */
 extern void * ip_set_alloc(size_t size);
diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index 36cf4dc703bb..6c0219348b43 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -586,7 +586,7 @@ nla_put_failure:
 
 static int
 type_pf_kadt(struct ip_set *set, const struct sk_buff * skb,
-	     enum ipset_adt adt, u8 pf, u8 dim, u8 flags);
+	     enum ipset_adt adt, const struct ip_set_adt_opt *opt);
 static int
 type_pf_uadt(struct ip_set *set, struct nlattr *tb[],
 	     enum ipset_adt adt, u32 *lineno, u32 flags);
diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index bcdd40ad39ed..bae086af0e44 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -22,6 +22,9 @@
 
 #define with_timeout(timeout)	((timeout) != IPSET_NO_TIMEOUT)
 
+#define opt_timeout(opt, map)	\
+	(with_timeout((opt)->timeout) ? (opt)->timeout : (map)->timeout)
+
 static inline unsigned int
 ip_set_timeout_uget(struct nlattr *tb)
 {
diff --git a/include/linux/netfilter/xt_set.h b/include/linux/netfilter/xt_set.h
index 081f1ded2842..c0405ac92870 100644
--- a/include/linux/netfilter/xt_set.h
+++ b/include/linux/netfilter/xt_set.h
@@ -35,7 +35,7 @@ struct xt_set_info_target_v0 {
 	struct xt_set_info_v0 del_set;
 };
 
-/* Revision 1: current interface to netfilter/iptables */
+/* Revision 1  match and target */
 
 struct xt_set_info {
 	ip_set_id_t index;
@@ -44,13 +44,22 @@ struct xt_set_info {
 };
 
 /* match and target infos */
-struct xt_set_info_match {
+struct xt_set_info_match_v1 {
 	struct xt_set_info match_set;
 };
 
-struct xt_set_info_target {
+struct xt_set_info_target_v1 {
 	struct xt_set_info add_set;
 	struct xt_set_info del_set;
 };
 
+/* Revision 2 target */
+
+struct xt_set_info_target_v2 {
+	struct xt_set_info add_set;
+	struct xt_set_info del_set;
+	u32 flags;
+	u32 timeout;
+};
+
 #endif /*_XT_SET_H*/
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 85b1cdf0a4b8..75990b37ca37 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -219,19 +219,19 @@ nla_put_failure:
 
 static int
 bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
-	       enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+	       enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	struct bitmap_ip *map = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	u32 ip;
 
-	ip = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC));
+	ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
 	if (ip < map->first_ip || ip > map->last_ip)
 		return -IPSET_ERR_BITMAP_RANGE;
 
 	ip = ip_to_id(map, ip);
 
-	return adtfn(set, &ip, map->timeout, flags);
+	return adtfn(set, &ip, opt_timeout(opt, map), opt->cmdflags);
 }
 
 static int
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 913a461382e4..cbe77f36650b 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -338,17 +338,17 @@ nla_put_failure:
 
 static int
 bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
-		  enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		  enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	struct bitmap_ipmac *map = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct ipmac data;
 
 	/* MAC can be src only */
-	if (!(flags & IPSET_DIM_TWO_SRC))
+	if (!(opt->flags & IPSET_DIM_TWO_SRC))
 		return 0;
 
-	data.id = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC));
+	data.id = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
 	if (data.id < map->first_ip || data.id > map->last_ip)
 		return -IPSET_ERR_BITMAP_RANGE;
 
@@ -360,7 +360,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
 	data.id -= map->first_ip;
 	data.ether = eth_hdr(skb)->h_source;
 
-	return adtfn(set, &data, map->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, map), opt->cmdflags);
 }
 
 static int
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index a3935eef76fc..0b0ae19d0290 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -208,14 +208,15 @@ nla_put_failure:
 
 static int
 bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
-		 enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		 enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	struct bitmap_port *map = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	__be16 __port;
 	u16 port = 0;
 
-	if (!ip_set_get_ip_port(skb, pf, flags & IPSET_DIM_ONE_SRC, &__port))
+	if (!ip_set_get_ip_port(skb, opt->family,
+				opt->flags & IPSET_DIM_ONE_SRC, &__port))
 		return -EINVAL;
 
 	port = ntohs(__port);
@@ -225,7 +226,7 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 	port -= map->first_port;
 
-	return adtfn(set, &port, map->timeout, flags);
+	return adtfn(set, &port, opt_timeout(opt, map), opt->cmdflags);
 }
 
 static int
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 333b0bedf298..c15c0624d37f 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -325,7 +325,7 @@ __ip_set_put(ip_set_id_t index)
 
 int
 ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
-	    u8 family, u8 dim, u8 flags)
+	    const struct ip_set_adt_opt *opt)
 {
 	struct ip_set *set = ip_set_list[index];
 	int ret = 0;
@@ -333,19 +333,19 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	BUG_ON(set == NULL);
 	pr_debug("set %s, index %u\n", set->name, index);
 
-	if (dim < set->type->dimension ||
-	    !(family == set->family || set->family == AF_UNSPEC))
+	if (opt->dim < set->type->dimension ||
+	    !(opt->family == set->family || set->family == AF_UNSPEC))
 		return 0;
 
 	read_lock_bh(&set->lock);
-	ret = set->variant->kadt(set, skb, IPSET_TEST, family, dim, flags);
+	ret = set->variant->kadt(set, skb, IPSET_TEST, opt);
 	read_unlock_bh(&set->lock);
 
 	if (ret == -EAGAIN) {
 		/* Type requests element to be completed */
 		pr_debug("element must be competed, ADD is triggered\n");
 		write_lock_bh(&set->lock);
-		set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags);
+		set->variant->kadt(set, skb, IPSET_ADD, opt);
 		write_unlock_bh(&set->lock);
 		ret = 1;
 	}
@@ -357,7 +357,7 @@ EXPORT_SYMBOL_GPL(ip_set_test);
 
 int
 ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
-	   u8 family, u8 dim, u8 flags)
+	   const struct ip_set_adt_opt *opt)
 {
 	struct ip_set *set = ip_set_list[index];
 	int ret;
@@ -365,12 +365,12 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 	BUG_ON(set == NULL);
 	pr_debug("set %s, index %u\n", set->name, index);
 
-	if (dim < set->type->dimension ||
-	    !(family == set->family || set->family == AF_UNSPEC))
+	if (opt->dim < set->type->dimension ||
+	    !(opt->family == set->family || set->family == AF_UNSPEC))
 		return 0;
 
 	write_lock_bh(&set->lock);
-	ret = set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags);
+	ret = set->variant->kadt(set, skb, IPSET_ADD, opt);
 	write_unlock_bh(&set->lock);
 
 	return ret;
@@ -379,7 +379,7 @@ EXPORT_SYMBOL_GPL(ip_set_add);
 
 int
 ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
-	   u8 family, u8 dim, u8 flags)
+	   const struct ip_set_adt_opt *opt)
 {
 	struct ip_set *set = ip_set_list[index];
 	int ret = 0;
@@ -387,12 +387,12 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 	BUG_ON(set == NULL);
 	pr_debug("set %s, index %u\n", set->name, index);
 
-	if (dim < set->type->dimension ||
-	    !(family == set->family || set->family == AF_UNSPEC))
+	if (opt->dim < set->type->dimension ||
+	    !(opt->family == set->family || set->family == AF_UNSPEC))
 		return 0;
 
 	write_lock_bh(&set->lock);
-	ret = set->variant->kadt(set, skb, IPSET_DEL, family, dim, flags);
+	ret = set->variant->kadt(set, skb, IPSET_DEL, opt);
 	write_unlock_bh(&set->lock);
 
 	return ret;
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 368302001120..65a445477f64 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -110,18 +110,18 @@ nla_put_failure:
 
 static int
 hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
-	      enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+	      enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	__be32 ip;
 
-	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip);
+	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
 	ip &= ip_set_netmask(h->netmask);
 	if (ip == 0)
 		return -EINVAL;
 
-	return adtfn(set, &ip, h->timeout, flags);
+	return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
@@ -283,18 +283,18 @@ nla_put_failure:
 
 static int
 hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
-	      enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+	      enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	union nf_inet_addr ip;
 
-	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip.in6);
+	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip.in6);
 	ip6_netmask(&ip, h->netmask);
 	if (ipv6_addr_any(&ip.in6))
 		return -EINVAL;
 
-	return adtfn(set, &ip, h->timeout, flags);
+	return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = {
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 65c2ff4b27aa..9f179bb4f13e 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -126,19 +126,19 @@ nla_put_failure:
 
 static int
 hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
-		  enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		  enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipport4_elem data = { };
 
-	if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &data.port, &data.proto))
 		return -EINVAL;
 
-	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
@@ -330,19 +330,19 @@ nla_put_failure:
 
 static int
 hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
-		  enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		  enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipport6_elem data = { };
 
-	if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &data.port, &data.proto))
 		return -EINVAL;
 
-	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 670e5e4a1232..7cfa52b34981 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -129,20 +129,20 @@ nla_put_failure:
 
 static int
 hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
-		    enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		    enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipportip4_elem data = { };
 
-	if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &data.port, &data.proto))
 		return -EINVAL;
 
-	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
-	ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
+	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
+	ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
@@ -343,20 +343,20 @@ nla_put_failure:
 
 static int
 hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
-		    enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		    enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipportip6_elem data = { };
 
-	if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &data.port, &data.proto))
 		return -EINVAL;
 
-	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
-	ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
+	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 4bb365c9f3db..104042aba92b 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -142,7 +142,7 @@ nla_put_failure:
 
 static int
 hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
-		     enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		     enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -154,15 +154,15 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (adt == IPSET_TEST)
 		data.cidr = HOST_MASK;
 
-	if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &data.port, &data.proto))
 		return -EINVAL;
 
-	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
-	ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
+	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
+	ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2);
 	data.ip2 &= ip_set_netmask(data.cidr);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
@@ -390,7 +390,7 @@ nla_put_failure:
 
 static int
 hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
-		     enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		     enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -402,15 +402,15 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (adt == IPSET_TEST)
 		data.cidr = HOST_MASK;
 
-	if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &data.port, &data.proto))
 		return -EINVAL;
 
-	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
-	ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
+	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
 	ip6_netmask(&data.ip2, data.cidr);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 440b38f9fe3b..0024053e409a 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -127,7 +127,7 @@ nla_put_failure:
 
 static int
 hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
-	       enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+	       enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -138,10 +138,10 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (adt == IPSET_TEST)
 		data.cidr = HOST_MASK;
 
-	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
 	data.ip &= ip_set_netmask(data.cidr);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
@@ -292,7 +292,7 @@ nla_put_failure:
 
 static int
 hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
-	       enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+	       enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -303,10 +303,10 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (adt == IPSET_TEST)
 		data.cidr = HOST_MASK;
 
-	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
 	ip6_netmask(&data.ip, data.cidr);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 2d31291ba83e..7a2327b3407d 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -139,7 +139,7 @@ nla_put_failure:
 
 static int
 hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
-		   enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		   enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -151,14 +151,14 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (adt == IPSET_TEST)
 		data.cidr = HOST_MASK;
 
-	if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &data.port, &data.proto))
 		return -EINVAL;
 
-	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
 	data.ip &= ip_set_netmask(data.cidr);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
@@ -352,7 +352,7 @@ nla_put_failure:
 
 static int
 hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
-		   enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+		   enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -364,14 +364,14 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (adt == IPSET_TEST)
 		data.cidr = HOST_MASK;
 
-	if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &data.port, &data.proto))
 		return -EINVAL;
 
-	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
 	ip6_netmask(&data.ip, data.cidr);
 
-	return adtfn(set, &data, h->timeout, flags);
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
 }
 
 static int
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index c2c29daab0ff..f05e9eb863dc 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -72,7 +72,7 @@ list_set_expired(const struct list_set *map, u32 id)
 
 static int
 list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
-	      enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+	      enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	struct list_set *map = set->data;
 	struct set_elem *elem;
@@ -87,17 +87,17 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
 			continue;
 		switch (adt) {
 		case IPSET_TEST:
-			ret = ip_set_test(elem->id, skb, pf, dim, flags);
+			ret = ip_set_test(elem->id, skb, opt);
 			if (ret > 0)
 				return ret;
 			break;
 		case IPSET_ADD:
-			ret = ip_set_add(elem->id, skb, pf, dim, flags);
+			ret = ip_set_add(elem->id, skb, opt);
 			if (ret == 0)
 				return ret;
 			break;
 		case IPSET_DEL:
-			ret = ip_set_del(elem->id, skb, pf, dim, flags);
+			ret = ip_set_del(elem->id, skb, opt);
 			if (ret == 0)
 				return ret;
 			break;
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index b3babaed7719..eb265bdfc1d0 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -29,23 +29,32 @@ MODULE_ALIAS("ip6t_SET");
 
 static inline int
 match_set(ip_set_id_t index, const struct sk_buff *skb,
-	  u8 pf, u8 dim, u8 flags, int inv)
+	  const struct ip_set_adt_opt *opt, int inv)
 {
-	if (ip_set_test(index, skb, pf, dim, flags))
+	if (ip_set_test(index, skb, opt))
 		inv = !inv;
 	return inv;
 }
 
+#define ADT_OPT(n, f, d, fs, cfs, t) 	\
+const struct ip_set_adt_opt n = {	\
+	.family	= f,			\
+	.dim = d,			\
+	.flags = fs,			\
+	.cmdflags = cfs,		\
+	.timeout = t,			\
+}
+
 /* Revision 0 interface: backward compatible with netfilter/iptables */
 
 static bool
 set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_set_info_match_v0 *info = par->matchinfo;
+	ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
+		info->match_set.u.compat.flags, 0, UINT_MAX);
 
-	return match_set(info->match_set.index, skb, par->family,
-			 info->match_set.u.compat.dim,
-			 info->match_set.u.compat.flags,
+	return match_set(info->match_set.index, skb, &opt,
 			 info->match_set.u.compat.flags & IPSET_INV_MATCH);
 }
 
@@ -103,15 +112,15 @@ static unsigned int
 set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_set_info_target_v0 *info = par->targinfo;
+	ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim,
+		info->add_set.u.compat.flags, 0, UINT_MAX);
+	ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim,
+		info->del_set.u.compat.flags, 0, UINT_MAX);
 
 	if (info->add_set.index != IPSET_INVALID_ID)
-		ip_set_add(info->add_set.index, skb, par->family,
-			   info->add_set.u.compat.dim,
-			   info->add_set.u.compat.flags);
+		ip_set_add(info->add_set.index, skb, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
-		ip_set_del(info->del_set.index, skb, par->family,
-			   info->del_set.u.compat.dim,
-			   info->del_set.u.compat.flags);
+		ip_set_del(info->del_set.index, skb, &del_opt);
 
 	return XT_CONTINUE;
 }
@@ -170,23 +179,23 @@ set_target_v0_destroy(const struct xt_tgdtor_param *par)
 		ip_set_nfnl_put(info->del_set.index);
 }
 
-/* Revision 1: current interface to netfilter/iptables */
+/* Revision 1 match and target */
 
 static bool
-set_match(const struct sk_buff *skb, struct xt_action_param *par)
+set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
-	const struct xt_set_info_match *info = par->matchinfo;
+	const struct xt_set_info_match_v1 *info = par->matchinfo;
+	ADT_OPT(opt, par->family, info->match_set.dim,
+		info->match_set.flags, 0, UINT_MAX);
 
-	return match_set(info->match_set.index, skb, par->family,
-			 info->match_set.dim,
-			 info->match_set.flags,
+	return match_set(info->match_set.index, skb, &opt,
 			 info->match_set.flags & IPSET_INV_MATCH);
 }
 
 static int
-set_match_checkentry(const struct xt_mtchk_param *par)
+set_match_v1_checkentry(const struct xt_mtchk_param *par)
 {
-	struct xt_set_info_match *info = par->matchinfo;
+	struct xt_set_info_match_v1 *info = par->matchinfo;
 	ip_set_id_t index;
 
 	index = ip_set_nfnl_get_byindex(info->match_set.index);
@@ -207,36 +216,34 @@ set_match_checkentry(const struct xt_mtchk_param *par)
 }
 
 static void
-set_match_destroy(const struct xt_mtdtor_param *par)
+set_match_v1_destroy(const struct xt_mtdtor_param *par)
 {
-	struct xt_set_info_match *info = par->matchinfo;
+	struct xt_set_info_match_v1 *info = par->matchinfo;
 
 	ip_set_nfnl_put(info->match_set.index);
 }
 
 static unsigned int
-set_target(struct sk_buff *skb, const struct xt_action_param *par)
+set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct xt_set_info_target *info = par->targinfo;
+	const struct xt_set_info_target_v1 *info = par->targinfo;
+	ADT_OPT(add_opt, par->family, info->add_set.dim,
+		info->add_set.flags, 0, UINT_MAX);
+	ADT_OPT(del_opt, par->family, info->del_set.dim,
+		info->del_set.flags, 0, UINT_MAX);
 
 	if (info->add_set.index != IPSET_INVALID_ID)
-		ip_set_add(info->add_set.index,
-			   skb, par->family,
-			   info->add_set.dim,
-			   info->add_set.flags);
+		ip_set_add(info->add_set.index, skb, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
-		ip_set_del(info->del_set.index,
-			   skb, par->family,
-			   info->del_set.dim,
-			   info->del_set.flags);
+		ip_set_del(info->del_set.index, skb, &del_opt);
 
 	return XT_CONTINUE;
 }
 
 static int
-set_target_checkentry(const struct xt_tgchk_param *par)
+set_target_v1_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct xt_set_info_target *info = par->targinfo;
+	const struct xt_set_info_target_v1 *info = par->targinfo;
 	ip_set_id_t index;
 
 	if (info->add_set.index != IPSET_INVALID_ID) {
@@ -273,9 +280,9 @@ set_target_checkentry(const struct xt_tgchk_param *par)
 }
 
 static void
-set_target_destroy(const struct xt_tgdtor_param *par)
+set_target_v1_destroy(const struct xt_tgdtor_param *par)
 {
-	const struct xt_set_info_target *info = par->targinfo;
+	const struct xt_set_info_target_v1 *info = par->targinfo;
 
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_nfnl_put(info->add_set.index);
@@ -283,6 +290,28 @@ set_target_destroy(const struct xt_tgdtor_param *par)
 		ip_set_nfnl_put(info->del_set.index);
 }
 
+/* Revision 2 target */
+
+static unsigned int
+set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_set_info_target_v2 *info = par->targinfo;
+	ADT_OPT(add_opt, par->family, info->add_set.dim,
+		info->add_set.flags, info->flags, info->timeout);
+	ADT_OPT(del_opt, par->family, info->del_set.dim,
+		info->del_set.flags, 0, UINT_MAX);
+
+	if (info->add_set.index != IPSET_INVALID_ID)
+		ip_set_add(info->add_set.index, skb, &add_opt);
+	if (info->del_set.index != IPSET_INVALID_ID)
+		ip_set_del(info->del_set.index, skb, &del_opt);
+
+	return XT_CONTINUE;
+}
+
+#define set_target_v2_checkentry	set_target_v1_checkentry
+#define set_target_v2_destroy		set_target_v1_destroy
+
 static struct xt_match set_matches[] __read_mostly = {
 	{
 		.name		= "set",
@@ -298,20 +327,20 @@ static struct xt_match set_matches[] __read_mostly = {
 		.name		= "set",
 		.family		= NFPROTO_IPV4,
 		.revision	= 1,
-		.match		= set_match,
-		.matchsize	= sizeof(struct xt_set_info_match),
-		.checkentry	= set_match_checkentry,
-		.destroy	= set_match_destroy,
+		.match		= set_match_v1,
+		.matchsize	= sizeof(struct xt_set_info_match_v1),
+		.checkentry	= set_match_v1_checkentry,
+		.destroy	= set_match_v1_destroy,
 		.me		= THIS_MODULE
 	},
 	{
 		.name		= "set",
 		.family		= NFPROTO_IPV6,
 		.revision	= 1,
-		.match		= set_match,
-		.matchsize	= sizeof(struct xt_set_info_match),
-		.checkentry	= set_match_checkentry,
-		.destroy	= set_match_destroy,
+		.match		= set_match_v1,
+		.matchsize	= sizeof(struct xt_set_info_match_v1),
+		.checkentry	= set_match_v1_checkentry,
+		.destroy	= set_match_v1_destroy,
 		.me		= THIS_MODULE
 	},
 };
@@ -331,20 +360,40 @@ static struct xt_target set_targets[] __read_mostly = {
 		.name		= "SET",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
-		.target		= set_target,
-		.targetsize	= sizeof(struct xt_set_info_target),
-		.checkentry	= set_target_checkentry,
-		.destroy	= set_target_destroy,
+		.target		= set_target_v1,
+		.targetsize	= sizeof(struct xt_set_info_target_v1),
+		.checkentry	= set_target_v1_checkentry,
+		.destroy	= set_target_v1_destroy,
 		.me		= THIS_MODULE
 	},
 	{
 		.name		= "SET",
 		.revision	= 1,
 		.family		= NFPROTO_IPV6,
-		.target		= set_target,
-		.targetsize	= sizeof(struct xt_set_info_target),
-		.checkentry	= set_target_checkentry,
-		.destroy	= set_target_destroy,
+		.target		= set_target_v1,
+		.targetsize	= sizeof(struct xt_set_info_target_v1),
+		.checkentry	= set_target_v1_checkentry,
+		.destroy	= set_target_v1_destroy,
+		.me		= THIS_MODULE
+	},
+	{
+		.name		= "SET",
+		.revision	= 2,
+		.family		= NFPROTO_IPV4,
+		.target		= set_target_v2,
+		.targetsize	= sizeof(struct xt_set_info_target_v2),
+		.checkentry	= set_target_v2_checkentry,
+		.destroy	= set_target_v2_destroy,
+		.me		= THIS_MODULE
+	},
+	{
+		.name		= "SET",
+		.revision	= 2,
+		.family		= NFPROTO_IPV6,
+		.target		= set_target_v2,
+		.targetsize	= sizeof(struct xt_set_info_target_v2),
+		.checkentry	= set_target_v2_checkentry,
+		.destroy	= set_target_v2_destroy,
 		.me		= THIS_MODULE
 	},
 };
-- 
cgit v1.2.3


From c1e2e04388b2539960453689b8e721709f71dc9c Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 18:47:07 +0200
Subject: netfilter: ipset: support listing setnames and headers too

Current listing makes possible to list sets with full content only.
The patch adds support partial listings, i.e. listing just
the existing setnames or listing set headers, without set members.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set.h |  4 ++
 net/netfilter/ipset/ip_set_core.c      | 73 +++++++++++++++++++++-------------
 2 files changed, 50 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 68b21f5d6a84..e677c4d8f00e 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -142,6 +142,10 @@ enum ipset_errno {
 enum ipset_cmd_flags {
 	IPSET_FLAG_BIT_EXIST	= 0,
 	IPSET_FLAG_EXIST	= (1 << IPSET_FLAG_BIT_EXIST),
+	IPSET_FLAG_BIT_LIST_SETNAME = 1,
+	IPSET_FLAG_LIST_SETNAME	= (1 << IPSET_FLAG_BIT_LIST_SETNAME),
+	IPSET_FLAG_BIT_LIST_HEADER = 2,
+	IPSET_FLAG_LIST_HEADER	= (1 << IPSET_FLAG_BIT_LIST_HEADER),
 };
 
 /* Flags at CADT attribute level */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index c15c0624d37f..8446c7d81898 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -939,10 +939,13 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 
 /* List/save set data */
 
-#define DUMP_INIT	0L
-#define DUMP_ALL	1L
-#define DUMP_ONE	2L
-#define DUMP_LAST	3L
+#define DUMP_INIT	0
+#define DUMP_ALL	1
+#define DUMP_ONE	2
+#define DUMP_LAST	3
+
+#define DUMP_TYPE(arg)		(((u32)(arg)) & 0x0000FFFF)
+#define DUMP_FLAGS(arg)		(((u32)(arg)) >> 16)
 
 static int
 ip_set_dump_done(struct netlink_callback *cb)
@@ -973,6 +976,7 @@ dump_init(struct netlink_callback *cb)
 	int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
 	struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
 	struct nlattr *attr = (void *)nlh + min_len;
+	u32 dump_type;
 	ip_set_id_t index;
 
 	/* Second pass, so parser can't fail */
@@ -984,17 +988,22 @@ dump_init(struct netlink_callback *cb)
 	 *         [..]: type specific
 	 */
 
-	if (!cda[IPSET_ATTR_SETNAME]) {
-		cb->args[0] = DUMP_ALL;
-		return 0;
-	}
+	if (cda[IPSET_ATTR_SETNAME]) {
+		index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME]));
+		if (index == IPSET_INVALID_ID)
+			return -ENOENT;
 
-	index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME]));
-	if (index == IPSET_INVALID_ID)
-		return -ENOENT;
+		dump_type = DUMP_ONE;
+		cb->args[1] = index;
+	} else
+		dump_type = DUMP_ALL;
+
+	if (cda[IPSET_ATTR_FLAGS]) {
+		u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
+		dump_type |= (f << 16);
+	}
+	cb->args[0] = dump_type;
 
-	cb->args[0] = DUMP_ONE;
-	cb->args[1] = index;
 	return 0;
 }
 
@@ -1005,9 +1014,10 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 	struct ip_set *set = NULL;
 	struct nlmsghdr *nlh = NULL;
 	unsigned int flags = NETLINK_CB(cb->skb).pid ? NLM_F_MULTI : 0;
+	u32 dump_type, dump_flags;
 	int ret = 0;
 
-	if (cb->args[0] == DUMP_INIT) {
+	if (!cb->args[0]) {
 		ret = dump_init(cb);
 		if (ret < 0) {
 			nlh = nlmsg_hdr(cb->skb);
@@ -1022,14 +1032,17 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 	if (cb->args[1] >= ip_set_max)
 		goto out;
 
-	max = cb->args[0] == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
+	dump_type = DUMP_TYPE(cb->args[0]);
+	dump_flags = DUMP_FLAGS(cb->args[0]);
+	max = dump_type == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
 dump_last:
-	pr_debug("args[0]: %ld args[1]: %ld\n", cb->args[0], cb->args[1]);
+	pr_debug("args[0]: %u %u args[1]: %ld\n",
+		 dump_type, dump_flags, cb->args[1]);
 	for (; cb->args[1] < max; cb->args[1]++) {
 		index = (ip_set_id_t) cb->args[1];
 		set = ip_set_list[index];
 		if (set == NULL) {
-			if (cb->args[0] == DUMP_ONE) {
+			if (dump_type == DUMP_ONE) {
 				ret = -ENOENT;
 				goto out;
 			}
@@ -1038,8 +1051,8 @@ dump_last:
 		/* When dumping all sets, we must dump "sorted"
 		 * so that lists (unions of sets) are dumped last.
 		 */
-		if (cb->args[0] != DUMP_ONE &&
-		    ((cb->args[0] == DUMP_ALL) ==
+		if (dump_type != DUMP_ONE &&
+		    ((dump_type == DUMP_ALL) ==
 		     !!(set->type->features & IPSET_DUMP_LAST)))
 			continue;
 		pr_debug("List set: %s\n", set->name);
@@ -1057,6 +1070,8 @@ dump_last:
 		}
 		NLA_PUT_U8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
 		NLA_PUT_STRING(skb, IPSET_ATTR_SETNAME, set->name);
+		if (dump_flags & IPSET_FLAG_LIST_SETNAME)
+			goto next_set;
 		switch (cb->args[2]) {
 		case 0:
 			/* Core header data */
@@ -1069,24 +1084,23 @@ dump_last:
 			ret = set->variant->head(set, skb);
 			if (ret < 0)
 				goto release_refcount;
+			if (dump_flags & IPSET_FLAG_LIST_HEADER)
+				goto next_set;
 			/* Fall through and add elements */
 		default:
 			read_lock_bh(&set->lock);
 			ret = set->variant->list(set, skb, cb);
 			read_unlock_bh(&set->lock);
-			if (!cb->args[2]) {
+			if (!cb->args[2])
 				/* Set is done, proceed with next one */
-				if (cb->args[0] == DUMP_ONE)
-					cb->args[1] = IPSET_INVALID_ID;
-				else
-					cb->args[1]++;
-			}
+				goto next_set;
 			goto release_refcount;
 		}
 	}
 	/* If we dump all sets, continue with dumping last ones */
-	if (cb->args[0] == DUMP_ALL) {
-		cb->args[0] = DUMP_LAST;
+	if (dump_type == DUMP_ALL) {
+		dump_type = DUMP_LAST;
+		cb->args[0] = dump_type | (dump_flags << 16);
 		cb->args[1] = 0;
 		goto dump_last;
 	}
@@ -1094,6 +1108,11 @@ dump_last:
 
 nla_put_failure:
 	ret = -EFAULT;
+next_set:
+	if (dump_type == DUMP_ONE)
+		cb->args[1] = IPSET_INVALID_ID;
+	else
+		cb->args[1]++;
 release_refcount:
 	/* If there was an error or set is done, release set */
 	if (ret || !cb->args[2]) {
-- 
cgit v1.2.3


From 3d14b171f004f75c2d1e82e10545966f94132705 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 18:49:17 +0200
Subject: netfilter: ipset: fix adding ranges to hash types

When ranges are added to hash types, the elements may trigger rehashing
the set. However, the last successfully added element was not kept track
so the adding started again with the first element after the rehashing.

Bug reported by Mr Dash Four.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set.h       |  2 +-
 include/linux/netfilter/ipset/ip_set_ahash.h | 22 +++++++++++++++++---
 net/netfilter/ipset/ip_set_bitmap_ip.c       |  2 +-
 net/netfilter/ipset/ip_set_bitmap_ipmac.c    |  2 +-
 net/netfilter/ipset/ip_set_bitmap_port.c     |  2 +-
 net/netfilter/ipset/ip_set_core.c            | 11 +++++-----
 net/netfilter/ipset/ip_set_hash_ip.c         | 17 +++++++++++++--
 net/netfilter/ipset/ip_set_hash_ipport.c     | 31 +++++++++++++++++++++++-----
 net/netfilter/ipset/ip_set_hash_ipportip.c   | 31 +++++++++++++++++++++++-----
 net/netfilter/ipset/ip_set_hash_ipportnet.c  | 31 +++++++++++++++++++++++-----
 net/netfilter/ipset/ip_set_hash_net.c        | 16 ++++++++++++--
 net/netfilter/ipset/ip_set_hash_netport.c    | 22 ++++++++++++++++++--
 net/netfilter/ipset/ip_set_list_set.c        |  2 +-
 13 files changed, 157 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e677c4d8f00e..710ba0070298 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -244,7 +244,7 @@ struct ip_set_type_variant {
 	 *			zero for no match/success to add/delete
 	 *			positive for matching element */
 	int (*uadt)(struct ip_set *set, struct nlattr *tb[],
-		    enum ipset_adt adt, u32 *lineno, u32 flags);
+		    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
 
 	/* Low level add/del/test functions */
 	ipset_adtfn adt[IPSET_ADT_MAX];
diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index 6c0219348b43..8709bd950c8b 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -5,6 +5,11 @@
 #include <linux/jhash.h>
 #include <linux/netfilter/ipset/ip_set_timeout.h>
 
+#define CONCAT(a, b, c)		a##b##c
+#define TOKEN(a, b, c)		CONCAT(a, b, c)
+
+#define type_pf_next		TOKEN(TYPE, PF, _elem)
+
 /* Hashing which uses arrays to resolve clashing. The hash table is resized
  * (doubled) when searching becomes too long.
  * Internally jhash is used with the assumption that the size of the
@@ -54,6 +59,7 @@ struct ip_set_hash {
 	u32 initval;		/* random jhash init value */
 	u32 timeout;		/* timeout value, if enabled */
 	struct timer_list gc;	/* garbage collection when timeout enabled */
+	struct type_pf_next next; /* temporary storage for uadd */
 #ifdef IP_SET_HASH_WITH_NETMASK
 	u8 netmask;		/* netmask value for subnets to store */
 #endif
@@ -217,6 +223,7 @@ ip_set_hash_destroy(struct ip_set *set)
 #define type_pf_data_netmask	TOKEN(TYPE, PF, _data_netmask)
 #define type_pf_data_list	TOKEN(TYPE, PF, _data_list)
 #define type_pf_data_tlist	TOKEN(TYPE, PF, _data_tlist)
+#define type_pf_data_next	TOKEN(TYPE, PF, _data_next)
 
 #define type_pf_elem		TOKEN(TYPE, PF, _elem)
 #define type_pf_telem		TOKEN(TYPE, PF, _telem)
@@ -346,6 +353,9 @@ retry:
 	return 0;
 }
 
+static inline void
+type_pf_data_next(struct ip_set_hash *h, const struct type_pf_elem *d);
+
 /* Add an element to a hash and update the internal counters when succeeded,
  * otherwise report the proper error code. */
 static int
@@ -372,8 +382,11 @@ type_pf_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
 		}
 
 	ret = type_pf_elem_add(n, value);
-	if (ret != 0)
+	if (ret != 0) {
+		if (ret == -EAGAIN)
+			type_pf_data_next(h, d);
 		goto out;
+	}
 
 #ifdef IP_SET_HASH_WITH_NETS
 	add_cidr(h, d->cidr, HOST_MASK);
@@ -589,7 +602,7 @@ type_pf_kadt(struct ip_set *set, const struct sk_buff * skb,
 	     enum ipset_adt adt, const struct ip_set_adt_opt *opt);
 static int
 type_pf_uadt(struct ip_set *set, struct nlattr *tb[],
-	     enum ipset_adt adt, u32 *lineno, u32 flags);
+	     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
 
 static const struct ip_set_type_variant type_pf_variant = {
 	.kadt	= type_pf_kadt,
@@ -821,8 +834,11 @@ type_pf_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
 		goto out;
 	}
 	ret = type_pf_elem_tadd(n, d, timeout);
-	if (ret != 0)
+	if (ret != 0) {
+		if (ret == -EEXIST)
+			type_pf_data_next(h, d);
 		goto out;
+	}
 
 #ifdef IP_SET_HASH_WITH_NETS
 	add_cidr(h, d->cidr, HOST_MASK);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 75990b37ca37..3a71c8e41557 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -236,7 +236,7 @@ bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
-	       enum ipset_adt adt, u32 *lineno, u32 flags)
+	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	struct bitmap_ip *map = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index cbe77f36650b..fdd5f79d93f3 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -365,7 +365,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
-		  enum ipset_adt adt, u32 *lineno, u32 flags)
+		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct bitmap_ipmac *map = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index 0b0ae19d0290..a6a5b3558ddc 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -231,7 +231,7 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
-		 enum ipset_adt adt, u32 *lineno, u32 flags)
+		 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	struct bitmap_port *map = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 8446c7d81898..528a9b3933ab 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1158,17 +1158,18 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
 	struct nlattr *tb[], enum ipset_adt adt,
 	u32 flags, bool use_lineno)
 {
-	int ret, retried = 0;
+	int ret;
 	u32 lineno = 0;
-	bool eexist = flags & IPSET_FLAG_EXIST;
+	bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
 
 	do {
 		write_lock_bh(&set->lock);
-		ret = set->variant->uadt(set, tb, adt, &lineno, flags);
+		ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
 		write_unlock_bh(&set->lock);
+		retried = true;
 	} while (ret == -EAGAIN &&
 		 set->variant->resize &&
-		 (ret = set->variant->resize(set, retried++)) == 0);
+		 (ret = set->variant->resize(set, retried)) == 0);
 
 	if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
 		return 0;
@@ -1341,7 +1342,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
 		return -IPSET_ERR_PROTOCOL;
 
 	read_lock_bh(&set->lock);
-	ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0);
+	ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
 	read_unlock_bh(&set->lock);
 	/* Userspace can't trigger element to be re-added */
 	if (ret == -EAGAIN)
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 65a445477f64..c99e861ce031 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -108,6 +108,12 @@ nla_put_failure:
 #define HOST_MASK	32
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_ip4_data_next(struct ip_set_hash *h, const struct hash_ip4_elem *d)
+{
+	h->next.ip = ntohl(d->ip);
+}
+
 static int
 hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	      enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -126,7 +132,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
-	      enum ipset_adt adt, u32 *lineno, u32 flags)
+	      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -178,6 +184,8 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
 
+	if (retried)
+		ip = h->next.ip;
 	for (; !before(ip_to, ip); ip += hosts) {
 		nip = htonl(ip);
 		if (nip == 0)
@@ -281,6 +289,11 @@ nla_put_failure:
 #define HOST_MASK	128
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_ip6_data_next(struct ip_set_hash *h, const struct hash_ip6_elem *d)
+{
+}
+
 static int
 hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	      enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -305,7 +318,7 @@ static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = {
 
 static int
 hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
-	      enum ipset_adt adt, u32 *lineno, u32 flags)
+	      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 9f179bb4f13e..aa91b2c73be3 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -124,6 +124,14 @@ nla_put_failure:
 #define HOST_MASK	32
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_ipport4_data_next(struct ip_set_hash *h,
+		       const struct hash_ipport4_elem *d)
+{
+	h->next.ip = ntohl(d->ip);
+	h->next.port = ntohs(d->port);
+}
+
 static int
 hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
 		  enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -143,12 +151,12 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
-		  enum ipset_adt adt, u32 *lineno, u32 flags)
+		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipport4_elem data = { };
-	u32 ip, ip_to, p, port, port_to;
+	u32 ip, ip_to, p = 0, port, port_to;
 	u32 timeout = h->timeout;
 	bool with_ports = false;
 	int ret;
@@ -220,8 +228,11 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(port, port_to);
 	}
 
-	for (; !before(ip_to, ip); ip++)
-		for (p = port; p <= port_to; p++) {
+	if (retried)
+		ip = h->next.ip;
+	for (; !before(ip_to, ip); ip++) {
+		p = retried && ip == h->next.ip ? h->next.port : port;
+		for (; p <= port_to; p++) {
 			data.ip = htonl(ip);
 			data.port = htons(p);
 			ret = adtfn(set, &data, timeout, flags);
@@ -231,6 +242,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 			else
 				ret = 0;
 		}
+	}
 	return ret;
 }
 
@@ -328,6 +340,13 @@ nla_put_failure:
 #define HOST_MASK	128
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_ipport6_data_next(struct ip_set_hash *h,
+		       const struct hash_ipport6_elem *d)
+{
+	h->next.port = ntohs(d->port);
+}
+
 static int
 hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
 		  enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -347,7 +366,7 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
-		  enum ipset_adt adt, u32 *lineno, u32 flags)
+		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -405,6 +424,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (port > port_to)
 		swap(port, port_to);
 
+	if (retried)
+		port = h->next.port;
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
 		ret = adtfn(set, &data, timeout, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 7cfa52b34981..b88e74e0bf06 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -127,6 +127,14 @@ nla_put_failure:
 #define HOST_MASK	32
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_ipportip4_data_next(struct ip_set_hash *h,
+			 const struct hash_ipportip4_elem *d)
+{
+	h->next.ip = ntohl(d->ip);
+	h->next.port = ntohs(d->port);
+}
+
 static int
 hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
 		    enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -147,12 +155,12 @@ hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
-		    enum ipset_adt adt, u32 *lineno, u32 flags)
+		    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipportip4_elem data = { };
-	u32 ip, ip_to, p, port, port_to;
+	u32 ip, ip_to, p = 0, port, port_to;
 	u32 timeout = h->timeout;
 	bool with_ports = false;
 	int ret;
@@ -228,8 +236,11 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(port, port_to);
 	}
 
-	for (; !before(ip_to, ip); ip++)
-		for (p = port; p <= port_to; p++) {
+	if (retried)
+		ip = h->next.ip;
+	for (; !before(ip_to, ip); ip++) {
+		p = retried && ip == h->next.ip ? h->next.port : port;
+		for (; p <= port_to; p++) {
 			data.ip = htonl(ip);
 			data.port = htons(p);
 			ret = adtfn(set, &data, timeout, flags);
@@ -239,6 +250,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 			else
 				ret = 0;
 		}
+	}
 	return ret;
 }
 
@@ -341,6 +353,13 @@ nla_put_failure:
 #define HOST_MASK	128
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_ipportip6_data_next(struct ip_set_hash *h,
+			 const struct hash_ipportip6_elem *d)
+{
+	h->next.port = ntohs(d->port);
+}
+
 static int
 hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
 		    enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -361,7 +380,7 @@ hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
-		    enum ipset_adt adt, u32 *lineno, u32 flags)
+		    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -423,6 +442,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (port > port_to)
 		swap(port, port_to);
 
+	if (retried)
+		port = h->next.port;
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
 		ret = adtfn(set, &data, timeout, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 104042aba92b..605ef3bf94ef 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -140,6 +140,14 @@ nla_put_failure:
 #define HOST_MASK	32
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_ipportnet4_data_next(struct ip_set_hash *h,
+			  const struct hash_ipportnet4_elem *d)
+{
+	h->next.ip = ntohl(d->ip);
+	h->next.port = ntohs(d->port);
+}
+
 static int
 hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 		     enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -167,12 +175,12 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
-		     enum ipset_adt adt, u32 *lineno, u32 flags)
+		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipportnet4_elem data = { .cidr = HOST_MASK };
-	u32 ip, ip_to, p, port, port_to;
+	u32 ip, ip_to, p = 0, port, port_to;
 	u32 timeout = h->timeout;
 	bool with_ports = false;
 	int ret;
@@ -256,8 +264,11 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(port, port_to);
 	}
 
-	for (; !before(ip_to, ip); ip++)
-		for (p = port; p <= port_to; p++) {
+	if (retried)
+		ip = h->next.ip;
+	for (; !before(ip_to, ip); ip++) {
+		p = retried && ip == h->next.ip ? h->next.port : port;
+		for (; p <= port_to; p++) {
 			data.ip = htonl(ip);
 			data.port = htons(p);
 			ret = adtfn(set, &data, timeout, flags);
@@ -267,6 +278,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			else
 				ret = 0;
 		}
+	}
 	return ret;
 }
 
@@ -388,6 +400,13 @@ nla_put_failure:
 #define HOST_MASK	128
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_ipportnet6_data_next(struct ip_set_hash *h,
+			  const struct hash_ipportnet6_elem *d)
+{
+	h->next.port = ntohs(d->port);
+}
+
 static int
 hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 		     enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -415,7 +434,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
-		     enum ipset_adt adt, u32 *lineno, u32 flags)
+		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -485,6 +504,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (port > port_to)
 		swap(port, port_to);
 
+	if (retried)
+		port = h->next.port;
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
 		ret = adtfn(set, &data, timeout, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 0024053e409a..e6f8bc5771ca 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -125,6 +125,12 @@ nla_put_failure:
 #define HOST_MASK	32
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_net4_data_next(struct ip_set_hash *h,
+		    const struct hash_net4_elem *d)
+{
+}
+
 static int
 hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	       enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -146,7 +152,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
-	       enum ipset_adt adt, u32 *lineno, u32 flags)
+	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -290,6 +296,12 @@ nla_put_failure:
 #define HOST_MASK	128
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_net6_data_next(struct ip_set_hash *h,
+		    const struct hash_net6_elem *d)
+{
+}
+
 static int
 hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	       enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -311,7 +323,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
-	       enum ipset_adt adt, u32 *lineno, u32 flags)
+	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 7a2327b3407d..037b829178dc 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -137,6 +137,13 @@ nla_put_failure:
 #define HOST_MASK	32
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_netport4_data_next(struct ip_set_hash *h,
+			const struct hash_netport4_elem *d)
+{
+	h->next.port = ntohs(d->port);
+}
+
 static int
 hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
 		   enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -163,7 +170,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
-		   enum ipset_adt adt, u32 *lineno, u32 flags)
+		   enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -225,6 +232,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (port > port_to)
 		swap(port, port_to);
 
+	if (retried)
+		port = h->next.port;
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
 		ret = adtfn(set, &data, timeout, flags);
@@ -350,6 +359,13 @@ nla_put_failure:
 #define HOST_MASK	128
 #include <linux/netfilter/ipset/ip_set_ahash.h>
 
+static inline void
+hash_netport6_data_next(struct ip_set_hash *h,
+			const struct hash_netport6_elem *d)
+{
+	h->next.port = ntohs(d->port);
+}
+
 static int
 hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
 		   enum ipset_adt adt, const struct ip_set_adt_opt *opt)
@@ -376,7 +392,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
-		   enum ipset_adt adt, u32 *lineno, u32 flags)
+		   enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -438,6 +454,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (port > port_to)
 		swap(port, port_to);
 
+	if (retried)
+		port = h->next.port;
 	for (; port <= port_to; port++) {
 		data.port = htons(port);
 		ret = adtfn(set, &data, timeout, flags);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index f05e9eb863dc..74f0dcc30d98 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -218,7 +218,7 @@ cleanup_entries(struct list_set *map)
 
 static int
 list_set_uadt(struct ip_set *set, struct nlattr *tb[],
-	      enum ipset_adt adt, u32 *lineno, u32 flags)
+	      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	struct list_set *map = set->data;
 	bool with_timeout = with_timeout(map->timeout);
-- 
cgit v1.2.3


From f1e00b39797944bf25addaf543839feeb25fbdc5 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 18:51:41 +0200
Subject: netfilter: ipset: set type support with multiple revisions added

A set type may have multiple revisions, for example when syntax is
extended. Support continuous revision ranges in set types.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set.h      |  6 ++--
 net/netfilter/ipset/ip_set_bitmap_ip.c      |  3 +-
 net/netfilter/ipset/ip_set_bitmap_ipmac.c   |  3 +-
 net/netfilter/ipset/ip_set_bitmap_port.c    |  3 +-
 net/netfilter/ipset/ip_set_core.c           | 45 ++++++++++++++++-------------
 net/netfilter/ipset/ip_set_hash_ip.c        |  3 +-
 net/netfilter/ipset/ip_set_hash_ipport.c    |  3 +-
 net/netfilter/ipset/ip_set_hash_ipportip.c  |  3 +-
 net/netfilter/ipset/ip_set_hash_ipportnet.c |  3 +-
 net/netfilter/ipset/ip_set_hash_net.c       |  3 +-
 net/netfilter/ipset/ip_set_hash_netport.c   |  3 +-
 net/netfilter/ipset/ip_set_list_set.c       |  3 +-
 12 files changed, 49 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 710ba0070298..ac31e382472b 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -282,8 +282,8 @@ struct ip_set_type {
 	u8 dimension;
 	/* Supported family: may be AF_UNSPEC for both AF_INET/AF_INET6 */
 	u8 family;
-	/* Type revision */
-	u8 revision;
+	/* Type revisions */
+	u8 revision_min, revision_max;
 
 	/* Create set */
 	int (*create)(struct ip_set *set, struct nlattr *tb[], u32 flags);
@@ -314,6 +314,8 @@ struct ip_set {
 	const struct ip_set_type_variant *variant;
 	/* The actual INET family of the set */
 	u8 family;
+	/* The type revision */
+	u8 revision;
 	/* The type specific data */
 	void *data;
 };
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 3a71c8e41557..3b5920bfc784 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -551,7 +551,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = {
 	.features	= IPSET_TYPE_IP,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= AF_INET,
-	.revision	= 0,
+	.revision_min	= 0,
+	.revision_max	= 0,
 	.create		= bitmap_ip_create,
 	.create_policy	= {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index fdd5f79d93f3..5deb7bb37468 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -623,7 +623,8 @@ static struct ip_set_type bitmap_ipmac_type = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_MAC,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= AF_INET,
-	.revision	= 0,
+	.revision_min	= 0,
+	.revision_max	= 0,
 	.create		= bitmap_ipmac_create,
 	.create_policy	= {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index a6a5b3558ddc..c3e906fcc22c 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -483,7 +483,8 @@ static struct ip_set_type bitmap_port_type = {
 	.features	= IPSET_TYPE_PORT,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision_min	= 0,
+	.revision_max	= 0,
 	.create		= bitmap_port_create,
 	.create_policy	= {
 		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 528a9b3933ab..6a82cc0c9e00 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -70,7 +70,8 @@ find_set_type(const char *name, u8 family, u8 revision)
 	list_for_each_entry_rcu(type, &ip_set_type_list, list)
 		if (STREQ(type->name, name) &&
 		    (type->family == family || type->family == AF_UNSPEC) &&
-		    type->revision == revision)
+		    revision >= type->revision_min &&
+		    revision <= type->revision_max)
 			return type;
 	return NULL;
 }
@@ -135,10 +136,10 @@ find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max)
 		if (STREQ(type->name, name) &&
 		    (type->family == family || type->family == AF_UNSPEC)) {
 			found = true;
-			if (type->revision < *min)
-				*min = type->revision;
-			if (type->revision > *max)
-				*max = type->revision;
+			if (type->revision_min < *min)
+				*min = type->revision_min;
+			if (type->revision_max > *max)
+				*max = type->revision_max;
 		}
 	rcu_read_unlock();
 	if (found)
@@ -159,25 +160,27 @@ ip_set_type_register(struct ip_set_type *type)
 	int ret = 0;
 
 	if (type->protocol != IPSET_PROTOCOL) {
-		pr_warning("ip_set type %s, family %s, revision %u uses "
+		pr_warning("ip_set type %s, family %s, revision %u:%u uses "
 			   "wrong protocol version %u (want %u)\n",
 			   type->name, family_name(type->family),
-			   type->revision, type->protocol, IPSET_PROTOCOL);
+			   type->revision_min, type->revision_max,
+			   type->protocol, IPSET_PROTOCOL);
 		return -EINVAL;
 	}
 
 	ip_set_type_lock();
-	if (find_set_type(type->name, type->family, type->revision)) {
+	if (find_set_type(type->name, type->family, type->revision_min)) {
 		/* Duplicate! */
-		pr_warning("ip_set type %s, family %s, revision %u "
+		pr_warning("ip_set type %s, family %s with revision min %u "
 			   "already registered!\n", type->name,
-			   family_name(type->family), type->revision);
+			   family_name(type->family), type->revision_min);
 		ret = -EINVAL;
 		goto unlock;
 	}
 	list_add_rcu(&type->list, &ip_set_type_list);
-	pr_debug("type %s, family %s, revision %u registered.\n",
-		 type->name, family_name(type->family), type->revision);
+	pr_debug("type %s, family %s, revision %u:%u registered.\n",
+		 type->name, family_name(type->family),
+		 type->revision_min, type->revision_max);
 unlock:
 	ip_set_type_unlock();
 	return ret;
@@ -189,15 +192,15 @@ void
 ip_set_type_unregister(struct ip_set_type *type)
 {
 	ip_set_type_lock();
-	if (!find_set_type(type->name, type->family, type->revision)) {
-		pr_warning("ip_set type %s, family %s, revision %u "
+	if (!find_set_type(type->name, type->family, type->revision_min)) {
+		pr_warning("ip_set type %s, family %s with revision min %u "
 			   "not registered\n", type->name,
-			   family_name(type->family), type->revision);
+			   family_name(type->family), type->revision_min);
 		goto unlock;
 	}
 	list_del_rcu(&type->list);
-	pr_debug("type %s, family %s, revision %u unregistered.\n",
-		 type->name, family_name(type->family), type->revision);
+	pr_debug("type %s, family %s with revision min %u unregistered.\n",
+		 type->name, family_name(type->family), type->revision_min);
 unlock:
 	ip_set_type_unlock();
 
@@ -656,6 +659,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	rwlock_init(&set->lock);
 	strlcpy(set->name, name, IPSET_MAXNAMELEN);
 	set->family = family;
+	set->revision = revision;
 
 	/*
 	 * Next, check that we know the type, and take
@@ -696,7 +700,8 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 		    (flags & IPSET_FLAG_EXIST) &&
 		    STREQ(set->type->name, clash->type->name) &&
 		    set->type->family == clash->type->family &&
-		    set->type->revision == clash->type->revision &&
+		    set->type->revision_min == clash->type->revision_min &&
+		    set->type->revision_max == clash->type->revision_max &&
 		    set->variant->same_set(set, clash))
 			ret = 0;
 		goto cleanup;
@@ -1080,7 +1085,7 @@ dump_last:
 			NLA_PUT_U8(skb, IPSET_ATTR_FAMILY,
 				   set->family);
 			NLA_PUT_U8(skb, IPSET_ATTR_REVISION,
-				   set->type->revision);
+				   set->revision);
 			ret = set->variant->head(set, skb);
 			if (ret < 0)
 				goto release_refcount;
@@ -1385,7 +1390,7 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,
 	NLA_PUT_STRING(skb2, IPSET_ATTR_SETNAME, set->name);
 	NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, set->type->name);
 	NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, set->family);
-	NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, set->type->revision);
+	NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, set->revision);
 	nlmsg_end(skb2, nlh2);
 
 	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index c99e861ce031..c3bc06d353d3 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -441,7 +441,8 @@ static struct ip_set_type hash_ip_type __read_mostly = {
 	.features	= IPSET_TYPE_IP,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision_min	= 0,
+	.revision_max	= 0,
 	.create		= hash_ip_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index aa91b2c73be3..de2e351034a1 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -512,7 +512,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= AF_UNSPEC,
-	.revision	= 1,
+	.revision_min	= 0,
+	.revision_max	= 1,	/* SCTP and UDPLITE support added */
 	.create		= hash_ipport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index b88e74e0bf06..031ed057c811 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -530,7 +530,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
 	.dimension	= IPSET_DIM_THREE,
 	.family		= AF_UNSPEC,
-	.revision	= 1,
+	.revision_min	= 0,
+	.revision_max	= 1,	/* SCTP and UDPLITE support added */
 	.create		= hash_ipportip_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 605ef3bf94ef..0b54fdea9794 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -595,7 +595,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
 	.dimension	= IPSET_DIM_THREE,
 	.family		= AF_UNSPEC,
-	.revision	= 1,
+	.revision_min	= 0,
+	.revision_max	= 1,	/* SCTP and UDPLITE support added */
 	.create		= hash_ipportnet_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index e6f8bc5771ca..360cf5b3ddf6 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -437,7 +437,8 @@ static struct ip_set_type hash_net_type __read_mostly = {
 	.features	= IPSET_TYPE_IP,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision_min	= 0,
+	.revision_max	= 0,
 	.create		= hash_net_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 037b829178dc..09f807fa24ac 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -544,7 +544,8 @@ static struct ip_set_type hash_netport_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= AF_UNSPEC,
-	.revision	= 1,
+	.revision_min	= 0,
+	.revision_max	= 1,	/* SCTP and UDPLITE support added */
 	.create		= hash_netport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 74f0dcc30d98..898fe68ec4a4 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -575,7 +575,8 @@ static struct ip_set_type list_set_type __read_mostly = {
 	.features	= IPSET_TYPE_NAME | IPSET_DUMP_LAST,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision_min	= 0,
+	.revision_max	= 0,
 	.create		= list_set_create,
 	.create_policy	= {
 		[IPSET_ATTR_SIZE]	= { .type = NLA_U32 },
-- 
cgit v1.2.3


From d0d9e0a5a8db05b2179c2ffb25d1c2850cce3c8e Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 18:52:41 +0200
Subject: netfilter: ipset: support range for IPv4 at adding/deleting elements
 for hash:*net* types

The range internally is converted to the network(s) equal to the range.
Example:

	# ipset new test hash:net
	# ipset add test 10.2.0.0-10.2.1.12
	# ipset list test
	Name: test
	Type: hash:net
	Header: family inet hashsize 1024 maxelem 65536
	Size in memory: 16888
	References: 0
	Members:
	10.2.1.12
	10.2.1.0/29
	10.2.0.0/24
	10.2.1.8/30

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set.h       |  1 +
 include/linux/netfilter/ipset/ip_set_ahash.h |  2 +-
 include/linux/netfilter/ipset/ip_set_hash.h  |  4 ++
 include/linux/netfilter/ipset/pfxlen.h       |  3 ++
 net/netfilter/ipset/ip_set_hash_ipportnet.c  | 69 +++++++++++++++++++---------
 net/netfilter/ipset/ip_set_hash_net.c        | 51 +++++++++++++++-----
 net/netfilter/ipset/ip_set_hash_netport.c    | 69 +++++++++++++++++++---------
 net/netfilter/ipset/pfxlen.c                 | 21 +++++++++
 8 files changed, 165 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index ac31e382472b..fd83f4f436c3 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -104,6 +104,7 @@ enum {
 	IPSET_ATTR_NAMEREF,
 	IPSET_ATTR_IP2,
 	IPSET_ATTR_CIDR2,
+	IPSET_ATTR_IP2_TO,
 	__IPSET_ATTR_ADT_MAX,
 };
 #define IPSET_ATTR_ADT_MAX	(__IPSET_ATTR_ADT_MAX - 1)
diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index 8709bd950c8b..905e2ac32daf 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -353,7 +353,7 @@ retry:
 	return 0;
 }
 
-static inline void
+static void
 type_pf_data_next(struct ip_set_hash *h, const struct type_pf_elem *d);
 
 /* Add an element to a hash and update the internal counters when succeeded,
diff --git a/include/linux/netfilter/ipset/ip_set_hash.h b/include/linux/netfilter/ipset/ip_set_hash.h
index b86f15c04524..e2a9fae767f6 100644
--- a/include/linux/netfilter/ipset/ip_set_hash.h
+++ b/include/linux/netfilter/ipset/ip_set_hash.h
@@ -11,6 +11,10 @@ enum {
 	IPSET_ERR_INVALID_PROTO,
 	/* Protocol missing but must be specified */
 	IPSET_ERR_MISSING_PROTO,
+	/* Range not supported */
+	IPSET_ERR_HASH_RANGE_UNSUPPORTED,
+	/* Invalid range */
+	IPSET_ERR_HASH_RANGE,
 };
 
 #ifdef __KERNEL__
diff --git a/include/linux/netfilter/ipset/pfxlen.h b/include/linux/netfilter/ipset/pfxlen.h
index 0e1fb50da562..84efa3351e0e 100644
--- a/include/linux/netfilter/ipset/pfxlen.h
+++ b/include/linux/netfilter/ipset/pfxlen.h
@@ -3,6 +3,7 @@
 
 #include <asm/byteorder.h>
 #include <linux/netfilter.h> 
+#include <net/tcp.h>
 
 /* Prefixlen maps, by Jan Engelhardt  */
 extern const union nf_inet_addr ip_set_netmask_map[];
@@ -32,4 +33,6 @@ ip_set_hostmask6(u8 pfxlen)
 	return &ip_set_hostmask_map[pfxlen].ip6[0];
 }
 
+extern u32 ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr);
+
 #endif /*_PFXLEN_H */
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 0b54fdea9794..ef068b03ec1a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -146,6 +146,7 @@ hash_ipportnet4_data_next(struct ip_set_hash *h,
 {
 	h->next.ip = ntohl(d->ip);
 	h->next.port = ntohs(d->port);
+	h->next.ip2 = ntohl(d->ip2);
 }
 
 static int
@@ -181,6 +182,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipportnet4_elem data = { .cidr = HOST_MASK };
 	u32 ip, ip_to, p = 0, port, port_to;
+	u32 ip2_from = 0, ip2_to, ip2_last, ip2;
 	u32 timeout = h->timeout;
 	bool with_ports = false;
 	int ret;
@@ -194,21 +196,19 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
-	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
 	if (ret)
 		return ret;
 
-	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2);
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from);
 	if (ret)
 		return ret;
 
-	if (tb[IPSET_ATTR_CIDR2])
+	if (tb[IPSET_ATTR_CIDR2]) {
 		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
-
-	if (!data.cidr)
-		return -IPSET_ERR_INVALID_CIDR;
-
-	data.ip2 &= ip_set_netmask(data.cidr);
+		if (!data.cidr)
+			return -IPSET_ERR_INVALID_CIDR;
+	}
 
 	if (tb[IPSET_ATTR_PORT])
 		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
@@ -233,14 +233,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
+	with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
 	if (adt == IPSET_TEST ||
-	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
-	      tb[IPSET_ATTR_PORT_TO])) {
+	    !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports ||
+	      tb[IPSET_ATTR_IP2_TO])) {
+		data.ip = htonl(ip);
+		data.ip2 = htonl(ip2_from & ip_set_hostmask(data.cidr));
 		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
-	ip = ntohl(data.ip);
 	if (tb[IPSET_ATTR_IP_TO]) {
 		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
 		if (ret)
@@ -254,29 +256,48 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			return -IPSET_ERR_INVALID_CIDR;
 		ip &= ip_set_hostmask(cidr);
 		ip_to = ip | ~ip_set_hostmask(cidr);
-	} else
-		ip_to = ip;
+	}
 
 	port_to = port = ntohs(data.port);
-	if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+	if (tb[IPSET_ATTR_PORT_TO]) {
 		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
 		if (port > port_to)
 			swap(port, port_to);
 	}
+	if (tb[IPSET_ATTR_IP2_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+		if (ret)
+			return ret;
+		if (ip2_from > ip2_to)
+			swap(ip2_from, ip2_to);
+		if (ip2_from + UINT_MAX == ip2_to)
+			return -IPSET_ERR_HASH_RANGE;
+	} else {
+		ip2_from &= ip_set_hostmask(data.cidr);
+		ip2_to = ip2_from | ~ip_set_hostmask(data.cidr);
+	}
 
 	if (retried)
 		ip = h->next.ip;
 	for (; !before(ip_to, ip); ip++) {
+		data.ip = htonl(ip);
 		p = retried && ip == h->next.ip ? h->next.port : port;
 		for (; p <= port_to; p++) {
-			data.ip = htonl(ip);
 			data.port = htons(p);
-			ret = adtfn(set, &data, timeout, flags);
-
-			if (ret && !ip_set_eexist(ret, flags))
-				return ret;
-			else
-				ret = 0;
+			ip2 = retried && ip == h->next.ip && p == h->next.port
+				? h->next.ip2 : ip2_from;
+			while (!after(ip2, ip2_to)) {
+				data.ip2 = htonl(ip2);
+				ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
+								&data.cidr);
+				ret = adtfn(set, &data, timeout, flags);
+
+				if (ret && !ip_set_eexist(ret, flags))
+					return ret;
+				else
+					ret = 0;
+				ip2 = ip2_last + 1;
+			}
 		}
 	}
 	return ret;
@@ -451,6 +472,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 		     tb[IPSET_ATTR_IP_TO] ||
 		     tb[IPSET_ATTR_CIDR]))
 		return -IPSET_ERR_PROTOCOL;
+	if (unlikely(tb[IPSET_ATTR_IP_TO]))
+		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
 
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
@@ -596,7 +619,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
 	.dimension	= IPSET_DIM_THREE,
 	.family		= AF_UNSPEC,
 	.revision_min	= 0,
-	.revision_max	= 1,	/* SCTP and UDPLITE support added */
+	/*		  1	   SCTP and UDPLITE support added */
+	.revision_max	= 2,	/* Range as input support for IPv4 added */
 	.create		= hash_ipportnet_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
@@ -609,6 +633,7 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
 		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
 		[IPSET_ATTR_IP2]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP2_TO]	= { .type = NLA_NESTED },
 		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
 		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 },
 		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 360cf5b3ddf6..8d3c3efbbf17 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -129,6 +129,7 @@ static inline void
 hash_net4_data_next(struct ip_set_hash *h,
 		    const struct hash_net4_elem *d)
 {
+	h->next.ip = ntohl(d->ip);
 }
 
 static int
@@ -158,6 +159,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_net4_elem data = { .cidr = HOST_MASK };
 	u32 timeout = h->timeout;
+	u32 ip = 0, ip_to, last;
 	int ret;
 
 	if (unlikely(!tb[IPSET_ATTR_IP] ||
@@ -167,27 +169,51 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
-	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
 	if (ret)
 		return ret;
 
-	if (tb[IPSET_ATTR_CIDR])
+	if (tb[IPSET_ATTR_CIDR]) {
 		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
-
-	if (!data.cidr)
-		return -IPSET_ERR_INVALID_CIDR;
-
-	data.ip &= ip_set_netmask(data.cidr);
+		if (!data.cidr)
+			return -IPSET_ERR_INVALID_CIDR;
+	}
 
 	if (tb[IPSET_ATTR_TIMEOUT]) {
 		if (!with_timeout(h->timeout))
 			return -IPSET_ERR_TIMEOUT;
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
+	
+	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
+		data.ip = htonl(ip & ip_set_hostmask(data.cidr));
+		ret = adtfn(set, &data, timeout, flags);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
 
-	ret = adtfn(set, &data, timeout, flags);
-
-	return ip_set_eexist(ret, flags) ? 0 : ret;
+	ip_to = ip;
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+		if (ret)
+			return ret;
+		if (ip_to < ip)
+			swap(ip, ip_to);
+		if (ip + UINT_MAX == ip_to)
+			return -IPSET_ERR_HASH_RANGE;
+	}
+	if (retried)
+		ip = h->next.ip;		
+	while (!after(ip, ip_to)) {
+		data.ip = htonl(ip);
+		last = ip_set_range_to_cidr(ip, ip_to, &data.cidr);
+		ret = adtfn(set, &data, timeout, flags);
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+		ip = last + 1;
+	}
+	return ret;
 }
 
 static bool
@@ -334,6 +360,8 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (unlikely(!tb[IPSET_ATTR_IP] ||
 		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
 		return -IPSET_ERR_PROTOCOL;
+	if (unlikely(tb[IPSET_ATTR_IP_TO]))
+		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
 
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
@@ -438,7 +466,7 @@ static struct ip_set_type hash_net_type __read_mostly = {
 	.dimension	= IPSET_DIM_ONE,
 	.family		= AF_UNSPEC,
 	.revision_min	= 0,
-	.revision_max	= 0,
+	.revision_max	= 1,	/* Range as input support for IPv4 added */
 	.create		= hash_net_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
@@ -449,6 +477,7 @@ static struct ip_set_type hash_net_type __read_mostly = {
 	},
 	.adt_policy	= {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
 		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 	},
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 09f807fa24ac..300103096879 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -141,6 +141,7 @@ static inline void
 hash_netport4_data_next(struct ip_set_hash *h,
 			const struct hash_netport4_elem *d)
 {
+	h->next.ip = ntohl(d->ip);
 	h->next.port = ntohs(d->port);
 }
 
@@ -175,7 +176,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	const struct ip_set_hash *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netport4_elem data = { .cidr = HOST_MASK };
-	u32 port, port_to;
+	u32 port, port_to, p = 0, ip = 0, ip_to, last;
 	u32 timeout = h->timeout;
 	bool with_ports = false;
 	int ret;
@@ -189,15 +190,15 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
-	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
 	if (ret)
 		return ret;
 
-	if (tb[IPSET_ATTR_CIDR])
+	if (tb[IPSET_ATTR_CIDR]) {
 		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
-	if (!data.cidr)
-		return -IPSET_ERR_INVALID_CIDR;
-	data.ip &= ip_set_netmask(data.cidr);
+		if (!data.cidr)
+			return -IPSET_ERR_INVALID_CIDR; 
+	}
 
 	if (tb[IPSET_ATTR_PORT])
 		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
@@ -222,26 +223,48 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+	with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+	if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) {
+		data.ip = htonl(ip & ip_set_hostmask(data.cidr));
 		ret = adtfn(set, &data, timeout, flags);
 		return ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
-	port = ntohs(data.port);
-	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
-	if (port > port_to)
-		swap(port, port_to);
+	port = port_to = ntohs(data.port);
+	if (tb[IPSET_ATTR_PORT_TO]) {
+		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+		if (port_to < port)
+			swap(port, port_to);
+	}
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+		if (ret)
+			return ret;
+		if (ip_to < ip)
+			swap(ip, ip_to);
+		if (ip + UINT_MAX == ip_to)
+			return -IPSET_ERR_HASH_RANGE;
+	} else {
+		ip &= ip_set_hostmask(data.cidr);
+		ip_to = ip | ~ip_set_hostmask(data.cidr);
+	}
 
 	if (retried)
-		port = h->next.port;
-	for (; port <= port_to; port++) {
-		data.port = htons(port);
-		ret = adtfn(set, &data, timeout, flags);
-
-		if (ret && !ip_set_eexist(ret, flags))
-			return ret;
-		else
-			ret = 0;
+		ip = h->next.ip;
+	while (!after(ip, ip_to)) {
+		data.ip = htonl(ip);
+		last = ip_set_range_to_cidr(ip, ip_to, &data.cidr);
+		p = retried && ip == h->next.ip ? h->next.port : port;
+		for (; p <= port_to; p++) {
+			data.port = htons(p);
+			ret = adtfn(set, &data, timeout, flags);
+
+			if (ret && !ip_set_eexist(ret, flags))
+				return ret;
+			else
+				ret = 0;
+		}
+		ip = last + 1;
 	}
 	return ret;
 }
@@ -407,6 +430,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
 		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
 		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
 		return -IPSET_ERR_PROTOCOL;
+	if (unlikely(tb[IPSET_ATTR_IP_TO]))
+		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
 
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
@@ -545,7 +570,8 @@ static struct ip_set_type hash_netport_type __read_mostly = {
 	.dimension	= IPSET_DIM_TWO,
 	.family		= AF_UNSPEC,
 	.revision_min	= 0,
-	.revision_max	= 1,	/* SCTP and UDPLITE support added */
+	/*		  1	   SCTP and UDPLITE support added */
+	.revision_max	= 2,	/* Range as input support for IPv4 added */
 	.create		= hash_netport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
@@ -557,6 +583,7 @@ static struct ip_set_type hash_netport_type __read_mostly = {
 	},
 	.adt_policy	= {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
 		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
 		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 },
 		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
index 23f8c8162214..b57a85673de7 100644
--- a/net/netfilter/ipset/pfxlen.c
+++ b/net/netfilter/ipset/pfxlen.c
@@ -289,3 +289,24 @@ const union nf_inet_addr ip_set_hostmask_map[] = {
 	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
 };
 EXPORT_SYMBOL_GPL(ip_set_hostmask_map);
+
+/* Find the largest network which matches the range from left, in host order. */
+u32
+ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr)
+{
+	u32 last;
+	u8 i;
+
+	for (i = 1; i < 32; i++) {
+		if ((from & ip_set_hostmask(i)) != from)
+			continue;
+		last = from | ~ip_set_hostmask(i);
+		if (!after(last, to)) {
+			*cidr = i;
+			return last;
+		}
+	}
+	*cidr = 32;
+	return from;
+}
+EXPORT_SYMBOL_GPL(ip_set_range_to_cidr);
-- 
cgit v1.2.3


From c64562eaf2ad61d7492788ef95678f52d0d28f2a Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 18:53:51 +0200
Subject: netfilter: ipset: adding ranges to hash types with timeout could
 still fail, fixed

The patch "Fix adding ranges to hash types" had got a mistypeing
in the timeout variant of the hash types, which actually made
the patch ineffective. Fixed!

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set_ahash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index 905e2ac32daf..a099d40d120a 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -835,7 +835,7 @@ type_pf_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	}
 	ret = type_pf_elem_tadd(n, d, timeout);
 	if (ret != 0) {
-		if (ret == -EEXIST)
+		if (ret == -EAGAIN)
 			type_pf_data_next(h, d);
 		goto out;
 	}
-- 
cgit v1.2.3


From e6146e8684ed6dd4c0ff85ca21bf4324114fbbfa Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 18:55:58 +0200
Subject: netfilter: ipset: use unified from/to address masking and check the
 usage

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/pfxlen.h      | 6 ++++++
 net/netfilter/ipset/ip_set_bitmap_ip.c      | 6 ++----
 net/netfilter/ipset/ip_set_bitmap_ipmac.c   | 2 +-
 net/netfilter/ipset/ip_set_hash_ip.c        | 3 +--
 net/netfilter/ipset/ip_set_hash_ipport.c    | 3 +--
 net/netfilter/ipset/ip_set_hash_ipportip.c  | 3 +--
 net/netfilter/ipset/ip_set_hash_ipportnet.c | 6 ++----
 net/netfilter/ipset/ip_set_hash_netport.c   | 3 +--
 8 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/pfxlen.h b/include/linux/netfilter/ipset/pfxlen.h
index 84efa3351e0e..d55a6ccf662e 100644
--- a/include/linux/netfilter/ipset/pfxlen.h
+++ b/include/linux/netfilter/ipset/pfxlen.h
@@ -35,4 +35,10 @@ ip_set_hostmask6(u8 pfxlen)
 
 extern u32 ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr);
 
+#define ip_set_mask_from_to(from, to, cidr)	\
+do {						\
+	from &= ip_set_hostmask(cidr);		\
+	to = from | ~ip_set_hostmask(cidr);	\
+} while (0)
+
 #endif /*_PFXLEN_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 49323110560c..c46e34401597 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -283,8 +283,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (cidr > 32)
 			return -IPSET_ERR_INVALID_CIDR;
-		ip &= ip_set_hostmask(cidr);
-		ip_to = ip | ~ip_set_hostmask(cidr);
+		ip_set_mask_from_to(ip, ip_to, cidr);
 	} else
 		ip_to = ip;
 
@@ -478,8 +477,7 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 
 		if (cidr >= 32)
 			return -IPSET_ERR_INVALID_CIDR;
-		first_ip &= ip_set_hostmask(cidr);
-		last_ip = first_ip | ~ip_set_hostmask(cidr);
+		ip_set_mask_from_to(first_ip, last_ip, cidr);
 	} else
 		return -IPSET_ERR_PROTOCOL;
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 5deb7bb37468..aa2cfa1ed474 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -578,7 +578,7 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
 
 		if (cidr >= 32)
 			return -IPSET_ERR_INVALID_CIDR;
-		last_ip = first_ip | ~ip_set_hostmask(cidr);
+		ip_set_mask_from_to(first_ip, last_ip, cidr);
 	} else
 		return -IPSET_ERR_PROTOCOL;
 
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index c3bc06d353d3..bdb432e22a8a 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -177,8 +177,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (cidr > 32)
 			return -IPSET_ERR_INVALID_CIDR;
-		ip &= ip_set_hostmask(cidr);
-		ip_to = ip | ~ip_set_hostmask(cidr);
+		ip_set_mask_from_to(ip, ip_to, cidr);
 	} else
 		ip_to = ip;
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index de2e351034a1..bdeb71605080 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -216,8 +216,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (cidr > 32)
 			return -IPSET_ERR_INVALID_CIDR;
-		ip &= ip_set_hostmask(cidr);
-		ip_to = ip | ~ip_set_hostmask(cidr);
+		ip_set_mask_from_to(ip, ip_to, cidr);
 	} else
 		ip_to = ip;
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 031ed057c811..fb986fc6a6f2 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -224,8 +224,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (cidr > 32)
 			return -IPSET_ERR_INVALID_CIDR;
-		ip &= ip_set_hostmask(cidr);
-		ip_to = ip | ~ip_set_hostmask(cidr);
+		ip_set_mask_from_to(ip, ip_to, cidr);
 	} else
 		ip_to = ip;
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index ef068b03ec1a..2ed5e7581055 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -254,8 +254,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (cidr > 32)
 			return -IPSET_ERR_INVALID_CIDR;
-		ip &= ip_set_hostmask(cidr);
-		ip_to = ip | ~ip_set_hostmask(cidr);
+		ip_set_mask_from_to(ip, ip_to, cidr);
 	}
 
 	port_to = port = ntohs(data.port);
@@ -273,8 +272,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 		if (ip2_from + UINT_MAX == ip2_to)
 			return -IPSET_ERR_HASH_RANGE;
 	} else {
-		ip2_from &= ip_set_hostmask(data.cidr);
-		ip2_to = ip2_from | ~ip_set_hostmask(data.cidr);
+		ip_set_mask_from_to(ip2_from, ip2_to, data.cidr);
 	}
 
 	if (retried)
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 300103096879..90adc2c30665 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -245,8 +245,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 		if (ip + UINT_MAX == ip_to)
 			return -IPSET_ERR_HASH_RANGE;
 	} else {
-		ip &= ip_set_hostmask(data.cidr);
-		ip_to = ip | ~ip_set_hostmask(data.cidr);
+		ip_set_mask_from_to(ip, ip_to, data.cidr);
 	}
 
 	if (retried)
-- 
cgit v1.2.3


From b66554cf03fe866b3fb7b9f40f430b8ba09f41c8 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 18:56:47 +0200
Subject: netfilter: ipset: add xt_action_param to the variant level kadt
 functions, ipset API change

With the change the sets can use any parameter available for the match
and target extensions, like input/output interface. It's required for
the hash:net,iface set type.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set.h       |  5 +++++
 include/linux/netfilter/ipset/ip_set_ahash.h |  1 +
 net/netfilter/ipset/ip_set_bitmap_ip.c       |  1 +
 net/netfilter/ipset/ip_set_bitmap_ipmac.c    |  1 +
 net/netfilter/ipset/ip_set_bitmap_port.c     |  1 +
 net/netfilter/ipset/ip_set_core.c            | 12 ++++++++----
 net/netfilter/ipset/ip_set_hash_ip.c         |  2 ++
 net/netfilter/ipset/ip_set_hash_ipport.c     |  2 ++
 net/netfilter/ipset/ip_set_hash_ipportip.c   |  2 ++
 net/netfilter/ipset/ip_set_hash_ipportnet.c  |  2 ++
 net/netfilter/ipset/ip_set_hash_net.c        |  2 ++
 net/netfilter/ipset/ip_set_hash_netport.c    |  2 ++
 net/netfilter/ipset/ip_set_list_set.c        |  7 ++++---
 net/netfilter/xt_set.c                       | 19 ++++++++++---------
 14 files changed, 43 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index fd83f4f436c3..8955165f7dbb 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -170,6 +170,7 @@ enum ipset_adt {
 #include <linux/ipv6.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/vmalloc.h>
 #include <net/netlink.h>
 
@@ -238,6 +239,7 @@ struct ip_set_type_variant {
 	 *			zero for no match/success to add/delete
 	 *			positive for matching element */
 	int (*kadt)(struct ip_set *set, const struct sk_buff * skb,
+		    const struct xt_action_param *par,
 		    enum ipset_adt adt, const struct ip_set_adt_opt *opt);
 
 	/* Userspace: test/add/del entries
@@ -332,10 +334,13 @@ extern void ip_set_nfnl_put(ip_set_id_t index);
 /* API for iptables set match, and SET target */
 
 extern int ip_set_add(ip_set_id_t id, const struct sk_buff *skb,
+		      const struct xt_action_param *par,
 		      const struct ip_set_adt_opt *opt);
 extern int ip_set_del(ip_set_id_t id, const struct sk_buff *skb,
+		      const struct xt_action_param *par,
 		      const struct ip_set_adt_opt *opt);
 extern int ip_set_test(ip_set_id_t id, const struct sk_buff *skb,
+		       const struct xt_action_param *par,
 		       const struct ip_set_adt_opt *opt);
 
 /* Utility functions */
diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index a099d40d120a..1c977e143885 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -599,6 +599,7 @@ nla_put_failure:
 
 static int
 type_pf_kadt(struct ip_set *set, const struct sk_buff * skb,
+	     const struct xt_action_param *par,
 	     enum ipset_adt adt, const struct ip_set_adt_opt *opt);
 static int
 type_pf_uadt(struct ip_set *set, struct nlattr *tb[],
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index c46e34401597..e3e73997c3be 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -219,6 +219,7 @@ nla_put_failure:
 
 static int
 bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
+	       const struct xt_action_param *par,
 	       enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	struct bitmap_ip *map = set->data;
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index aa2cfa1ed474..51ab66435a0a 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -338,6 +338,7 @@ nla_put_failure:
 
 static int
 bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
+		  const struct xt_action_param *par,
 		  enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	struct bitmap_ipmac *map = set->data;
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index c3e906fcc22c..29ba93bb94be 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -208,6 +208,7 @@ nla_put_failure:
 
 static int
 bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
+		 const struct xt_action_param *par,
 		 enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	struct bitmap_port *map = set->data;
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 6a82cc0c9e00..64e7b04759a7 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -21,6 +21,7 @@
 #include <net/netlink.h>
 
 #include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/ipset/ip_set.h>
 
@@ -328,6 +329,7 @@ __ip_set_put(ip_set_id_t index)
 
 int
 ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
+	    const struct xt_action_param *par,
 	    const struct ip_set_adt_opt *opt)
 {
 	struct ip_set *set = ip_set_list[index];
@@ -341,14 +343,14 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 		return 0;
 
 	read_lock_bh(&set->lock);
-	ret = set->variant->kadt(set, skb, IPSET_TEST, opt);
+	ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
 	read_unlock_bh(&set->lock);
 
 	if (ret == -EAGAIN) {
 		/* Type requests element to be completed */
 		pr_debug("element must be competed, ADD is triggered\n");
 		write_lock_bh(&set->lock);
-		set->variant->kadt(set, skb, IPSET_ADD, opt);
+		set->variant->kadt(set, skb, par, IPSET_ADD, opt);
 		write_unlock_bh(&set->lock);
 		ret = 1;
 	}
@@ -360,6 +362,7 @@ EXPORT_SYMBOL_GPL(ip_set_test);
 
 int
 ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
+	   const struct xt_action_param *par,
 	   const struct ip_set_adt_opt *opt)
 {
 	struct ip_set *set = ip_set_list[index];
@@ -373,7 +376,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 		return 0;
 
 	write_lock_bh(&set->lock);
-	ret = set->variant->kadt(set, skb, IPSET_ADD, opt);
+	ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
 	write_unlock_bh(&set->lock);
 
 	return ret;
@@ -382,6 +385,7 @@ EXPORT_SYMBOL_GPL(ip_set_add);
 
 int
 ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
+	   const struct xt_action_param *par,
 	   const struct ip_set_adt_opt *opt)
 {
 	struct ip_set *set = ip_set_list[index];
@@ -395,7 +399,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 		return 0;
 
 	write_lock_bh(&set->lock);
-	ret = set->variant->kadt(set, skb, IPSET_DEL, opt);
+	ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
 	write_unlock_bh(&set->lock);
 
 	return ret;
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index bdb432e22a8a..fa80bb9b9c81 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -116,6 +116,7 @@ hash_ip4_data_next(struct ip_set_hash *h, const struct hash_ip4_elem *d)
 
 static int
 hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+	      const struct xt_action_param *par,
 	      enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
@@ -295,6 +296,7 @@ hash_ip6_data_next(struct ip_set_hash *h, const struct hash_ip6_elem *d)
 
 static int
 hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+	      const struct xt_action_param *par,
 	      enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index bdeb71605080..bbf51b67b170 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -134,6 +134,7 @@ hash_ipport4_data_next(struct ip_set_hash *h,
 
 static int
 hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+		  const struct xt_action_param *par,
 		  enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
@@ -348,6 +349,7 @@ hash_ipport6_data_next(struct ip_set_hash *h,
 
 static int
 hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+		  const struct xt_action_param *par,
 		  enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index fb986fc6a6f2..96525f529a54 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -137,6 +137,7 @@ hash_ipportip4_data_next(struct ip_set_hash *h,
 
 static int
 hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+		    const struct xt_action_param *par,
 		    enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
@@ -361,6 +362,7 @@ hash_ipportip6_data_next(struct ip_set_hash *h,
 
 static int
 hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+		    const struct xt_action_param *par,
 		    enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 2ed5e7581055..dcd351b587d5 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -151,6 +151,7 @@ hash_ipportnet4_data_next(struct ip_set_hash *h,
 
 static int
 hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+		     const struct xt_action_param *par,
 		     enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
@@ -428,6 +429,7 @@ hash_ipportnet6_data_next(struct ip_set_hash *h,
 
 static int
 hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+		     const struct xt_action_param *par,
 		     enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 8d3c3efbbf17..dcbb5d4c636c 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -134,6 +134,7 @@ hash_net4_data_next(struct ip_set_hash *h,
 
 static int
 hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
+	       const struct xt_action_param *par,
 	       enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
@@ -330,6 +331,7 @@ hash_net6_data_next(struct ip_set_hash *h,
 
 static int
 hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
+	       const struct xt_action_param *par,
 	       enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 90adc2c30665..72961ba72a86 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -147,6 +147,7 @@ hash_netport4_data_next(struct ip_set_hash *h,
 
 static int
 hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+		   const struct xt_action_param *par,
 		   enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
@@ -390,6 +391,7 @@ hash_netport6_data_next(struct ip_set_hash *h,
 
 static int
 hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+		   const struct xt_action_param *par,
 		   enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	const struct ip_set_hash *h = set->data;
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 898fe68ec4a4..4d10819d462e 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -72,6 +72,7 @@ list_set_expired(const struct list_set *map, u32 id)
 
 static int
 list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
+	      const struct xt_action_param *par,
 	      enum ipset_adt adt, const struct ip_set_adt_opt *opt)
 {
 	struct list_set *map = set->data;
@@ -87,17 +88,17 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
 			continue;
 		switch (adt) {
 		case IPSET_TEST:
-			ret = ip_set_test(elem->id, skb, opt);
+			ret = ip_set_test(elem->id, skb, par, opt);
 			if (ret > 0)
 				return ret;
 			break;
 		case IPSET_ADD:
-			ret = ip_set_add(elem->id, skb, opt);
+			ret = ip_set_add(elem->id, skb, par, opt);
 			if (ret == 0)
 				return ret;
 			break;
 		case IPSET_DEL:
-			ret = ip_set_del(elem->id, skb, opt);
+			ret = ip_set_del(elem->id, skb, par, opt);
 			if (ret == 0)
 				return ret;
 			break;
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index eb265bdfc1d0..453847f293d3 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -29,9 +29,10 @@ MODULE_ALIAS("ip6t_SET");
 
 static inline int
 match_set(ip_set_id_t index, const struct sk_buff *skb,
+	  const struct xt_action_param *par,
 	  const struct ip_set_adt_opt *opt, int inv)
 {
-	if (ip_set_test(index, skb, opt))
+	if (ip_set_test(index, skb, par, opt))
 		inv = !inv;
 	return inv;
 }
@@ -54,7 +55,7 @@ set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
 	ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
 		info->match_set.u.compat.flags, 0, UINT_MAX);
 
-	return match_set(info->match_set.index, skb, &opt,
+	return match_set(info->match_set.index, skb, par, &opt,
 			 info->match_set.u.compat.flags & IPSET_INV_MATCH);
 }
 
@@ -118,9 +119,9 @@ set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 		info->del_set.u.compat.flags, 0, UINT_MAX);
 
 	if (info->add_set.index != IPSET_INVALID_ID)
-		ip_set_add(info->add_set.index, skb, &add_opt);
+		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
-		ip_set_del(info->del_set.index, skb, &del_opt);
+		ip_set_del(info->del_set.index, skb, par, &del_opt);
 
 	return XT_CONTINUE;
 }
@@ -188,7 +189,7 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
 	ADT_OPT(opt, par->family, info->match_set.dim,
 		info->match_set.flags, 0, UINT_MAX);
 
-	return match_set(info->match_set.index, skb, &opt,
+	return match_set(info->match_set.index, skb, par, &opt,
 			 info->match_set.flags & IPSET_INV_MATCH);
 }
 
@@ -233,9 +234,9 @@ set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 		info->del_set.flags, 0, UINT_MAX);
 
 	if (info->add_set.index != IPSET_INVALID_ID)
-		ip_set_add(info->add_set.index, skb, &add_opt);
+		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
-		ip_set_del(info->del_set.index, skb, &del_opt);
+		ip_set_del(info->del_set.index, skb, par, &del_opt);
 
 	return XT_CONTINUE;
 }
@@ -302,9 +303,9 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 		info->del_set.flags, 0, UINT_MAX);
 
 	if (info->add_set.index != IPSET_INVALID_ID)
-		ip_set_add(info->add_set.index, skb, &add_opt);
+		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
-		ip_set_del(info->del_set.index, skb, &del_opt);
+		ip_set_del(info->del_set.index, skb, par, &del_opt);
 
 	return XT_CONTINUE;
 }
-- 
cgit v1.2.3


From e385357a2f214e4d4e79c6118f1bede2572e0701 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 19:00:48 +0200
Subject: netfilter: ipset: hash:net,iface type introduced

The hash:net,iface type makes possible to store network address and
interface name pairs in a set. It's mostly suitable for egress
and ingress filtering. Examples:

        # ipset create test hash:net,iface
        # ipset add test 192.168.0.0/16,eth0
        # ipset add test 192.168.0.0/24,eth1

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set.h       |   5 +
 include/linux/netfilter/ipset/ip_set_ahash.h |   6 +
 net/netfilter/ipset/Kconfig                  |  10 +
 net/netfilter/ipset/Makefile                 |   1 +
 net/netfilter/ipset/ip_set_hash_netiface.c   | 762 +++++++++++++++++++++++++++
 5 files changed, 784 insertions(+)
 create mode 100644 net/netfilter/ipset/ip_set_hash_netiface.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 8955165f7dbb..e40917334d04 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -105,6 +105,7 @@ enum {
 	IPSET_ATTR_IP2,
 	IPSET_ATTR_CIDR2,
 	IPSET_ATTR_IP2_TO,
+	IPSET_ATTR_IFACE,
 	__IPSET_ATTR_ADT_MAX,
 };
 #define IPSET_ATTR_ADT_MAX	(__IPSET_ATTR_ADT_MAX - 1)
@@ -153,6 +154,8 @@ enum ipset_cmd_flags {
 enum ipset_cadt_flags {
 	IPSET_FLAG_BIT_BEFORE	= 0,
 	IPSET_FLAG_BEFORE	= (1 << IPSET_FLAG_BIT_BEFORE),
+	IPSET_FLAG_BIT_PHYSDEV	= 1,
+	IPSET_FLAG_PHYSDEV	= (1 << IPSET_FLAG_BIT_PHYSDEV),
 };
 
 /* Commands with settype-specific attributes */
@@ -212,6 +215,8 @@ enum ip_set_feature {
 	IPSET_TYPE_IP2 = (1 << IPSET_TYPE_IP2_FLAG),
 	IPSET_TYPE_NAME_FLAG = 4,
 	IPSET_TYPE_NAME = (1 << IPSET_TYPE_NAME_FLAG),
+	IPSET_TYPE_IFACE_FLAG = 5,
+	IPSET_TYPE_IFACE = (1 << IPSET_TYPE_IFACE_FLAG),
 	/* Strictly speaking not a feature, but a flag for dumping:
 	 * this settype must be dumped last */
 	IPSET_DUMP_LAST_FLAG = 7,
diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index 1c977e143885..8a0999a8baa1 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -63,6 +63,9 @@ struct ip_set_hash {
 #ifdef IP_SET_HASH_WITH_NETMASK
 	u8 netmask;		/* netmask value for subnets to store */
 #endif
+#ifdef IP_SET_HASH_WITH_RBTREE
+	struct rb_root rbtree;
+#endif
 #ifdef IP_SET_HASH_WITH_NETS
 	struct ip_set_hash_nets nets[0]; /* book-keeping of prefixes */
 #endif
@@ -200,6 +203,9 @@ ip_set_hash_destroy(struct ip_set *set)
 		del_timer_sync(&h->gc);
 
 	ahash_destroy(h->table);
+#ifdef IP_SET_HASH_WITH_RBTREE
+	rbtree_destroy(&h->rbtree);
+#endif
 	kfree(h);
 
 	set->data = NULL;
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
index 2c5b348eb3a8..ba36c283d837 100644
--- a/net/netfilter/ipset/Kconfig
+++ b/net/netfilter/ipset/Kconfig
@@ -109,6 +109,16 @@ config IP_SET_HASH_NETPORT
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_SET_HASH_NETIFACE
+	tristate "hash:net,iface set support"
+	depends on IP_SET
+	help
+	  This option adds the hash:net,iface set type support, by which
+	  one can store IPv4/IPv6 network address/prefix and
+	  interface name pairs as elements in a set.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config IP_SET_LIST_SET
 	tristate "list:set set support"
 	depends on IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
index 5adbdab67bd2..6e965ecd5444 100644
--- a/net/netfilter/ipset/Makefile
+++ b/net/netfilter/ipset/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
 obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o
 obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o
 obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o
+obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o
 
 # list types
 obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
new file mode 100644
index 000000000000..51e5df12bd00
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -0,0 +1,762 @@
+/* Copyright (C) 2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,iface type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:net,iface type of IP sets");
+MODULE_ALIAS("ip_set_hash:net,iface");
+
+/* Interface name rbtree */
+
+struct iface_node {
+	struct rb_node node;
+	char iface[IFNAMSIZ];
+};
+
+#define iface_data(n)	(rb_entry(n, struct iface_node, node)->iface)
+
+static inline long
+ifname_compare(const char *_a, const char *_b)
+{
+	const long *a = (const long *)_a;
+	const long *b = (const long *)_b;
+
+	BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
+	if (a[0] != b[0])
+		return a[0] - b[0];
+	if (IFNAMSIZ > sizeof(long)) {
+		if (a[1] != b[1])
+			return a[1] - b[1];
+	}
+	if (IFNAMSIZ > 2 * sizeof(long)) {
+		if (a[2] != b[2])
+			return a[2] - b[2];
+	}
+	if (IFNAMSIZ > 3 * sizeof(long)) {
+		if (a[3] != b[3])
+			return a[3] - b[3];
+	}
+	return 0;
+}
+
+static void
+rbtree_destroy(struct rb_root *root)
+{
+	struct rb_node *p, *n = root->rb_node;
+	struct iface_node *node;
+
+	/* Non-recursive destroy, like in ext3 */
+	while (n) {
+		if (n->rb_left) {
+			n = n->rb_left;
+			continue;
+		}
+		if (n->rb_right) {
+			n = n->rb_right;
+			continue;
+		}
+		p = rb_parent(n);
+		node = rb_entry(n, struct iface_node, node);
+		if (!p)
+			*root = RB_ROOT;
+		else if (p->rb_left == n)
+			p->rb_left = NULL;
+		else if (p->rb_right == n)
+			p->rb_right = NULL;
+
+		kfree(node);
+		n = p;
+	}
+}
+
+static int
+iface_test(struct rb_root *root, const char **iface)
+{
+	struct rb_node *n = root->rb_node;
+
+	while (n) {
+		const char *d = iface_data(n);
+		int res = ifname_compare(*iface, d);
+		
+		if (res < 0)
+			n = n->rb_left;
+		else if (res > 0)
+			n = n->rb_right;
+		else {
+			*iface = d;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int
+iface_add(struct rb_root *root, const char **iface)
+{
+	struct rb_node **n = &(root->rb_node), *p = NULL;
+	struct iface_node *d;
+	
+	while (*n) {
+		char *ifname = iface_data(*n);
+		int res = ifname_compare(*iface, ifname);
+
+		p = *n;
+		if (res < 0)
+			n = &((*n)->rb_left);
+		else if (res > 0)
+			n = &((*n)->rb_right);
+		else {
+			*iface = ifname;
+			return 0;
+		}
+	}
+
+	d = kzalloc(sizeof(*d), GFP_ATOMIC);
+	if (!d)
+		return -ENOMEM;
+	strcpy(d->iface, *iface);
+
+	rb_link_node(&d->node, p, n);
+	rb_insert_color(&d->node, root);
+
+	*iface = d->iface;
+	return 0;
+}
+
+/* Type specific function prefix */
+#define TYPE		hash_netiface
+
+static bool
+hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_netiface4_same_set	hash_netiface_same_set
+#define hash_netiface6_same_set	hash_netiface_same_set
+
+#define STREQ(a, b)	(strcmp(a, b) == 0)
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_netiface4_elem {
+	__be32 ip;
+	const char *iface;
+	u8 physdev;
+	u8 cidr;
+	u16 padding;
+};
+
+/* Member elements with timeout support */
+struct hash_netiface4_telem {
+	__be32 ip;
+	const char *iface;
+	u8 physdev;
+	u8 cidr;
+	u16 padding;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
+			  const struct hash_netiface4_elem *ip2)
+{
+	return ip1->ip == ip2->ip &&
+	       ip1->cidr == ip2->cidr &&
+	       ip1->physdev == ip2->physdev &&
+	       ip1->iface == ip2->iface;
+}
+
+static inline bool
+hash_netiface4_data_isnull(const struct hash_netiface4_elem *elem)
+{
+	return elem->cidr == 0;
+}
+
+static inline void
+hash_netiface4_data_copy(struct hash_netiface4_elem *dst,
+			 const struct hash_netiface4_elem *src) {
+	dst->ip = src->ip;
+	dst->cidr = src->cidr;
+	dst->physdev = src->physdev;
+	dst->iface = src->iface;
+}
+
+static inline void
+hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr)
+{
+	elem->ip &= ip_set_netmask(cidr);
+	elem->cidr = cidr;
+}
+
+static inline void
+hash_netiface4_data_zero_out(struct hash_netiface4_elem *elem)
+{
+	elem->cidr = 0;
+}
+
+static bool
+hash_netiface4_data_list(struct sk_buff *skb,
+			 const struct hash_netiface4_elem *data)
+{
+	u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface);
+	if (flags)
+		NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, flags);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_netiface4_data_tlist(struct sk_buff *skb,
+			  const struct hash_netiface4_elem *data)
+{
+	const struct hash_netiface4_telem *tdata =
+		(const struct hash_netiface4_telem *)data;
+	u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface);
+	if (flags)
+		NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, flags);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(tdata->timeout)));
+
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_RBTREE
+
+#define PF		4
+#define HOST_MASK	32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static inline void
+hash_netiface4_data_next(struct ip_set_hash *h,
+			 const struct hash_netiface4_elem *d)
+{
+	h->next.ip = ntohl(d->ip);
+}
+
+static int
+hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
+		    const struct xt_action_param *par,
+		    enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+{
+	struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_netiface4_elem data = {
+		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
+	};
+	int ret;
+
+	if (data.cidr == 0)
+		return -EINVAL;
+	if (adt == IPSET_TEST)
+		data.cidr = HOST_MASK;
+
+	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
+	data.ip &= ip_set_netmask(data.cidr);
+
+#define IFACE(dir)	(par->dir ? par->dir->name : NULL)
+#define PHYSDEV(dir)	(nf_bridge->dir ? nf_bridge->dir->name : NULL)
+#define SRCDIR		(opt->flags & IPSET_DIM_TWO_SRC)
+
+	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
+#ifdef CONFIG_BRIDGE_NETFILTER
+		const struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+		
+		if (!nf_bridge)
+			return -EINVAL;
+		data.iface = SRCDIR ? PHYSDEV(physindev): PHYSDEV(physoutdev);
+		data.physdev = 1;
+#else
+		data.iface = NULL;
+#endif
+	} else
+		data.iface = SRCDIR ? IFACE(in) : IFACE(out);
+
+	if (!data.iface)
+		return -EINVAL;
+	ret = iface_test(&h->rbtree, &data.iface);
+	if (adt == IPSET_ADD) {
+		if (!ret) {
+			ret = iface_add(&h->rbtree, &data.iface);
+			if (ret)
+				return ret;
+		}
+	} else if (!ret)
+		return ret;
+
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+}
+
+static int
+hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
+		    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+	struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_netiface4_elem data = { .cidr = HOST_MASK };
+	u32 ip = 0, ip_to, last;
+	u32 timeout = h->timeout;
+	char iface[IFNAMSIZ] = {};
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !tb[IPSET_ATTR_IFACE] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CIDR]) {
+		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+		if (!data.cidr)
+			return -IPSET_ERR_INVALID_CIDR; 
+	}
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); 
+	data.iface = iface;
+	ret = iface_test(&h->rbtree, &data.iface);
+	if (adt == IPSET_ADD) {
+		if (!ret) {
+			ret = iface_add(&h->rbtree, &data.iface);
+			if (ret)
+				return ret;
+		}
+	} else if (!ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CADT_FLAGS]) {
+		u32 flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+		if (flags & IPSET_FLAG_PHYSDEV)
+			data.physdev = 1;
+	}
+
+	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
+		data.ip = htonl(ip & ip_set_hostmask(data.cidr));
+		ret = adtfn(set, &data, timeout, flags);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
+
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+		if (ret)
+			return ret;
+		if (ip_to < ip)
+			swap(ip, ip_to);
+		if (ip + UINT_MAX == ip_to)
+			return -IPSET_ERR_HASH_RANGE;
+	} else {
+		ip_set_mask_from_to(ip, ip_to, data.cidr);
+	}
+
+	if (retried)
+		ip = h->next.ip;
+	while (!after(ip, ip_to)) {
+		data.ip = htonl(ip);
+		last = ip_set_range_to_cidr(ip, ip_to, &data.cidr);
+		ret = adtfn(set, &data, timeout, flags);
+
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+		ip = last + 1;
+	}
+	return ret;
+}
+
+static bool
+hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct ip_set_hash *x = a->data;
+	const struct ip_set_hash *y = b->data;
+
+	/* Resizing changes htable_bits, so we ignore it */
+	return x->maxelem == y->maxelem &&
+	       x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_netiface6_elem {
+	union nf_inet_addr ip;
+	const char *iface;
+	u8 physdev;
+	u8 cidr;
+	u16 padding;
+};
+
+struct hash_netiface6_telem {
+	union nf_inet_addr ip;
+	const char *iface;
+	u8 physdev;
+	u8 cidr;
+	u16 padding;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
+			  const struct hash_netiface6_elem *ip2)
+{
+	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+	       ip1->cidr == ip2->cidr &&
+	       ip1->physdev == ip2->physdev &&
+	       ip1->iface == ip2->iface;
+}
+
+static inline bool
+hash_netiface6_data_isnull(const struct hash_netiface6_elem *elem)
+{
+	return elem->cidr == 0;
+}
+
+static inline void
+hash_netiface6_data_copy(struct hash_netiface6_elem *dst,
+			 const struct hash_netiface6_elem *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_netiface6_data_zero_out(struct hash_netiface6_elem *elem)
+{
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr)
+{
+	ip6_netmask(&elem->ip, cidr);
+	elem->cidr = cidr;
+}
+
+static bool
+hash_netiface6_data_list(struct sk_buff *skb,
+			 const struct hash_netiface6_elem *data)
+{
+	u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface);
+	if (flags)
+		NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, flags);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_netiface6_data_tlist(struct sk_buff *skb,
+			  const struct hash_netiface6_elem *data)
+{
+	const struct hash_netiface6_telem *e =
+		(const struct hash_netiface6_telem *)data;
+	u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface);
+	if (flags)
+		NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, flags);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(e->timeout)));
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF		6
+#define HOST_MASK	128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static inline void
+hash_netiface6_data_next(struct ip_set_hash *h,
+			 const struct hash_netiface6_elem *d)
+{
+}
+
+static int
+hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
+		    const struct xt_action_param *par,
+		    enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+{
+	struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_netiface6_elem data = {
+		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
+	};
+	int ret;
+
+	if (data.cidr == 0)
+		return -EINVAL;
+	if (adt == IPSET_TEST)
+		data.cidr = HOST_MASK;
+
+	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6_netmask(&data.ip, data.cidr);
+
+	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
+#ifdef CONFIG_BRIDGE_NETFILTER
+		const struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+		
+		if (!nf_bridge)
+			return -EINVAL;
+		data.iface = SRCDIR ? PHYSDEV(physindev): PHYSDEV(physoutdev);
+		data.physdev = 1;
+#else
+		data.iface = NULL;
+#endif
+	} else
+		data.iface = SRCDIR ? IFACE(in) : IFACE(out);
+
+	if (!data.iface)
+		return -EINVAL;
+	ret = iface_test(&h->rbtree, &data.iface);
+	if (adt == IPSET_ADD) {
+		if (!ret) {
+			ret = iface_add(&h->rbtree, &data.iface);
+			if (ret)
+				return ret;
+		}
+	} else if (!ret)
+		return ret;
+
+	return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+}
+
+static int
+hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
+		   enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+	struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_netiface6_elem data = { .cidr = HOST_MASK };
+	u32 timeout = h->timeout;
+	char iface[IFNAMSIZ] = {};
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !tb[IPSET_ATTR_IFACE] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+		return -IPSET_ERR_PROTOCOL;
+	if (unlikely(tb[IPSET_ATTR_IP_TO]))
+		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CIDR])
+		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+	if (!data.cidr)
+		return -IPSET_ERR_INVALID_CIDR;
+	ip6_netmask(&data.ip, data.cidr);
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); 
+	data.iface = iface;
+	ret = iface_test(&h->rbtree, &data.iface);
+	if (adt == IPSET_ADD) {
+		if (!ret) {
+			ret = iface_add(&h->rbtree, &data.iface);
+			if (ret)
+				return ret;
+		}
+	} else if (!ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CADT_FLAGS]) {
+		u32 flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+		if (flags & IPSET_FLAG_PHYSDEV)
+			data.physdev = 1;
+	}
+
+	ret = adtfn(set, &data, timeout, flags);
+
+	return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_netiface_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	struct ip_set_hash *h;
+	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+	u8 hbits;
+
+	if (!(set->family == AF_INET || set->family == AF_INET6))
+		return -IPSET_ERR_INVALID_FAMILY;
+
+	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_HASHSIZE]) {
+		hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+		if (hashsize < IPSET_MIMINAL_HASHSIZE)
+			hashsize = IPSET_MIMINAL_HASHSIZE;
+	}
+
+	if (tb[IPSET_ATTR_MAXELEM])
+		maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+	h = kzalloc(sizeof(*h)
+		    + sizeof(struct ip_set_hash_nets)
+		      * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+
+	h->maxelem = maxelem;
+	get_random_bytes(&h->initval, sizeof(h->initval));
+	h->timeout = IPSET_NO_TIMEOUT;
+
+	hbits = htable_bits(hashsize);
+	h->table = ip_set_alloc(
+			sizeof(struct htable)
+			+ jhash_size(hbits) * sizeof(struct hbucket));
+	if (!h->table) {
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->table->htable_bits = hbits;
+	h->rbtree = RB_ROOT;
+
+	set->data = h;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+		set->variant = set->family == AF_INET
+			? &hash_netiface4_tvariant : &hash_netiface6_tvariant;
+
+		if (set->family == AF_INET)
+			hash_netiface4_gc_init(set);
+		else
+			hash_netiface6_gc_init(set);
+	} else {
+		set->variant = set->family == AF_INET
+			? &hash_netiface4_variant : &hash_netiface6_variant;
+	}
+
+	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+		 set->name, jhash_size(h->table->htable_bits),
+		 h->table->htable_bits, h->maxelem, set->data, h->table);
+
+	return 0;
+}
+
+static struct ip_set_type hash_netiface_type __read_mostly = {
+	.name		= "hash:net,iface",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_IP | IPSET_TYPE_IFACE,
+	.dimension	= IPSET_DIM_TWO,
+	.family		= AF_UNSPEC,
+	.revision_min	= 0,
+	.create		= hash_netiface_create,
+	.create_policy	= {
+		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
+		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
+		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
+		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_IFACE]	= { .type = NLA_NUL_STRING,
+					    .len = IPSET_MAXNAMELEN - 1 },
+		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+hash_netiface_init(void)
+{
+	return ip_set_type_register(&hash_netiface_type);
+}
+
+static void __exit
+hash_netiface_fini(void)
+{
+	ip_set_type_unregister(&hash_netiface_type);
+}
+
+module_init(hash_netiface_init);
+module_exit(hash_netiface_fini);
-- 
cgit v1.2.3


From 15b4d93f0316caec44e07255c1d73bde4fac12e4 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 16 Jun 2011 19:01:26 +0200
Subject: netfilter: ipset: whitespace and coding fixes detected by
 checkpatch.pl

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set.h         |  6 +++---
 include/linux/netfilter/ipset/ip_set_ahash.h   |  2 +-
 include/linux/netfilter/ipset/ip_set_timeout.h |  2 +-
 include/linux/netfilter/ipset/pfxlen.h         |  2 +-
 net/netfilter/ipset/ip_set_bitmap_ipmac.c      |  3 ++-
 net/netfilter/ipset/ip_set_core.c              |  4 ++--
 net/netfilter/ipset/ip_set_hash_net.c          |  4 ++--
 net/netfilter/ipset/ip_set_hash_netiface.c     | 26 +++++++++++++-------------
 net/netfilter/ipset/ip_set_hash_netport.c      |  2 +-
 net/netfilter/ipset/pfxlen.c                   |  2 +-
 net/netfilter/xt_set.c                         |  2 +-
 11 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e40917334d04..3540c6e262f7 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -331,7 +331,7 @@ struct ip_set {
 /* register and unregister set references */
 extern ip_set_id_t ip_set_get_byname(const char *name, struct ip_set **set);
 extern void ip_set_put_byindex(ip_set_id_t index);
-extern const char * ip_set_name_byindex(ip_set_id_t index);
+extern const char *ip_set_name_byindex(ip_set_id_t index);
 extern ip_set_id_t ip_set_nfnl_get(const char *name);
 extern ip_set_id_t ip_set_nfnl_get_byindex(ip_set_id_t index);
 extern void ip_set_nfnl_put(ip_set_id_t index);
@@ -349,7 +349,7 @@ extern int ip_set_test(ip_set_id_t id, const struct sk_buff *skb,
 		       const struct ip_set_adt_opt *opt);
 
 /* Utility functions */
-extern void * ip_set_alloc(size_t size);
+extern void *ip_set_alloc(size_t size);
 extern void ip_set_free(void *members);
 extern int ip_set_get_ipaddr4(struct nlattr *nla,  __be32 *ipaddr);
 extern int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr);
@@ -359,7 +359,7 @@ ip_set_get_hostipaddr4(struct nlattr *nla, u32 *ipaddr)
 {
 	__be32 ip;
 	int ret = ip_set_get_ipaddr4(nla, &ip);
-	
+
 	if (ret)
 		return ret;
 	*ipaddr = ntohl(ip);
diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index 8a0999a8baa1..c5b06aaa205c 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -43,7 +43,7 @@ struct htable {
 	struct hbucket bucket[0]; /* hashtable buckets */
 };
 
-#define hbucket(h, i)		&((h)->bucket[i])
+#define hbucket(h, i)		(&((h)->bucket[i]))
 
 /* Book-keeping of the prefixes added to the set */
 struct ip_set_hash_nets {
diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index bae086af0e44..47923205a4ad 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -78,7 +78,7 @@ ip_set_timeout_set(u32 timeout)
 static inline u32
 ip_set_timeout_get(unsigned long timeout)
 {
-	return timeout == IPSET_ELEM_PERMANENT ? 0 : 
+	return timeout == IPSET_ELEM_PERMANENT ? 0 :
 		jiffies_to_msecs(timeout - jiffies)/1000;
 }
 
diff --git a/include/linux/netfilter/ipset/pfxlen.h b/include/linux/netfilter/ipset/pfxlen.h
index d55a6ccf662e..199fd11fedc0 100644
--- a/include/linux/netfilter/ipset/pfxlen.h
+++ b/include/linux/netfilter/ipset/pfxlen.h
@@ -2,7 +2,7 @@
 #define _PFXLEN_H
 
 #include <asm/byteorder.h>
-#include <linux/netfilter.h> 
+#include <linux/netfilter.h>
 #include <net/tcp.h>
 
 /* Prefixlen maps, by Jan Engelhardt  */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 51ab66435a0a..56096f544978 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -635,7 +635,8 @@ static struct ip_set_type bitmap_ipmac_type = {
 	},
 	.adt_policy	= {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
-		[IPSET_ATTR_ETHER]	= { .type = NLA_BINARY, .len  = ETH_ALEN },
+		[IPSET_ATTR_ETHER]	= { .type = NLA_BINARY,
+					    .len  = ETH_ALEN },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
 	},
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 80a1262104bf..c012985a5a26 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -683,8 +683,8 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	if (attr[IPSET_ATTR_DATA] &&
 	    nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
 			     set->type->create_policy)) {
-	    	ret = -IPSET_ERR_PROTOCOL;
-	    	goto put_out;
+		ret = -IPSET_ERR_PROTOCOL;
+		goto put_out;
 	}
 
 	ret = set->type->create(set, tb, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 050163fb6094..2d4b1f48e8c9 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -187,7 +187,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 			return -IPSET_ERR_TIMEOUT;
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
-	
+
 	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
 		data.ip = htonl(ip & ip_set_hostmask(data.cidr));
 		ret = adtfn(set, &data, timeout, flags);
@@ -205,7 +205,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 			return -IPSET_ERR_HASH_RANGE;
 	}
 	if (retried)
-		ip = h->next.ip;		
+		ip = h->next.ip;
 	while (!after(ip, ip_to)) {
 		data.ip = htonl(ip);
 		last = ip_set_range_to_cidr(ip, ip_to, &data.cidr);
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 51e5df12bd00..3d6c53b6211a 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -100,7 +100,7 @@ iface_test(struct rb_root *root, const char **iface)
 	while (n) {
 		const char *d = iface_data(n);
 		int res = ifname_compare(*iface, d);
-		
+
 		if (res < 0)
 			n = n->rb_left;
 		else if (res > 0)
@@ -118,7 +118,7 @@ iface_add(struct rb_root *root, const char **iface)
 {
 	struct rb_node **n = &(root->rb_node), *p = NULL;
 	struct iface_node *d;
-	
+
 	while (*n) {
 		char *ifname = iface_data(*n);
 		int res = ifname_compare(*iface, ifname);
@@ -296,10 +296,10 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
 #ifdef CONFIG_BRIDGE_NETFILTER
 		const struct nf_bridge_info *nf_bridge = skb->nf_bridge;
-		
+
 		if (!nf_bridge)
 			return -EINVAL;
-		data.iface = SRCDIR ? PHYSDEV(physindev): PHYSDEV(physoutdev);
+		data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
 		data.physdev = 1;
 #else
 		data.iface = NULL;
@@ -350,7 +350,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (tb[IPSET_ATTR_CIDR]) {
 		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
 		if (!data.cidr)
-			return -IPSET_ERR_INVALID_CIDR; 
+			return -IPSET_ERR_INVALID_CIDR;
 	}
 
 	if (tb[IPSET_ATTR_TIMEOUT]) {
@@ -359,7 +359,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); 
+	strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
 	data.iface = iface;
 	ret = iface_test(&h->rbtree, &data.iface);
 	if (adt == IPSET_ADD) {
@@ -372,8 +372,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 		return ret;
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
-		u32 flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
-		if (flags & IPSET_FLAG_PHYSDEV)
+		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+		if (cadt_flags & IPSET_FLAG_PHYSDEV)
 			data.physdev = 1;
 	}
 
@@ -559,10 +559,10 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
 #ifdef CONFIG_BRIDGE_NETFILTER
 		const struct nf_bridge_info *nf_bridge = skb->nf_bridge;
-		
+
 		if (!nf_bridge)
 			return -EINVAL;
-		data.iface = SRCDIR ? PHYSDEV(physindev): PHYSDEV(physoutdev);
+		data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
 		data.physdev = 1;
 #else
 		data.iface = NULL;
@@ -623,7 +623,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); 
+	strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
 	data.iface = iface;
 	ret = iface_test(&h->rbtree, &data.iface);
 	if (adt == IPSET_ADD) {
@@ -636,8 +636,8 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
 		return ret;
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
-		u32 flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
-		if (flags & IPSET_FLAG_PHYSDEV)
+		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+		if (cadt_flags & IPSET_FLAG_PHYSDEV)
 			data.physdev = 1;
 	}
 
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index d7710a9fb7c7..fe203d12f56b 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -199,7 +199,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (tb[IPSET_ATTR_CIDR]) {
 		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
 		if (!data.cidr)
-			return -IPSET_ERR_INVALID_CIDR; 
+			return -IPSET_ERR_INVALID_CIDR;
 	}
 
 	if (tb[IPSET_ATTR_PORT])
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
index b57a85673de7..bd13d66220f1 100644
--- a/net/netfilter/ipset/pfxlen.c
+++ b/net/netfilter/ipset/pfxlen.c
@@ -148,7 +148,7 @@ const union nf_inet_addr ip_set_netmask_map[] = {
 EXPORT_SYMBOL_GPL(ip_set_netmask_map);
 
 #undef  E
-#define E(a, b, c, d) 						\
+#define E(a, b, c, d)						\
 	{.ip6 = { (__force __be32) a, (__force __be32) b,	\
 		  (__force __be32) c, (__force __be32) d,	\
 	} }
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 453847f293d3..19461c462dbd 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -37,7 +37,7 @@ match_set(ip_set_id_t index, const struct sk_buff *skb,
 	return inv;
 }
 
-#define ADT_OPT(n, f, d, fs, cfs, t) 	\
+#define ADT_OPT(n, f, d, fs, cfs, t)	\
 const struct ip_set_adt_opt n = {	\
 	.family	= f,			\
 	.dim = d,			\
-- 
cgit v1.2.3


From c16d51a32bbb61ac8fd96f78b5ce2fccfe0fb4c3 Mon Sep 17 00:00:00 2001
From: Shreshtha Kumar Sahu <shreshthakumar.sahu@stericsson.com>
Date: Mon, 13 Jun 2011 10:11:33 +0200
Subject: amba pl011: workaround for uart registers lockup

This workaround aims to break the deadlock situation
which raises during continuous transfer of data for long
duration over uart with hardware flow control. It is
observed that CTS interrupt cannot be cleared in uart
interrupt register (ICR). Hence further transfer over
uart gets blocked.

It is seen that during such deadlock condition ICR
don't get cleared even on multiple write. This leads
pass_counter to decrease and finally reach zero. This
can be taken as trigger point to run this UART_BT_WA.

Workaround backups the register configuration, does soft
reset of UART using BIT-0 of PRCC_K_SOFTRST_SET/CLEAR
registers and restores the registers.

This patch also provides support for uart init and exit
function calls if present.

Signed-off-by: Shreshtha Kumar Sahu <shreshthakumar.sahu@stericsson.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/tty/serial/amba-pl011.c | 123 +++++++++++++++++++++++++++++++++++++++-
 include/linux/amba/serial.h     |   3 +
 2 files changed, 125 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 8dc0541feecc..f5f6831b0a64 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -50,6 +50,7 @@
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
+#include <linux/delay.h>
 
 #include <asm/io.h>
 #include <asm/sizes.h>
@@ -65,6 +66,30 @@
 #define UART_DR_ERROR		(UART011_DR_OE|UART011_DR_BE|UART011_DR_PE|UART011_DR_FE)
 #define UART_DUMMY_DR_RX	(1 << 16)
 
+
+#define UART_WA_SAVE_NR 14
+
+static void pl011_lockup_wa(unsigned long data);
+static const u32 uart_wa_reg[UART_WA_SAVE_NR] = {
+	ST_UART011_DMAWM,
+	ST_UART011_TIMEOUT,
+	ST_UART011_LCRH_RX,
+	UART011_IBRD,
+	UART011_FBRD,
+	ST_UART011_LCRH_TX,
+	UART011_IFLS,
+	ST_UART011_XFCR,
+	ST_UART011_XON1,
+	ST_UART011_XON2,
+	ST_UART011_XOFF1,
+	ST_UART011_XOFF2,
+	UART011_CR,
+	UART011_IMSC
+};
+
+static u32 uart_wa_regdata[UART_WA_SAVE_NR];
+static DECLARE_TASKLET(pl011_lockup_tlet, pl011_lockup_wa, 0);
+
 /* There is by now at least one vendor with differing details, so handle it */
 struct vendor_data {
 	unsigned int		ifls;
@@ -72,6 +97,7 @@ struct vendor_data {
 	unsigned int		lcrh_tx;
 	unsigned int		lcrh_rx;
 	bool			oversampling;
+	bool			interrupt_may_hang;   /* vendor-specific */
 	bool			dma_threshold;
 };
 
@@ -90,9 +116,12 @@ static struct vendor_data vendor_st = {
 	.lcrh_tx		= ST_UART011_LCRH_TX,
 	.lcrh_rx		= ST_UART011_LCRH_RX,
 	.oversampling		= true,
+	.interrupt_may_hang	= true,
 	.dma_threshold		= true,
 };
 
+static struct uart_amba_port *amba_ports[UART_NR];
+
 /* Deals with DMA transactions */
 
 struct pl011_sgbuf {
@@ -132,6 +161,7 @@ struct uart_amba_port {
 	unsigned int		lcrh_rx;	/* vendor-specific */
 	bool			autorts;
 	char			type[12];
+	bool			interrupt_may_hang; /* vendor-specific */
 #ifdef CONFIG_DMA_ENGINE
 	/* DMA stuff */
 	bool			using_tx_dma;
@@ -1008,6 +1038,68 @@ static inline bool pl011_dma_rx_running(struct uart_amba_port *uap)
 #endif
 
 
+/*
+ * pl011_lockup_wa
+ * This workaround aims to break the deadlock situation
+ * when after long transfer over uart in hardware flow
+ * control, uart interrupt registers cannot be cleared.
+ * Hence uart transfer gets blocked.
+ *
+ * It is seen that during such deadlock condition ICR
+ * don't get cleared even on multiple write. This leads
+ * pass_counter to decrease and finally reach zero. This
+ * can be taken as trigger point to run this UART_BT_WA.
+ *
+ */
+static void pl011_lockup_wa(unsigned long data)
+{
+	struct uart_amba_port *uap = amba_ports[0];
+	void __iomem *base = uap->port.membase;
+	struct circ_buf *xmit = &uap->port.state->xmit;
+	struct tty_struct *tty = uap->port.state->port.tty;
+	int buf_empty_retries = 200;
+	int loop;
+
+	/* Stop HCI layer from submitting data for tx */
+	tty->hw_stopped = 1;
+	while (!uart_circ_empty(xmit)) {
+		if (buf_empty_retries-- == 0)
+			break;
+		udelay(100);
+	}
+
+	/* Backup registers */
+	for (loop = 0; loop < UART_WA_SAVE_NR; loop++)
+		uart_wa_regdata[loop] = readl(base + uart_wa_reg[loop]);
+
+	/* Disable UART so that FIFO data is flushed out */
+	writew(0x00, uap->port.membase + UART011_CR);
+
+	/* Soft reset UART module */
+	if (uap->port.dev->platform_data) {
+		struct amba_pl011_data *plat;
+
+		plat = uap->port.dev->platform_data;
+		if (plat->reset)
+			plat->reset();
+	}
+
+	/* Restore registers */
+	for (loop = 0; loop < UART_WA_SAVE_NR; loop++)
+		writew(uart_wa_regdata[loop] ,
+				uap->port.membase + uart_wa_reg[loop]);
+
+	/* Initialise the old status of the modem signals */
+	uap->old_status = readw(uap->port.membase + UART01x_FR) &
+		UART01x_FR_MODEM_ANY;
+
+	if (readl(base + UART011_MIS) & 0x2)
+		printk(KERN_EMERG "UART_BT_WA: ***FAILED***\n");
+
+	/* Start Tx/Rx */
+	tty->hw_stopped = 0;
+}
+
 static void pl011_stop_tx(struct uart_port *port)
 {
 	struct uart_amba_port *uap = (struct uart_amba_port *)port;
@@ -1158,8 +1250,11 @@ static irqreturn_t pl011_int(int irq, void *dev_id)
 			if (status & UART011_TXIS)
 				pl011_tx_chars(uap);
 
-			if (pass_counter-- == 0)
+			if (pass_counter-- == 0) {
+				if (uap->interrupt_may_hang)
+					tasklet_schedule(&pl011_lockup_tlet);
 				break;
+			}
 
 			status = readw(uap->port.membase + UART011_MIS);
 		} while (status != 0);
@@ -1339,6 +1434,14 @@ static int pl011_startup(struct uart_port *port)
 	writew(uap->im, uap->port.membase + UART011_IMSC);
 	spin_unlock_irq(&uap->port.lock);
 
+	if (uap->port.dev->platform_data) {
+		struct amba_pl011_data *plat;
+
+		plat = uap->port.dev->platform_data;
+		if (plat->init)
+			plat->init();
+	}
+
 	return 0;
 
  clk_dis:
@@ -1394,6 +1497,15 @@ static void pl011_shutdown(struct uart_port *port)
 	 * Shut down the clock producer
 	 */
 	clk_disable(uap->clk);
+
+	if (uap->port.dev->platform_data) {
+		struct amba_pl011_data *plat;
+
+		plat = uap->port.dev->platform_data;
+		if (plat->exit)
+			plat->exit();
+	}
+
 }
 
 static void
@@ -1700,6 +1812,14 @@ static int __init pl011_console_setup(struct console *co, char *options)
 	if (!uap)
 		return -ENODEV;
 
+	if (uap->port.dev->platform_data) {
+		struct amba_pl011_data *plat;
+
+		plat = uap->port.dev->platform_data;
+		if (plat->init)
+			plat->init();
+	}
+
 	uap->port.uartclk = clk_get_rate(uap->clk);
 
 	if (options)
@@ -1774,6 +1894,7 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 	uap->lcrh_rx = vendor->lcrh_rx;
 	uap->lcrh_tx = vendor->lcrh_tx;
 	uap->fifosize = vendor->fifosize;
+	uap->interrupt_may_hang = vendor->interrupt_may_hang;
 	uap->port.dev = &dev->dev;
 	uap->port.mapbase = dev->res.start;
 	uap->port.membase = base;
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 5479fdc849e9..514ed45c462e 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -201,6 +201,9 @@ struct amba_pl011_data {
 	bool (*dma_filter)(struct dma_chan *chan, void *filter_param);
 	void *dma_rx_param;
 	void *dma_tx_param;
+        void (*init) (void);
+	void (*exit) (void);
+	void (*reset) (void);
 };
 #endif
 
-- 
cgit v1.2.3


From 73ddff2bee159ffb580bd24faf625cd5e628f5ec Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:14 +0200
Subject: job control: introduce JOBCTL_TRAP_STOP and use it for group stop
 trap

do_signal_stop() implemented both normal group stop and trap for group
stop while ptraced.  This approach has been enough but scheduled
changes require trap mechanism which can be used in more generic
manner and using group stop trap for generic trap site simplifies both
userland visible interface and implementation.

This patch adds a new jobctl flag - JOBCTL_TRAP_STOP.  When set, it
triggers a trap site, which behaves like group stop trap, in
get_signal_to_deliver() after checking for pending signals.  While
ptraced, do_signal_stop() doesn't stop itself.  It initiates group
stop if requested and schedules JOBCTL_TRAP_STOP and returns.  The
caller - get_signal_to_deliver() - is responsible for checking whether
TRAP_STOP is pending afterwards and handling it.

ptrace_attach() is updated to use JOBCTL_TRAP_STOP instead of
JOBCTL_STOP_PENDING and __ptrace_unlink() to clear all pending trap
bits and TRAPPING so that TRAP_STOP and future trap bits don't linger
after detach.

While at it, add proper function comment to do_signal_stop() and make
it return bool.

-v2: __ptrace_unlink() updated to clear JOBCTL_TRAP_MASK and TRAPPING
     instead of JOBCTL_PENDING_MASK.  This avoids accidentally
     clearing JOBCTL_STOP_CONSUME.  Spotted by Oleg.

-v3: do_signal_stop() updated to return %false without dropping
     siglock while ptraced and TRAP_STOP check moved inside for(;;)
     loop after group stop participation.  This avoids unnecessary
     relocking and also will help avoiding unnecessary traps by
     consuming group stop before handling pending traps.

-v4: Jobctl trap handling moved into a separate function -
     do_jobctl_trap().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/sched.h |  6 +++-
 kernel/ptrace.c       | 12 +++++--
 kernel/signal.c       | 90 ++++++++++++++++++++++++++++++++++-----------------
 3 files changed, 75 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5157bd9eee37..8bd84b83a35b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1810,17 +1810,21 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_STOP_DEQUEUED_BIT 16	/* stop signal dequeued */
 #define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
 #define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
+#define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 
 #define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
 #define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
 #define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
 
-#define JOBCTL_PENDING_MASK	JOBCTL_STOP_PENDING
+#define JOBCTL_TRAP_MASK	JOBCTL_TRAP_STOP
+#define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
 extern bool task_set_jobctl_pending(struct task_struct *task,
 				    unsigned int mask);
+extern void task_clear_jobctl_trapping(struct task_struct *task);
 extern void task_clear_jobctl_pending(struct task_struct *task,
 				      unsigned int mask);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 7f05f3a1267b..45a8a4c5d8b2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -82,6 +82,13 @@ void __ptrace_unlink(struct task_struct *child)
 
 	spin_lock(&child->sighand->siglock);
 
+	/*
+	 * Clear all pending traps and TRAPPING.  TRAPPING should be
+	 * cleared regardless of JOBCTL_STOP_PENDING.  Do it explicitly.
+	 */
+	task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
+	task_clear_jobctl_trapping(child);
+
 	/*
 	 * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
 	 * @child isn't dead.
@@ -246,7 +253,7 @@ static int ptrace_attach(struct task_struct *task)
 	spin_lock(&task->sighand->siglock);
 
 	/*
-	 * If the task is already STOPPED, set JOBCTL_STOP_PENDING and
+	 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
 	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
 	 * will be cleared if the child completes the transition or any
 	 * event which clears the group stop states happens.  We'll wait
@@ -263,8 +270,7 @@ static int ptrace_attach(struct task_struct *task)
 	 * in and out of STOPPED are protected by siglock.
 	 */
 	if (task_is_stopped(task) &&
-	    task_set_jobctl_pending(task,
-				    JOBCTL_STOP_PENDING | JOBCTL_TRAPPING))
+	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
 		signal_wake_up(task, 1);
 
 	spin_unlock(&task->sighand->siglock);
diff --git a/kernel/signal.c b/kernel/signal.c
index c99b8b5c0be7..b5f55ca1f43f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -266,7 +266,7 @@ bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-static void task_clear_jobctl_trapping(struct task_struct *task)
+void task_clear_jobctl_trapping(struct task_struct *task)
 {
 	if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
 		task->jobctl &= ~JOBCTL_TRAPPING;
@@ -1790,13 +1790,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	/*
 	 * If @why is CLD_STOPPED, we're trapping to participate in a group
 	 * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
-	 * while siglock was released for the arch hook, PENDING could be
-	 * clear now.  We act as if SIGCONT is received after TASK_TRACED
-	 * is entered - ignore it.
+	 * across siglock relocks since INTERRUPT was scheduled, PENDING
+	 * could be clear now.  We act as if SIGCONT is received after
+	 * TASK_TRACED is entered - ignore it.
 	 */
 	if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
 		gstop_done = task_participate_group_stop(current);
 
+	/* any trap clears pending STOP trap */
+	task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
+
 	/* entering a trap, clear TRAPPING */
 	task_clear_jobctl_trapping(current);
 
@@ -1888,13 +1891,30 @@ void ptrace_notify(int exit_code)
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
-/*
- * This performs the stopping for SIGSTOP and other stop signals.
- * We have to stop all threads in the thread group.
- * Returns non-zero if we've actually stopped and released the siglock.
- * Returns zero if we didn't stop and still hold the siglock.
+/**
+ * do_signal_stop - handle group stop for SIGSTOP and other stop signals
+ * @signr: signr causing group stop if initiating
+ *
+ * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
+ * and participate in it.  If already set, participate in the existing
+ * group stop.  If participated in a group stop (and thus slept), %true is
+ * returned with siglock released.
+ *
+ * If ptraced, this function doesn't handle stop itself.  Instead,
+ * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
+ * untouched.  The caller must ensure that INTERRUPT trap handling takes
+ * places afterwards.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which is released
+ * on %true return.
+ *
+ * RETURNS:
+ * %false if group stop is already cancelled or ptrace trap is scheduled.
+ * %true if participated in group stop.
  */
-static int do_signal_stop(int signr)
+static bool do_signal_stop(int signr)
+	__releases(&current->sighand->siglock)
 {
 	struct signal_struct *sig = current->signal;
 
@@ -1907,7 +1927,7 @@ static int do_signal_stop(int signr)
 
 		if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
 		    unlikely(signal_group_exit(sig)))
-			return 0;
+			return false;
 		/*
 		 * There is no group stop already in progress.  We must
 		 * initiate one now.
@@ -1951,7 +1971,7 @@ static int do_signal_stop(int signr)
 			}
 		}
 	}
-retry:
+
 	if (likely(!task_ptrace(current))) {
 		int notify = 0;
 
@@ -1983,27 +2003,33 @@ retry:
 
 		/* Now we don't run again until woken by SIGCONT or SIGKILL */
 		schedule();
-
-		spin_lock_irq(&current->sighand->siglock);
+		return true;
 	} else {
-		ptrace_stop(current->jobctl & JOBCTL_STOP_SIGMASK,
-			    CLD_STOPPED, 0, NULL);
-		current->exit_code = 0;
-	}
-
-	/*
-	 * JOBCTL_STOP_PENDING could be set if another group stop has
-	 * started since being woken up or ptrace wants us to transit
-	 * between TASK_STOPPED and TRACED.  Retry group stop.
-	 */
-	if (current->jobctl & JOBCTL_STOP_PENDING) {
-		WARN_ON_ONCE(!(current->jobctl & JOBCTL_STOP_SIGMASK));
-		goto retry;
+		/*
+		 * While ptraced, group stop is handled by STOP trap.
+		 * Schedule it and let the caller deal with it.
+		 */
+		task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
+		return false;
 	}
+}
 
-	spin_unlock_irq(&current->sighand->siglock);
+/**
+ * do_jobctl_trap - take care of ptrace jobctl traps
+ *
+ * It is currently used only to trap for group stop while ptraced.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which may be
+ * released and re-acquired before returning with intervening sleep.
+ */
+static void do_jobctl_trap(void)
+{
+	int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
 
-	return 1;
+	WARN_ON_ONCE(!signr);
+	ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+	current->exit_code = 0;
 }
 
 static int ptrace_signal(int signr, siginfo_t *info,
@@ -2110,6 +2136,12 @@ relock:
 		    do_signal_stop(0))
 			goto relock;
 
+		if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
+			do_jobctl_trap();
+			spin_unlock_irq(&sighand->siglock);
+			goto relock;
+		}
+
 		signr = dequeue_signal(current, &current->blocked, info);
 
 		if (!signr)
-- 
cgit v1.2.3


From 3544d72a0e10d0aa1c1bd59ed77a53a59cdc12f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:15 +0200
Subject: ptrace: implement PTRACE_SEIZE

PTRACE_ATTACH implicitly issues SIGSTOP on attach which has side
effects on tracee signal and job control states.  This patch
implements a new ptrace request PTRACE_SEIZE which attaches a tracee
without trapping it or affecting its signal and job control states.

The usage is the same with PTRACE_ATTACH but it takes PTRACE_SEIZE_*
flags in @data.  Currently, the only defined flag is
PTRACE_SEIZE_DEVEL which is a temporary flag to enable PTRACE_SEIZE.
PTRACE_SEIZE will change ptrace behaviors outside of attach itself.
The changes will be implemented gradually and the DEVEL flag is to
prevent programs which expect full SEIZE behavior from using it before
all the behavior modifications are complete while allowing unit
testing.  The flag will be removed once SEIZE behaviors are completely
implemented.

* PTRACE_SEIZE, unlike ATTACH, doesn't force tracee to trap.  After
  attaching tracee continues to run unless a trap condition occurs.

* PTRACE_SEIZE doesn't affect signal or group stop state.

* If PTRACE_SEIZE'd, group stop uses PTRACE_EVENT_STOP trap which uses
  exit_code of (signr | PTRACE_EVENT_STOP << 8) where signr is one of
  the stopping signals if group stop is in effect or SIGTRAP
  otherwise, and returns usual trap siginfo on PTRACE_GETSIGINFO
  instead of NULL.

Seizing sets PT_SEIZED in ->ptrace of the tracee.  This flag will be
used to determine whether new SEIZE behaviors should be enabled.

Test program follows.

  #define PTRACE_SEIZE		0x4206
  #define PTRACE_SEIZE_DEVEL	0x80000000

  static const struct timespec ts100ms = { .tv_nsec = 100000000 };
  static const struct timespec ts1s = { .tv_sec = 1 };
  static const struct timespec ts3s = { .tv_sec = 3 };

  int main(int argc, char **argv)
  {
	  pid_t tracee;

	  tracee = fork();
	  if (tracee == 0) {
		  nanosleep(&ts100ms, NULL);
		  while (1) {
			  printf("tracee: alive\n");
			  nanosleep(&ts1s, NULL);
		  }
	  }

	  if (argc > 1)
		  kill(tracee, SIGSTOP);

	  nanosleep(&ts100ms, NULL);

	  ptrace(PTRACE_SEIZE, tracee, NULL,
		 (void *)(unsigned long)PTRACE_SEIZE_DEVEL);
	  if (argc > 1) {
		  waitid(P_PID, tracee, NULL, WSTOPPED);
		  ptrace(PTRACE_CONT, tracee, NULL, NULL);
	  }
	  nanosleep(&ts3s, NULL);
	  printf("tracer: exiting\n");
	  return 0;
  }

When the above program is called w/o argument, tracee is seized while
running and remains running.  When tracer exits, tracee continues to
run and print out messages.

  # ./test-seize-simple
  tracee: alive
  tracee: alive
  tracee: alive
  tracer: exiting
  tracee: alive
  tracee: alive

When called with an argument, tracee is seized from stopped state and
continued, and returns to stopped state when tracer exits.

  # ./test-seize
  tracee: alive
  tracee: alive
  tracee: alive
  tracer: exiting
  # ps -el|grep test-seize
  1 T     0  4720     1  0  80   0 -   941 signal ttyS0    00:00:00 test-seize

-v2: SEIZE doesn't schedule TRAP_STOP and leaves tracee running as Jan
     suggested.

-v3: PTRACE_EVENT_STOP traps now report group stop state by signr.  If
     group stop is in effect the stop signal number is returned as
     part of exit_code; otherwise, SIGTRAP.  This was suggested by
     Denys and Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h |  7 +++++++
 kernel/ptrace.c        | 35 +++++++++++++++++++++++++++++------
 kernel/signal.c        | 39 ++++++++++++++++++++++++++++++---------
 3 files changed, 66 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e93ef1a54fc7..67ad3f152329 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -47,6 +47,11 @@
 #define PTRACE_GETREGSET	0x4204
 #define PTRACE_SETREGSET	0x4205
 
+#define PTRACE_SEIZE		0x4206
+
+/* flags in @data for PTRACE_SEIZE */
+#define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
+
 /* options set using PTRACE_SETOPTIONS */
 #define PTRACE_O_TRACESYSGOOD	0x00000001
 #define PTRACE_O_TRACEFORK	0x00000002
@@ -65,6 +70,7 @@
 #define PTRACE_EVENT_EXEC	4
 #define PTRACE_EVENT_VFORK_DONE	5
 #define PTRACE_EVENT_EXIT	6
+#define PTRACE_EVENT_STOP	7
 
 #include <asm/ptrace.h>
 
@@ -77,6 +83,7 @@
  * flags.  When the a task is stopped the ptracer owns task->ptrace.
  */
 
+#define PT_SEIZED	0x00010000	/* SEIZE used, enable new behavior */
 #define PT_PTRACED	0x00000001
 #define PT_DTRACE	0x00000002	/* delayed trace (used on m68k, i386) */
 #define PT_TRACESYSGOOD	0x00000004
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 45a8a4c5d8b2..dcf9f974198c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -209,10 +209,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 	return !err;
 }
 
-static int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task, long request,
+			 unsigned long flags)
 {
+	bool seize = (request == PTRACE_SEIZE);
 	int retval;
 
+	/*
+	 * SEIZE will enable new ptrace behaviors which will be implemented
+	 * gradually.  SEIZE_DEVEL is used to prevent applications
+	 * expecting full SEIZE behaviors trapping on kernel commits which
+	 * are still in the process of implementing them.
+	 *
+	 * Only test programs for new ptrace behaviors being implemented
+	 * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
+	 *
+	 * Once SEIZE behaviors are completely implemented, this flag and
+	 * the following test will be removed.
+	 */
+	retval = -EIO;
+	if (seize && !(flags & PTRACE_SEIZE_DEVEL))
+		goto out;
+
 	audit_ptrace(task);
 
 	retval = -EPERM;
@@ -244,11 +262,16 @@ static int ptrace_attach(struct task_struct *task)
 		goto unlock_tasklist;
 
 	task->ptrace = PT_PTRACED;
+	if (seize)
+		task->ptrace |= PT_SEIZED;
 	if (task_ns_capable(task, CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
 
 	__ptrace_link(task, current);
-	send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+
+	/* SEIZE doesn't trap tracee on attach */
+	if (!seize)
+		send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
 
 	spin_lock(&task->sighand->siglock);
 
@@ -785,8 +808,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 		goto out;
 	}
 
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
+	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+		ret = ptrace_attach(child, request, data);
 		/*
 		 * Some architectures need to do book-keeping after
 		 * a ptrace attach.
@@ -927,8 +950,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 		goto out;
 	}
 
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
+	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+		ret = ptrace_attach(child, request, data);
 		/*
 		 * Some architectures need to do book-keeping after
 		 * a ptrace attach.
diff --git a/kernel/signal.c b/kernel/signal.c
index b5f55ca1f43f..589292f38530 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1873,21 +1873,26 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	recalc_sigpending_tsk(current);
 }
 
-void ptrace_notify(int exit_code)
+static void ptrace_do_notify(int signr, int exit_code, int why)
 {
 	siginfo_t info;
 
-	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
-
 	memset(&info, 0, sizeof info);
-	info.si_signo = SIGTRAP;
+	info.si_signo = signr;
 	info.si_code = exit_code;
 	info.si_pid = task_pid_vnr(current);
 	info.si_uid = current_uid();
 
 	/* Let the debugger run.  */
+	ptrace_stop(exit_code, why, 1, &info);
+}
+
+void ptrace_notify(int exit_code)
+{
+	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+
 	spin_lock_irq(&current->sighand->siglock);
-	ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
+	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
@@ -2017,7 +2022,13 @@ static bool do_signal_stop(int signr)
 /**
  * do_jobctl_trap - take care of ptrace jobctl traps
  *
- * It is currently used only to trap for group stop while ptraced.
+ * When PT_SEIZED, it's used for both group stop and explicit
+ * SEIZE/INTERRUPT traps.  Both generate PTRACE_EVENT_STOP trap with
+ * accompanying siginfo.  If stopped, lower eight bits of exit_code contain
+ * the stop signal; otherwise, %SIGTRAP.
+ *
+ * When !PT_SEIZED, it's used only for group stop trap with stop signal
+ * number as exit_code and no siginfo.
  *
  * CONTEXT:
  * Must be called with @current->sighand->siglock held, which may be
@@ -2025,11 +2036,21 @@ static bool do_signal_stop(int signr)
  */
 static void do_jobctl_trap(void)
 {
+	struct signal_struct *signal = current->signal;
 	int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
 
-	WARN_ON_ONCE(!signr);
-	ptrace_stop(signr, CLD_STOPPED, 0, NULL);
-	current->exit_code = 0;
+	if (current->ptrace & PT_SEIZED) {
+		if (!signal->group_stop_count &&
+		    !(signal->flags & SIGNAL_STOP_STOPPED))
+			signr = SIGTRAP;
+		WARN_ON_ONCE(!signr);
+		ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
+				 CLD_STOPPED);
+	} else {
+		WARN_ON_ONCE(!signr);
+		ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+		current->exit_code = 0;
+	}
 }
 
 static int ptrace_signal(int signr, siginfo_t *info,
-- 
cgit v1.2.3


From fca26f260c528ee51a2e451b5b200aeb528f3e09 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:16 +0200
Subject: ptrace: implement PTRACE_INTERRUPT

Currently, there's no way to trap a running ptracee short of sending a
signal which has various side effects.  This patch implements
PTRACE_INTERRUPT which traps ptracee without any signal or job control
related side effect.

The implementation is almost trivial.  It uses the group stop trap -
SIGTRAP | PTRACE_EVENT_STOP << 8.  A new trap flag
JOBCTL_TRAP_INTERRUPT is added, which is set on PTRACE_INTERRUPT and
cleared when any trap happens.  As INTERRUPT should be useable
regardless of the current state of tracee, task_is_traced() test in
ptrace_check_attach() is skipped for INTERRUPT.

PTRACE_INTERRUPT is available iff tracee is attached with
PTRACE_SEIZE.

Test program follows.

  #define PTRACE_SEIZE		0x4206
  #define PTRACE_INTERRUPT	0x4207

  #define PTRACE_SEIZE_DEVEL	0x80000000

  static const struct timespec ts100ms = { .tv_nsec = 100000000 };
  static const struct timespec ts1s = { .tv_sec = 1 };
  static const struct timespec ts3s = { .tv_sec = 3 };

  int main(int argc, char **argv)
  {
	  pid_t tracee;

	  tracee = fork();
	  if (tracee == 0) {
		  nanosleep(&ts100ms, NULL);
		  while (1) {
			  printf("tracee: alive pid=%d\n", getpid());
			  nanosleep(&ts1s, NULL);
		  }
	  }

	  if (argc > 1)
		  kill(tracee, SIGSTOP);

	  nanosleep(&ts100ms, NULL);

	  ptrace(PTRACE_SEIZE, tracee, NULL,
		 (void *)(unsigned long)PTRACE_SEIZE_DEVEL);
	  if (argc > 1) {
		  waitid(P_PID, tracee, NULL, WSTOPPED);
		  ptrace(PTRACE_CONT, tracee, NULL, NULL);
	  }
	  nanosleep(&ts3s, NULL);

	  printf("tracer: INTERRUPT and DETACH\n");
	  ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL);
	  waitid(P_PID, tracee, NULL, WSTOPPED);
	  ptrace(PTRACE_DETACH, tracee, NULL, NULL);
	  nanosleep(&ts3s, NULL);

	  printf("tracer: exiting\n");
	  kill(tracee, SIGKILL);
	  return 0;
  }

When called without argument, tracee is seized from running state,
interrupted and then detached back to running state.

  # ./test-interrupt
  tracee: alive pid=4546
  tracee: alive pid=4546
  tracee: alive pid=4546
  tracer: INTERRUPT and DETACH
  tracee: alive pid=4546
  tracee: alive pid=4546
  tracee: alive pid=4546
  tracer: exiting

When called with argument, tracee is seized from stopped state,
continued, interrupted and then detached back to stopped state.

  # ./test-interrupt  1
  tracee: alive pid=4548
  tracee: alive pid=4548
  tracee: alive pid=4548
  tracer: INTERRUPT and DETACH
  tracer: exiting

Before PTRACE_INTERRUPT, once the tracee was running, there was no way
to trap tracee and do PTRACE_DETACH without causing side effect.

-v2: Updated to use task_set_jobctl_pending() so that it doesn't end
     up scheduling TRAP_STOP if child is dying which may make the
     child unkillable.  Spotted by Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h |  1 +
 kernel/ptrace.c        | 29 +++++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 67ad3f152329..ad754d1e0b13 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -48,6 +48,7 @@
 #define PTRACE_SETREGSET	0x4205
 
 #define PTRACE_SEIZE		0x4206
+#define PTRACE_INTERRUPT	0x4207
 
 /* flags in @data for PTRACE_SEIZE */
 #define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dcf9f974198c..6852c0f4a916 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -658,10 +658,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
 int ptrace_request(struct task_struct *child, long request,
 		   unsigned long addr, unsigned long data)
 {
+	bool seized = child->ptrace & PT_SEIZED;
 	int ret = -EIO;
 	siginfo_t siginfo;
 	void __user *datavp = (void __user *) data;
 	unsigned long __user *datalp = datavp;
+	unsigned long flags;
 
 	switch (request) {
 	case PTRACE_PEEKTEXT:
@@ -694,6 +696,27 @@ int ptrace_request(struct task_struct *child, long request,
 			ret = ptrace_setsiginfo(child, &siginfo);
 		break;
 
+	case PTRACE_INTERRUPT:
+		/*
+		 * Stop tracee without any side-effect on signal or job
+		 * control.  At least one trap is guaranteed to happen
+		 * after this request.  If @child is already trapped, the
+		 * current trap is not disturbed and another trap will
+		 * happen after the current trap is ended with PTRACE_CONT.
+		 *
+		 * The actual trap might not be PTRACE_EVENT_STOP trap but
+		 * the pending condition is cleared regardless.
+		 */
+		if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+			break;
+
+		if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
+			signal_wake_up(child, 0);
+
+		unlock_task_sighand(child, &flags);
+		ret = 0;
+		break;
+
 	case PTRACE_DETACH:	 /* detach a process that was attached. */
 		ret = ptrace_detach(child, data);
 		break;
@@ -819,7 +842,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 		goto out_put_task_struct;
 	}
 
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+				  request == PTRACE_INTERRUPT);
 	if (ret < 0)
 		goto out_put_task_struct;
 
@@ -961,7 +985,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 		goto out_put_task_struct;
 	}
 
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+				  request == PTRACE_INTERRUPT);
 	if (!ret)
 		ret = compat_arch_ptrace(child, request, addr, data);
 
-- 
cgit v1.2.3


From fb1d910c178ba0c5bc32d3e5a9e82e05b7aad3cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:17 +0200
Subject: ptrace: implement TRAP_NOTIFY and use it for group stop events

Currently there's no way for ptracer to find out whether group stop
finished other than polling with INTERRUPT - GETSIGINFO - CONT
sequence.  This patch implements group stop notification for ptracer
using STOP traps.

When group stop state of a seized tracee changes, JOBCTL_TRAP_NOTIFY
is set, which schedules a STOP trap which is sticky - it isn't cleared
by other traps and at least one STOP trap will happen eventually.
STOP trap is synchronization point for event notification and the
tracer can determine the current group stop state by looking at the
signal number portion of exit code (si_status from waitid(2) or
si_code from PTRACE_GETSIGINFO).

Notifications are generated both on start and end of group stops but,
because group stop participation always happens before STOP trap, this
doesn't cause an extra trap while tracee is participating in group
stop.  The symmetry will be useful later.

Note that this notification works iff tracee is not trapped.
Currently there is no way to be notified of group stop state changes
while tracee is trapped.  This will be addressed by a later patch.

An example program follows.

  #define PTRACE_SEIZE		0x4206
  #define PTRACE_INTERRUPT	0x4207

  #define PTRACE_SEIZE_DEVEL	0x80000000

  static const struct timespec ts1s = { .tv_sec = 1 };

  int main(int argc, char **argv)
  {
	  pid_t tracee, tracer;
	  int i;

	  tracee = fork();
	  if (!tracee)
		  while (1)
			  pause();

	  tracer = fork();
	  if (!tracer) {
		  siginfo_t si;

		  ptrace(PTRACE_SEIZE, tracee, NULL,
			 (void *)(unsigned long)PTRACE_SEIZE_DEVEL);
		  ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL);
	  repeat:
		  waitid(P_PID, tracee, NULL, WSTOPPED);

		  ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si);
		  if (!si.si_code) {
			  printf("tracer: SIG %d\n", si.si_signo);
			  ptrace(PTRACE_CONT, tracee, NULL,
				 (void *)(unsigned long)si.si_signo);
			  goto repeat;
		  }
		  printf("tracer: stopped=%d signo=%d\n",
			 si.si_signo != SIGTRAP, si.si_signo);
		  ptrace(PTRACE_CONT, tracee, NULL, NULL);
		  goto repeat;
	  }

	  for (i = 0; i < 3; i++) {
		  nanosleep(&ts1s, NULL);
		  printf("mother: SIGSTOP\n");
		  kill(tracee, SIGSTOP);
		  nanosleep(&ts1s, NULL);
		  printf("mother: SIGCONT\n");
		  kill(tracee, SIGCONT);
	  }
	  nanosleep(&ts1s, NULL);

	  kill(tracer, SIGKILL);
	  kill(tracee, SIGKILL);
	  return 0;
  }

In the above program, tracer keeps tracee running and gets
notification of each group stop state changes.

  # ./test-notify
  tracer: stopped=0 signo=5
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/sched.h |  4 +++-
 kernel/signal.c       | 38 +++++++++++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8bd84b83a35b..1854def284f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1811,15 +1811,17 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
 #define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
 #define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
+#define JOBCTL_TRAP_NOTIFY_BIT	20	/* trap for NOTIFY */
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 
 #define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
 #define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
 #define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
+#define JOBCTL_TRAP_NOTIFY	(1 << JOBCTL_TRAP_NOTIFY_BIT)
 #define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
 
-#define JOBCTL_TRAP_MASK	JOBCTL_TRAP_STOP
+#define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
 extern bool task_set_jobctl_pending(struct task_struct *task,
diff --git a/kernel/signal.c b/kernel/signal.c
index 589292f38530..06177e2b3917 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -817,6 +817,30 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	return security_task_kill(t, info, sig, 0);
 }
 
+/**
+ * ptrace_trap_notify - schedule trap to notify ptracer
+ * @t: tracee wanting to notify tracer
+ *
+ * This function schedules sticky ptrace trap which is cleared on the next
+ * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
+ * ptracer.
+ *
+ * If @t is running, STOP trap will be taken.  If already trapped, STOP
+ * trap will be eventually taken without returning to userland after the
+ * existing traps are finished by PTRACE_CONT.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void ptrace_trap_notify(struct task_struct *t)
+{
+	WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
+	assert_spin_locked(&t->sighand->siglock);
+
+	task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
+	signal_wake_up(t, 0);
+}
+
 /*
  * Handle magic process-wide effects of stop/continue signals. Unlike
  * the signal actions, these happen immediately at signal-generation
@@ -855,7 +879,10 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 		do {
 			task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 			rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
-			wake_up_state(t, __TASK_STOPPED);
+			if (likely(!(t->ptrace & PT_SEIZED)))
+				wake_up_state(t, __TASK_STOPPED);
+			else
+				ptrace_trap_notify(t);
 		} while_each_thread(p, t);
 
 		/*
@@ -1797,8 +1824,10 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
 		gstop_done = task_participate_group_stop(current);
 
-	/* any trap clears pending STOP trap */
+	/* any trap clears pending STOP trap, STOP trap clears NOTIFY */
 	task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
+	if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
+		task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
 
 	/* entering a trap, clear TRAPPING */
 	task_clear_jobctl_trapping(current);
@@ -1972,7 +2001,10 @@ static bool do_signal_stop(int signr)
 			if (!task_is_stopped(t) &&
 			    task_set_jobctl_pending(t, signr | gstop)) {
 				sig->group_stop_count++;
-				signal_wake_up(t, 0);
+				if (likely(!(t->ptrace & PT_SEIZED)))
+					signal_wake_up(t, 0);
+				else
+					ptrace_trap_notify(t);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 544b2c91a9f14f9565af1972203438b7f49afd48 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:18 +0200
Subject: ptrace: implement PTRACE_LISTEN

The previous patch implemented async notification for ptrace but it
only worked while trace is running.  This patch introduces
PTRACE_LISTEN which is suggested by Oleg Nestrov.

It's allowed iff tracee is in STOP trap and puts tracee into
quasi-running state - tracee never really runs but wait(2) and
ptrace(2) consider it to be running.  While ptracer is listening,
tracee is allowed to re-enter STOP to notify an async event.
Listening state is cleared on the first notification.  Ptracer can
also clear it by issuing INTERRUPT - tracee will re-trap into STOP
with listening state cleared.

This allows ptracer to monitor group stop state without running tracee
- use INTERRUPT to put tracee into STOP trap, issue LISTEN and then
wait(2) to wait for the next group stop event.  When it happens,
PTRACE_GETSIGINFO provides information to determine the current state.

Test program follows.

  #define PTRACE_SEIZE		0x4206
  #define PTRACE_INTERRUPT	0x4207
  #define PTRACE_LISTEN		0x4208

  #define PTRACE_SEIZE_DEVEL	0x80000000

  static const struct timespec ts1s = { .tv_sec = 1 };

  int main(int argc, char **argv)
  {
	  pid_t tracee, tracer;
	  int i;

	  tracee = fork();
	  if (!tracee)
		  while (1)
			  pause();

	  tracer = fork();
	  if (!tracer) {
		  siginfo_t si;

		  ptrace(PTRACE_SEIZE, tracee, NULL,
			 (void *)(unsigned long)PTRACE_SEIZE_DEVEL);
		  ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL);
	  repeat:
		  waitid(P_PID, tracee, NULL, WSTOPPED);

		  ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si);
		  if (!si.si_code) {
			  printf("tracer: SIG %d\n", si.si_signo);
			  ptrace(PTRACE_CONT, tracee, NULL,
				 (void *)(unsigned long)si.si_signo);
			  goto repeat;
		  }
		  printf("tracer: stopped=%d signo=%d\n",
			 si.si_signo != SIGTRAP, si.si_signo);
		  if (si.si_signo != SIGTRAP)
			  ptrace(PTRACE_LISTEN, tracee, NULL, NULL);
		  else
			  ptrace(PTRACE_CONT, tracee, NULL, NULL);
		  goto repeat;
	  }

	  for (i = 0; i < 3; i++) {
		  nanosleep(&ts1s, NULL);
		  printf("mother: SIGSTOP\n");
		  kill(tracee, SIGSTOP);
		  nanosleep(&ts1s, NULL);
		  printf("mother: SIGCONT\n");
		  kill(tracee, SIGCONT);
	  }
	  nanosleep(&ts1s, NULL);

	  kill(tracer, SIGKILL);
	  kill(tracee, SIGKILL);
	  return 0;
  }

This is identical to the program to test TRAP_NOTIFY except that
tracee is PTRACE_LISTEN'd instead of PTRACE_CONT'd when group stopped.
This allows ptracer to monitor when group stop ends without running
tracee.

  # ./test-listen
  tracer: stopped=0 signo=5
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18

-v2: Moved JOBCTL_LISTENING check in wait_task_stopped() into
     task_stopped_code() as suggested by Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h |  1 +
 include/linux/sched.h  |  2 ++
 kernel/exit.c          |  3 ++-
 kernel/ptrace.c        | 42 +++++++++++++++++++++++++++++++++++++++---
 kernel/signal.c        | 13 +++++++++----
 5 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ad754d1e0b13..4f224f169524 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -49,6 +49,7 @@
 
 #define PTRACE_SEIZE		0x4206
 #define PTRACE_INTERRUPT	0x4207
+#define PTRACE_LISTEN		0x4208
 
 /* flags in @data for PTRACE_SEIZE */
 #define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1854def284f5..87f7ca7ed6f6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1813,6 +1813,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
 #define JOBCTL_TRAP_NOTIFY_BIT	20	/* trap for NOTIFY */
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
+#define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
 
 #define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
@@ -1820,6 +1821,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
 #define JOBCTL_TRAP_NOTIFY	(1 << JOBCTL_TRAP_NOTIFY_BIT)
 #define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
+#define JOBCTL_LISTENING	(1 << JOBCTL_LISTENING_BIT)
 
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
diff --git a/kernel/exit.c b/kernel/exit.c
index 20a406471525..289f59d686bf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1368,7 +1368,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 static int *task_stopped_code(struct task_struct *p, bool ptrace)
 {
 	if (ptrace) {
-		if (task_is_stopped_or_traced(p))
+		if (task_is_stopped_or_traced(p) &&
+		    !(p->jobctl & JOBCTL_LISTENING))
 			return &p->exit_code;
 	} else {
 		if (p->signal->flags & SIGNAL_STOP_STOPPED)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6852c0f4a916..e18966c1c0da 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -146,7 +146,8 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 		 */
 		spin_lock_irq(&child->sighand->siglock);
 		WARN_ON_ONCE(task_is_stopped(child));
-		if (task_is_traced(child) || ignore_state)
+		if (ignore_state || (task_is_traced(child) &&
+				     !(child->jobctl & JOBCTL_LISTENING)))
 			ret = 0;
 		spin_unlock_irq(&child->sighand->siglock);
 	}
@@ -660,7 +661,7 @@ int ptrace_request(struct task_struct *child, long request,
 {
 	bool seized = child->ptrace & PT_SEIZED;
 	int ret = -EIO;
-	siginfo_t siginfo;
+	siginfo_t siginfo, *si;
 	void __user *datavp = (void __user *) data;
 	unsigned long __user *datalp = datavp;
 	unsigned long flags;
@@ -710,8 +711,43 @@ int ptrace_request(struct task_struct *child, long request,
 		if (unlikely(!seized || !lock_task_sighand(child, &flags)))
 			break;
 
+		/*
+		 * INTERRUPT doesn't disturb existing trap sans one
+		 * exception.  If ptracer issued LISTEN for the current
+		 * STOP, this INTERRUPT should clear LISTEN and re-trap
+		 * tracee into STOP.
+		 */
 		if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
-			signal_wake_up(child, 0);
+			signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+
+		unlock_task_sighand(child, &flags);
+		ret = 0;
+		break;
+
+	case PTRACE_LISTEN:
+		/*
+		 * Listen for events.  Tracee must be in STOP.  It's not
+		 * resumed per-se but is not considered to be in TRACED by
+		 * wait(2) or ptrace(2).  If an async event (e.g. group
+		 * stop state change) happens, tracee will enter STOP trap
+		 * again.  Alternatively, ptracer can issue INTERRUPT to
+		 * finish listening and re-trap tracee into STOP.
+		 */
+		if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+			break;
+
+		si = child->last_siginfo;
+		if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
+			break;
+
+		child->jobctl |= JOBCTL_LISTENING;
+
+		/*
+		 * If NOTIFY is set, it means event happened between start
+		 * of this trap and now.  Trigger re-trap immediately.
+		 */
+		if (child->jobctl & JOBCTL_TRAP_NOTIFY)
+			signal_wake_up(child, true);
 
 		unlock_task_sighand(child, &flags);
 		ret = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 06177e2b3917..97e575a3387e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -825,9 +825,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
  * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
  * ptracer.
  *
- * If @t is running, STOP trap will be taken.  If already trapped, STOP
- * trap will be eventually taken without returning to userland after the
- * existing traps are finished by PTRACE_CONT.
+ * If @t is running, STOP trap will be taken.  If trapped for STOP and
+ * ptracer is listening for events, tracee is woken up so that it can
+ * re-trap for the new event.  If trapped otherwise, STOP trap will be
+ * eventually taken without returning to userland after the existing traps
+ * are finished by PTRACE_CONT.
  *
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
@@ -838,7 +840,7 @@ static void ptrace_trap_notify(struct task_struct *t)
 	assert_spin_locked(&t->sighand->siglock);
 
 	task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
-	signal_wake_up(t, 0);
+	signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
 }
 
 /*
@@ -1894,6 +1896,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	spin_lock_irq(&current->sighand->siglock);
 	current->last_siginfo = NULL;
 
+	/* LISTENING can be set only during STOP traps, clear it */
+	current->jobctl &= ~JOBCTL_LISTENING;
+
 	/*
 	 * Queued signals ignored us while we were stopped for tracing.
 	 * So check for any that we should take before resuming user mode.
-- 
cgit v1.2.3


From 4ff75b7cf3b06efb72a50e26008d09b259b1231b Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sun, 19 Jun 2011 03:31:39 +0000
Subject: net: correct comment on where to place transmit time stamp hook.

The comment for the skb_tx_timestamp() function suggests calling it just
after a buffer is released to the hardware for transmission. However,
for drivers that free the buffer in an ISR, this produces a race between
the time stamp code and the ISR. This commit changes the comment to advise
placing the call just before handing the buffer over to the hardware.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e8b78ce14474..f3af147c211d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2023,8 +2023,7 @@ static inline void sw_tx_timestamp(struct sk_buff *skb)
  * skb_tx_timestamp() - Driver hook for transmit timestamping
  *
  * Ethernet MAC Drivers should call this function in their hard_xmit()
- * function as soon as possible after giving the sk_buff to the MAC
- * hardware, but before freeing the sk_buff.
+ * function immediately before giving the sk_buff to the MAC hardware.
  *
  * @skb: A socket buffer.
  */
-- 
cgit v1.2.3


From 440ca98fe8407808a9ad8e934f6e28408f546313 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sat, 18 Jun 2011 01:01:59 +0200
Subject: bcma: clean exports of functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Function managing IRQs is needed for external drivers like b43.
On the other side we do not expect writing any hosts drivers outside of
bcma, so this is safe to do not export functions related to this.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/bcma/bcma_private.h          | 4 ++--
 drivers/bcma/driver_pci.c            | 1 +
 drivers/bcma/main.c                  | 2 --
 include/linux/bcma/bcma_driver_pci.h | 2 ++
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h
index 12a75ab3dd23..4228736de0e8 100644
--- a/drivers/bcma/bcma_private.h
+++ b/drivers/bcma/bcma_private.h
@@ -13,8 +13,8 @@
 struct bcma_bus;
 
 /* main.c */
-extern int bcma_bus_register(struct bcma_bus *bus);
-extern void bcma_bus_unregister(struct bcma_bus *bus);
+int bcma_bus_register(struct bcma_bus *bus);
+void bcma_bus_unregister(struct bcma_bus *bus);
 
 /* scan.c */
 int bcma_bus_scan(struct bcma_bus *bus);
diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c
index 789d68b4858b..b0c19ede0d2e 100644
--- a/drivers/bcma/driver_pci.c
+++ b/drivers/bcma/driver_pci.c
@@ -184,3 +184,4 @@ int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core,
 out:
 	return err;
 }
+EXPORT_SYMBOL_GPL(bcma_core_pci_irq_ctl);
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index 11e96dc6011a..ba15105cc23a 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -160,13 +160,11 @@ int bcma_bus_register(struct bcma_bus *bus)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(bcma_bus_register);
 
 void bcma_bus_unregister(struct bcma_bus *bus)
 {
 	bcma_unregister_cores(bus);
 }
-EXPORT_SYMBOL_GPL(bcma_bus_unregister);
 
 int __bcma_driver_register(struct bcma_driver *drv, struct module *owner)
 {
diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h
index b7e191cf00ec..3871b668caf9 100644
--- a/include/linux/bcma/bcma_driver_pci.h
+++ b/include/linux/bcma/bcma_driver_pci.h
@@ -85,5 +85,7 @@ struct bcma_drv_pci {
 #define pcicore_write32(pc, offset, val)	bcma_write32((pc)->core, offset, val)
 
 extern void bcma_core_pci_init(struct bcma_drv_pci *pc);
+extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc,
+				 struct bcma_device *core, bool enable);
 
 #endif /* LINUX_BCMA_DRIVER_PCI_H_ */
-- 
cgit v1.2.3


From 166e9278a3f98bab29ebb3d685a81cfb11b98be0 Mon Sep 17 00:00:00 2001
From: Ohad Ben-Cohen <ohad@wizery.com>
Date: Fri, 10 Jun 2011 21:42:27 +0300
Subject: x86/ia64: intel-iommu: move to drivers/iommu/

This should ease finding similarities with different platforms,
with the intention of solving problems once in a generic framework
which everyone can use.

Note: to move intel-iommu.c, the declaration of pci_find_upstream_pcie_bridge()
has to move from drivers/pci/pci.h to include/linux/pci.h. This is handled
in this patch, too.

As suggested, also drop DMAR's EXPERIMENTAL tag while we're at it.

Compile-tested on x86_64.

Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/ia64/Kconfig              |   21 -
 arch/x86/Kconfig               |   50 -
 drivers/iommu/Kconfig          |   49 +
 drivers/iommu/Makefile         |    2 +
 drivers/iommu/dmar.c           | 1461 +++++++++++++++
 drivers/iommu/intel-iommu.c    | 4016 +++++++++++++++++++++++++++++++++++++++
 drivers/iommu/intr_remapping.c |  797 ++++++++
 drivers/iommu/intr_remapping.h |   17 +
 drivers/iommu/iova.c           |  435 +++++
 drivers/pci/Makefile           |    5 -
 drivers/pci/dmar.c             | 1461 ---------------
 drivers/pci/intel-iommu.c      | 4017 ----------------------------------------
 drivers/pci/intr_remapping.c   |  798 --------
 drivers/pci/intr_remapping.h   |   17 -
 drivers/pci/iova.c             |  435 -----
 drivers/pci/pci.h              |    2 -
 include/linux/pci.h            |   11 +
 17 files changed, 6788 insertions(+), 6806 deletions(-)
 create mode 100644 drivers/iommu/dmar.c
 create mode 100644 drivers/iommu/intel-iommu.c
 create mode 100644 drivers/iommu/intr_remapping.c
 create mode 100644 drivers/iommu/intr_remapping.h
 create mode 100644 drivers/iommu/iova.c
 delete mode 100644 drivers/pci/dmar.c
 delete mode 100644 drivers/pci/intel-iommu.c
 delete mode 100644 drivers/pci/intr_remapping.c
 delete mode 100644 drivers/pci/intr_remapping.h
 delete mode 100644 drivers/pci/iova.c

(limited to 'include/linux')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 9929e4e11ea0..7336ba653b8f 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -627,27 +627,6 @@ source "drivers/pci/hotplug/Kconfig"
 
 source "drivers/pcmcia/Kconfig"
 
-config DMAR
-        bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
-        depends on IA64_GENERIC && ACPI && EXPERIMENTAL
-	help
-	  DMA remapping (DMAR) devices support enables independent address
-	  translations for Direct Memory Access (DMA) from devices.
-	  These DMA remapping devices are reported via ACPI tables
-	  and include PCI device scope covered by these DMA
-	  remapping devices.
-
-config DMAR_DEFAULT_ON
-	def_bool y
-	prompt "Enable DMA Remapping Devices by default"
-	depends on DMAR
-	help
-	  Selecting this option will enable a DMAR device at boot time if
-	  one is found. If this option is not selected, DMAR support can
-	  be enabled by passing intel_iommu=on to the kernel. It is
-	  recommended you say N here while the DMAR code remains
-	  experimental.
-
 endmenu
 
 endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1b6a2e212db4..a169573c64fc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1912,56 +1912,6 @@ config PCI_CNB20LE_QUIRK
 
 	  You should say N unless you know you need this.
 
-config DMAR
-	bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
-	depends on PCI_MSI && ACPI && EXPERIMENTAL
-	select IOMMU_API
-	help
-	  DMA remapping (DMAR) devices support enables independent address
-	  translations for Direct Memory Access (DMA) from devices.
-	  These DMA remapping devices are reported via ACPI tables
-	  and include PCI device scope covered by these DMA
-	  remapping devices.
-
-config DMAR_DEFAULT_ON
-	def_bool y
-	prompt "Enable DMA Remapping Devices by default"
-	depends on DMAR
-	help
-	  Selecting this option will enable a DMAR device at boot time if
-	  one is found. If this option is not selected, DMAR support can
-	  be enabled by passing intel_iommu=on to the kernel. It is
-	  recommended you say N here while the DMAR code remains
-	  experimental.
-
-config DMAR_BROKEN_GFX_WA
-	bool "Workaround broken graphics drivers (going away soon)"
-	depends on DMAR && BROKEN
-	---help---
-	  Current Graphics drivers tend to use physical address
-	  for DMA and avoid using DMA APIs. Setting this config
-	  option permits the IOMMU driver to set a unity map for
-	  all the OS-visible memory. Hence the driver can continue
-	  to use physical addresses for DMA, at least until this
-	  option is removed in the 2.6.32 kernel.
-
-config DMAR_FLOPPY_WA
-	def_bool y
-	depends on DMAR
-	---help---
-	  Floppy disk drivers are known to bypass DMA API calls
-	  thereby failing to work when IOMMU is enabled. This
-	  workaround will setup a 1:1 mapping for the first
-	  16MiB to make floppy (an ISA device) work.
-
-config INTR_REMAP
-	bool "Support for Interrupt Remapping (EXPERIMENTAL)"
-	depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
-	---help---
-	  Supports Interrupt remapping for IO-APIC and MSI devices.
-	  To use x2apic mode in the CPU's which support x2APIC enhancements or
-	  to support platforms with CPU's having > 8 bit APIC ID, say Y.
-
 source "drivers/pci/pcie/Kconfig"
 
 source "drivers/pci/Kconfig"
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9246c5bf25af..e2a5f141ae2d 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -46,3 +46,52 @@ config AMD_IOMMU_STATS
 	  statistics about whats happening in the driver and exports that
 	  information to userspace via debugfs.
 	  If unsure, say N.
+
+# Intel IOMMU support
+config DMAR
+	bool "Support for DMA Remapping Devices"
+	depends on PCI_MSI && ACPI && (X86 || IA64_GENERIC)
+	select IOMMU_API
+	help
+	  DMA remapping (DMAR) devices support enables independent address
+	  translations for Direct Memory Access (DMA) from devices.
+	  These DMA remapping devices are reported via ACPI tables
+	  and include PCI device scope covered by these DMA
+	  remapping devices.
+
+config DMAR_DEFAULT_ON
+	def_bool y
+	prompt "Enable DMA Remapping Devices by default"
+	depends on DMAR
+	help
+	  Selecting this option will enable a DMAR device at boot time if
+	  one is found. If this option is not selected, DMAR support can
+	  be enabled by passing intel_iommu=on to the kernel.
+
+config DMAR_BROKEN_GFX_WA
+	bool "Workaround broken graphics drivers (going away soon)"
+	depends on DMAR && BROKEN && X86
+	---help---
+	  Current Graphics drivers tend to use physical address
+	  for DMA and avoid using DMA APIs. Setting this config
+	  option permits the IOMMU driver to set a unity map for
+	  all the OS-visible memory. Hence the driver can continue
+	  to use physical addresses for DMA, at least until this
+	  option is removed in the 2.6.32 kernel.
+
+config DMAR_FLOPPY_WA
+	def_bool y
+	depends on DMAR && X86
+	---help---
+	  Floppy disk drivers are known to bypass DMA API calls
+	  thereby failing to work when IOMMU is enabled. This
+	  workaround will setup a 1:1 mapping for the first
+	  16MiB to make floppy (an ISA device) work.
+
+config INTR_REMAP
+	bool "Support for Interrupt Remapping (EXPERIMENTAL)"
+	depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
+	---help---
+	  Supports Interrupt remapping for IO-APIC and MSI devices.
+	  To use x2apic mode in the CPU's which support x2APIC enhancements or
+	  to support platforms with CPU's having > 8 bit APIC ID, say Y.
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 4237eaf84609..49e9c0f46bd5 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,3 +1,5 @@
 obj-$(CONFIG_IOMMU_API) += iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
 obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o
+obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
+obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
new file mode 100644
index 000000000000..3dc9befa5aec
--- /dev/null
+++ b/drivers/iommu/dmar.c
@@ -0,0 +1,1461 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Ashok Raj <ashok.raj@intel.com>
+ * Author: Shaohua Li <shaohua.li@intel.com>
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ * This file implements early detection/parsing of Remapping Devices
+ * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
+ * tables.
+ *
+ * These routines are used by both DMA-remapping and Interrupt-remapping
+ */
+
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/iova.h>
+#include <linux/intel-iommu.h>
+#include <linux/timer.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/tboot.h>
+#include <linux/dmi.h>
+#include <linux/slab.h>
+#include <asm/iommu_table.h>
+
+#define PREFIX "DMAR: "
+
+/* No locks are needed as DMA remapping hardware unit
+ * list is constructed at boot time and hotplug of
+ * these units are not supported by the architecture.
+ */
+LIST_HEAD(dmar_drhd_units);
+
+static struct acpi_table_header * __initdata dmar_tbl;
+static acpi_size dmar_tbl_size;
+
+static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
+{
+	/*
+	 * add INCLUDE_ALL at the tail, so scan the list will find it at
+	 * the very end.
+	 */
+	if (drhd->include_all)
+		list_add_tail(&drhd->list, &dmar_drhd_units);
+	else
+		list_add(&drhd->list, &dmar_drhd_units);
+}
+
+static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope,
+					   struct pci_dev **dev, u16 segment)
+{
+	struct pci_bus *bus;
+	struct pci_dev *pdev = NULL;
+	struct acpi_dmar_pci_path *path;
+	int count;
+
+	bus = pci_find_bus(segment, scope->bus);
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (count) {
+		if (pdev)
+			pci_dev_put(pdev);
+		/*
+		 * Some BIOSes list non-exist devices in DMAR table, just
+		 * ignore it
+		 */
+		if (!bus) {
+			printk(KERN_WARNING
+			PREFIX "Device scope bus [%d] not found\n",
+			scope->bus);
+			break;
+		}
+		pdev = pci_get_slot(bus, PCI_DEVFN(path->dev, path->fn));
+		if (!pdev) {
+			printk(KERN_WARNING PREFIX
+			"Device scope device [%04x:%02x:%02x.%02x] not found\n",
+				segment, bus->number, path->dev, path->fn);
+			break;
+		}
+		path ++;
+		count --;
+		bus = pdev->subordinate;
+	}
+	if (!pdev) {
+		printk(KERN_WARNING PREFIX
+		"Device scope device [%04x:%02x:%02x.%02x] not found\n",
+		segment, scope->bus, path->dev, path->fn);
+		*dev = NULL;
+		return 0;
+	}
+	if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT && \
+			pdev->subordinate) || (scope->entry_type == \
+			ACPI_DMAR_SCOPE_TYPE_BRIDGE && !pdev->subordinate)) {
+		pci_dev_put(pdev);
+		printk(KERN_WARNING PREFIX
+			"Device scope type does not match for %s\n",
+			 pci_name(pdev));
+		return -EINVAL;
+	}
+	*dev = pdev;
+	return 0;
+}
+
+static int __init dmar_parse_dev_scope(void *start, void *end, int *cnt,
+				       struct pci_dev ***devices, u16 segment)
+{
+	struct acpi_dmar_device_scope *scope;
+	void * tmp = start;
+	int index;
+	int ret;
+
+	*cnt = 0;
+	while (start < end) {
+		scope = start;
+		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
+		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE)
+			(*cnt)++;
+		else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
+			printk(KERN_WARNING PREFIX
+			       "Unsupported device scope\n");
+		}
+		start += scope->length;
+	}
+	if (*cnt == 0)
+		return 0;
+
+	*devices = kcalloc(*cnt, sizeof(struct pci_dev *), GFP_KERNEL);
+	if (!*devices)
+		return -ENOMEM;
+
+	start = tmp;
+	index = 0;
+	while (start < end) {
+		scope = start;
+		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
+		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) {
+			ret = dmar_parse_one_dev_scope(scope,
+				&(*devices)[index], segment);
+			if (ret) {
+				kfree(*devices);
+				return ret;
+			}
+			index ++;
+		}
+		start += scope->length;
+	}
+
+	return 0;
+}
+
+/**
+ * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
+ * structure which uniquely represent one DMA remapping hardware unit
+ * present in the platform
+ */
+static int __init
+dmar_parse_one_drhd(struct acpi_dmar_header *header)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	struct dmar_drhd_unit *dmaru;
+	int ret = 0;
+
+	drhd = (struct acpi_dmar_hardware_unit *)header;
+	dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL);
+	if (!dmaru)
+		return -ENOMEM;
+
+	dmaru->hdr = header;
+	dmaru->reg_base_addr = drhd->address;
+	dmaru->segment = drhd->segment;
+	dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
+
+	ret = alloc_iommu(dmaru);
+	if (ret) {
+		kfree(dmaru);
+		return ret;
+	}
+	dmar_register_drhd_unit(dmaru);
+	return 0;
+}
+
+static int __init dmar_parse_dev(struct dmar_drhd_unit *dmaru)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	int ret = 0;
+
+	drhd = (struct acpi_dmar_hardware_unit *) dmaru->hdr;
+
+	if (dmaru->include_all)
+		return 0;
+
+	ret = dmar_parse_dev_scope((void *)(drhd + 1),
+				((void *)drhd) + drhd->header.length,
+				&dmaru->devices_cnt, &dmaru->devices,
+				drhd->segment);
+	if (ret) {
+		list_del(&dmaru->list);
+		kfree(dmaru);
+	}
+	return ret;
+}
+
+#ifdef CONFIG_DMAR
+LIST_HEAD(dmar_rmrr_units);
+
+static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
+{
+	list_add(&rmrr->list, &dmar_rmrr_units);
+}
+
+
+static int __init
+dmar_parse_one_rmrr(struct acpi_dmar_header *header)
+{
+	struct acpi_dmar_reserved_memory *rmrr;
+	struct dmar_rmrr_unit *rmrru;
+
+	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
+	if (!rmrru)
+		return -ENOMEM;
+
+	rmrru->hdr = header;
+	rmrr = (struct acpi_dmar_reserved_memory *)header;
+	rmrru->base_address = rmrr->base_address;
+	rmrru->end_address = rmrr->end_address;
+
+	dmar_register_rmrr_unit(rmrru);
+	return 0;
+}
+
+static int __init
+rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
+{
+	struct acpi_dmar_reserved_memory *rmrr;
+	int ret;
+
+	rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
+	ret = dmar_parse_dev_scope((void *)(rmrr + 1),
+		((void *)rmrr) + rmrr->header.length,
+		&rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
+
+	if (ret || (rmrru->devices_cnt == 0)) {
+		list_del(&rmrru->list);
+		kfree(rmrru);
+	}
+	return ret;
+}
+
+static LIST_HEAD(dmar_atsr_units);
+
+static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
+{
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
+	if (!atsru)
+		return -ENOMEM;
+
+	atsru->hdr = hdr;
+	atsru->include_all = atsr->flags & 0x1;
+
+	list_add(&atsru->list, &dmar_atsr_units);
+
+	return 0;
+}
+
+static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
+{
+	int rc;
+	struct acpi_dmar_atsr *atsr;
+
+	if (atsru->include_all)
+		return 0;
+
+	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+	rc = dmar_parse_dev_scope((void *)(atsr + 1),
+				(void *)atsr + atsr->header.length,
+				&atsru->devices_cnt, &atsru->devices,
+				atsr->segment);
+	if (rc || !atsru->devices_cnt) {
+		list_del(&atsru->list);
+		kfree(atsru);
+	}
+
+	return rc;
+}
+
+int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+{
+	int i;
+	struct pci_bus *bus;
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	dev = pci_physfn(dev);
+
+	list_for_each_entry(atsru, &dmar_atsr_units, list) {
+		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+		if (atsr->segment == pci_domain_nr(dev->bus))
+			goto found;
+	}
+
+	return 0;
+
+found:
+	for (bus = dev->bus; bus; bus = bus->parent) {
+		struct pci_dev *bridge = bus->self;
+
+		if (!bridge || !pci_is_pcie(bridge) ||
+		    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+			return 0;
+
+		if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
+			for (i = 0; i < atsru->devices_cnt; i++)
+				if (atsru->devices[i] == bridge)
+					return 1;
+			break;
+		}
+	}
+
+	if (atsru->include_all)
+		return 1;
+
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+static int __init
+dmar_parse_one_rhsa(struct acpi_dmar_header *header)
+{
+	struct acpi_dmar_rhsa *rhsa;
+	struct dmar_drhd_unit *drhd;
+
+	rhsa = (struct acpi_dmar_rhsa *)header;
+	for_each_drhd_unit(drhd) {
+		if (drhd->reg_base_addr == rhsa->base_address) {
+			int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
+
+			if (!node_online(node))
+				node = -1;
+			drhd->iommu->node = node;
+			return 0;
+		}
+	}
+	WARN_TAINT(
+		1, TAINT_FIRMWARE_WORKAROUND,
+		"Your BIOS is broken; RHSA refers to non-existent DMAR unit at %llx\n"
+		"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+		drhd->reg_base_addr,
+		dmi_get_system_info(DMI_BIOS_VENDOR),
+		dmi_get_system_info(DMI_BIOS_VERSION),
+		dmi_get_system_info(DMI_PRODUCT_VERSION));
+
+	return 0;
+}
+#endif
+
+static void __init
+dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	struct acpi_dmar_reserved_memory *rmrr;
+	struct acpi_dmar_atsr *atsr;
+	struct acpi_dmar_rhsa *rhsa;
+
+	switch (header->type) {
+	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
+		drhd = container_of(header, struct acpi_dmar_hardware_unit,
+				    header);
+		printk (KERN_INFO PREFIX
+			"DRHD base: %#016Lx flags: %#x\n",
+			(unsigned long long)drhd->address, drhd->flags);
+		break;
+	case ACPI_DMAR_TYPE_RESERVED_MEMORY:
+		rmrr = container_of(header, struct acpi_dmar_reserved_memory,
+				    header);
+		printk (KERN_INFO PREFIX
+			"RMRR base: %#016Lx end: %#016Lx\n",
+			(unsigned long long)rmrr->base_address,
+			(unsigned long long)rmrr->end_address);
+		break;
+	case ACPI_DMAR_TYPE_ATSR:
+		atsr = container_of(header, struct acpi_dmar_atsr, header);
+		printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags);
+		break;
+	case ACPI_DMAR_HARDWARE_AFFINITY:
+		rhsa = container_of(header, struct acpi_dmar_rhsa, header);
+		printk(KERN_INFO PREFIX "RHSA base: %#016Lx proximity domain: %#x\n",
+		       (unsigned long long)rhsa->base_address,
+		       rhsa->proximity_domain);
+		break;
+	}
+}
+
+/**
+ * dmar_table_detect - checks to see if the platform supports DMAR devices
+ */
+static int __init dmar_table_detect(void)
+{
+	acpi_status status = AE_OK;
+
+	/* if we could find DMAR table, then there are DMAR devices */
+	status = acpi_get_table_with_size(ACPI_SIG_DMAR, 0,
+				(struct acpi_table_header **)&dmar_tbl,
+				&dmar_tbl_size);
+
+	if (ACPI_SUCCESS(status) && !dmar_tbl) {
+		printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
+		status = AE_NOT_FOUND;
+	}
+
+	return (ACPI_SUCCESS(status) ? 1 : 0);
+}
+
+/**
+ * parse_dmar_table - parses the DMA reporting table
+ */
+static int __init
+parse_dmar_table(void)
+{
+	struct acpi_table_dmar *dmar;
+	struct acpi_dmar_header *entry_header;
+	int ret = 0;
+
+	/*
+	 * Do it again, earlier dmar_tbl mapping could be mapped with
+	 * fixed map.
+	 */
+	dmar_table_detect();
+
+	/*
+	 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+	 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+	 */
+	dmar_tbl = tboot_get_dmar_table(dmar_tbl);
+
+	dmar = (struct acpi_table_dmar *)dmar_tbl;
+	if (!dmar)
+		return -ENODEV;
+
+	if (dmar->width < PAGE_SHIFT - 1) {
+		printk(KERN_WARNING PREFIX "Invalid DMAR haw\n");
+		return -EINVAL;
+	}
+
+	printk (KERN_INFO PREFIX "Host address width %d\n",
+		dmar->width + 1);
+
+	entry_header = (struct acpi_dmar_header *)(dmar + 1);
+	while (((unsigned long)entry_header) <
+			(((unsigned long)dmar) + dmar_tbl->length)) {
+		/* Avoid looping forever on bad ACPI tables */
+		if (entry_header->length == 0) {
+			printk(KERN_WARNING PREFIX
+				"Invalid 0-length structure\n");
+			ret = -EINVAL;
+			break;
+		}
+
+		dmar_table_print_dmar_entry(entry_header);
+
+		switch (entry_header->type) {
+		case ACPI_DMAR_TYPE_HARDWARE_UNIT:
+			ret = dmar_parse_one_drhd(entry_header);
+			break;
+		case ACPI_DMAR_TYPE_RESERVED_MEMORY:
+#ifdef CONFIG_DMAR
+			ret = dmar_parse_one_rmrr(entry_header);
+#endif
+			break;
+		case ACPI_DMAR_TYPE_ATSR:
+#ifdef CONFIG_DMAR
+			ret = dmar_parse_one_atsr(entry_header);
+#endif
+			break;
+		case ACPI_DMAR_HARDWARE_AFFINITY:
+#ifdef CONFIG_ACPI_NUMA
+			ret = dmar_parse_one_rhsa(entry_header);
+#endif
+			break;
+		default:
+			printk(KERN_WARNING PREFIX
+				"Unknown DMAR structure type %d\n",
+				entry_header->type);
+			ret = 0; /* for forward compatibility */
+			break;
+		}
+		if (ret)
+			break;
+
+		entry_header = ((void *)entry_header + entry_header->length);
+	}
+	return ret;
+}
+
+static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
+			  struct pci_dev *dev)
+{
+	int index;
+
+	while (dev) {
+		for (index = 0; index < cnt; index++)
+			if (dev == devices[index])
+				return 1;
+
+		/* Check our parent */
+		dev = dev->bus->self;
+	}
+
+	return 0;
+}
+
+struct dmar_drhd_unit *
+dmar_find_matched_drhd_unit(struct pci_dev *dev)
+{
+	struct dmar_drhd_unit *dmaru = NULL;
+	struct acpi_dmar_hardware_unit *drhd;
+
+	dev = pci_physfn(dev);
+
+	list_for_each_entry(dmaru, &dmar_drhd_units, list) {
+		drhd = container_of(dmaru->hdr,
+				    struct acpi_dmar_hardware_unit,
+				    header);
+
+		if (dmaru->include_all &&
+		    drhd->segment == pci_domain_nr(dev->bus))
+			return dmaru;
+
+		if (dmar_pci_device_match(dmaru->devices,
+					  dmaru->devices_cnt, dev))
+			return dmaru;
+	}
+
+	return NULL;
+}
+
+int __init dmar_dev_scope_init(void)
+{
+	struct dmar_drhd_unit *drhd, *drhd_n;
+	int ret = -ENODEV;
+
+	list_for_each_entry_safe(drhd, drhd_n, &dmar_drhd_units, list) {
+		ret = dmar_parse_dev(drhd);
+		if (ret)
+			return ret;
+	}
+
+#ifdef CONFIG_DMAR
+	{
+		struct dmar_rmrr_unit *rmrr, *rmrr_n;
+		struct dmar_atsr_unit *atsr, *atsr_n;
+
+		list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
+			ret = rmrr_parse_dev(rmrr);
+			if (ret)
+				return ret;
+		}
+
+		list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
+			ret = atsr_parse_dev(atsr);
+			if (ret)
+				return ret;
+		}
+	}
+#endif
+
+	return ret;
+}
+
+
+int __init dmar_table_init(void)
+{
+	static int dmar_table_initialized;
+	int ret;
+
+	if (dmar_table_initialized)
+		return 0;
+
+	dmar_table_initialized = 1;
+
+	ret = parse_dmar_table();
+	if (ret) {
+		if (ret != -ENODEV)
+			printk(KERN_INFO PREFIX "parse DMAR table failure.\n");
+		return ret;
+	}
+
+	if (list_empty(&dmar_drhd_units)) {
+		printk(KERN_INFO PREFIX "No DMAR devices found\n");
+		return -ENODEV;
+	}
+
+#ifdef CONFIG_DMAR
+	if (list_empty(&dmar_rmrr_units))
+		printk(KERN_INFO PREFIX "No RMRR found\n");
+
+	if (list_empty(&dmar_atsr_units))
+		printk(KERN_INFO PREFIX "No ATSR found\n");
+#endif
+
+	return 0;
+}
+
+static void warn_invalid_dmar(u64 addr, const char *message)
+{
+	WARN_TAINT_ONCE(
+		1, TAINT_FIRMWARE_WORKAROUND,
+		"Your BIOS is broken; DMAR reported at address %llx%s!\n"
+		"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+		addr, message,
+		dmi_get_system_info(DMI_BIOS_VENDOR),
+		dmi_get_system_info(DMI_BIOS_VERSION),
+		dmi_get_system_info(DMI_PRODUCT_VERSION));
+}
+
+int __init check_zero_address(void)
+{
+	struct acpi_table_dmar *dmar;
+	struct acpi_dmar_header *entry_header;
+	struct acpi_dmar_hardware_unit *drhd;
+
+	dmar = (struct acpi_table_dmar *)dmar_tbl;
+	entry_header = (struct acpi_dmar_header *)(dmar + 1);
+
+	while (((unsigned long)entry_header) <
+			(((unsigned long)dmar) + dmar_tbl->length)) {
+		/* Avoid looping forever on bad ACPI tables */
+		if (entry_header->length == 0) {
+			printk(KERN_WARNING PREFIX
+				"Invalid 0-length structure\n");
+			return 0;
+		}
+
+		if (entry_header->type == ACPI_DMAR_TYPE_HARDWARE_UNIT) {
+			void __iomem *addr;
+			u64 cap, ecap;
+
+			drhd = (void *)entry_header;
+			if (!drhd->address) {
+				warn_invalid_dmar(0, "");
+				goto failed;
+			}
+
+			addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
+			if (!addr ) {
+				printk("IOMMU: can't validate: %llx\n", drhd->address);
+				goto failed;
+			}
+			cap = dmar_readq(addr + DMAR_CAP_REG);
+			ecap = dmar_readq(addr + DMAR_ECAP_REG);
+			early_iounmap(addr, VTD_PAGE_SIZE);
+			if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
+				warn_invalid_dmar(drhd->address,
+						  " returns all ones");
+				goto failed;
+			}
+		}
+
+		entry_header = ((void *)entry_header + entry_header->length);
+	}
+	return 1;
+
+failed:
+#ifdef CONFIG_DMAR
+	dmar_disabled = 1;
+#endif
+	return 0;
+}
+
+int __init detect_intel_iommu(void)
+{
+	int ret;
+
+	ret = dmar_table_detect();
+	if (ret)
+		ret = check_zero_address();
+	{
+#ifdef CONFIG_INTR_REMAP
+		struct acpi_table_dmar *dmar;
+
+		dmar = (struct acpi_table_dmar *) dmar_tbl;
+		if (ret && cpu_has_x2apic && dmar->flags & 0x1)
+			printk(KERN_INFO
+			       "Queued invalidation will be enabled to support "
+			       "x2apic and Intr-remapping.\n");
+#endif
+#ifdef CONFIG_DMAR
+		if (ret && !no_iommu && !iommu_detected && !dmar_disabled) {
+			iommu_detected = 1;
+			/* Make sure ACS will be enabled */
+			pci_request_acs();
+		}
+#endif
+#ifdef CONFIG_X86
+		if (ret)
+			x86_init.iommu.iommu_init = intel_iommu_init;
+#endif
+	}
+	early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
+	dmar_tbl = NULL;
+
+	return ret ? 1 : -ENODEV;
+}
+
+
+int alloc_iommu(struct dmar_drhd_unit *drhd)
+{
+	struct intel_iommu *iommu;
+	int map_size;
+	u32 ver;
+	static int iommu_allocated = 0;
+	int agaw = 0;
+	int msagaw = 0;
+
+	if (!drhd->reg_base_addr) {
+		warn_invalid_dmar(0, "");
+		return -EINVAL;
+	}
+
+	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return -ENOMEM;
+
+	iommu->seq_id = iommu_allocated++;
+	sprintf (iommu->name, "dmar%d", iommu->seq_id);
+
+	iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE);
+	if (!iommu->reg) {
+		printk(KERN_ERR "IOMMU: can't map the region\n");
+		goto error;
+	}
+	iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
+	iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
+
+	if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) {
+		warn_invalid_dmar(drhd->reg_base_addr, " returns all ones");
+		goto err_unmap;
+	}
+
+#ifdef CONFIG_DMAR
+	agaw = iommu_calculate_agaw(iommu);
+	if (agaw < 0) {
+		printk(KERN_ERR
+		       "Cannot get a valid agaw for iommu (seq_id = %d)\n",
+		       iommu->seq_id);
+		goto err_unmap;
+	}
+	msagaw = iommu_calculate_max_sagaw(iommu);
+	if (msagaw < 0) {
+		printk(KERN_ERR
+			"Cannot get a valid max agaw for iommu (seq_id = %d)\n",
+			iommu->seq_id);
+		goto err_unmap;
+	}
+#endif
+	iommu->agaw = agaw;
+	iommu->msagaw = msagaw;
+
+	iommu->node = -1;
+
+	/* the registers might be more than one page */
+	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
+		cap_max_fault_reg_offset(iommu->cap));
+	map_size = VTD_PAGE_ALIGN(map_size);
+	if (map_size > VTD_PAGE_SIZE) {
+		iounmap(iommu->reg);
+		iommu->reg = ioremap(drhd->reg_base_addr, map_size);
+		if (!iommu->reg) {
+			printk(KERN_ERR "IOMMU: can't map the region\n");
+			goto error;
+		}
+	}
+
+	ver = readl(iommu->reg + DMAR_VER_REG);
+	pr_info("IOMMU %d: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
+		iommu->seq_id,
+		(unsigned long long)drhd->reg_base_addr,
+		DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
+		(unsigned long long)iommu->cap,
+		(unsigned long long)iommu->ecap);
+
+	spin_lock_init(&iommu->register_lock);
+
+	drhd->iommu = iommu;
+	return 0;
+
+ err_unmap:
+	iounmap(iommu->reg);
+ error:
+	kfree(iommu);
+	return -1;
+}
+
+void free_iommu(struct intel_iommu *iommu)
+{
+	if (!iommu)
+		return;
+
+#ifdef CONFIG_DMAR
+	free_dmar_iommu(iommu);
+#endif
+
+	if (iommu->reg)
+		iounmap(iommu->reg);
+	kfree(iommu);
+}
+
+/*
+ * Reclaim all the submitted descriptors which have completed its work.
+ */
+static inline void reclaim_free_desc(struct q_inval *qi)
+{
+	while (qi->desc_status[qi->free_tail] == QI_DONE ||
+	       qi->desc_status[qi->free_tail] == QI_ABORT) {
+		qi->desc_status[qi->free_tail] = QI_FREE;
+		qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
+		qi->free_cnt++;
+	}
+}
+
+static int qi_check_fault(struct intel_iommu *iommu, int index)
+{
+	u32 fault;
+	int head, tail;
+	struct q_inval *qi = iommu->qi;
+	int wait_index = (index + 1) % QI_LENGTH;
+
+	if (qi->desc_status[wait_index] == QI_ABORT)
+		return -EAGAIN;
+
+	fault = readl(iommu->reg + DMAR_FSTS_REG);
+
+	/*
+	 * If IQE happens, the head points to the descriptor associated
+	 * with the error. No new descriptors are fetched until the IQE
+	 * is cleared.
+	 */
+	if (fault & DMA_FSTS_IQE) {
+		head = readl(iommu->reg + DMAR_IQH_REG);
+		if ((head >> DMAR_IQ_SHIFT) == index) {
+			printk(KERN_ERR "VT-d detected invalid descriptor: "
+				"low=%llx, high=%llx\n",
+				(unsigned long long)qi->desc[index].low,
+				(unsigned long long)qi->desc[index].high);
+			memcpy(&qi->desc[index], &qi->desc[wait_index],
+					sizeof(struct qi_desc));
+			__iommu_flush_cache(iommu, &qi->desc[index],
+					sizeof(struct qi_desc));
+			writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * If ITE happens, all pending wait_desc commands are aborted.
+	 * No new descriptors are fetched until the ITE is cleared.
+	 */
+	if (fault & DMA_FSTS_ITE) {
+		head = readl(iommu->reg + DMAR_IQH_REG);
+		head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+		head |= 1;
+		tail = readl(iommu->reg + DMAR_IQT_REG);
+		tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+
+		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+
+		do {
+			if (qi->desc_status[head] == QI_IN_USE)
+				qi->desc_status[head] = QI_ABORT;
+			head = (head - 2 + QI_LENGTH) % QI_LENGTH;
+		} while (head != tail);
+
+		if (qi->desc_status[wait_index] == QI_ABORT)
+			return -EAGAIN;
+	}
+
+	if (fault & DMA_FSTS_ICE)
+		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+
+	return 0;
+}
+
+/*
+ * Submit the queued invalidation descriptor to the remapping
+ * hardware unit and wait for its completion.
+ */
+int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
+{
+	int rc;
+	struct q_inval *qi = iommu->qi;
+	struct qi_desc *hw, wait_desc;
+	int wait_index, index;
+	unsigned long flags;
+
+	if (!qi)
+		return 0;
+
+	hw = qi->desc;
+
+restart:
+	rc = 0;
+
+	spin_lock_irqsave(&qi->q_lock, flags);
+	while (qi->free_cnt < 3) {
+		spin_unlock_irqrestore(&qi->q_lock, flags);
+		cpu_relax();
+		spin_lock_irqsave(&qi->q_lock, flags);
+	}
+
+	index = qi->free_head;
+	wait_index = (index + 1) % QI_LENGTH;
+
+	qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE;
+
+	hw[index] = *desc;
+
+	wait_desc.low = QI_IWD_STATUS_DATA(QI_DONE) |
+			QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
+	wait_desc.high = virt_to_phys(&qi->desc_status[wait_index]);
+
+	hw[wait_index] = wait_desc;
+
+	__iommu_flush_cache(iommu, &hw[index], sizeof(struct qi_desc));
+	__iommu_flush_cache(iommu, &hw[wait_index], sizeof(struct qi_desc));
+
+	qi->free_head = (qi->free_head + 2) % QI_LENGTH;
+	qi->free_cnt -= 2;
+
+	/*
+	 * update the HW tail register indicating the presence of
+	 * new descriptors.
+	 */
+	writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG);
+
+	while (qi->desc_status[wait_index] != QI_DONE) {
+		/*
+		 * We will leave the interrupts disabled, to prevent interrupt
+		 * context to queue another cmd while a cmd is already submitted
+		 * and waiting for completion on this cpu. This is to avoid
+		 * a deadlock where the interrupt context can wait indefinitely
+		 * for free slots in the queue.
+		 */
+		rc = qi_check_fault(iommu, index);
+		if (rc)
+			break;
+
+		spin_unlock(&qi->q_lock);
+		cpu_relax();
+		spin_lock(&qi->q_lock);
+	}
+
+	qi->desc_status[index] = QI_DONE;
+
+	reclaim_free_desc(qi);
+	spin_unlock_irqrestore(&qi->q_lock, flags);
+
+	if (rc == -EAGAIN)
+		goto restart;
+
+	return rc;
+}
+
+/*
+ * Flush the global interrupt entry cache.
+ */
+void qi_global_iec(struct intel_iommu *iommu)
+{
+	struct qi_desc desc;
+
+	desc.low = QI_IEC_TYPE;
+	desc.high = 0;
+
+	/* should never fail */
+	qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
+		      u64 type)
+{
+	struct qi_desc desc;
+
+	desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
+			| QI_CC_GRAN(type) | QI_CC_TYPE;
+	desc.high = 0;
+
+	qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+		    unsigned int size_order, u64 type)
+{
+	u8 dw = 0, dr = 0;
+
+	struct qi_desc desc;
+	int ih = 0;
+
+	if (cap_write_drain(iommu->cap))
+		dw = 1;
+
+	if (cap_read_drain(iommu->cap))
+		dr = 1;
+
+	desc.low = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
+		| QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
+	desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
+		| QI_IOTLB_AM(size_order);
+
+	qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
+			u64 addr, unsigned mask)
+{
+	struct qi_desc desc;
+
+	if (mask) {
+		BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
+		addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+		desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
+	} else
+		desc.high = QI_DEV_IOTLB_ADDR(addr);
+
+	if (qdep >= QI_DEV_IOTLB_MAX_INVS)
+		qdep = 0;
+
+	desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
+		   QI_DIOTLB_TYPE;
+
+	qi_submit_sync(&desc, iommu);
+}
+
+/*
+ * Disable Queued Invalidation interface.
+ */
+void dmar_disable_qi(struct intel_iommu *iommu)
+{
+	unsigned long flags;
+	u32 sts;
+	cycles_t start_time = get_cycles();
+
+	if (!ecap_qis(iommu->ecap))
+		return;
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+
+	sts =  dmar_readq(iommu->reg + DMAR_GSTS_REG);
+	if (!(sts & DMA_GSTS_QIES))
+		goto end;
+
+	/*
+	 * Give a chance to HW to complete the pending invalidation requests.
+	 */
+	while ((readl(iommu->reg + DMAR_IQT_REG) !=
+		readl(iommu->reg + DMAR_IQH_REG)) &&
+		(DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
+		cpu_relax();
+
+	iommu->gcmd &= ~DMA_GCMD_QIE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
+		      !(sts & DMA_GSTS_QIES), sts);
+end:
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+/*
+ * Enable queued invalidation.
+ */
+static void __dmar_enable_qi(struct intel_iommu *iommu)
+{
+	u32 sts;
+	unsigned long flags;
+	struct q_inval *qi = iommu->qi;
+
+	qi->free_head = qi->free_tail = 0;
+	qi->free_cnt = QI_LENGTH;
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+
+	/* write zero to the tail reg */
+	writel(0, iommu->reg + DMAR_IQT_REG);
+
+	dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc));
+
+	iommu->gcmd |= DMA_GCMD_QIE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+/*
+ * Enable Queued Invalidation interface. This is a must to support
+ * interrupt-remapping. Also used by DMA-remapping, which replaces
+ * register based IOTLB invalidation.
+ */
+int dmar_enable_qi(struct intel_iommu *iommu)
+{
+	struct q_inval *qi;
+	struct page *desc_page;
+
+	if (!ecap_qis(iommu->ecap))
+		return -ENOENT;
+
+	/*
+	 * queued invalidation is already setup and enabled.
+	 */
+	if (iommu->qi)
+		return 0;
+
+	iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
+	if (!iommu->qi)
+		return -ENOMEM;
+
+	qi = iommu->qi;
+
+
+	desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, 0);
+	if (!desc_page) {
+		kfree(qi);
+		iommu->qi = 0;
+		return -ENOMEM;
+	}
+
+	qi->desc = page_address(desc_page);
+
+	qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_ATOMIC);
+	if (!qi->desc_status) {
+		free_page((unsigned long) qi->desc);
+		kfree(qi);
+		iommu->qi = 0;
+		return -ENOMEM;
+	}
+
+	qi->free_head = qi->free_tail = 0;
+	qi->free_cnt = QI_LENGTH;
+
+	spin_lock_init(&qi->q_lock);
+
+	__dmar_enable_qi(iommu);
+
+	return 0;
+}
+
+/* iommu interrupt handling. Most stuff are MSI-like. */
+
+enum faulttype {
+	DMA_REMAP,
+	INTR_REMAP,
+	UNKNOWN,
+};
+
+static const char *dma_remap_fault_reasons[] =
+{
+	"Software",
+	"Present bit in root entry is clear",
+	"Present bit in context entry is clear",
+	"Invalid context entry",
+	"Access beyond MGAW",
+	"PTE Write access is not set",
+	"PTE Read access is not set",
+	"Next page table ptr is invalid",
+	"Root table address invalid",
+	"Context table ptr is invalid",
+	"non-zero reserved fields in RTP",
+	"non-zero reserved fields in CTP",
+	"non-zero reserved fields in PTE",
+};
+
+static const char *intr_remap_fault_reasons[] =
+{
+	"Detected reserved fields in the decoded interrupt-remapped request",
+	"Interrupt index exceeded the interrupt-remapping table size",
+	"Present field in the IRTE entry is clear",
+	"Error accessing interrupt-remapping table pointed by IRTA_REG",
+	"Detected reserved fields in the IRTE entry",
+	"Blocked a compatibility format interrupt request",
+	"Blocked an interrupt request due to source-id verification failure",
+};
+
+#define MAX_FAULT_REASON_IDX 	(ARRAY_SIZE(fault_reason_strings) - 1)
+
+const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
+{
+	if (fault_reason >= 0x20 && (fault_reason <= 0x20 +
+				     ARRAY_SIZE(intr_remap_fault_reasons))) {
+		*fault_type = INTR_REMAP;
+		return intr_remap_fault_reasons[fault_reason - 0x20];
+	} else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
+		*fault_type = DMA_REMAP;
+		return dma_remap_fault_reasons[fault_reason];
+	} else {
+		*fault_type = UNKNOWN;
+		return "Unknown";
+	}
+}
+
+void dmar_msi_unmask(struct irq_data *data)
+{
+	struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
+	unsigned long flag;
+
+	/* unmask it */
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(0, iommu->reg + DMAR_FECTL_REG);
+	/* Read a reg to force flush the post write */
+	readl(iommu->reg + DMAR_FECTL_REG);
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_mask(struct irq_data *data)
+{
+	unsigned long flag;
+	struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
+
+	/* mask it */
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
+	/* Read a reg to force flush the post write */
+	readl(iommu->reg + DMAR_FECTL_REG);
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_write(int irq, struct msi_msg *msg)
+{
+	struct intel_iommu *iommu = irq_get_handler_data(irq);
+	unsigned long flag;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
+	writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
+	writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_read(int irq, struct msi_msg *msg)
+{
+	struct intel_iommu *iommu = irq_get_handler_data(irq);
+	unsigned long flag;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
+	msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
+	msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
+		u8 fault_reason, u16 source_id, unsigned long long addr)
+{
+	const char *reason;
+	int fault_type;
+
+	reason = dmar_get_fault_reason(fault_reason, &fault_type);
+
+	if (fault_type == INTR_REMAP)
+		printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] "
+		       "fault index %llx\n"
+			"INTR-REMAP:[fault reason %02d] %s\n",
+			(source_id >> 8), PCI_SLOT(source_id & 0xFF),
+			PCI_FUNC(source_id & 0xFF), addr >> 48,
+			fault_reason, reason);
+	else
+		printk(KERN_ERR
+		       "DMAR:[%s] Request device [%02x:%02x.%d] "
+		       "fault addr %llx \n"
+		       "DMAR:[fault reason %02d] %s\n",
+		       (type ? "DMA Read" : "DMA Write"),
+		       (source_id >> 8), PCI_SLOT(source_id & 0xFF),
+		       PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
+	return 0;
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+irqreturn_t dmar_fault(int irq, void *dev_id)
+{
+	struct intel_iommu *iommu = dev_id;
+	int reg, fault_index;
+	u32 fault_status;
+	unsigned long flag;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+	if (fault_status)
+		printk(KERN_ERR "DRHD: handling fault status reg %x\n",
+		       fault_status);
+
+	/* TBD: ignore advanced fault log currently */
+	if (!(fault_status & DMA_FSTS_PPF))
+		goto clear_rest;
+
+	fault_index = dma_fsts_fault_record_index(fault_status);
+	reg = cap_fault_reg_offset(iommu->cap);
+	while (1) {
+		u8 fault_reason;
+		u16 source_id;
+		u64 guest_addr;
+		int type;
+		u32 data;
+
+		/* highest 32 bits */
+		data = readl(iommu->reg + reg +
+				fault_index * PRIMARY_FAULT_REG_LEN + 12);
+		if (!(data & DMA_FRCD_F))
+			break;
+
+		fault_reason = dma_frcd_fault_reason(data);
+		type = dma_frcd_type(data);
+
+		data = readl(iommu->reg + reg +
+				fault_index * PRIMARY_FAULT_REG_LEN + 8);
+		source_id = dma_frcd_source_id(data);
+
+		guest_addr = dmar_readq(iommu->reg + reg +
+				fault_index * PRIMARY_FAULT_REG_LEN);
+		guest_addr = dma_frcd_page_addr(guest_addr);
+		/* clear the fault */
+		writel(DMA_FRCD_F, iommu->reg + reg +
+			fault_index * PRIMARY_FAULT_REG_LEN + 12);
+
+		spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+		dmar_fault_do_one(iommu, type, fault_reason,
+				source_id, guest_addr);
+
+		fault_index++;
+		if (fault_index >= cap_num_fault_regs(iommu->cap))
+			fault_index = 0;
+		spin_lock_irqsave(&iommu->register_lock, flag);
+	}
+clear_rest:
+	/* clear all the other faults */
+	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+	writel(fault_status, iommu->reg + DMAR_FSTS_REG);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+	return IRQ_HANDLED;
+}
+
+int dmar_set_interrupt(struct intel_iommu *iommu)
+{
+	int irq, ret;
+
+	/*
+	 * Check if the fault interrupt is already initialized.
+	 */
+	if (iommu->irq)
+		return 0;
+
+	irq = create_irq();
+	if (!irq) {
+		printk(KERN_ERR "IOMMU: no free vectors\n");
+		return -EINVAL;
+	}
+
+	irq_set_handler_data(irq, iommu);
+	iommu->irq = irq;
+
+	ret = arch_setup_dmar_msi(irq);
+	if (ret) {
+		irq_set_handler_data(irq, NULL);
+		iommu->irq = 0;
+		destroy_irq(irq);
+		return ret;
+	}
+
+	ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu);
+	if (ret)
+		printk(KERN_ERR "IOMMU: can't request irq\n");
+	return ret;
+}
+
+int __init enable_drhd_fault_handling(void)
+{
+	struct dmar_drhd_unit *drhd;
+
+	/*
+	 * Enable fault control interrupt.
+	 */
+	for_each_drhd_unit(drhd) {
+		int ret;
+		struct intel_iommu *iommu = drhd->iommu;
+		ret = dmar_set_interrupt(iommu);
+
+		if (ret) {
+			printk(KERN_ERR "DRHD %Lx: failed to enable fault, "
+			       " interrupt, ret %d\n",
+			       (unsigned long long)drhd->reg_base_addr, ret);
+			return -1;
+		}
+
+		/*
+		 * Clear any previous faults.
+		 */
+		dmar_fault(iommu->irq, iommu);
+	}
+
+	return 0;
+}
+
+/*
+ * Re-enable Queued Invalidation interface.
+ */
+int dmar_reenable_qi(struct intel_iommu *iommu)
+{
+	if (!ecap_qis(iommu->ecap))
+		return -ENOENT;
+
+	if (!iommu->qi)
+		return -ENOENT;
+
+	/*
+	 * First disable queued invalidation.
+	 */
+	dmar_disable_qi(iommu);
+	/*
+	 * Then enable queued invalidation again. Since there is no pending
+	 * invalidation requests now, it's safe to re-enable queued
+	 * invalidation.
+	 */
+	__dmar_enable_qi(iommu);
+
+	return 0;
+}
+
+/*
+ * Check interrupt remapping support in DMAR table description.
+ */
+int __init dmar_ir_support(void)
+{
+	struct acpi_table_dmar *dmar;
+	dmar = (struct acpi_table_dmar *)dmar_tbl;
+	if (!dmar)
+		return 0;
+	return dmar->flags & 0x1;
+}
+IOMMU_INIT_POST(detect_intel_iommu);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
new file mode 100644
index 000000000000..c621c98c99da
--- /dev/null
+++ b/drivers/iommu/intel-iommu.c
@@ -0,0 +1,4016 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Ashok Raj <ashok.raj@intel.com>
+ * Author: Shaohua Li <shaohua.li@intel.com>
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/bitmap.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/dma-mapping.h>
+#include <linux/mempool.h>
+#include <linux/timer.h>
+#include <linux/iova.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+#include <linux/syscore_ops.h>
+#include <linux/tboot.h>
+#include <linux/dmi.h>
+#include <linux/pci-ats.h>
+#include <asm/cacheflush.h>
+#include <asm/iommu.h>
+
+#define ROOT_SIZE		VTD_PAGE_SIZE
+#define CONTEXT_SIZE		VTD_PAGE_SIZE
+
+#define IS_BRIDGE_HOST_DEVICE(pdev) \
+			    ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
+#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
+
+#define IOAPIC_RANGE_START	(0xfee00000)
+#define IOAPIC_RANGE_END	(0xfeefffff)
+#define IOVA_START_ADDR		(0x1000)
+
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
+
+#define MAX_AGAW_WIDTH 64
+
+#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
+#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
+
+/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
+   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
+#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
+				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
+#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
+
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
+#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
+#define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
+
+/* page table handling */
+#define LEVEL_STRIDE		(9)
+#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
+
+static inline int agaw_to_level(int agaw)
+{
+	return agaw + 2;
+}
+
+static inline int agaw_to_width(int agaw)
+{
+	return 30 + agaw * LEVEL_STRIDE;
+}
+
+static inline int width_to_agaw(int width)
+{
+	return (width - 30) / LEVEL_STRIDE;
+}
+
+static inline unsigned int level_to_offset_bits(int level)
+{
+	return (level - 1) * LEVEL_STRIDE;
+}
+
+static inline int pfn_level_offset(unsigned long pfn, int level)
+{
+	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
+}
+
+static inline unsigned long level_mask(int level)
+{
+	return -1UL << level_to_offset_bits(level);
+}
+
+static inline unsigned long level_size(int level)
+{
+	return 1UL << level_to_offset_bits(level);
+}
+
+static inline unsigned long align_to_level(unsigned long pfn, int level)
+{
+	return (pfn + level_size(level) - 1) & level_mask(level);
+}
+
+static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+{
+	return  1 << ((lvl - 1) * LEVEL_STRIDE);
+}
+
+/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
+   are never going to work. */
+static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
+{
+	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
+}
+
+static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
+{
+	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
+}
+static inline unsigned long page_to_dma_pfn(struct page *pg)
+{
+	return mm_to_dma_pfn(page_to_pfn(pg));
+}
+static inline unsigned long virt_to_dma_pfn(void *p)
+{
+	return page_to_dma_pfn(virt_to_page(p));
+}
+
+/* global iommu list, set NULL for ignored DMAR units */
+static struct intel_iommu **g_iommus;
+
+static void __init check_tylersburg_isoch(void);
+static int rwbf_quirk;
+
+/*
+ * set to 1 to panic kernel if can't successfully enable VT-d
+ * (used when kernel is launched w/ TXT)
+ */
+static int force_on = 0;
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+	u64	val;
+	u64	rsvd1;
+};
+#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
+static inline bool root_present(struct root_entry *root)
+{
+	return (root->val & 1);
+}
+static inline void set_root_present(struct root_entry *root)
+{
+	root->val |= 1;
+}
+static inline void set_root_value(struct root_entry *root, unsigned long value)
+{
+	root->val |= value & VTD_PAGE_MASK;
+}
+
+static inline struct context_entry *
+get_context_addr_from_root(struct root_entry *root)
+{
+	return (struct context_entry *)
+		(root_present(root)?phys_to_virt(
+		root->val & VTD_PAGE_MASK) :
+		NULL);
+}
+
+/*
+ * low 64 bits:
+ * 0: present
+ * 1: fault processing disable
+ * 2-3: translation type
+ * 12-63: address space root
+ * high 64 bits:
+ * 0-2: address width
+ * 3-6: aval
+ * 8-23: domain id
+ */
+struct context_entry {
+	u64 lo;
+	u64 hi;
+};
+
+static inline bool context_present(struct context_entry *context)
+{
+	return (context->lo & 1);
+}
+static inline void context_set_present(struct context_entry *context)
+{
+	context->lo |= 1;
+}
+
+static inline void context_set_fault_enable(struct context_entry *context)
+{
+	context->lo &= (((u64)-1) << 2) | 1;
+}
+
+static inline void context_set_translation_type(struct context_entry *context,
+						unsigned long value)
+{
+	context->lo &= (((u64)-1) << 4) | 3;
+	context->lo |= (value & 3) << 2;
+}
+
+static inline void context_set_address_root(struct context_entry *context,
+					    unsigned long value)
+{
+	context->lo |= value & VTD_PAGE_MASK;
+}
+
+static inline void context_set_address_width(struct context_entry *context,
+					     unsigned long value)
+{
+	context->hi |= value & 7;
+}
+
+static inline void context_set_domain_id(struct context_entry *context,
+					 unsigned long value)
+{
+	context->hi |= (value & ((1 << 16) - 1)) << 8;
+}
+
+static inline void context_clear_entry(struct context_entry *context)
+{
+	context->lo = 0;
+	context->hi = 0;
+}
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-10: available
+ * 11: snoop behavior
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+	u64 val;
+};
+
+static inline void dma_clear_pte(struct dma_pte *pte)
+{
+	pte->val = 0;
+}
+
+static inline void dma_set_pte_readable(struct dma_pte *pte)
+{
+	pte->val |= DMA_PTE_READ;
+}
+
+static inline void dma_set_pte_writable(struct dma_pte *pte)
+{
+	pte->val |= DMA_PTE_WRITE;
+}
+
+static inline void dma_set_pte_snp(struct dma_pte *pte)
+{
+	pte->val |= DMA_PTE_SNP;
+}
+
+static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
+{
+	pte->val = (pte->val & ~3) | (prot & 3);
+}
+
+static inline u64 dma_pte_addr(struct dma_pte *pte)
+{
+#ifdef CONFIG_64BIT
+	return pte->val & VTD_PAGE_MASK;
+#else
+	/* Must have a full atomic 64-bit read */
+	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
+#endif
+}
+
+static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
+{
+	pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
+}
+
+static inline bool dma_pte_present(struct dma_pte *pte)
+{
+	return (pte->val & 3) != 0;
+}
+
+static inline int first_pte_in_page(struct dma_pte *pte)
+{
+	return !((unsigned long)pte & ~VTD_PAGE_MASK);
+}
+
+/*
+ * This domain is a statically identity mapping domain.
+ *	1. This domain creats a static 1:1 mapping to all usable memory.
+ * 	2. It maps to each iommu if successful.
+ *	3. Each iommu mapps to this domain if successful.
+ */
+static struct dmar_domain *si_domain;
+static int hw_pass_through = 1;
+
+/* devices under the same p2p bridge are owned in one domain */
+#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
+
+/* domain represents a virtual machine, more than one devices
+ * across iommus may be owned in one domain, e.g. kvm guest.
+ */
+#define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 1)
+
+/* si_domain contains mulitple devices */
+#define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 2)
+
+struct dmar_domain {
+	int	id;			/* domain id */
+	int	nid;			/* node id */
+	unsigned long iommu_bmp;	/* bitmap of iommus this domain uses*/
+
+	struct list_head devices; 	/* all devices' list */
+	struct iova_domain iovad;	/* iova's that belong to this domain */
+
+	struct dma_pte	*pgd;		/* virtual address */
+	int		gaw;		/* max guest address width */
+
+	/* adjusted guest address width, 0 is level 2 30-bit */
+	int		agaw;
+
+	int		flags;		/* flags to find out type of domain */
+
+	int		iommu_coherency;/* indicate coherency of iommu access */
+	int		iommu_snooping; /* indicate snooping control feature*/
+	int		iommu_count;	/* reference count of iommu */
+	int		iommu_superpage;/* Level of superpages supported:
+					   0 == 4KiB (no superpages), 1 == 2MiB,
+					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
+	spinlock_t	iommu_lock;	/* protect iommu set in domain */
+	u64		max_addr;	/* maximum mapped address */
+};
+
+/* PCI domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct list_head global; /* link to global list */
+	int segment;		/* PCI domain */
+	u8 bus;			/* PCI bus number */
+	u8 devfn;		/* PCI devfn number */
+	struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
+	struct intel_iommu *iommu; /* IOMMU used by this device */
+	struct dmar_domain *domain; /* pointer to domain */
+};
+
+static void flush_unmaps_timeout(unsigned long data);
+
+DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
+
+#define HIGH_WATER_MARK 250
+struct deferred_flush_tables {
+	int next;
+	struct iova *iova[HIGH_WATER_MARK];
+	struct dmar_domain *domain[HIGH_WATER_MARK];
+};
+
+static struct deferred_flush_tables *deferred_flush;
+
+/* bitmap for indexing intel_iommus */
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+
+static void domain_remove_dev_info(struct dmar_domain *domain);
+
+#ifdef CONFIG_DMAR_DEFAULT_ON
+int dmar_disabled = 0;
+#else
+int dmar_disabled = 1;
+#endif /*CONFIG_DMAR_DEFAULT_ON*/
+
+static int dmar_map_gfx = 1;
+static int dmar_forcedac;
+static int intel_iommu_strict;
+static int intel_iommu_superpage = 1;
+
+#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
+static DEFINE_SPINLOCK(device_domain_lock);
+static LIST_HEAD(device_domain_list);
+
+static struct iommu_ops intel_iommu_ops;
+
+static int __init intel_iommu_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	while (*str) {
+		if (!strncmp(str, "on", 2)) {
+			dmar_disabled = 0;
+			printk(KERN_INFO "Intel-IOMMU: enabled\n");
+		} else if (!strncmp(str, "off", 3)) {
+			dmar_disabled = 1;
+			printk(KERN_INFO "Intel-IOMMU: disabled\n");
+		} else if (!strncmp(str, "igfx_off", 8)) {
+			dmar_map_gfx = 0;
+			printk(KERN_INFO
+				"Intel-IOMMU: disable GFX device mapping\n");
+		} else if (!strncmp(str, "forcedac", 8)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: Forcing DAC for PCI devices\n");
+			dmar_forcedac = 1;
+		} else if (!strncmp(str, "strict", 6)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable batched IOTLB flush\n");
+			intel_iommu_strict = 1;
+		} else if (!strncmp(str, "sp_off", 6)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable supported super page\n");
+			intel_iommu_superpage = 0;
+		}
+
+		str += strcspn(str, ",");
+		while (*str == ',')
+			str++;
+	}
+	return 0;
+}
+__setup("intel_iommu=", intel_iommu_setup);
+
+static struct kmem_cache *iommu_domain_cache;
+static struct kmem_cache *iommu_devinfo_cache;
+static struct kmem_cache *iommu_iova_cache;
+
+static inline void *alloc_pgtable_page(int node)
+{
+	struct page *page;
+	void *vaddr = NULL;
+
+	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
+	if (page)
+		vaddr = page_address(page);
+	return vaddr;
+}
+
+static inline void free_pgtable_page(void *vaddr)
+{
+	free_page((unsigned long)vaddr);
+}
+
+static inline void *alloc_domain_mem(void)
+{
+	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
+}
+
+static void free_domain_mem(void *vaddr)
+{
+	kmem_cache_free(iommu_domain_cache, vaddr);
+}
+
+static inline void * alloc_devinfo_mem(void)
+{
+	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
+}
+
+static inline void free_devinfo_mem(void *vaddr)
+{
+	kmem_cache_free(iommu_devinfo_cache, vaddr);
+}
+
+struct iova *alloc_iova_mem(void)
+{
+	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
+}
+
+void free_iova_mem(struct iova *iova)
+{
+	kmem_cache_free(iommu_iova_cache, iova);
+}
+
+
+static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
+{
+	unsigned long sagaw;
+	int agaw = -1;
+
+	sagaw = cap_sagaw(iommu->cap);
+	for (agaw = width_to_agaw(max_gaw);
+	     agaw >= 0; agaw--) {
+		if (test_bit(agaw, &sagaw))
+			break;
+	}
+
+	return agaw;
+}
+
+/*
+ * Calculate max SAGAW for each iommu.
+ */
+int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
+}
+
+/*
+ * calculate agaw for each iommu.
+ * "SAGAW" may be different across iommus, use a default agaw, and
+ * get a supported less agaw for iommus that don't support the default agaw.
+ */
+int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+}
+
+/* This functionin only returns single iommu in a domain */
+static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
+{
+	int iommu_id;
+
+	/* si_domain and vm domain should not get here. */
+	BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
+	BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
+
+	iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
+	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
+		return NULL;
+
+	return g_iommus[iommu_id];
+}
+
+static void domain_update_iommu_coherency(struct dmar_domain *domain)
+{
+	int i;
+
+	domain->iommu_coherency = 1;
+
+	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+		if (!ecap_coherent(g_iommus[i]->ecap)) {
+			domain->iommu_coherency = 0;
+			break;
+		}
+	}
+}
+
+static void domain_update_iommu_snooping(struct dmar_domain *domain)
+{
+	int i;
+
+	domain->iommu_snooping = 1;
+
+	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+		if (!ecap_sc_support(g_iommus[i]->ecap)) {
+			domain->iommu_snooping = 0;
+			break;
+		}
+	}
+}
+
+static void domain_update_iommu_superpage(struct dmar_domain *domain)
+{
+	int i, mask = 0xf;
+
+	if (!intel_iommu_superpage) {
+		domain->iommu_superpage = 0;
+		return;
+	}
+
+	domain->iommu_superpage = 4; /* 1TiB */
+
+	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+		mask |= cap_super_page_val(g_iommus[i]->cap);
+		if (!mask) {
+			break;
+		}
+	}
+	domain->iommu_superpage = fls(mask);
+}
+
+/* Some capabilities may be different across iommus */
+static void domain_update_iommu_cap(struct dmar_domain *domain)
+{
+	domain_update_iommu_coherency(domain);
+	domain_update_iommu_snooping(domain);
+	domain_update_iommu_superpage(domain);
+}
+
+static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
+{
+	struct dmar_drhd_unit *drhd = NULL;
+	int i;
+
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		if (segment != drhd->segment)
+			continue;
+
+		for (i = 0; i < drhd->devices_cnt; i++) {
+			if (drhd->devices[i] &&
+			    drhd->devices[i]->bus->number == bus &&
+			    drhd->devices[i]->devfn == devfn)
+				return drhd->iommu;
+			if (drhd->devices[i] &&
+			    drhd->devices[i]->subordinate &&
+			    drhd->devices[i]->subordinate->number <= bus &&
+			    drhd->devices[i]->subordinate->subordinate >= bus)
+				return drhd->iommu;
+		}
+
+		if (drhd->include_all)
+			return drhd->iommu;
+	}
+
+	return NULL;
+}
+
+static void domain_flush_cache(struct dmar_domain *domain,
+			       void *addr, int size)
+{
+	if (!domain->iommu_coherency)
+		clflush_cache_range(addr, size);
+}
+
+/* Gets context entry for a given bus and devfn */
+static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
+		u8 bus, u8 devfn)
+{
+	struct root_entry *root;
+	struct context_entry *context;
+	unsigned long phy_addr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	root = &iommu->root_entry[bus];
+	context = get_context_addr_from_root(root);
+	if (!context) {
+		context = (struct context_entry *)
+				alloc_pgtable_page(iommu->node);
+		if (!context) {
+			spin_unlock_irqrestore(&iommu->lock, flags);
+			return NULL;
+		}
+		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
+		phy_addr = virt_to_phys((void *)context);
+		set_root_value(root, phy_addr);
+		set_root_present(root);
+		__iommu_flush_cache(iommu, root, sizeof(*root));
+	}
+	spin_unlock_irqrestore(&iommu->lock, flags);
+	return &context[devfn];
+}
+
+static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+	struct root_entry *root;
+	struct context_entry *context;
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	root = &iommu->root_entry[bus];
+	context = get_context_addr_from_root(root);
+	if (!context) {
+		ret = 0;
+		goto out;
+	}
+	ret = context_present(&context[devfn]);
+out:
+	spin_unlock_irqrestore(&iommu->lock, flags);
+	return ret;
+}
+
+static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+	struct root_entry *root;
+	struct context_entry *context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	root = &iommu->root_entry[bus];
+	context = get_context_addr_from_root(root);
+	if (context) {
+		context_clear_entry(&context[devfn]);
+		__iommu_flush_cache(iommu, &context[devfn], \
+			sizeof(*context));
+	}
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void free_context_table(struct intel_iommu *iommu)
+{
+	struct root_entry *root;
+	int i;
+	unsigned long flags;
+	struct context_entry *context;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	if (!iommu->root_entry) {
+		goto out;
+	}
+	for (i = 0; i < ROOT_ENTRY_NR; i++) {
+		root = &iommu->root_entry[i];
+		context = get_context_addr_from_root(root);
+		if (context)
+			free_pgtable_page(context);
+	}
+	free_pgtable_page(iommu->root_entry);
+	iommu->root_entry = NULL;
+out:
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
+				      unsigned long pfn, int large_level)
+{
+	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+	struct dma_pte *parent, *pte = NULL;
+	int level = agaw_to_level(domain->agaw);
+	int offset, target_level;
+
+	BUG_ON(!domain->pgd);
+	BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
+	parent = domain->pgd;
+
+	/* Search pte */
+	if (!large_level)
+		target_level = 1;
+	else
+		target_level = large_level;
+
+	while (level > 0) {
+		void *tmp_page;
+
+		offset = pfn_level_offset(pfn, level);
+		pte = &parent[offset];
+		if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
+			break;
+		if (level == target_level)
+			break;
+
+		if (!dma_pte_present(pte)) {
+			uint64_t pteval;
+
+			tmp_page = alloc_pgtable_page(domain->nid);
+
+			if (!tmp_page)
+				return NULL;
+
+			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
+			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
+			if (cmpxchg64(&pte->val, 0ULL, pteval)) {
+				/* Someone else set it while we were thinking; use theirs. */
+				free_pgtable_page(tmp_page);
+			} else {
+				dma_pte_addr(pte);
+				domain_flush_cache(domain, pte, sizeof(*pte));
+			}
+		}
+		parent = phys_to_virt(dma_pte_addr(pte));
+		level--;
+	}
+
+	return pte;
+}
+
+
+/* return address's pte at specific level */
+static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
+					 unsigned long pfn,
+					 int level, int *large_page)
+{
+	struct dma_pte *parent, *pte = NULL;
+	int total = agaw_to_level(domain->agaw);
+	int offset;
+
+	parent = domain->pgd;
+	while (level <= total) {
+		offset = pfn_level_offset(pfn, total);
+		pte = &parent[offset];
+		if (level == total)
+			return pte;
+
+		if (!dma_pte_present(pte)) {
+			*large_page = total;
+			break;
+		}
+
+		if (pte->val & DMA_PTE_LARGE_PAGE) {
+			*large_page = total;
+			return pte;
+		}
+
+		parent = phys_to_virt(dma_pte_addr(pte));
+		total--;
+	}
+	return NULL;
+}
+
+/* clear last level pte, a tlb flush should be followed */
+static void dma_pte_clear_range(struct dmar_domain *domain,
+				unsigned long start_pfn,
+				unsigned long last_pfn)
+{
+	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+	unsigned int large_page = 1;
+	struct dma_pte *first_pte, *pte;
+
+	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
+	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
+	BUG_ON(start_pfn > last_pfn);
+
+	/* we don't need lock here; nobody else touches the iova range */
+	do {
+		large_page = 1;
+		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
+		if (!pte) {
+			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
+			continue;
+		}
+		do {
+			dma_clear_pte(pte);
+			start_pfn += lvl_to_nr_pages(large_page);
+			pte++;
+		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
+
+		domain_flush_cache(domain, first_pte,
+				   (void *)pte - (void *)first_pte);
+
+	} while (start_pfn && start_pfn <= last_pfn);
+}
+
+/* free page table pages. last level pte should already be cleared */
+static void dma_pte_free_pagetable(struct dmar_domain *domain,
+				   unsigned long start_pfn,
+				   unsigned long last_pfn)
+{
+	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+	struct dma_pte *first_pte, *pte;
+	int total = agaw_to_level(domain->agaw);
+	int level;
+	unsigned long tmp;
+	int large_page = 2;
+
+	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
+	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
+	BUG_ON(start_pfn > last_pfn);
+
+	/* We don't need lock here; nobody else touches the iova range */
+	level = 2;
+	while (level <= total) {
+		tmp = align_to_level(start_pfn, level);
+
+		/* If we can't even clear one PTE at this level, we're done */
+		if (tmp + level_size(level) - 1 > last_pfn)
+			return;
+
+		do {
+			large_page = level;
+			first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
+			if (large_page > level)
+				level = large_page + 1;
+			if (!pte) {
+				tmp = align_to_level(tmp + 1, level + 1);
+				continue;
+			}
+			do {
+				if (dma_pte_present(pte)) {
+					free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
+					dma_clear_pte(pte);
+				}
+				pte++;
+				tmp += level_size(level);
+			} while (!first_pte_in_page(pte) &&
+				 tmp + level_size(level) - 1 <= last_pfn);
+
+			domain_flush_cache(domain, first_pte,
+					   (void *)pte - (void *)first_pte);
+			
+		} while (tmp && tmp + level_size(level) - 1 <= last_pfn);
+		level++;
+	}
+	/* free pgd */
+	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+		free_pgtable_page(domain->pgd);
+		domain->pgd = NULL;
+	}
+}
+
+/* iommu handling */
+static int iommu_alloc_root_entry(struct intel_iommu *iommu)
+{
+	struct root_entry *root;
+	unsigned long flags;
+
+	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
+	if (!root)
+		return -ENOMEM;
+
+	__iommu_flush_cache(iommu, root, ROOT_SIZE);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	iommu->root_entry = root;
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return 0;
+}
+
+static void iommu_set_root_entry(struct intel_iommu *iommu)
+{
+	void *addr;
+	u32 sts;
+	unsigned long flag;
+
+	addr = iommu->root_entry;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
+
+	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_RTPS), sts);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static void iommu_flush_write_buffer(struct intel_iommu *iommu)
+{
+	u32 val;
+	unsigned long flag;
+
+	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
+		return;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (!(val & DMA_GSTS_WBFS)), val);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static void __iommu_flush_context(struct intel_iommu *iommu,
+				  u16 did, u16 source_id, u8 function_mask,
+				  u64 type)
+{
+	u64 val = 0;
+	unsigned long flag;
+
+	switch (type) {
+	case DMA_CCMD_GLOBAL_INVL:
+		val = DMA_CCMD_GLOBAL_INVL;
+		break;
+	case DMA_CCMD_DOMAIN_INVL:
+		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+		break;
+	case DMA_CCMD_DEVICE_INVL:
+		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
+		break;
+	default:
+		BUG();
+	}
+	val |= DMA_CCMD_ICC;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
+		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+				u64 addr, unsigned int size_order, u64 type)
+{
+	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+	u64 val = 0, val_iva = 0;
+	unsigned long flag;
+
+	switch (type) {
+	case DMA_TLB_GLOBAL_FLUSH:
+		/* global flush doesn't need set IVA_REG */
+		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+		break;
+	case DMA_TLB_DSI_FLUSH:
+		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+		break;
+	case DMA_TLB_PSI_FLUSH:
+		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+		/* Note: always flush non-leaf currently */
+		val_iva = size_order | addr;
+		break;
+	default:
+		BUG();
+	}
+	/* Note: set drain read/write */
+#if 0
+	/*
+	 * This is probably to be super secure.. Looks like we can
+	 * ignore it without any impact.
+	 */
+	if (cap_read_drain(iommu->cap))
+		val |= DMA_TLB_READ_DRAIN;
+#endif
+	if (cap_write_drain(iommu->cap))
+		val |= DMA_TLB_WRITE_DRAIN;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	/* Note: Only uses first TLB reg currently */
+	if (val_iva)
+		dmar_writeq(iommu->reg + tlb_offset, val_iva);
+	dmar_writeq(iommu->reg + tlb_offset + 8, val);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
+		dmar_readq, (!(val & DMA_TLB_IVT)), val);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+	/* check IOTLB invalidation granularity */
+	if (DMA_TLB_IAIG(val) == 0)
+		printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
+	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
+		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
+			(unsigned long long)DMA_TLB_IIRG(type),
+			(unsigned long long)DMA_TLB_IAIG(val));
+}
+
+static struct device_domain_info *iommu_support_dev_iotlb(
+	struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
+{
+	int found = 0;
+	unsigned long flags;
+	struct device_domain_info *info;
+	struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
+
+	if (!ecap_dev_iotlb_support(iommu->ecap))
+		return NULL;
+
+	if (!iommu->qi)
+		return NULL;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link)
+		if (info->bus == bus && info->devfn == devfn) {
+			found = 1;
+			break;
+		}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	if (!found || !info->dev)
+		return NULL;
+
+	if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
+		return NULL;
+
+	if (!dmar_find_matched_atsr_unit(info->dev))
+		return NULL;
+
+	info->iommu = iommu;
+
+	return info;
+}
+
+static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+{
+	if (!info)
+		return;
+
+	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
+}
+
+static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+{
+	if (!info->dev || !pci_ats_enabled(info->dev))
+		return;
+
+	pci_disable_ats(info->dev);
+}
+
+static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+				  u64 addr, unsigned mask)
+{
+	u16 sid, qdep;
+	unsigned long flags;
+	struct device_domain_info *info;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link) {
+		if (!info->dev || !pci_ats_enabled(info->dev))
+			continue;
+
+		sid = info->bus << 8 | info->devfn;
+		qdep = pci_ats_queue_depth(info->dev);
+		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
+				  unsigned long pfn, unsigned int pages, int map)
+{
+	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
+	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
+
+	BUG_ON(pages == 0);
+
+	/*
+	 * Fallback to domain selective flush if no PSI support or the size is
+	 * too big.
+	 * PSI requires page size to be 2 ^ x, and the base address is naturally
+	 * aligned to the size
+	 */
+	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
+		iommu->flush.flush_iotlb(iommu, did, 0, 0,
+						DMA_TLB_DSI_FLUSH);
+	else
+		iommu->flush.flush_iotlb(iommu, did, addr, mask,
+						DMA_TLB_PSI_FLUSH);
+
+	/*
+	 * In caching mode, changes of pages from non-present to present require
+	 * flush. However, device IOTLB doesn't need to be flushed in this case.
+	 */
+	if (!cap_caching_mode(iommu->cap) || !map)
+		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
+}
+
+static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
+{
+	u32 pmen;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+	pmen = readl(iommu->reg + DMAR_PMEN_REG);
+	pmen &= ~DMA_PMEN_EPM;
+	writel(pmen, iommu->reg + DMAR_PMEN_REG);
+
+	/* wait for the protected region status bit to clear */
+	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
+		readl, !(pmen & DMA_PMEN_PRS), pmen);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static int iommu_enable_translation(struct intel_iommu *iommu)
+{
+	u32 sts;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+	iommu->gcmd |= DMA_GCMD_TE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_TES), sts);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+	return 0;
+}
+
+static int iommu_disable_translation(struct intel_iommu *iommu)
+{
+	u32 sts;
+	unsigned long flag;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	iommu->gcmd &= ~DMA_GCMD_TE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (!(sts & DMA_GSTS_TES)), sts);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+	return 0;
+}
+
+
+static int iommu_init_domains(struct intel_iommu *iommu)
+{
+	unsigned long ndomains;
+	unsigned long nlongs;
+
+	ndomains = cap_ndoms(iommu->cap);
+	pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
+			ndomains);
+	nlongs = BITS_TO_LONGS(ndomains);
+
+	spin_lock_init(&iommu->lock);
+
+	/* TBD: there might be 64K domains,
+	 * consider other allocation for future chip
+	 */
+	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
+	if (!iommu->domain_ids) {
+		printk(KERN_ERR "Allocating domain id array failed\n");
+		return -ENOMEM;
+	}
+	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
+			GFP_KERNEL);
+	if (!iommu->domains) {
+		printk(KERN_ERR "Allocating domain array failed\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * if Caching mode is set, then invalid translations are tagged
+	 * with domainid 0. Hence we need to pre-allocate it.
+	 */
+	if (cap_caching_mode(iommu->cap))
+		set_bit(0, iommu->domain_ids);
+	return 0;
+}
+
+
+static void domain_exit(struct dmar_domain *domain);
+static void vm_domain_exit(struct dmar_domain *domain);
+
+void free_dmar_iommu(struct intel_iommu *iommu)
+{
+	struct dmar_domain *domain;
+	int i;
+	unsigned long flags;
+
+	if ((iommu->domains) && (iommu->domain_ids)) {
+		for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
+			domain = iommu->domains[i];
+			clear_bit(i, iommu->domain_ids);
+
+			spin_lock_irqsave(&domain->iommu_lock, flags);
+			if (--domain->iommu_count == 0) {
+				if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
+					vm_domain_exit(domain);
+				else
+					domain_exit(domain);
+			}
+			spin_unlock_irqrestore(&domain->iommu_lock, flags);
+		}
+	}
+
+	if (iommu->gcmd & DMA_GCMD_TE)
+		iommu_disable_translation(iommu);
+
+	if (iommu->irq) {
+		irq_set_handler_data(iommu->irq, NULL);
+		/* This will mask the irq */
+		free_irq(iommu->irq, iommu);
+		destroy_irq(iommu->irq);
+	}
+
+	kfree(iommu->domains);
+	kfree(iommu->domain_ids);
+
+	g_iommus[iommu->seq_id] = NULL;
+
+	/* if all iommus are freed, free g_iommus */
+	for (i = 0; i < g_num_of_iommus; i++) {
+		if (g_iommus[i])
+			break;
+	}
+
+	if (i == g_num_of_iommus)
+		kfree(g_iommus);
+
+	/* free context mapping */
+	free_context_table(iommu);
+}
+
+static struct dmar_domain *alloc_domain(void)
+{
+	struct dmar_domain *domain;
+
+	domain = alloc_domain_mem();
+	if (!domain)
+		return NULL;
+
+	domain->nid = -1;
+	memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
+	domain->flags = 0;
+
+	return domain;
+}
+
+static int iommu_attach_domain(struct dmar_domain *domain,
+			       struct intel_iommu *iommu)
+{
+	int num;
+	unsigned long ndomains;
+	unsigned long flags;
+
+	ndomains = cap_ndoms(iommu->cap);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	num = find_first_zero_bit(iommu->domain_ids, ndomains);
+	if (num >= ndomains) {
+		spin_unlock_irqrestore(&iommu->lock, flags);
+		printk(KERN_ERR "IOMMU: no free domain ids\n");
+		return -ENOMEM;
+	}
+
+	domain->id = num;
+	set_bit(num, iommu->domain_ids);
+	set_bit(iommu->seq_id, &domain->iommu_bmp);
+	iommu->domains[num] = domain;
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return 0;
+}
+
+static void iommu_detach_domain(struct dmar_domain *domain,
+				struct intel_iommu *iommu)
+{
+	unsigned long flags;
+	int num, ndomains;
+	int found = 0;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	ndomains = cap_ndoms(iommu->cap);
+	for_each_set_bit(num, iommu->domain_ids, ndomains) {
+		if (iommu->domains[num] == domain) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		clear_bit(num, iommu->domain_ids);
+		clear_bit(iommu->seq_id, &domain->iommu_bmp);
+		iommu->domains[num] = NULL;
+	}
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static struct iova_domain reserved_iova_list;
+static struct lock_class_key reserved_rbtree_key;
+
+static int dmar_init_reserved_ranges(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iova *iova;
+	int i;
+
+	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
+
+	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
+		&reserved_rbtree_key);
+
+	/* IOAPIC ranges shouldn't be accessed by DMA */
+	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
+		IOVA_PFN(IOAPIC_RANGE_END));
+	if (!iova) {
+		printk(KERN_ERR "Reserve IOAPIC range failed\n");
+		return -ENODEV;
+	}
+
+	/* Reserve all PCI MMIO to avoid peer-to-peer access */
+	for_each_pci_dev(pdev) {
+		struct resource *r;
+
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			r = &pdev->resource[i];
+			if (!r->flags || !(r->flags & IORESOURCE_MEM))
+				continue;
+			iova = reserve_iova(&reserved_iova_list,
+					    IOVA_PFN(r->start),
+					    IOVA_PFN(r->end));
+			if (!iova) {
+				printk(KERN_ERR "Reserve iova failed\n");
+				return -ENODEV;
+			}
+		}
+	}
+	return 0;
+}
+
+static void domain_reserve_special_ranges(struct dmar_domain *domain)
+{
+	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
+}
+
+static inline int guestwidth_to_adjustwidth(int gaw)
+{
+	int agaw;
+	int r = (gaw - 12) % 9;
+
+	if (r == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - r;
+	if (agaw > 64)
+		agaw = 64;
+	return agaw;
+}
+
+static int domain_init(struct dmar_domain *domain, int guest_width)
+{
+	struct intel_iommu *iommu;
+	int adjust_width, agaw;
+	unsigned long sagaw;
+
+	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+	spin_lock_init(&domain->iommu_lock);
+
+	domain_reserve_special_ranges(domain);
+
+	/* calculate AGAW */
+	iommu = domain_get_iommu(domain);
+	if (guest_width > cap_mgaw(iommu->cap))
+		guest_width = cap_mgaw(iommu->cap);
+	domain->gaw = guest_width;
+	adjust_width = guestwidth_to_adjustwidth(guest_width);
+	agaw = width_to_agaw(adjust_width);
+	sagaw = cap_sagaw(iommu->cap);
+	if (!test_bit(agaw, &sagaw)) {
+		/* hardware doesn't support it, choose a bigger one */
+		pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
+		agaw = find_next_bit(&sagaw, 5, agaw);
+		if (agaw >= 5)
+			return -ENODEV;
+	}
+	domain->agaw = agaw;
+	INIT_LIST_HEAD(&domain->devices);
+
+	if (ecap_coherent(iommu->ecap))
+		domain->iommu_coherency = 1;
+	else
+		domain->iommu_coherency = 0;
+
+	if (ecap_sc_support(iommu->ecap))
+		domain->iommu_snooping = 1;
+	else
+		domain->iommu_snooping = 0;
+
+	domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
+	domain->iommu_count = 1;
+	domain->nid = iommu->node;
+
+	/* always allocate the top pgd */
+	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
+	if (!domain->pgd)
+		return -ENOMEM;
+	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
+	return 0;
+}
+
+static void domain_exit(struct dmar_domain *domain)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	/* Domain 0 is reserved, so dont process it */
+	if (!domain)
+		return;
+
+	/* Flush any lazy unmaps that may reference this domain */
+	if (!intel_iommu_strict)
+		flush_unmaps_timeout(0);
+
+	domain_remove_dev_info(domain);
+	/* destroy iovas */
+	put_iova_domain(&domain->iovad);
+
+	/* clear ptes */
+	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+
+	/* free page tables */
+	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+
+	for_each_active_iommu(iommu, drhd)
+		if (test_bit(iommu->seq_id, &domain->iommu_bmp))
+			iommu_detach_domain(domain, iommu);
+
+	free_domain_mem(domain);
+}
+
+static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
+				 u8 bus, u8 devfn, int translation)
+{
+	struct context_entry *context;
+	unsigned long flags;
+	struct intel_iommu *iommu;
+	struct dma_pte *pgd;
+	unsigned long num;
+	unsigned long ndomains;
+	int id;
+	int agaw;
+	struct device_domain_info *info = NULL;
+
+	pr_debug("Set context mapping for %02x:%02x.%d\n",
+		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+	BUG_ON(!domain->pgd);
+	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
+	       translation != CONTEXT_TT_MULTI_LEVEL);
+
+	iommu = device_to_iommu(segment, bus, devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	context = device_to_context_entry(iommu, bus, devfn);
+	if (!context)
+		return -ENOMEM;
+	spin_lock_irqsave(&iommu->lock, flags);
+	if (context_present(context)) {
+		spin_unlock_irqrestore(&iommu->lock, flags);
+		return 0;
+	}
+
+	id = domain->id;
+	pgd = domain->pgd;
+
+	if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
+	    domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
+		int found = 0;
+
+		/* find an available domain id for this device in iommu */
+		ndomains = cap_ndoms(iommu->cap);
+		for_each_set_bit(num, iommu->domain_ids, ndomains) {
+			if (iommu->domains[num] == domain) {
+				id = num;
+				found = 1;
+				break;
+			}
+		}
+
+		if (found == 0) {
+			num = find_first_zero_bit(iommu->domain_ids, ndomains);
+			if (num >= ndomains) {
+				spin_unlock_irqrestore(&iommu->lock, flags);
+				printk(KERN_ERR "IOMMU: no free domain ids\n");
+				return -EFAULT;
+			}
+
+			set_bit(num, iommu->domain_ids);
+			iommu->domains[num] = domain;
+			id = num;
+		}
+
+		/* Skip top levels of page tables for
+		 * iommu which has less agaw than default.
+		 * Unnecessary for PT mode.
+		 */
+		if (translation != CONTEXT_TT_PASS_THROUGH) {
+			for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
+				pgd = phys_to_virt(dma_pte_addr(pgd));
+				if (!dma_pte_present(pgd)) {
+					spin_unlock_irqrestore(&iommu->lock, flags);
+					return -ENOMEM;
+				}
+			}
+		}
+	}
+
+	context_set_domain_id(context, id);
+
+	if (translation != CONTEXT_TT_PASS_THROUGH) {
+		info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
+		translation = info ? CONTEXT_TT_DEV_IOTLB :
+				     CONTEXT_TT_MULTI_LEVEL;
+	}
+	/*
+	 * In pass through mode, AW must be programmed to indicate the largest
+	 * AGAW value supported by hardware. And ASR is ignored by hardware.
+	 */
+	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
+		context_set_address_width(context, iommu->msagaw);
+	else {
+		context_set_address_root(context, virt_to_phys(pgd));
+		context_set_address_width(context, iommu->agaw);
+	}
+
+	context_set_translation_type(context, translation);
+	context_set_fault_enable(context);
+	context_set_present(context);
+	domain_flush_cache(domain, context, sizeof(*context));
+
+	/*
+	 * It's a non-present to present mapping. If hardware doesn't cache
+	 * non-present entry we only need to flush the write-buffer. If the
+	 * _does_ cache non-present entries, then it does so in the special
+	 * domain #0, which we have to flush:
+	 */
+	if (cap_caching_mode(iommu->cap)) {
+		iommu->flush.flush_context(iommu, 0,
+					   (((u16)bus) << 8) | devfn,
+					   DMA_CCMD_MASK_NOBIT,
+					   DMA_CCMD_DEVICE_INVL);
+		iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
+	} else {
+		iommu_flush_write_buffer(iommu);
+	}
+	iommu_enable_dev_iotlb(info);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	spin_lock_irqsave(&domain->iommu_lock, flags);
+	if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
+		domain->iommu_count++;
+		if (domain->iommu_count == 1)
+			domain->nid = iommu->node;
+		domain_update_iommu_cap(domain);
+	}
+	spin_unlock_irqrestore(&domain->iommu_lock, flags);
+	return 0;
+}
+
+static int
+domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
+			int translation)
+{
+	int ret;
+	struct pci_dev *tmp, *parent;
+
+	ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
+					 pdev->bus->number, pdev->devfn,
+					 translation);
+	if (ret)
+		return ret;
+
+	/* dependent device mapping */
+	tmp = pci_find_upstream_pcie_bridge(pdev);
+	if (!tmp)
+		return 0;
+	/* Secondary interface's bus number and devfn 0 */
+	parent = pdev->bus->self;
+	while (parent != tmp) {
+		ret = domain_context_mapping_one(domain,
+						 pci_domain_nr(parent->bus),
+						 parent->bus->number,
+						 parent->devfn, translation);
+		if (ret)
+			return ret;
+		parent = parent->bus->self;
+	}
+	if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
+		return domain_context_mapping_one(domain,
+					pci_domain_nr(tmp->subordinate),
+					tmp->subordinate->number, 0,
+					translation);
+	else /* this is a legacy PCI bridge */
+		return domain_context_mapping_one(domain,
+						  pci_domain_nr(tmp->bus),
+						  tmp->bus->number,
+						  tmp->devfn,
+						  translation);
+}
+
+static int domain_context_mapped(struct pci_dev *pdev)
+{
+	int ret;
+	struct pci_dev *tmp, *parent;
+	struct intel_iommu *iommu;
+
+	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
+				pdev->devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
+	if (!ret)
+		return ret;
+	/* dependent device mapping */
+	tmp = pci_find_upstream_pcie_bridge(pdev);
+	if (!tmp)
+		return ret;
+	/* Secondary interface's bus number and devfn 0 */
+	parent = pdev->bus->self;
+	while (parent != tmp) {
+		ret = device_context_mapped(iommu, parent->bus->number,
+					    parent->devfn);
+		if (!ret)
+			return ret;
+		parent = parent->bus->self;
+	}
+	if (pci_is_pcie(tmp))
+		return device_context_mapped(iommu, tmp->subordinate->number,
+					     0);
+	else
+		return device_context_mapped(iommu, tmp->bus->number,
+					     tmp->devfn);
+}
+
+/* Returns a number of VTD pages, but aligned to MM page size */
+static inline unsigned long aligned_nrpages(unsigned long host_addr,
+					    size_t size)
+{
+	host_addr &= ~PAGE_MASK;
+	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
+}
+
+/* Return largest possible superpage level for a given mapping */
+static inline int hardware_largepage_caps(struct dmar_domain *domain,
+					  unsigned long iov_pfn,
+					  unsigned long phy_pfn,
+					  unsigned long pages)
+{
+	int support, level = 1;
+	unsigned long pfnmerge;
+
+	support = domain->iommu_superpage;
+
+	/* To use a large page, the virtual *and* physical addresses
+	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+	   of them will mean we have to use smaller pages. So just
+	   merge them and check both at once. */
+	pfnmerge = iov_pfn | phy_pfn;
+
+	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+		pages >>= VTD_STRIDE_SHIFT;
+		if (!pages)
+			break;
+		pfnmerge >>= VTD_STRIDE_SHIFT;
+		level++;
+		support--;
+	}
+	return level;
+}
+
+static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+			    struct scatterlist *sg, unsigned long phys_pfn,
+			    unsigned long nr_pages, int prot)
+{
+	struct dma_pte *first_pte = NULL, *pte = NULL;
+	phys_addr_t uninitialized_var(pteval);
+	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+	unsigned long sg_res;
+	unsigned int largepage_lvl = 0;
+	unsigned long lvl_pages = 0;
+
+	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
+
+	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
+		return -EINVAL;
+
+	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
+
+	if (sg)
+		sg_res = 0;
+	else {
+		sg_res = nr_pages + 1;
+		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
+	}
+
+	while (nr_pages > 0) {
+		uint64_t tmp;
+
+		if (!sg_res) {
+			sg_res = aligned_nrpages(sg->offset, sg->length);
+			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
+			sg->dma_length = sg->length;
+			pteval = page_to_phys(sg_page(sg)) | prot;
+			phys_pfn = pteval >> VTD_PAGE_SHIFT;
+		}
+
+		if (!pte) {
+			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+
+			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
+			if (!pte)
+				return -ENOMEM;
+			/* It is large page*/
+			if (largepage_lvl > 1)
+				pteval |= DMA_PTE_LARGE_PAGE;
+			else
+				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+
+		}
+		/* We don't need lock here, nobody else
+		 * touches the iova range
+		 */
+		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
+		if (tmp) {
+			static int dumps = 5;
+			printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
+			       iov_pfn, tmp, (unsigned long long)pteval);
+			if (dumps) {
+				dumps--;
+				debug_dma_dump_mappings(NULL);
+			}
+			WARN_ON(1);
+		}
+
+		lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+		BUG_ON(nr_pages < lvl_pages);
+		BUG_ON(sg_res < lvl_pages);
+
+		nr_pages -= lvl_pages;
+		iov_pfn += lvl_pages;
+		phys_pfn += lvl_pages;
+		pteval += lvl_pages * VTD_PAGE_SIZE;
+		sg_res -= lvl_pages;
+
+		/* If the next PTE would be the first in a new page, then we
+		   need to flush the cache on the entries we've just written.
+		   And then we'll need to recalculate 'pte', so clear it and
+		   let it get set again in the if (!pte) block above.
+
+		   If we're done (!nr_pages) we need to flush the cache too.
+
+		   Also if we've been setting superpages, we may need to
+		   recalculate 'pte' and switch back to smaller pages for the
+		   end of the mapping, if the trailing size is not enough to
+		   use another superpage (i.e. sg_res < lvl_pages). */
+		pte++;
+		if (!nr_pages || first_pte_in_page(pte) ||
+		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
+			domain_flush_cache(domain, first_pte,
+					   (void *)pte - (void *)first_pte);
+			pte = NULL;
+		}
+
+		if (!sg_res && nr_pages)
+			sg = sg_next(sg);
+	}
+	return 0;
+}
+
+static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+				    struct scatterlist *sg, unsigned long nr_pages,
+				    int prot)
+{
+	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
+}
+
+static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+				     unsigned long phys_pfn, unsigned long nr_pages,
+				     int prot)
+{
+	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
+}
+
+static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+	if (!iommu)
+		return;
+
+	clear_context_table(iommu, bus, devfn);
+	iommu->flush.flush_context(iommu, 0, 0, 0,
+					   DMA_CCMD_GLOBAL_INVL);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+}
+
+static void domain_remove_dev_info(struct dmar_domain *domain)
+{
+	struct device_domain_info *info;
+	unsigned long flags;
+	struct intel_iommu *iommu;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	while (!list_empty(&domain->devices)) {
+		info = list_entry(domain->devices.next,
+			struct device_domain_info, link);
+		list_del(&info->link);
+		list_del(&info->global);
+		if (info->dev)
+			info->dev->dev.archdata.iommu = NULL;
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+
+		iommu_disable_dev_iotlb(info);
+		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
+		iommu_detach_dev(iommu, info->bus, info->devfn);
+		free_devinfo_mem(info);
+
+		spin_lock_irqsave(&device_domain_lock, flags);
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+/*
+ * find_domain
+ * Note: we use struct pci_dev->dev.archdata.iommu stores the info
+ */
+static struct dmar_domain *
+find_domain(struct pci_dev *pdev)
+{
+	struct device_domain_info *info;
+
+	/* No lock here, assumes no domain exit in normal case */
+	info = pdev->dev.archdata.iommu;
+	if (info)
+		return info->domain;
+	return NULL;
+}
+
+/* domain is initialized */
+static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
+{
+	struct dmar_domain *domain, *found = NULL;
+	struct intel_iommu *iommu;
+	struct dmar_drhd_unit *drhd;
+	struct device_domain_info *info, *tmp;
+	struct pci_dev *dev_tmp;
+	unsigned long flags;
+	int bus = 0, devfn = 0;
+	int segment;
+	int ret;
+
+	domain = find_domain(pdev);
+	if (domain)
+		return domain;
+
+	segment = pci_domain_nr(pdev->bus);
+
+	dev_tmp = pci_find_upstream_pcie_bridge(pdev);
+	if (dev_tmp) {
+		if (pci_is_pcie(dev_tmp)) {
+			bus = dev_tmp->subordinate->number;
+			devfn = 0;
+		} else {
+			bus = dev_tmp->bus->number;
+			devfn = dev_tmp->devfn;
+		}
+		spin_lock_irqsave(&device_domain_lock, flags);
+		list_for_each_entry(info, &device_domain_list, global) {
+			if (info->segment == segment &&
+			    info->bus == bus && info->devfn == devfn) {
+				found = info->domain;
+				break;
+			}
+		}
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		/* pcie-pci bridge already has a domain, uses it */
+		if (found) {
+			domain = found;
+			goto found_domain;
+		}
+	}
+
+	domain = alloc_domain();
+	if (!domain)
+		goto error;
+
+	/* Allocate new domain for the device */
+	drhd = dmar_find_matched_drhd_unit(pdev);
+	if (!drhd) {
+		printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
+			pci_name(pdev));
+		return NULL;
+	}
+	iommu = drhd->iommu;
+
+	ret = iommu_attach_domain(domain, iommu);
+	if (ret) {
+		free_domain_mem(domain);
+		goto error;
+	}
+
+	if (domain_init(domain, gaw)) {
+		domain_exit(domain);
+		goto error;
+	}
+
+	/* register pcie-to-pci device */
+	if (dev_tmp) {
+		info = alloc_devinfo_mem();
+		if (!info) {
+			domain_exit(domain);
+			goto error;
+		}
+		info->segment = segment;
+		info->bus = bus;
+		info->devfn = devfn;
+		info->dev = NULL;
+		info->domain = domain;
+		/* This domain is shared by devices under p2p bridge */
+		domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
+
+		/* pcie-to-pci bridge already has a domain, uses it */
+		found = NULL;
+		spin_lock_irqsave(&device_domain_lock, flags);
+		list_for_each_entry(tmp, &device_domain_list, global) {
+			if (tmp->segment == segment &&
+			    tmp->bus == bus && tmp->devfn == devfn) {
+				found = tmp->domain;
+				break;
+			}
+		}
+		if (found) {
+			spin_unlock_irqrestore(&device_domain_lock, flags);
+			free_devinfo_mem(info);
+			domain_exit(domain);
+			domain = found;
+		} else {
+			list_add(&info->link, &domain->devices);
+			list_add(&info->global, &device_domain_list);
+			spin_unlock_irqrestore(&device_domain_lock, flags);
+		}
+	}
+
+found_domain:
+	info = alloc_devinfo_mem();
+	if (!info)
+		goto error;
+	info->segment = segment;
+	info->bus = pdev->bus->number;
+	info->devfn = pdev->devfn;
+	info->dev = pdev;
+	info->domain = domain;
+	spin_lock_irqsave(&device_domain_lock, flags);
+	/* somebody is fast */
+	found = find_domain(pdev);
+	if (found != NULL) {
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		if (found != domain) {
+			domain_exit(domain);
+			domain = found;
+		}
+		free_devinfo_mem(info);
+		return domain;
+	}
+	list_add(&info->link, &domain->devices);
+	list_add(&info->global, &device_domain_list);
+	pdev->dev.archdata.iommu = info;
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+	return domain;
+error:
+	/* recheck it here, maybe others set it */
+	return find_domain(pdev);
+}
+
+static int iommu_identity_mapping;
+#define IDENTMAP_ALL		1
+#define IDENTMAP_GFX		2
+#define IDENTMAP_AZALIA		4
+
+static int iommu_domain_identity_map(struct dmar_domain *domain,
+				     unsigned long long start,
+				     unsigned long long end)
+{
+	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
+	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
+
+	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
+			  dma_to_mm_pfn(last_vpfn))) {
+		printk(KERN_ERR "IOMMU: reserve iova failed\n");
+		return -ENOMEM;
+	}
+
+	pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
+		 start, end, domain->id);
+	/*
+	 * RMRR range might have overlap with physical memory range,
+	 * clear it first
+	 */
+	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
+
+	return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
+				  last_vpfn - first_vpfn + 1,
+				  DMA_PTE_READ|DMA_PTE_WRITE);
+}
+
+static int iommu_prepare_identity_map(struct pci_dev *pdev,
+				      unsigned long long start,
+				      unsigned long long end)
+{
+	struct dmar_domain *domain;
+	int ret;
+
+	domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+	if (!domain)
+		return -ENOMEM;
+
+	/* For _hardware_ passthrough, don't bother. But for software
+	   passthrough, we do it anyway -- it may indicate a memory
+	   range which is reserved in E820, so which didn't get set
+	   up to start with in si_domain */
+	if (domain == si_domain && hw_pass_through) {
+		printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
+		       pci_name(pdev), start, end);
+		return 0;
+	}
+
+	printk(KERN_INFO
+	       "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
+	       pci_name(pdev), start, end);
+	
+	if (end < start) {
+		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
+			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+			dmi_get_system_info(DMI_BIOS_VENDOR),
+			dmi_get_system_info(DMI_BIOS_VERSION),
+		     dmi_get_system_info(DMI_PRODUCT_VERSION));
+		ret = -EIO;
+		goto error;
+	}
+
+	if (end >> agaw_to_width(domain->agaw)) {
+		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
+		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+		     agaw_to_width(domain->agaw),
+		     dmi_get_system_info(DMI_BIOS_VENDOR),
+		     dmi_get_system_info(DMI_BIOS_VERSION),
+		     dmi_get_system_info(DMI_PRODUCT_VERSION));
+		ret = -EIO;
+		goto error;
+	}
+
+	ret = iommu_domain_identity_map(domain, start, end);
+	if (ret)
+		goto error;
+
+	/* context entry init */
+	ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
+	if (ret)
+		goto error;
+
+	return 0;
+
+ error:
+	domain_exit(domain);
+	return ret;
+}
+
+static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
+	struct pci_dev *pdev)
+{
+	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
+		return 0;
+	return iommu_prepare_identity_map(pdev, rmrr->base_address,
+		rmrr->end_address);
+}
+
+#ifdef CONFIG_DMAR_FLOPPY_WA
+static inline void iommu_prepare_isa(void)
+{
+	struct pci_dev *pdev;
+	int ret;
+
+	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
+	if (!pdev)
+		return;
+
+	printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
+	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
+
+	if (ret)
+		printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
+		       "floppy might not work\n");
+
+}
+#else
+static inline void iommu_prepare_isa(void)
+{
+	return;
+}
+#endif /* !CONFIG_DMAR_FLPY_WA */
+
+static int md_domain_init(struct dmar_domain *domain, int guest_width);
+
+static int __init si_domain_work_fn(unsigned long start_pfn,
+				    unsigned long end_pfn, void *datax)
+{
+	int *ret = datax;
+
+	*ret = iommu_domain_identity_map(si_domain,
+					 (uint64_t)start_pfn << PAGE_SHIFT,
+					 (uint64_t)end_pfn << PAGE_SHIFT);
+	return *ret;
+
+}
+
+static int __init si_domain_init(int hw)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	int nid, ret = 0;
+
+	si_domain = alloc_domain();
+	if (!si_domain)
+		return -EFAULT;
+
+	pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
+
+	for_each_active_iommu(iommu, drhd) {
+		ret = iommu_attach_domain(si_domain, iommu);
+		if (ret) {
+			domain_exit(si_domain);
+			return -EFAULT;
+		}
+	}
+
+	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+		domain_exit(si_domain);
+		return -EFAULT;
+	}
+
+	si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
+
+	if (hw)
+		return 0;
+
+	for_each_online_node(nid) {
+		work_with_active_regions(nid, si_domain_work_fn, &ret);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void domain_remove_one_dev_info(struct dmar_domain *domain,
+					  struct pci_dev *pdev);
+static int identity_mapping(struct pci_dev *pdev)
+{
+	struct device_domain_info *info;
+
+	if (likely(!iommu_identity_mapping))
+		return 0;
+
+	info = pdev->dev.archdata.iommu;
+	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
+		return (info->domain == si_domain);
+
+	return 0;
+}
+
+static int domain_add_dev_info(struct dmar_domain *domain,
+			       struct pci_dev *pdev,
+			       int translation)
+{
+	struct device_domain_info *info;
+	unsigned long flags;
+	int ret;
+
+	info = alloc_devinfo_mem();
+	if (!info)
+		return -ENOMEM;
+
+	ret = domain_context_mapping(domain, pdev, translation);
+	if (ret) {
+		free_devinfo_mem(info);
+		return ret;
+	}
+
+	info->segment = pci_domain_nr(pdev->bus);
+	info->bus = pdev->bus->number;
+	info->devfn = pdev->devfn;
+	info->dev = pdev;
+	info->domain = domain;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_add(&info->link, &domain->devices);
+	list_add(&info->global, &device_domain_list);
+	pdev->dev.archdata.iommu = info;
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return 0;
+}
+
+static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
+{
+	if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
+		return 1;
+
+	if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
+		return 1;
+
+	if (!(iommu_identity_mapping & IDENTMAP_ALL))
+		return 0;
+
+	/*
+	 * We want to start off with all devices in the 1:1 domain, and
+	 * take them out later if we find they can't access all of memory.
+	 *
+	 * However, we can't do this for PCI devices behind bridges,
+	 * because all PCI devices behind the same bridge will end up
+	 * with the same source-id on their transactions.
+	 *
+	 * Practically speaking, we can't change things around for these
+	 * devices at run-time, because we can't be sure there'll be no
+	 * DMA transactions in flight for any of their siblings.
+	 * 
+	 * So PCI devices (unless they're on the root bus) as well as
+	 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
+	 * the 1:1 domain, just in _case_ one of their siblings turns out
+	 * not to be able to map all of memory.
+	 */
+	if (!pci_is_pcie(pdev)) {
+		if (!pci_is_root_bus(pdev->bus))
+			return 0;
+		if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
+			return 0;
+	} else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+		return 0;
+
+	/* 
+	 * At boot time, we don't yet know if devices will be 64-bit capable.
+	 * Assume that they will -- if they turn out not to be, then we can 
+	 * take them out of the 1:1 domain later.
+	 */
+	if (!startup) {
+		/*
+		 * If the device's dma_mask is less than the system's memory
+		 * size then this is not a candidate for identity mapping.
+		 */
+		u64 dma_mask = pdev->dma_mask;
+
+		if (pdev->dev.coherent_dma_mask &&
+		    pdev->dev.coherent_dma_mask < dma_mask)
+			dma_mask = pdev->dev.coherent_dma_mask;
+
+		return dma_mask >= dma_get_required_mask(&pdev->dev);
+	}
+
+	return 1;
+}
+
+static int __init iommu_prepare_static_identity_mapping(int hw)
+{
+	struct pci_dev *pdev = NULL;
+	int ret;
+
+	ret = si_domain_init(hw);
+	if (ret)
+		return -EFAULT;
+
+	for_each_pci_dev(pdev) {
+		/* Skip Host/PCI Bridge devices */
+		if (IS_BRIDGE_HOST_DEVICE(pdev))
+			continue;
+		if (iommu_should_identity_map(pdev, 1)) {
+			printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
+			       hw ? "hardware" : "software", pci_name(pdev));
+
+			ret = domain_add_dev_info(si_domain, pdev,
+						     hw ? CONTEXT_TT_PASS_THROUGH :
+						     CONTEXT_TT_MULTI_LEVEL);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int __init init_dmars(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct dmar_rmrr_unit *rmrr;
+	struct pci_dev *pdev;
+	struct intel_iommu *iommu;
+	int i, ret;
+
+	/*
+	 * for each drhd
+	 *    allocate root
+	 *    initialize and program root entry to not present
+	 * endfor
+	 */
+	for_each_drhd_unit(drhd) {
+		g_num_of_iommus++;
+		/*
+		 * lock not needed as this is only incremented in the single
+		 * threaded kernel __init code path all other access are read
+		 * only
+		 */
+	}
+
+	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
+			GFP_KERNEL);
+	if (!g_iommus) {
+		printk(KERN_ERR "Allocating global iommu array failed\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	deferred_flush = kzalloc(g_num_of_iommus *
+		sizeof(struct deferred_flush_tables), GFP_KERNEL);
+	if (!deferred_flush) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+
+		iommu = drhd->iommu;
+		g_iommus[iommu->seq_id] = iommu;
+
+		ret = iommu_init_domains(iommu);
+		if (ret)
+			goto error;
+
+		/*
+		 * TBD:
+		 * we could share the same root & context tables
+		 * among all IOMMU's. Need to Split it later.
+		 */
+		ret = iommu_alloc_root_entry(iommu);
+		if (ret) {
+			printk(KERN_ERR "IOMMU: allocate root entry failed\n");
+			goto error;
+		}
+		if (!ecap_pass_through(iommu->ecap))
+			hw_pass_through = 0;
+	}
+
+	/*
+	 * Start from the sane iommu hardware state.
+	 */
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+
+		iommu = drhd->iommu;
+
+		/*
+		 * If the queued invalidation is already initialized by us
+		 * (for example, while enabling interrupt-remapping) then
+		 * we got the things already rolling from a sane state.
+		 */
+		if (iommu->qi)
+			continue;
+
+		/*
+		 * Clear any previous faults.
+		 */
+		dmar_fault(-1, iommu);
+		/*
+		 * Disable queued invalidation if supported and already enabled
+		 * before OS handover.
+		 */
+		dmar_disable_qi(iommu);
+	}
+
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+
+		iommu = drhd->iommu;
+
+		if (dmar_enable_qi(iommu)) {
+			/*
+			 * Queued Invalidate not enabled, use Register Based
+			 * Invalidate
+			 */
+			iommu->flush.flush_context = __iommu_flush_context;
+			iommu->flush.flush_iotlb = __iommu_flush_iotlb;
+			printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
+			       "invalidation\n",
+				iommu->seq_id,
+			       (unsigned long long)drhd->reg_base_addr);
+		} else {
+			iommu->flush.flush_context = qi_flush_context;
+			iommu->flush.flush_iotlb = qi_flush_iotlb;
+			printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
+			       "invalidation\n",
+				iommu->seq_id,
+			       (unsigned long long)drhd->reg_base_addr);
+		}
+	}
+
+	if (iommu_pass_through)
+		iommu_identity_mapping |= IDENTMAP_ALL;
+
+#ifdef CONFIG_DMAR_BROKEN_GFX_WA
+	iommu_identity_mapping |= IDENTMAP_GFX;
+#endif
+
+	check_tylersburg_isoch();
+
+	/*
+	 * If pass through is not set or not enabled, setup context entries for
+	 * identity mappings for rmrr, gfx, and isa and may fall back to static
+	 * identity mapping if iommu_identity_mapping is set.
+	 */
+	if (iommu_identity_mapping) {
+		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
+		if (ret) {
+			printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
+			goto error;
+		}
+	}
+	/*
+	 * For each rmrr
+	 *   for each dev attached to rmrr
+	 *   do
+	 *     locate drhd for dev, alloc domain for dev
+	 *     allocate free domain
+	 *     allocate page table entries for rmrr
+	 *     if context not allocated for bus
+	 *           allocate and init context
+	 *           set present in root table for this bus
+	 *     init context with domain, translation etc
+	 *    endfor
+	 * endfor
+	 */
+	printk(KERN_INFO "IOMMU: Setting RMRR:\n");
+	for_each_rmrr_units(rmrr) {
+		for (i = 0; i < rmrr->devices_cnt; i++) {
+			pdev = rmrr->devices[i];
+			/*
+			 * some BIOS lists non-exist devices in DMAR
+			 * table.
+			 */
+			if (!pdev)
+				continue;
+			ret = iommu_prepare_rmrr_dev(rmrr, pdev);
+			if (ret)
+				printk(KERN_ERR
+				       "IOMMU: mapping reserved region failed\n");
+		}
+	}
+
+	iommu_prepare_isa();
+
+	/*
+	 * for each drhd
+	 *   enable fault log
+	 *   global invalidate context cache
+	 *   global invalidate iotlb
+	 *   enable translation
+	 */
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored) {
+			/*
+			 * we always have to disable PMRs or DMA may fail on
+			 * this device
+			 */
+			if (force_on)
+				iommu_disable_protect_mem_regions(drhd->iommu);
+			continue;
+		}
+		iommu = drhd->iommu;
+
+		iommu_flush_write_buffer(iommu);
+
+		ret = dmar_set_interrupt(iommu);
+		if (ret)
+			goto error;
+
+		iommu_set_root_entry(iommu);
+
+		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+
+		ret = iommu_enable_translation(iommu);
+		if (ret)
+			goto error;
+
+		iommu_disable_protect_mem_regions(iommu);
+	}
+
+	return 0;
+error:
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = drhd->iommu;
+		free_iommu(iommu);
+	}
+	kfree(g_iommus);
+	return ret;
+}
+
+/* This takes a number of _MM_ pages, not VTD pages */
+static struct iova *intel_alloc_iova(struct device *dev,
+				     struct dmar_domain *domain,
+				     unsigned long nrpages, uint64_t dma_mask)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct iova *iova = NULL;
+
+	/* Restrict dma_mask to the width that the iommu can handle */
+	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
+
+	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
+		/*
+		 * First try to allocate an io virtual address in
+		 * DMA_BIT_MASK(32) and if that fails then try allocating
+		 * from higher range
+		 */
+		iova = alloc_iova(&domain->iovad, nrpages,
+				  IOVA_PFN(DMA_BIT_MASK(32)), 1);
+		if (iova)
+			return iova;
+	}
+	iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
+	if (unlikely(!iova)) {
+		printk(KERN_ERR "Allocating %ld-page iova for %s failed",
+		       nrpages, pci_name(pdev));
+		return NULL;
+	}
+
+	return iova;
+}
+
+static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
+{
+	struct dmar_domain *domain;
+	int ret;
+
+	domain = get_domain_for_dev(pdev,
+			DEFAULT_DOMAIN_ADDRESS_WIDTH);
+	if (!domain) {
+		printk(KERN_ERR
+			"Allocating domain for %s failed", pci_name(pdev));
+		return NULL;
+	}
+
+	/* make sure context mapping is ok */
+	if (unlikely(!domain_context_mapped(pdev))) {
+		ret = domain_context_mapping(domain, pdev,
+					     CONTEXT_TT_MULTI_LEVEL);
+		if (ret) {
+			printk(KERN_ERR
+				"Domain context map for %s failed",
+				pci_name(pdev));
+			return NULL;
+		}
+	}
+
+	return domain;
+}
+
+static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
+{
+	struct device_domain_info *info;
+
+	/* No lock here, assumes no domain exit in normal case */
+	info = dev->dev.archdata.iommu;
+	if (likely(info))
+		return info->domain;
+
+	return __get_valid_domain_for_dev(dev);
+}
+
+static int iommu_dummy(struct pci_dev *pdev)
+{
+	return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
+}
+
+/* Check if the pdev needs to go through non-identity map and unmap process.*/
+static int iommu_no_mapping(struct device *dev)
+{
+	struct pci_dev *pdev;
+	int found;
+
+	if (unlikely(dev->bus != &pci_bus_type))
+		return 1;
+
+	pdev = to_pci_dev(dev);
+	if (iommu_dummy(pdev))
+		return 1;
+
+	if (!iommu_identity_mapping)
+		return 0;
+
+	found = identity_mapping(pdev);
+	if (found) {
+		if (iommu_should_identity_map(pdev, 0))
+			return 1;
+		else {
+			/*
+			 * 32 bit DMA is removed from si_domain and fall back
+			 * to non-identity mapping.
+			 */
+			domain_remove_one_dev_info(si_domain, pdev);
+			printk(KERN_INFO "32bit %s uses non-identity mapping\n",
+			       pci_name(pdev));
+			return 0;
+		}
+	} else {
+		/*
+		 * In case of a detached 64 bit DMA device from vm, the device
+		 * is put into si_domain for identity mapping.
+		 */
+		if (iommu_should_identity_map(pdev, 0)) {
+			int ret;
+			ret = domain_add_dev_info(si_domain, pdev,
+						  hw_pass_through ?
+						  CONTEXT_TT_PASS_THROUGH :
+						  CONTEXT_TT_MULTI_LEVEL);
+			if (!ret) {
+				printk(KERN_INFO "64bit %s uses identity mapping\n",
+				       pci_name(pdev));
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
+				     size_t size, int dir, u64 dma_mask)
+{
+	struct pci_dev *pdev = to_pci_dev(hwdev);
+	struct dmar_domain *domain;
+	phys_addr_t start_paddr;
+	struct iova *iova;
+	int prot = 0;
+	int ret;
+	struct intel_iommu *iommu;
+	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
+
+	BUG_ON(dir == DMA_NONE);
+
+	if (iommu_no_mapping(hwdev))
+		return paddr;
+
+	domain = get_valid_domain_for_dev(pdev);
+	if (!domain)
+		return 0;
+
+	iommu = domain_get_iommu(domain);
+	size = aligned_nrpages(paddr, size);
+
+	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
+	if (!iova)
+		goto error;
+
+	/*
+	 * Check if DMAR supports zero-length reads on write only
+	 * mappings..
+	 */
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+			!cap_zlr(iommu->cap))
+		prot |= DMA_PTE_READ;
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		prot |= DMA_PTE_WRITE;
+	/*
+	 * paddr - (paddr + size) might be partial page, we should map the whole
+	 * page.  Note: if two part of one page are separately mapped, we
+	 * might have two guest_addr mapping to the same host paddr, but this
+	 * is not a big problem
+	 */
+	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
+				 mm_to_dma_pfn(paddr_pfn), size, prot);
+	if (ret)
+		goto error;
+
+	/* it's a non-present to present mapping. Only flush if caching mode */
+	if (cap_caching_mode(iommu->cap))
+		iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
+	else
+		iommu_flush_write_buffer(iommu);
+
+	start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
+	start_paddr += paddr & ~PAGE_MASK;
+	return start_paddr;
+
+error:
+	if (iova)
+		__free_iova(&domain->iovad, iova);
+	printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
+		pci_name(pdev), size, (unsigned long long)paddr, dir);
+	return 0;
+}
+
+static dma_addr_t intel_map_page(struct device *dev, struct page *page,
+				 unsigned long offset, size_t size,
+				 enum dma_data_direction dir,
+				 struct dma_attrs *attrs)
+{
+	return __intel_map_single(dev, page_to_phys(page) + offset, size,
+				  dir, to_pci_dev(dev)->dma_mask);
+}
+
+static void flush_unmaps(void)
+{
+	int i, j;
+
+	timer_on = 0;
+
+	/* just flush them all */
+	for (i = 0; i < g_num_of_iommus; i++) {
+		struct intel_iommu *iommu = g_iommus[i];
+		if (!iommu)
+			continue;
+
+		if (!deferred_flush[i].next)
+			continue;
+
+		/* In caching mode, global flushes turn emulation expensive */
+		if (!cap_caching_mode(iommu->cap))
+			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+					 DMA_TLB_GLOBAL_FLUSH);
+		for (j = 0; j < deferred_flush[i].next; j++) {
+			unsigned long mask;
+			struct iova *iova = deferred_flush[i].iova[j];
+			struct dmar_domain *domain = deferred_flush[i].domain[j];
+
+			/* On real hardware multiple invalidations are expensive */
+			if (cap_caching_mode(iommu->cap))
+				iommu_flush_iotlb_psi(iommu, domain->id,
+				iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
+			else {
+				mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
+				iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
+						(uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
+			}
+			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
+		}
+		deferred_flush[i].next = 0;
+	}
+
+	list_size = 0;
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	flush_unmaps();
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+	unsigned long flags;
+	int next, iommu_id;
+	struct intel_iommu *iommu;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	if (list_size == HIGH_WATER_MARK)
+		flush_unmaps();
+
+	iommu = domain_get_iommu(dom);
+	iommu_id = iommu->seq_id;
+
+	next = deferred_flush[iommu_id].next;
+	deferred_flush[iommu_id].domain[next] = dom;
+	deferred_flush[iommu_id].iova[next] = iova;
+	deferred_flush[iommu_id].next++;
+
+	if (!timer_on) {
+		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
+		timer_on = 1;
+	}
+	list_size++;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
+			     size_t size, enum dma_data_direction dir,
+			     struct dma_attrs *attrs)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct dmar_domain *domain;
+	unsigned long start_pfn, last_pfn;
+	struct iova *iova;
+	struct intel_iommu *iommu;
+
+	if (iommu_no_mapping(dev))
+		return;
+
+	domain = find_domain(pdev);
+	BUG_ON(!domain);
+
+	iommu = domain_get_iommu(domain);
+
+	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
+	if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
+		      (unsigned long long)dev_addr))
+		return;
+
+	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
+	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+
+	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
+		 pci_name(pdev), start_pfn, last_pfn);
+
+	/*  clear the whole page */
+	dma_pte_clear_range(domain, start_pfn, last_pfn);
+
+	/* free page tables */
+	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+
+	if (intel_iommu_strict) {
+		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
+				      last_pfn - start_pfn + 1, 0);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+	} else {
+		add_unmap(domain, iova);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+	}
+}
+
+static void *intel_alloc_coherent(struct device *hwdev, size_t size,
+				  dma_addr_t *dma_handle, gfp_t flags)
+{
+	void *vaddr;
+	int order;
+
+	size = PAGE_ALIGN(size);
+	order = get_order(size);
+
+	if (!iommu_no_mapping(hwdev))
+		flags &= ~(GFP_DMA | GFP_DMA32);
+	else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
+		if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
+			flags |= GFP_DMA;
+		else
+			flags |= GFP_DMA32;
+	}
+
+	vaddr = (void *)__get_free_pages(flags, order);
+	if (!vaddr)
+		return NULL;
+	memset(vaddr, 0, size);
+
+	*dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
+					 DMA_BIDIRECTIONAL,
+					 hwdev->coherent_dma_mask);
+	if (*dma_handle)
+		return vaddr;
+	free_pages((unsigned long)vaddr, order);
+	return NULL;
+}
+
+static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+				dma_addr_t dma_handle)
+{
+	int order;
+
+	size = PAGE_ALIGN(size);
+	order = get_order(size);
+
+	intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
+	free_pages((unsigned long)vaddr, order);
+}
+
+static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
+			   int nelems, enum dma_data_direction dir,
+			   struct dma_attrs *attrs)
+{
+	struct pci_dev *pdev = to_pci_dev(hwdev);
+	struct dmar_domain *domain;
+	unsigned long start_pfn, last_pfn;
+	struct iova *iova;
+	struct intel_iommu *iommu;
+
+	if (iommu_no_mapping(hwdev))
+		return;
+
+	domain = find_domain(pdev);
+	BUG_ON(!domain);
+
+	iommu = domain_get_iommu(domain);
+
+	iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
+	if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
+		      (unsigned long long)sglist[0].dma_address))
+		return;
+
+	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
+	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+
+	/*  clear the whole page */
+	dma_pte_clear_range(domain, start_pfn, last_pfn);
+
+	/* free page tables */
+	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+
+	if (intel_iommu_strict) {
+		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
+				      last_pfn - start_pfn + 1, 0);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+	} else {
+		add_unmap(domain, iova);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+	}
+}
+
+static int intel_nontranslate_map_sg(struct device *hddev,
+	struct scatterlist *sglist, int nelems, int dir)
+{
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sglist, sg, nelems, i) {
+		BUG_ON(!sg_page(sg));
+		sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
+		sg->dma_length = sg->length;
+	}
+	return nelems;
+}
+
+static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
+			enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	int i;
+	struct pci_dev *pdev = to_pci_dev(hwdev);
+	struct dmar_domain *domain;
+	size_t size = 0;
+	int prot = 0;
+	struct iova *iova = NULL;
+	int ret;
+	struct scatterlist *sg;
+	unsigned long start_vpfn;
+	struct intel_iommu *iommu;
+
+	BUG_ON(dir == DMA_NONE);
+	if (iommu_no_mapping(hwdev))
+		return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
+
+	domain = get_valid_domain_for_dev(pdev);
+	if (!domain)
+		return 0;
+
+	iommu = domain_get_iommu(domain);
+
+	for_each_sg(sglist, sg, nelems, i)
+		size += aligned_nrpages(sg->offset, sg->length);
+
+	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
+				pdev->dma_mask);
+	if (!iova) {
+		sglist->dma_length = 0;
+		return 0;
+	}
+
+	/*
+	 * Check if DMAR supports zero-length reads on write only
+	 * mappings..
+	 */
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+			!cap_zlr(iommu->cap))
+		prot |= DMA_PTE_READ;
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		prot |= DMA_PTE_WRITE;
+
+	start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
+
+	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
+	if (unlikely(ret)) {
+		/*  clear the page */
+		dma_pte_clear_range(domain, start_vpfn,
+				    start_vpfn + size - 1);
+		/* free page tables */
+		dma_pte_free_pagetable(domain, start_vpfn,
+				       start_vpfn + size - 1);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+		return 0;
+	}
+
+	/* it's a non-present to present mapping. Only flush if caching mode */
+	if (cap_caching_mode(iommu->cap))
+		iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
+	else
+		iommu_flush_write_buffer(iommu);
+
+	return nelems;
+}
+
+static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return !dma_addr;
+}
+
+struct dma_map_ops intel_dma_ops = {
+	.alloc_coherent = intel_alloc_coherent,
+	.free_coherent = intel_free_coherent,
+	.map_sg = intel_map_sg,
+	.unmap_sg = intel_unmap_sg,
+	.map_page = intel_map_page,
+	.unmap_page = intel_unmap_page,
+	.mapping_error = intel_mapping_error,
+};
+
+static inline int iommu_domain_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_domain_cache = kmem_cache_create("iommu_domain",
+					 sizeof(struct dmar_domain),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+
+					 NULL);
+	if (!iommu_domain_cache) {
+		printk(KERN_ERR "Couldn't create iommu_domain cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static inline int iommu_devinfo_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+					 sizeof(struct device_domain_info),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!iommu_devinfo_cache) {
+		printk(KERN_ERR "Couldn't create devinfo cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static inline int iommu_iova_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_iova_cache = kmem_cache_create("iommu_iova",
+					 sizeof(struct iova),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!iommu_iova_cache) {
+		printk(KERN_ERR "Couldn't create iova cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static int __init iommu_init_mempool(void)
+{
+	int ret;
+	ret = iommu_iova_cache_init();
+	if (ret)
+		return ret;
+
+	ret = iommu_domain_cache_init();
+	if (ret)
+		goto domain_error;
+
+	ret = iommu_devinfo_cache_init();
+	if (!ret)
+		return ret;
+
+	kmem_cache_destroy(iommu_domain_cache);
+domain_error:
+	kmem_cache_destroy(iommu_iova_cache);
+
+	return -ENOMEM;
+}
+
+static void __init iommu_exit_mempool(void)
+{
+	kmem_cache_destroy(iommu_devinfo_cache);
+	kmem_cache_destroy(iommu_domain_cache);
+	kmem_cache_destroy(iommu_iova_cache);
+
+}
+
+static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
+{
+	struct dmar_drhd_unit *drhd;
+	u32 vtbar;
+	int rc;
+
+	/* We know that this device on this chipset has its own IOMMU.
+	 * If we find it under a different IOMMU, then the BIOS is lying
+	 * to us. Hope that the IOMMU for this device is actually
+	 * disabled, and it needs no translation...
+	 */
+	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
+	if (rc) {
+		/* "can't" happen */
+		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
+		return;
+	}
+	vtbar &= 0xffff0000;
+
+	/* we know that the this iommu should be at offset 0xa000 from vtbar */
+	drhd = dmar_find_matched_drhd_unit(pdev);
+	if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
+			    TAINT_FIRMWARE_WORKAROUND,
+			    "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
+		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
+
+static void __init init_no_remapping_devices(void)
+{
+	struct dmar_drhd_unit *drhd;
+
+	for_each_drhd_unit(drhd) {
+		if (!drhd->include_all) {
+			int i;
+			for (i = 0; i < drhd->devices_cnt; i++)
+				if (drhd->devices[i] != NULL)
+					break;
+			/* ignore DMAR unit if no pci devices exist */
+			if (i == drhd->devices_cnt)
+				drhd->ignored = 1;
+		}
+	}
+
+	if (dmar_map_gfx)
+		return;
+
+	for_each_drhd_unit(drhd) {
+		int i;
+		if (drhd->ignored || drhd->include_all)
+			continue;
+
+		for (i = 0; i < drhd->devices_cnt; i++)
+			if (drhd->devices[i] &&
+				!IS_GFX_DEVICE(drhd->devices[i]))
+				break;
+
+		if (i < drhd->devices_cnt)
+			continue;
+
+		/* bypass IOMMU if it is just for gfx devices */
+		drhd->ignored = 1;
+		for (i = 0; i < drhd->devices_cnt; i++) {
+			if (!drhd->devices[i])
+				continue;
+			drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+		}
+	}
+}
+
+#ifdef CONFIG_SUSPEND
+static int init_iommu_hw(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+
+	for_each_active_iommu(iommu, drhd)
+		if (iommu->qi)
+			dmar_reenable_qi(iommu);
+
+	for_each_iommu(iommu, drhd) {
+		if (drhd->ignored) {
+			/*
+			 * we always have to disable PMRs or DMA may fail on
+			 * this device
+			 */
+			if (force_on)
+				iommu_disable_protect_mem_regions(iommu);
+			continue;
+		}
+	
+		iommu_flush_write_buffer(iommu);
+
+		iommu_set_root_entry(iommu);
+
+		iommu->flush.flush_context(iommu, 0, 0, 0,
+					   DMA_CCMD_GLOBAL_INVL);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+					 DMA_TLB_GLOBAL_FLUSH);
+		if (iommu_enable_translation(iommu))
+			return 1;
+		iommu_disable_protect_mem_regions(iommu);
+	}
+
+	return 0;
+}
+
+static void iommu_flush_all(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu->flush.flush_context(iommu, 0, 0, 0,
+					   DMA_CCMD_GLOBAL_INVL);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+					 DMA_TLB_GLOBAL_FLUSH);
+	}
+}
+
+static int iommu_suspend(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+	unsigned long flag;
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
+						 GFP_ATOMIC);
+		if (!iommu->iommu_state)
+			goto nomem;
+	}
+
+	iommu_flush_all();
+
+	for_each_active_iommu(iommu, drhd) {
+		iommu_disable_translation(iommu);
+
+		spin_lock_irqsave(&iommu->register_lock, flag);
+
+		iommu->iommu_state[SR_DMAR_FECTL_REG] =
+			readl(iommu->reg + DMAR_FECTL_REG);
+		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
+			readl(iommu->reg + DMAR_FEDATA_REG);
+		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
+			readl(iommu->reg + DMAR_FEADDR_REG);
+		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
+			readl(iommu->reg + DMAR_FEUADDR_REG);
+
+		spin_unlock_irqrestore(&iommu->register_lock, flag);
+	}
+	return 0;
+
+nomem:
+	for_each_active_iommu(iommu, drhd)
+		kfree(iommu->iommu_state);
+
+	return -ENOMEM;
+}
+
+static void iommu_resume(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+	unsigned long flag;
+
+	if (init_iommu_hw()) {
+		if (force_on)
+			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
+		else
+			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+		return;
+	}
+
+	for_each_active_iommu(iommu, drhd) {
+
+		spin_lock_irqsave(&iommu->register_lock, flag);
+
+		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
+			iommu->reg + DMAR_FECTL_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
+			iommu->reg + DMAR_FEDATA_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
+			iommu->reg + DMAR_FEADDR_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
+			iommu->reg + DMAR_FEUADDR_REG);
+
+		spin_unlock_irqrestore(&iommu->register_lock, flag);
+	}
+
+	for_each_active_iommu(iommu, drhd)
+		kfree(iommu->iommu_state);
+}
+
+static struct syscore_ops iommu_syscore_ops = {
+	.resume		= iommu_resume,
+	.suspend	= iommu_suspend,
+};
+
+static void __init init_iommu_pm_ops(void)
+{
+	register_syscore_ops(&iommu_syscore_ops);
+}
+
+#else
+static inline void init_iommu_pm_ops(void) {}
+#endif	/* CONFIG_PM */
+
+/*
+ * Here we only respond to action of unbound device from driver.
+ *
+ * Added device is not attached to its DMAR domain here yet. That will happen
+ * when mapping the device to iova.
+ */
+static int device_notifier(struct notifier_block *nb,
+				  unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct dmar_domain *domain;
+
+	if (iommu_no_mapping(dev))
+		return 0;
+
+	domain = find_domain(pdev);
+	if (!domain)
+		return 0;
+
+	if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
+		domain_remove_one_dev_info(domain, pdev);
+
+		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
+		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
+		    list_empty(&domain->devices))
+			domain_exit(domain);
+	}
+
+	return 0;
+}
+
+static struct notifier_block device_nb = {
+	.notifier_call = device_notifier,
+};
+
+int __init intel_iommu_init(void)
+{
+	int ret = 0;
+
+	/* VT-d is required for a TXT/tboot launch, so enforce that */
+	force_on = tboot_force_iommu();
+
+	if (dmar_table_init()) {
+		if (force_on)
+			panic("tboot: Failed to initialize DMAR table\n");
+		return 	-ENODEV;
+	}
+
+	if (dmar_dev_scope_init()) {
+		if (force_on)
+			panic("tboot: Failed to initialize DMAR device scope\n");
+		return 	-ENODEV;
+	}
+
+	/*
+	 * Check the need for DMA-remapping initialization now.
+	 * Above initialization will also be used by Interrupt-remapping.
+	 */
+	if (no_iommu || dmar_disabled)
+		return -ENODEV;
+
+	if (iommu_init_mempool()) {
+		if (force_on)
+			panic("tboot: Failed to initialize iommu memory\n");
+		return 	-ENODEV;
+	}
+
+	if (dmar_init_reserved_ranges()) {
+		if (force_on)
+			panic("tboot: Failed to reserve iommu ranges\n");
+		return 	-ENODEV;
+	}
+
+	init_no_remapping_devices();
+
+	ret = init_dmars();
+	if (ret) {
+		if (force_on)
+			panic("tboot: Failed to initialize DMARs\n");
+		printk(KERN_ERR "IOMMU: dmar init failed\n");
+		put_iova_domain(&reserved_iova_list);
+		iommu_exit_mempool();
+		return ret;
+	}
+	printk(KERN_INFO
+	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
+
+	init_timer(&unmap_timer);
+#ifdef CONFIG_SWIOTLB
+	swiotlb = 0;
+#endif
+	dma_ops = &intel_dma_ops;
+
+	init_iommu_pm_ops();
+
+	register_iommu(&intel_iommu_ops);
+
+	bus_register_notifier(&pci_bus_type, &device_nb);
+
+	return 0;
+}
+
+static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
+					   struct pci_dev *pdev)
+{
+	struct pci_dev *tmp, *parent;
+
+	if (!iommu || !pdev)
+		return;
+
+	/* dependent device detach */
+	tmp = pci_find_upstream_pcie_bridge(pdev);
+	/* Secondary interface's bus number and devfn 0 */
+	if (tmp) {
+		parent = pdev->bus->self;
+		while (parent != tmp) {
+			iommu_detach_dev(iommu, parent->bus->number,
+					 parent->devfn);
+			parent = parent->bus->self;
+		}
+		if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
+			iommu_detach_dev(iommu,
+				tmp->subordinate->number, 0);
+		else /* this is a legacy PCI bridge */
+			iommu_detach_dev(iommu, tmp->bus->number,
+					 tmp->devfn);
+	}
+}
+
+static void domain_remove_one_dev_info(struct dmar_domain *domain,
+					  struct pci_dev *pdev)
+{
+	struct device_domain_info *info;
+	struct intel_iommu *iommu;
+	unsigned long flags;
+	int found = 0;
+	struct list_head *entry, *tmp;
+
+	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
+				pdev->devfn);
+	if (!iommu)
+		return;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_safe(entry, tmp, &domain->devices) {
+		info = list_entry(entry, struct device_domain_info, link);
+		if (info->segment == pci_domain_nr(pdev->bus) &&
+		    info->bus == pdev->bus->number &&
+		    info->devfn == pdev->devfn) {
+			list_del(&info->link);
+			list_del(&info->global);
+			if (info->dev)
+				info->dev->dev.archdata.iommu = NULL;
+			spin_unlock_irqrestore(&device_domain_lock, flags);
+
+			iommu_disable_dev_iotlb(info);
+			iommu_detach_dev(iommu, info->bus, info->devfn);
+			iommu_detach_dependent_devices(iommu, pdev);
+			free_devinfo_mem(info);
+
+			spin_lock_irqsave(&device_domain_lock, flags);
+
+			if (found)
+				break;
+			else
+				continue;
+		}
+
+		/* if there is no other devices under the same iommu
+		 * owned by this domain, clear this iommu in iommu_bmp
+		 * update iommu count and coherency
+		 */
+		if (iommu == device_to_iommu(info->segment, info->bus,
+					    info->devfn))
+			found = 1;
+	}
+
+	if (found == 0) {
+		unsigned long tmp_flags;
+		spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
+		clear_bit(iommu->seq_id, &domain->iommu_bmp);
+		domain->iommu_count--;
+		domain_update_iommu_cap(domain);
+		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
+
+		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
+		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
+			spin_lock_irqsave(&iommu->lock, tmp_flags);
+			clear_bit(domain->id, iommu->domain_ids);
+			iommu->domains[domain->id] = NULL;
+			spin_unlock_irqrestore(&iommu->lock, tmp_flags);
+		}
+	}
+
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
+{
+	struct device_domain_info *info;
+	struct intel_iommu *iommu;
+	unsigned long flags1, flags2;
+
+	spin_lock_irqsave(&device_domain_lock, flags1);
+	while (!list_empty(&domain->devices)) {
+		info = list_entry(domain->devices.next,
+			struct device_domain_info, link);
+		list_del(&info->link);
+		list_del(&info->global);
+		if (info->dev)
+			info->dev->dev.archdata.iommu = NULL;
+
+		spin_unlock_irqrestore(&device_domain_lock, flags1);
+
+		iommu_disable_dev_iotlb(info);
+		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
+		iommu_detach_dev(iommu, info->bus, info->devfn);
+		iommu_detach_dependent_devices(iommu, info->dev);
+
+		/* clear this iommu in iommu_bmp, update iommu count
+		 * and capabilities
+		 */
+		spin_lock_irqsave(&domain->iommu_lock, flags2);
+		if (test_and_clear_bit(iommu->seq_id,
+				       &domain->iommu_bmp)) {
+			domain->iommu_count--;
+			domain_update_iommu_cap(domain);
+		}
+		spin_unlock_irqrestore(&domain->iommu_lock, flags2);
+
+		free_devinfo_mem(info);
+		spin_lock_irqsave(&device_domain_lock, flags1);
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags1);
+}
+
+/* domain id for virtual machine, it won't be set in context */
+static unsigned long vm_domid;
+
+static struct dmar_domain *iommu_alloc_vm_domain(void)
+{
+	struct dmar_domain *domain;
+
+	domain = alloc_domain_mem();
+	if (!domain)
+		return NULL;
+
+	domain->id = vm_domid++;
+	domain->nid = -1;
+	memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
+	domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
+
+	return domain;
+}
+
+static int md_domain_init(struct dmar_domain *domain, int guest_width)
+{
+	int adjust_width;
+
+	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+	spin_lock_init(&domain->iommu_lock);
+
+	domain_reserve_special_ranges(domain);
+
+	/* calculate AGAW */
+	domain->gaw = guest_width;
+	adjust_width = guestwidth_to_adjustwidth(guest_width);
+	domain->agaw = width_to_agaw(adjust_width);
+
+	INIT_LIST_HEAD(&domain->devices);
+
+	domain->iommu_count = 0;
+	domain->iommu_coherency = 0;
+	domain->iommu_snooping = 0;
+	domain->iommu_superpage = 0;
+	domain->max_addr = 0;
+	domain->nid = -1;
+
+	/* always allocate the top pgd */
+	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
+	if (!domain->pgd)
+		return -ENOMEM;
+	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
+	return 0;
+}
+
+static void iommu_free_vm_domain(struct dmar_domain *domain)
+{
+	unsigned long flags;
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	unsigned long i;
+	unsigned long ndomains;
+
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = drhd->iommu;
+
+		ndomains = cap_ndoms(iommu->cap);
+		for_each_set_bit(i, iommu->domain_ids, ndomains) {
+			if (iommu->domains[i] == domain) {
+				spin_lock_irqsave(&iommu->lock, flags);
+				clear_bit(i, iommu->domain_ids);
+				iommu->domains[i] = NULL;
+				spin_unlock_irqrestore(&iommu->lock, flags);
+				break;
+			}
+		}
+	}
+}
+
+static void vm_domain_exit(struct dmar_domain *domain)
+{
+	/* Domain 0 is reserved, so dont process it */
+	if (!domain)
+		return;
+
+	vm_domain_remove_all_dev_info(domain);
+	/* destroy iovas */
+	put_iova_domain(&domain->iovad);
+
+	/* clear ptes */
+	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+
+	/* free page tables */
+	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+
+	iommu_free_vm_domain(domain);
+	free_domain_mem(domain);
+}
+
+static int intel_iommu_domain_init(struct iommu_domain *domain)
+{
+	struct dmar_domain *dmar_domain;
+
+	dmar_domain = iommu_alloc_vm_domain();
+	if (!dmar_domain) {
+		printk(KERN_ERR
+			"intel_iommu_domain_init: dmar_domain == NULL\n");
+		return -ENOMEM;
+	}
+	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+		printk(KERN_ERR
+			"intel_iommu_domain_init() failed\n");
+		vm_domain_exit(dmar_domain);
+		return -ENOMEM;
+	}
+	domain->priv = dmar_domain;
+
+	return 0;
+}
+
+static void intel_iommu_domain_destroy(struct iommu_domain *domain)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+
+	domain->priv = NULL;
+	vm_domain_exit(dmar_domain);
+}
+
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+				     struct device *dev)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct intel_iommu *iommu;
+	int addr_width;
+
+	/* normally pdev is not mapped */
+	if (unlikely(domain_context_mapped(pdev))) {
+		struct dmar_domain *old_domain;
+
+		old_domain = find_domain(pdev);
+		if (old_domain) {
+			if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
+			    dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
+				domain_remove_one_dev_info(old_domain, pdev);
+			else
+				domain_remove_dev_info(old_domain);
+		}
+	}
+
+	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
+				pdev->devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	/* check if this iommu agaw is sufficient for max mapped address */
+	addr_width = agaw_to_width(iommu->agaw);
+	if (addr_width > cap_mgaw(iommu->cap))
+		addr_width = cap_mgaw(iommu->cap);
+
+	if (dmar_domain->max_addr > (1LL << addr_width)) {
+		printk(KERN_ERR "%s: iommu width (%d) is not "
+		       "sufficient for the mapped address (%llx)\n",
+		       __func__, addr_width, dmar_domain->max_addr);
+		return -EFAULT;
+	}
+	dmar_domain->gaw = addr_width;
+
+	/*
+	 * Knock out extra levels of page tables if necessary
+	 */
+	while (iommu->agaw < dmar_domain->agaw) {
+		struct dma_pte *pte;
+
+		pte = dmar_domain->pgd;
+		if (dma_pte_present(pte)) {
+			dmar_domain->pgd = (struct dma_pte *)
+				phys_to_virt(dma_pte_addr(pte));
+			free_pgtable_page(pte);
+		}
+		dmar_domain->agaw--;
+	}
+
+	return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
+}
+
+static void intel_iommu_detach_device(struct iommu_domain *domain,
+				      struct device *dev)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	domain_remove_one_dev_info(dmar_domain, pdev);
+}
+
+static int intel_iommu_map(struct iommu_domain *domain,
+			   unsigned long iova, phys_addr_t hpa,
+			   int gfp_order, int iommu_prot)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+	u64 max_addr;
+	int prot = 0;
+	size_t size;
+	int ret;
+
+	if (iommu_prot & IOMMU_READ)
+		prot |= DMA_PTE_READ;
+	if (iommu_prot & IOMMU_WRITE)
+		prot |= DMA_PTE_WRITE;
+	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+		prot |= DMA_PTE_SNP;
+
+	size     = PAGE_SIZE << gfp_order;
+	max_addr = iova + size;
+	if (dmar_domain->max_addr < max_addr) {
+		u64 end;
+
+		/* check if minimum agaw is sufficient for mapped address */
+		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
+		if (end < max_addr) {
+			printk(KERN_ERR "%s: iommu width (%d) is not "
+			       "sufficient for the mapped address (%llx)\n",
+			       __func__, dmar_domain->gaw, max_addr);
+			return -EFAULT;
+		}
+		dmar_domain->max_addr = max_addr;
+	}
+	/* Round up size to next multiple of PAGE_SIZE, if it and
+	   the low bits of hpa would take us onto the next page */
+	size = aligned_nrpages(hpa, size);
+	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
+				 hpa >> VTD_PAGE_SHIFT, size, prot);
+	return ret;
+}
+
+static int intel_iommu_unmap(struct iommu_domain *domain,
+			     unsigned long iova, int gfp_order)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+	size_t size = PAGE_SIZE << gfp_order;
+
+	dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
+			    (iova + size - 1) >> VTD_PAGE_SHIFT);
+
+	if (dmar_domain->max_addr == iova + size)
+		dmar_domain->max_addr = iova;
+
+	return gfp_order;
+}
+
+static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
+					    unsigned long iova)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+	struct dma_pte *pte;
+	u64 phys = 0;
+
+	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
+	if (pte)
+		phys = dma_pte_addr(pte);
+
+	return phys;
+}
+
+static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
+				      unsigned long cap)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+
+	if (cap == IOMMU_CAP_CACHE_COHERENCY)
+		return dmar_domain->iommu_snooping;
+	if (cap == IOMMU_CAP_INTR_REMAP)
+		return intr_remapping_enabled;
+
+	return 0;
+}
+
+static struct iommu_ops intel_iommu_ops = {
+	.domain_init	= intel_iommu_domain_init,
+	.domain_destroy = intel_iommu_domain_destroy,
+	.attach_dev	= intel_iommu_attach_device,
+	.detach_dev	= intel_iommu_detach_device,
+	.map		= intel_iommu_map,
+	.unmap		= intel_iommu_unmap,
+	.iova_to_phys	= intel_iommu_iova_to_phys,
+	.domain_has_cap = intel_iommu_domain_has_cap,
+};
+
+static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
+{
+	/*
+	 * Mobile 4 Series Chipset neglects to set RWBF capability,
+	 * but needs it:
+	 */
+	printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
+	rwbf_quirk = 1;
+
+	/* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
+	if (dev->revision == 0x07) {
+		printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
+		dmar_map_gfx = 0;
+	}
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
+
+#define GGC 0x52
+#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
+#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
+#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
+#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
+#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
+#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
+#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
+#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
+
+static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
+{
+	unsigned short ggc;
+
+	if (pci_read_config_word(dev, GGC, &ggc))
+		return;
+
+	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
+		printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
+		dmar_map_gfx = 0;
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
+
+/* On Tylersburg chipsets, some BIOSes have been known to enable the
+   ISOCH DMAR unit for the Azalia sound device, but not give it any
+   TLB entries, which causes it to deadlock. Check for that.  We do
+   this in a function called from init_dmars(), instead of in a PCI
+   quirk, because we don't want to print the obnoxious "BIOS broken"
+   message if VT-d is actually disabled.
+*/
+static void __init check_tylersburg_isoch(void)
+{
+	struct pci_dev *pdev;
+	uint32_t vtisochctrl;
+
+	/* If there's no Azalia in the system anyway, forget it. */
+	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
+	if (!pdev)
+		return;
+	pci_dev_put(pdev);
+
+	/* System Management Registers. Might be hidden, in which case
+	   we can't do the sanity check. But that's OK, because the
+	   known-broken BIOSes _don't_ actually hide it, so far. */
+	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
+	if (!pdev)
+		return;
+
+	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
+		pci_dev_put(pdev);
+		return;
+	}
+
+	pci_dev_put(pdev);
+
+	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
+	if (vtisochctrl & 1)
+		return;
+
+	/* Drop all bits other than the number of TLB entries */
+	vtisochctrl &= 0x1c;
+
+	/* If we have the recommended number of TLB entries (16), fine. */
+	if (vtisochctrl == 0x10)
+		return;
+
+	/* Zero TLB entries? You get to ride the short bus to school. */
+	if (!vtisochctrl) {
+		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
+		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+		     dmi_get_system_info(DMI_BIOS_VENDOR),
+		     dmi_get_system_info(DMI_BIOS_VERSION),
+		     dmi_get_system_info(DMI_PRODUCT_VERSION));
+		iommu_identity_mapping |= IDENTMAP_AZALIA;
+		return;
+	}
+	
+	printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
+	       vtisochctrl);
+}
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
new file mode 100644
index 000000000000..1a89d4a2cadf
--- /dev/null
+++ b/drivers/iommu/intr_remapping.c
@@ -0,0 +1,797 @@
+#include <linux/interrupt.h>
+#include <linux/dmar.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/hpet.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <asm/io_apic.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <linux/intel-iommu.h>
+#include "intr_remapping.h"
+#include <acpi/acpi.h>
+#include <asm/pci-direct.h>
+
+static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
+static struct hpet_scope ir_hpet[MAX_HPET_TBS];
+static int ir_ioapic_num, ir_hpet_num;
+int intr_remapping_enabled;
+
+static int disable_intremap;
+static int disable_sourceid_checking;
+
+static __init int setup_nointremap(char *str)
+{
+	disable_intremap = 1;
+	return 0;
+}
+early_param("nointremap", setup_nointremap);
+
+static __init int setup_intremap(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strncmp(str, "on", 2))
+		disable_intremap = 0;
+	else if (!strncmp(str, "off", 3))
+		disable_intremap = 1;
+	else if (!strncmp(str, "nosid", 5))
+		disable_sourceid_checking = 1;
+
+	return 0;
+}
+early_param("intremap", setup_intremap);
+
+static DEFINE_SPINLOCK(irq_2_ir_lock);
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
+	return cfg ? &cfg->irq_2_iommu : NULL;
+}
+
+int get_irte(int irq, struct irte *entry)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	unsigned long flags;
+	int index;
+
+	if (!entry || !irq_iommu)
+		return -1;
+
+	spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
+	*entry = *(irq_iommu->iommu->ir_table->base + index);
+
+	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+	return 0;
+}
+
+int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
+{
+	struct ir_table *table = iommu->ir_table;
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	u16 index, start_index;
+	unsigned int mask = 0;
+	unsigned long flags;
+	int i;
+
+	if (!count || !irq_iommu)
+		return -1;
+
+	/*
+	 * start the IRTE search from index 0.
+	 */
+	index = start_index = 0;
+
+	if (count > 1) {
+		count = __roundup_pow_of_two(count);
+		mask = ilog2(count);
+	}
+
+	if (mask > ecap_max_handle_mask(iommu->ecap)) {
+		printk(KERN_ERR
+		       "Requested mask %x exceeds the max invalidation handle"
+		       " mask value %Lx\n", mask,
+		       ecap_max_handle_mask(iommu->ecap));
+		return -1;
+	}
+
+	spin_lock_irqsave(&irq_2_ir_lock, flags);
+	do {
+		for (i = index; i < index + count; i++)
+			if  (table->base[i].present)
+				break;
+		/* empty index found */
+		if (i == index + count)
+			break;
+
+		index = (index + count) % INTR_REMAP_TABLE_ENTRIES;
+
+		if (index == start_index) {
+			spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+			printk(KERN_ERR "can't allocate an IRTE\n");
+			return -1;
+		}
+	} while (1);
+
+	for (i = index; i < index + count; i++)
+		table->base[i].present = 1;
+
+	irq_iommu->iommu = iommu;
+	irq_iommu->irte_index =  index;
+	irq_iommu->sub_handle = 0;
+	irq_iommu->irte_mask = mask;
+
+	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return index;
+}
+
+static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
+{
+	struct qi_desc desc;
+
+	desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
+		   | QI_IEC_SELECTIVE;
+	desc.high = 0;
+
+	return qi_submit_sync(&desc, iommu);
+}
+
+int map_irq_to_irte_handle(int irq, u16 *sub_handle)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	unsigned long flags;
+	int index;
+
+	if (!irq_iommu)
+		return -1;
+
+	spin_lock_irqsave(&irq_2_ir_lock, flags);
+	*sub_handle = irq_iommu->sub_handle;
+	index = irq_iommu->irte_index;
+	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+	return index;
+}
+
+int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	unsigned long flags;
+
+	if (!irq_iommu)
+		return -1;
+
+	spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+	irq_iommu->iommu = iommu;
+	irq_iommu->irte_index = index;
+	irq_iommu->sub_handle = subhandle;
+	irq_iommu->irte_mask = 0;
+
+	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return 0;
+}
+
+int modify_irte(int irq, struct irte *irte_modified)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	struct intel_iommu *iommu;
+	unsigned long flags;
+	struct irte *irte;
+	int rc, index;
+
+	if (!irq_iommu)
+		return -1;
+
+	spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+	iommu = irq_iommu->iommu;
+
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
+	irte = &iommu->ir_table->base[index];
+
+	set_64bit(&irte->low, irte_modified->low);
+	set_64bit(&irte->high, irte_modified->high);
+	__iommu_flush_cache(iommu, irte, sizeof(*irte));
+
+	rc = qi_flush_iec(iommu, index, 0);
+	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return rc;
+}
+
+struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
+{
+	int i;
+
+	for (i = 0; i < MAX_HPET_TBS; i++)
+		if (ir_hpet[i].id == hpet_id)
+			return ir_hpet[i].iommu;
+	return NULL;
+}
+
+struct intel_iommu *map_ioapic_to_ir(int apic)
+{
+	int i;
+
+	for (i = 0; i < MAX_IO_APICS; i++)
+		if (ir_ioapic[i].id == apic)
+			return ir_ioapic[i].iommu;
+	return NULL;
+}
+
+struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+{
+	struct dmar_drhd_unit *drhd;
+
+	drhd = dmar_find_matched_drhd_unit(dev);
+	if (!drhd)
+		return NULL;
+
+	return drhd->iommu;
+}
+
+static int clear_entries(struct irq_2_iommu *irq_iommu)
+{
+	struct irte *start, *entry, *end;
+	struct intel_iommu *iommu;
+	int index;
+
+	if (irq_iommu->sub_handle)
+		return 0;
+
+	iommu = irq_iommu->iommu;
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
+
+	start = iommu->ir_table->base + index;
+	end = start + (1 << irq_iommu->irte_mask);
+
+	for (entry = start; entry < end; entry++) {
+		set_64bit(&entry->low, 0);
+		set_64bit(&entry->high, 0);
+	}
+
+	return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
+}
+
+int free_irte(int irq)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	unsigned long flags;
+	int rc;
+
+	if (!irq_iommu)
+		return -1;
+
+	spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+	rc = clear_entries(irq_iommu);
+
+	irq_iommu->iommu = NULL;
+	irq_iommu->irte_index = 0;
+	irq_iommu->sub_handle = 0;
+	irq_iommu->irte_mask = 0;
+
+	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return rc;
+}
+
+/*
+ * source validation type
+ */
+#define SVT_NO_VERIFY		0x0  /* no verification is required */
+#define SVT_VERIFY_SID_SQ	0x1  /* verify using SID and SQ fields */
+#define SVT_VERIFY_BUS		0x2  /* verify bus of request-id */
+
+/*
+ * source-id qualifier
+ */
+#define SQ_ALL_16	0x0  /* verify all 16 bits of request-id */
+#define SQ_13_IGNORE_1	0x1  /* verify most significant 13 bits, ignore
+			      * the third least significant bit
+			      */
+#define SQ_13_IGNORE_2	0x2  /* verify most significant 13 bits, ignore
+			      * the second and third least significant bits
+			      */
+#define SQ_13_IGNORE_3	0x3  /* verify most significant 13 bits, ignore
+			      * the least three significant bits
+			      */
+
+/*
+ * set SVT, SQ and SID fields of irte to verify
+ * source ids of interrupt requests
+ */
+static void set_irte_sid(struct irte *irte, unsigned int svt,
+			 unsigned int sq, unsigned int sid)
+{
+	if (disable_sourceid_checking)
+		svt = SVT_NO_VERIFY;
+	irte->svt = svt;
+	irte->sq = sq;
+	irte->sid = sid;
+}
+
+int set_ioapic_sid(struct irte *irte, int apic)
+{
+	int i;
+	u16 sid = 0;
+
+	if (!irte)
+		return -1;
+
+	for (i = 0; i < MAX_IO_APICS; i++) {
+		if (ir_ioapic[i].id == apic) {
+			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+			break;
+		}
+	}
+
+	if (sid == 0) {
+		pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic);
+		return -1;
+	}
+
+	set_irte_sid(irte, 1, 0, sid);
+
+	return 0;
+}
+
+int set_hpet_sid(struct irte *irte, u8 id)
+{
+	int i;
+	u16 sid = 0;
+
+	if (!irte)
+		return -1;
+
+	for (i = 0; i < MAX_HPET_TBS; i++) {
+		if (ir_hpet[i].id == id) {
+			sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
+			break;
+		}
+	}
+
+	if (sid == 0) {
+		pr_warning("Failed to set source-id of HPET block (%d)\n", id);
+		return -1;
+	}
+
+	/*
+	 * Should really use SQ_ALL_16. Some platforms are broken.
+	 * While we figure out the right quirks for these broken platforms, use
+	 * SQ_13_IGNORE_3 for now.
+	 */
+	set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid);
+
+	return 0;
+}
+
+int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+	struct pci_dev *bridge;
+
+	if (!irte || !dev)
+		return -1;
+
+	/* PCIe device or Root Complex integrated PCI device */
+	if (pci_is_pcie(dev) || !dev->bus->parent) {
+		set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+			     (dev->bus->number << 8) | dev->devfn);
+		return 0;
+	}
+
+	bridge = pci_find_upstream_pcie_bridge(dev);
+	if (bridge) {
+		if (pci_is_pcie(bridge))/* this is a PCIe-to-PCI/PCIX bridge */
+			set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
+				(bridge->bus->number << 8) | dev->bus->number);
+		else /* this is a legacy PCI bridge */
+			set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+				(bridge->bus->number << 8) | bridge->devfn);
+	}
+
+	return 0;
+}
+
+static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
+{
+	u64 addr;
+	u32 sts;
+	unsigned long flags;
+
+	addr = virt_to_phys((void *)iommu->ir_table->base);
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+
+	dmar_writeq(iommu->reg + DMAR_IRTA_REG,
+		    (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
+
+	/* Set interrupt-remapping table pointer */
+	iommu->gcmd |= DMA_GCMD_SIRTP;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_IRTPS), sts);
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	/*
+	 * global invalidation of interrupt entry cache before enabling
+	 * interrupt-remapping.
+	 */
+	qi_global_iec(iommu);
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+
+	/* Enable interrupt-remapping */
+	iommu->gcmd |= DMA_GCMD_IRE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_IRES), sts);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+
+static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
+{
+	struct ir_table *ir_table;
+	struct page *pages;
+
+	ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table),
+					     GFP_ATOMIC);
+
+	if (!iommu->ir_table)
+		return -ENOMEM;
+
+	pages = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
+				 INTR_REMAP_PAGE_ORDER);
+
+	if (!pages) {
+		printk(KERN_ERR "failed to allocate pages of order %d\n",
+		       INTR_REMAP_PAGE_ORDER);
+		kfree(iommu->ir_table);
+		return -ENOMEM;
+	}
+
+	ir_table->base = page_address(pages);
+
+	iommu_set_intr_remapping(iommu, mode);
+	return 0;
+}
+
+/*
+ * Disable Interrupt Remapping.
+ */
+static void iommu_disable_intr_remapping(struct intel_iommu *iommu)
+{
+	unsigned long flags;
+	u32 sts;
+
+	if (!ecap_ir_support(iommu->ecap))
+		return;
+
+	/*
+	 * global invalidation of interrupt entry cache before disabling
+	 * interrupt-remapping.
+	 */
+	qi_global_iec(iommu);
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+
+	sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
+	if (!(sts & DMA_GSTS_IRES))
+		goto end;
+
+	iommu->gcmd &= ~DMA_GCMD_IRE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, !(sts & DMA_GSTS_IRES), sts);
+
+end:
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+int __init intr_remapping_supported(void)
+{
+	struct dmar_drhd_unit *drhd;
+
+	if (disable_intremap)
+		return 0;
+
+	if (!dmar_ir_support())
+		return 0;
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			return 0;
+	}
+
+	return 1;
+}
+
+int __init enable_intr_remapping(int eim)
+{
+	struct dmar_drhd_unit *drhd;
+	int setup = 0;
+
+	if (parse_ioapics_under_ir() != 1) {
+		printk(KERN_INFO "Not enable interrupt remapping\n");
+		return -1;
+	}
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		/*
+		 * If the queued invalidation is already initialized,
+		 * shouldn't disable it.
+		 */
+		if (iommu->qi)
+			continue;
+
+		/*
+		 * Clear previous faults.
+		 */
+		dmar_fault(-1, iommu);
+
+		/*
+		 * Disable intr remapping and queued invalidation, if already
+		 * enabled prior to OS handover.
+		 */
+		iommu_disable_intr_remapping(iommu);
+
+		dmar_disable_qi(iommu);
+	}
+
+	/*
+	 * check for the Interrupt-remapping support
+	 */
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		if (eim && !ecap_eim_support(iommu->ecap)) {
+			printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
+			       " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
+			return -1;
+		}
+	}
+
+	/*
+	 * Enable queued invalidation for all the DRHD's.
+	 */
+	for_each_drhd_unit(drhd) {
+		int ret;
+		struct intel_iommu *iommu = drhd->iommu;
+		ret = dmar_enable_qi(iommu);
+
+		if (ret) {
+			printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
+			       " invalidation, ecap %Lx, ret %d\n",
+			       drhd->reg_base_addr, iommu->ecap, ret);
+			return -1;
+		}
+	}
+
+	/*
+	 * Setup Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		if (setup_intr_remapping(iommu, eim))
+			goto error;
+
+		setup = 1;
+	}
+
+	if (!setup)
+		goto error;
+
+	intr_remapping_enabled = 1;
+
+	return 0;
+
+error:
+	/*
+	 * handle error condition gracefully here!
+	 */
+	return -1;
+}
+
+static void ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
+				      struct intel_iommu *iommu)
+{
+	struct acpi_dmar_pci_path *path;
+	u8 bus;
+	int count;
+
+	bus = scope->bus;
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (--count > 0) {
+		/*
+		 * Access PCI directly due to the PCI
+		 * subsystem isn't initialized yet.
+		 */
+		bus = read_pci_config_byte(bus, path->dev, path->fn,
+					   PCI_SECONDARY_BUS);
+		path++;
+	}
+	ir_hpet[ir_hpet_num].bus   = bus;
+	ir_hpet[ir_hpet_num].devfn = PCI_DEVFN(path->dev, path->fn);
+	ir_hpet[ir_hpet_num].iommu = iommu;
+	ir_hpet[ir_hpet_num].id    = scope->enumeration_id;
+	ir_hpet_num++;
+}
+
+static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
+				      struct intel_iommu *iommu)
+{
+	struct acpi_dmar_pci_path *path;
+	u8 bus;
+	int count;
+
+	bus = scope->bus;
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (--count > 0) {
+		/*
+		 * Access PCI directly due to the PCI
+		 * subsystem isn't initialized yet.
+		 */
+		bus = read_pci_config_byte(bus, path->dev, path->fn,
+					   PCI_SECONDARY_BUS);
+		path++;
+	}
+
+	ir_ioapic[ir_ioapic_num].bus   = bus;
+	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
+	ir_ioapic[ir_ioapic_num].iommu = iommu;
+	ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id;
+	ir_ioapic_num++;
+}
+
+static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
+				      struct intel_iommu *iommu)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	struct acpi_dmar_device_scope *scope;
+	void *start, *end;
+
+	drhd = (struct acpi_dmar_hardware_unit *)header;
+
+	start = (void *)(drhd + 1);
+	end = ((void *)drhd) + header->length;
+
+	while (start < end) {
+		scope = start;
+		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
+			if (ir_ioapic_num == MAX_IO_APICS) {
+				printk(KERN_WARNING "Exceeded Max IO APICS\n");
+				return -1;
+			}
+
+			printk(KERN_INFO "IOAPIC id %d under DRHD base "
+			       " 0x%Lx IOMMU %d\n", scope->enumeration_id,
+			       drhd->address, iommu->seq_id);
+
+			ir_parse_one_ioapic_scope(scope, iommu);
+		} else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) {
+			if (ir_hpet_num == MAX_HPET_TBS) {
+				printk(KERN_WARNING "Exceeded Max HPET blocks\n");
+				return -1;
+			}
+
+			printk(KERN_INFO "HPET id %d under DRHD base"
+			       " 0x%Lx\n", scope->enumeration_id,
+			       drhd->address);
+
+			ir_parse_one_hpet_scope(scope, iommu);
+		}
+		start += scope->length;
+	}
+
+	return 0;
+}
+
+/*
+ * Finds the assocaition between IOAPIC's and its Interrupt-remapping
+ * hardware unit.
+ */
+int __init parse_ioapics_under_ir(void)
+{
+	struct dmar_drhd_unit *drhd;
+	int ir_supported = 0;
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (ecap_ir_support(iommu->ecap)) {
+			if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu))
+				return -1;
+
+			ir_supported = 1;
+		}
+	}
+
+	if (ir_supported && ir_ioapic_num != nr_ioapics) {
+		printk(KERN_WARNING
+		       "Not all IO-APIC's listed under remapping hardware\n");
+		return -1;
+	}
+
+	return ir_supported;
+}
+
+void disable_intr_remapping(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+
+	/*
+	 * Disable Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		iommu_disable_intr_remapping(iommu);
+	}
+}
+
+int reenable_intr_remapping(int eim)
+{
+	struct dmar_drhd_unit *drhd;
+	int setup = 0;
+	struct intel_iommu *iommu = NULL;
+
+	for_each_iommu(iommu, drhd)
+		if (iommu->qi)
+			dmar_reenable_qi(iommu);
+
+	/*
+	 * Setup Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		/* Set up interrupt remapping for iommu.*/
+		iommu_set_intr_remapping(iommu, eim);
+		setup = 1;
+	}
+
+	if (!setup)
+		goto error;
+
+	return 0;
+
+error:
+	/*
+	 * handle error condition gracefully here!
+	 */
+	return -1;
+}
+
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
new file mode 100644
index 000000000000..5662fecfee60
--- /dev/null
+++ b/drivers/iommu/intr_remapping.h
@@ -0,0 +1,17 @@
+#include <linux/intel-iommu.h>
+
+struct ioapic_scope {
+	struct intel_iommu *iommu;
+	unsigned int id;
+	unsigned int bus;	/* PCI bus number */
+	unsigned int devfn;	/* PCI devfn number */
+};
+
+struct hpet_scope {
+	struct intel_iommu *iommu;
+	u8 id;
+	unsigned int bus;
+	unsigned int devfn;
+};
+
+#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
new file mode 100644
index 000000000000..c5c274ab5c5a
--- /dev/null
+++ b/drivers/iommu/iova.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright © 2006-2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ */
+
+#include <linux/iova.h>
+
+void
+init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
+{
+	spin_lock_init(&iovad->iova_rbtree_lock);
+	iovad->rbroot = RB_ROOT;
+	iovad->cached32_node = NULL;
+	iovad->dma_32bit_pfn = pfn_32bit;
+}
+
+static struct rb_node *
+__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
+{
+	if ((*limit_pfn != iovad->dma_32bit_pfn) ||
+		(iovad->cached32_node == NULL))
+		return rb_last(&iovad->rbroot);
+	else {
+		struct rb_node *prev_node = rb_prev(iovad->cached32_node);
+		struct iova *curr_iova =
+			container_of(iovad->cached32_node, struct iova, node);
+		*limit_pfn = curr_iova->pfn_lo - 1;
+		return prev_node;
+	}
+}
+
+static void
+__cached_rbnode_insert_update(struct iova_domain *iovad,
+	unsigned long limit_pfn, struct iova *new)
+{
+	if (limit_pfn != iovad->dma_32bit_pfn)
+		return;
+	iovad->cached32_node = &new->node;
+}
+
+static void
+__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
+{
+	struct iova *cached_iova;
+	struct rb_node *curr;
+
+	if (!iovad->cached32_node)
+		return;
+	curr = iovad->cached32_node;
+	cached_iova = container_of(curr, struct iova, node);
+
+	if (free->pfn_lo >= cached_iova->pfn_lo) {
+		struct rb_node *node = rb_next(&free->node);
+		struct iova *iova = container_of(node, struct iova, node);
+
+		/* only cache if it's below 32bit pfn */
+		if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
+			iovad->cached32_node = node;
+		else
+			iovad->cached32_node = NULL;
+	}
+}
+
+/* Computes the padding size required, to make the
+ * the start address naturally aligned on its size
+ */
+static int
+iova_get_pad_size(int size, unsigned int limit_pfn)
+{
+	unsigned int pad_size = 0;
+	unsigned int order = ilog2(size);
+
+	if (order)
+		pad_size = (limit_pfn + 1) % (1 << order);
+
+	return pad_size;
+}
+
+static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
+		unsigned long size, unsigned long limit_pfn,
+			struct iova *new, bool size_aligned)
+{
+	struct rb_node *prev, *curr = NULL;
+	unsigned long flags;
+	unsigned long saved_pfn;
+	unsigned int pad_size = 0;
+
+	/* Walk the tree backwards */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	saved_pfn = limit_pfn;
+	curr = __get_cached_rbnode(iovad, &limit_pfn);
+	prev = curr;
+	while (curr) {
+		struct iova *curr_iova = container_of(curr, struct iova, node);
+
+		if (limit_pfn < curr_iova->pfn_lo)
+			goto move_left;
+		else if (limit_pfn < curr_iova->pfn_hi)
+			goto adjust_limit_pfn;
+		else {
+			if (size_aligned)
+				pad_size = iova_get_pad_size(size, limit_pfn);
+			if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
+				break;	/* found a free slot */
+		}
+adjust_limit_pfn:
+		limit_pfn = curr_iova->pfn_lo - 1;
+move_left:
+		prev = curr;
+		curr = rb_prev(curr);
+	}
+
+	if (!curr) {
+		if (size_aligned)
+			pad_size = iova_get_pad_size(size, limit_pfn);
+		if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
+			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+			return -ENOMEM;
+		}
+	}
+
+	/* pfn_lo will point to size aligned address if size_aligned is set */
+	new->pfn_lo = limit_pfn - (size + pad_size) + 1;
+	new->pfn_hi = new->pfn_lo + size - 1;
+
+	/* Insert the new_iova into domain rbtree by holding writer lock */
+	/* Add new node and rebalance tree. */
+	{
+		struct rb_node **entry, *parent = NULL;
+
+		/* If we have 'prev', it's a valid place to start the
+		   insertion. Otherwise, start from the root. */
+		if (prev)
+			entry = &prev;
+		else
+			entry = &iovad->rbroot.rb_node;
+
+		/* Figure out where to put new node */
+		while (*entry) {
+			struct iova *this = container_of(*entry,
+							struct iova, node);
+			parent = *entry;
+
+			if (new->pfn_lo < this->pfn_lo)
+				entry = &((*entry)->rb_left);
+			else if (new->pfn_lo > this->pfn_lo)
+				entry = &((*entry)->rb_right);
+			else
+				BUG(); /* this should not happen */
+		}
+
+		/* Add new node and rebalance tree. */
+		rb_link_node(&new->node, parent, entry);
+		rb_insert_color(&new->node, &iovad->rbroot);
+	}
+	__cached_rbnode_insert_update(iovad, saved_pfn, new);
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+
+	return 0;
+}
+
+static void
+iova_insert_rbtree(struct rb_root *root, struct iova *iova)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	/* Figure out where to put new node */
+	while (*new) {
+		struct iova *this = container_of(*new, struct iova, node);
+		parent = *new;
+
+		if (iova->pfn_lo < this->pfn_lo)
+			new = &((*new)->rb_left);
+		else if (iova->pfn_lo > this->pfn_lo)
+			new = &((*new)->rb_right);
+		else
+			BUG(); /* this should not happen */
+	}
+	/* Add new node and rebalance tree. */
+	rb_link_node(&iova->node, parent, new);
+	rb_insert_color(&iova->node, root);
+}
+
+/**
+ * alloc_iova - allocates an iova
+ * @iovad - iova domain in question
+ * @size - size of page frames to allocate
+ * @limit_pfn - max limit address
+ * @size_aligned - set if size_aligned address range is required
+ * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
+ * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
+ * flag is set then the allocated address iova->pfn_lo will be naturally
+ * aligned on roundup_power_of_two(size).
+ */
+struct iova *
+alloc_iova(struct iova_domain *iovad, unsigned long size,
+	unsigned long limit_pfn,
+	bool size_aligned)
+{
+	struct iova *new_iova;
+	int ret;
+
+	new_iova = alloc_iova_mem();
+	if (!new_iova)
+		return NULL;
+
+	/* If size aligned is set then round the size to
+	 * to next power of two.
+	 */
+	if (size_aligned)
+		size = __roundup_pow_of_two(size);
+
+	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn,
+			new_iova, size_aligned);
+
+	if (ret) {
+		free_iova_mem(new_iova);
+		return NULL;
+	}
+
+	return new_iova;
+}
+
+/**
+ * find_iova - find's an iova for a given pfn
+ * @iovad - iova domain in question.
+ * pfn - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	unsigned long flags;
+	struct rb_node *node;
+
+	/* Take the lock so that no other thread is manipulating the rbtree */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	node = iovad->rbroot.rb_node;
+	while (node) {
+		struct iova *iova = container_of(node, struct iova, node);
+
+		/* If pfn falls within iova's range, return iova */
+		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
+			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+			/* We are not holding the lock while this iova
+			 * is referenced by the caller as the same thread
+			 * which called this function also calls __free_iova()
+			 * and it is by desing that only one thread can possibly
+			 * reference a particular iova and hence no conflict.
+			 */
+			return iova;
+		}
+
+		if (pfn < iova->pfn_lo)
+			node = node->rb_left;
+		else if (pfn > iova->pfn_lo)
+			node = node->rb_right;
+	}
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	return NULL;
+}
+
+/**
+ * __free_iova - frees the given iova
+ * @iovad: iova domain in question.
+ * @iova: iova in question.
+ * Frees the given iova belonging to the giving domain
+ */
+void
+__free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	__cached_rbnode_delete_update(iovad, iova);
+	rb_erase(&iova->node, &iovad->rbroot);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	free_iova_mem(iova);
+}
+
+/**
+ * free_iova - finds and frees the iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * This functions finds an iova for a given pfn and then
+ * frees the iova from that domain.
+ */
+void
+free_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	struct iova *iova = find_iova(iovad, pfn);
+	if (iova)
+		__free_iova(iovad, iova);
+
+}
+
+/**
+ * put_iova_domain - destroys the iova doamin
+ * @iovad: - iova domain in question.
+ * All the iova's in that domain are destroyed.
+ */
+void put_iova_domain(struct iova_domain *iovad)
+{
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	node = rb_first(&iovad->rbroot);
+	while (node) {
+		struct iova *iova = container_of(node, struct iova, node);
+		rb_erase(node, &iovad->rbroot);
+		free_iova_mem(iova);
+		node = rb_first(&iovad->rbroot);
+	}
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+}
+
+static int
+__is_range_overlap(struct rb_node *node,
+	unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct iova *iova = container_of(node, struct iova, node);
+
+	if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
+		return 1;
+	return 0;
+}
+
+static struct iova *
+__insert_new_range(struct iova_domain *iovad,
+	unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct iova *iova;
+
+	iova = alloc_iova_mem();
+	if (!iova)
+		return iova;
+
+	iova->pfn_hi = pfn_hi;
+	iova->pfn_lo = pfn_lo;
+	iova_insert_rbtree(&iovad->rbroot, iova);
+	return iova;
+}
+
+static void
+__adjust_overlap_range(struct iova *iova,
+	unsigned long *pfn_lo, unsigned long *pfn_hi)
+{
+	if (*pfn_lo < iova->pfn_lo)
+		iova->pfn_lo = *pfn_lo;
+	if (*pfn_hi > iova->pfn_hi)
+		*pfn_lo = iova->pfn_hi + 1;
+}
+
+/**
+ * reserve_iova - reserves an iova in the given range
+ * @iovad: - iova domain pointer
+ * @pfn_lo: - lower page frame address
+ * @pfn_hi:- higher pfn adderss
+ * This function allocates reserves the address range from pfn_lo to pfn_hi so
+ * that this address is not dished out as part of alloc_iova.
+ */
+struct iova *
+reserve_iova(struct iova_domain *iovad,
+	unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct rb_node *node;
+	unsigned long flags;
+	struct iova *iova;
+	unsigned int overlap = 0;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
+		if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
+			iova = container_of(node, struct iova, node);
+			__adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
+			if ((pfn_lo >= iova->pfn_lo) &&
+				(pfn_hi <= iova->pfn_hi))
+				goto finish;
+			overlap = 1;
+
+		} else if (overlap)
+				break;
+	}
+
+	/* We are here either because this is the first reserver node
+	 * or need to insert remaining non overlap addr range
+	 */
+	iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
+finish:
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	return iova;
+}
+
+/**
+ * copy_reserved_iova - copies the reserved between domains
+ * @from: - source doamin from where to copy
+ * @to: - destination domin where to copy
+ * This function copies reserved iova's from one doamin to
+ * other.
+ */
+void
+copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
+{
+	unsigned long flags;
+	struct rb_node *node;
+
+	spin_lock_irqsave(&from->iova_rbtree_lock, flags);
+	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
+		struct iova *iova = container_of(node, struct iova, node);
+		struct iova *new_iova;
+		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
+		if (!new_iova)
+			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
+				iova->pfn_lo, iova->pfn_lo);
+	}
+	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
+}
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 094308e41be5..825c02b40daa 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -29,11 +29,6 @@ obj-$(CONFIG_PCI_MSI) += msi.o
 # Build the Hypertransport interrupt support
 obj-$(CONFIG_HT_IRQ) += htirq.o
 
-# Build Intel IOMMU support
-obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
-
-obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
-
 obj-$(CONFIG_PCI_IOV) += iov.o
 
 #
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
deleted file mode 100644
index 3dc9befa5aec..000000000000
--- a/drivers/pci/dmar.c
+++ /dev/null
@@ -1,1461 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Author: Ashok Raj <ashok.raj@intel.com>
- * Author: Shaohua Li <shaohua.li@intel.com>
- * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- *
- * This file implements early detection/parsing of Remapping Devices
- * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
- * tables.
- *
- * These routines are used by both DMA-remapping and Interrupt-remapping
- */
-
-#include <linux/pci.h>
-#include <linux/dmar.h>
-#include <linux/iova.h>
-#include <linux/intel-iommu.h>
-#include <linux/timer.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/tboot.h>
-#include <linux/dmi.h>
-#include <linux/slab.h>
-#include <asm/iommu_table.h>
-
-#define PREFIX "DMAR: "
-
-/* No locks are needed as DMA remapping hardware unit
- * list is constructed at boot time and hotplug of
- * these units are not supported by the architecture.
- */
-LIST_HEAD(dmar_drhd_units);
-
-static struct acpi_table_header * __initdata dmar_tbl;
-static acpi_size dmar_tbl_size;
-
-static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
-{
-	/*
-	 * add INCLUDE_ALL at the tail, so scan the list will find it at
-	 * the very end.
-	 */
-	if (drhd->include_all)
-		list_add_tail(&drhd->list, &dmar_drhd_units);
-	else
-		list_add(&drhd->list, &dmar_drhd_units);
-}
-
-static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope,
-					   struct pci_dev **dev, u16 segment)
-{
-	struct pci_bus *bus;
-	struct pci_dev *pdev = NULL;
-	struct acpi_dmar_pci_path *path;
-	int count;
-
-	bus = pci_find_bus(segment, scope->bus);
-	path = (struct acpi_dmar_pci_path *)(scope + 1);
-	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
-		/ sizeof(struct acpi_dmar_pci_path);
-
-	while (count) {
-		if (pdev)
-			pci_dev_put(pdev);
-		/*
-		 * Some BIOSes list non-exist devices in DMAR table, just
-		 * ignore it
-		 */
-		if (!bus) {
-			printk(KERN_WARNING
-			PREFIX "Device scope bus [%d] not found\n",
-			scope->bus);
-			break;
-		}
-		pdev = pci_get_slot(bus, PCI_DEVFN(path->dev, path->fn));
-		if (!pdev) {
-			printk(KERN_WARNING PREFIX
-			"Device scope device [%04x:%02x:%02x.%02x] not found\n",
-				segment, bus->number, path->dev, path->fn);
-			break;
-		}
-		path ++;
-		count --;
-		bus = pdev->subordinate;
-	}
-	if (!pdev) {
-		printk(KERN_WARNING PREFIX
-		"Device scope device [%04x:%02x:%02x.%02x] not found\n",
-		segment, scope->bus, path->dev, path->fn);
-		*dev = NULL;
-		return 0;
-	}
-	if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT && \
-			pdev->subordinate) || (scope->entry_type == \
-			ACPI_DMAR_SCOPE_TYPE_BRIDGE && !pdev->subordinate)) {
-		pci_dev_put(pdev);
-		printk(KERN_WARNING PREFIX
-			"Device scope type does not match for %s\n",
-			 pci_name(pdev));
-		return -EINVAL;
-	}
-	*dev = pdev;
-	return 0;
-}
-
-static int __init dmar_parse_dev_scope(void *start, void *end, int *cnt,
-				       struct pci_dev ***devices, u16 segment)
-{
-	struct acpi_dmar_device_scope *scope;
-	void * tmp = start;
-	int index;
-	int ret;
-
-	*cnt = 0;
-	while (start < end) {
-		scope = start;
-		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
-		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE)
-			(*cnt)++;
-		else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
-			printk(KERN_WARNING PREFIX
-			       "Unsupported device scope\n");
-		}
-		start += scope->length;
-	}
-	if (*cnt == 0)
-		return 0;
-
-	*devices = kcalloc(*cnt, sizeof(struct pci_dev *), GFP_KERNEL);
-	if (!*devices)
-		return -ENOMEM;
-
-	start = tmp;
-	index = 0;
-	while (start < end) {
-		scope = start;
-		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
-		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) {
-			ret = dmar_parse_one_dev_scope(scope,
-				&(*devices)[index], segment);
-			if (ret) {
-				kfree(*devices);
-				return ret;
-			}
-			index ++;
-		}
-		start += scope->length;
-	}
-
-	return 0;
-}
-
-/**
- * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
- * structure which uniquely represent one DMA remapping hardware unit
- * present in the platform
- */
-static int __init
-dmar_parse_one_drhd(struct acpi_dmar_header *header)
-{
-	struct acpi_dmar_hardware_unit *drhd;
-	struct dmar_drhd_unit *dmaru;
-	int ret = 0;
-
-	drhd = (struct acpi_dmar_hardware_unit *)header;
-	dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL);
-	if (!dmaru)
-		return -ENOMEM;
-
-	dmaru->hdr = header;
-	dmaru->reg_base_addr = drhd->address;
-	dmaru->segment = drhd->segment;
-	dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
-
-	ret = alloc_iommu(dmaru);
-	if (ret) {
-		kfree(dmaru);
-		return ret;
-	}
-	dmar_register_drhd_unit(dmaru);
-	return 0;
-}
-
-static int __init dmar_parse_dev(struct dmar_drhd_unit *dmaru)
-{
-	struct acpi_dmar_hardware_unit *drhd;
-	int ret = 0;
-
-	drhd = (struct acpi_dmar_hardware_unit *) dmaru->hdr;
-
-	if (dmaru->include_all)
-		return 0;
-
-	ret = dmar_parse_dev_scope((void *)(drhd + 1),
-				((void *)drhd) + drhd->header.length,
-				&dmaru->devices_cnt, &dmaru->devices,
-				drhd->segment);
-	if (ret) {
-		list_del(&dmaru->list);
-		kfree(dmaru);
-	}
-	return ret;
-}
-
-#ifdef CONFIG_DMAR
-LIST_HEAD(dmar_rmrr_units);
-
-static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
-{
-	list_add(&rmrr->list, &dmar_rmrr_units);
-}
-
-
-static int __init
-dmar_parse_one_rmrr(struct acpi_dmar_header *header)
-{
-	struct acpi_dmar_reserved_memory *rmrr;
-	struct dmar_rmrr_unit *rmrru;
-
-	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
-	if (!rmrru)
-		return -ENOMEM;
-
-	rmrru->hdr = header;
-	rmrr = (struct acpi_dmar_reserved_memory *)header;
-	rmrru->base_address = rmrr->base_address;
-	rmrru->end_address = rmrr->end_address;
-
-	dmar_register_rmrr_unit(rmrru);
-	return 0;
-}
-
-static int __init
-rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
-{
-	struct acpi_dmar_reserved_memory *rmrr;
-	int ret;
-
-	rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
-	ret = dmar_parse_dev_scope((void *)(rmrr + 1),
-		((void *)rmrr) + rmrr->header.length,
-		&rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
-
-	if (ret || (rmrru->devices_cnt == 0)) {
-		list_del(&rmrru->list);
-		kfree(rmrru);
-	}
-	return ret;
-}
-
-static LIST_HEAD(dmar_atsr_units);
-
-static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
-{
-	struct acpi_dmar_atsr *atsr;
-	struct dmar_atsr_unit *atsru;
-
-	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
-	atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
-	if (!atsru)
-		return -ENOMEM;
-
-	atsru->hdr = hdr;
-	atsru->include_all = atsr->flags & 0x1;
-
-	list_add(&atsru->list, &dmar_atsr_units);
-
-	return 0;
-}
-
-static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
-{
-	int rc;
-	struct acpi_dmar_atsr *atsr;
-
-	if (atsru->include_all)
-		return 0;
-
-	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
-	rc = dmar_parse_dev_scope((void *)(atsr + 1),
-				(void *)atsr + atsr->header.length,
-				&atsru->devices_cnt, &atsru->devices,
-				atsr->segment);
-	if (rc || !atsru->devices_cnt) {
-		list_del(&atsru->list);
-		kfree(atsru);
-	}
-
-	return rc;
-}
-
-int dmar_find_matched_atsr_unit(struct pci_dev *dev)
-{
-	int i;
-	struct pci_bus *bus;
-	struct acpi_dmar_atsr *atsr;
-	struct dmar_atsr_unit *atsru;
-
-	dev = pci_physfn(dev);
-
-	list_for_each_entry(atsru, &dmar_atsr_units, list) {
-		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
-		if (atsr->segment == pci_domain_nr(dev->bus))
-			goto found;
-	}
-
-	return 0;
-
-found:
-	for (bus = dev->bus; bus; bus = bus->parent) {
-		struct pci_dev *bridge = bus->self;
-
-		if (!bridge || !pci_is_pcie(bridge) ||
-		    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
-			return 0;
-
-		if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
-			for (i = 0; i < atsru->devices_cnt; i++)
-				if (atsru->devices[i] == bridge)
-					return 1;
-			break;
-		}
-	}
-
-	if (atsru->include_all)
-		return 1;
-
-	return 0;
-}
-#endif
-
-#ifdef CONFIG_ACPI_NUMA
-static int __init
-dmar_parse_one_rhsa(struct acpi_dmar_header *header)
-{
-	struct acpi_dmar_rhsa *rhsa;
-	struct dmar_drhd_unit *drhd;
-
-	rhsa = (struct acpi_dmar_rhsa *)header;
-	for_each_drhd_unit(drhd) {
-		if (drhd->reg_base_addr == rhsa->base_address) {
-			int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
-
-			if (!node_online(node))
-				node = -1;
-			drhd->iommu->node = node;
-			return 0;
-		}
-	}
-	WARN_TAINT(
-		1, TAINT_FIRMWARE_WORKAROUND,
-		"Your BIOS is broken; RHSA refers to non-existent DMAR unit at %llx\n"
-		"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-		drhd->reg_base_addr,
-		dmi_get_system_info(DMI_BIOS_VENDOR),
-		dmi_get_system_info(DMI_BIOS_VERSION),
-		dmi_get_system_info(DMI_PRODUCT_VERSION));
-
-	return 0;
-}
-#endif
-
-static void __init
-dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
-{
-	struct acpi_dmar_hardware_unit *drhd;
-	struct acpi_dmar_reserved_memory *rmrr;
-	struct acpi_dmar_atsr *atsr;
-	struct acpi_dmar_rhsa *rhsa;
-
-	switch (header->type) {
-	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
-		drhd = container_of(header, struct acpi_dmar_hardware_unit,
-				    header);
-		printk (KERN_INFO PREFIX
-			"DRHD base: %#016Lx flags: %#x\n",
-			(unsigned long long)drhd->address, drhd->flags);
-		break;
-	case ACPI_DMAR_TYPE_RESERVED_MEMORY:
-		rmrr = container_of(header, struct acpi_dmar_reserved_memory,
-				    header);
-		printk (KERN_INFO PREFIX
-			"RMRR base: %#016Lx end: %#016Lx\n",
-			(unsigned long long)rmrr->base_address,
-			(unsigned long long)rmrr->end_address);
-		break;
-	case ACPI_DMAR_TYPE_ATSR:
-		atsr = container_of(header, struct acpi_dmar_atsr, header);
-		printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags);
-		break;
-	case ACPI_DMAR_HARDWARE_AFFINITY:
-		rhsa = container_of(header, struct acpi_dmar_rhsa, header);
-		printk(KERN_INFO PREFIX "RHSA base: %#016Lx proximity domain: %#x\n",
-		       (unsigned long long)rhsa->base_address,
-		       rhsa->proximity_domain);
-		break;
-	}
-}
-
-/**
- * dmar_table_detect - checks to see if the platform supports DMAR devices
- */
-static int __init dmar_table_detect(void)
-{
-	acpi_status status = AE_OK;
-
-	/* if we could find DMAR table, then there are DMAR devices */
-	status = acpi_get_table_with_size(ACPI_SIG_DMAR, 0,
-				(struct acpi_table_header **)&dmar_tbl,
-				&dmar_tbl_size);
-
-	if (ACPI_SUCCESS(status) && !dmar_tbl) {
-		printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
-		status = AE_NOT_FOUND;
-	}
-
-	return (ACPI_SUCCESS(status) ? 1 : 0);
-}
-
-/**
- * parse_dmar_table - parses the DMA reporting table
- */
-static int __init
-parse_dmar_table(void)
-{
-	struct acpi_table_dmar *dmar;
-	struct acpi_dmar_header *entry_header;
-	int ret = 0;
-
-	/*
-	 * Do it again, earlier dmar_tbl mapping could be mapped with
-	 * fixed map.
-	 */
-	dmar_table_detect();
-
-	/*
-	 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
-	 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
-	 */
-	dmar_tbl = tboot_get_dmar_table(dmar_tbl);
-
-	dmar = (struct acpi_table_dmar *)dmar_tbl;
-	if (!dmar)
-		return -ENODEV;
-
-	if (dmar->width < PAGE_SHIFT - 1) {
-		printk(KERN_WARNING PREFIX "Invalid DMAR haw\n");
-		return -EINVAL;
-	}
-
-	printk (KERN_INFO PREFIX "Host address width %d\n",
-		dmar->width + 1);
-
-	entry_header = (struct acpi_dmar_header *)(dmar + 1);
-	while (((unsigned long)entry_header) <
-			(((unsigned long)dmar) + dmar_tbl->length)) {
-		/* Avoid looping forever on bad ACPI tables */
-		if (entry_header->length == 0) {
-			printk(KERN_WARNING PREFIX
-				"Invalid 0-length structure\n");
-			ret = -EINVAL;
-			break;
-		}
-
-		dmar_table_print_dmar_entry(entry_header);
-
-		switch (entry_header->type) {
-		case ACPI_DMAR_TYPE_HARDWARE_UNIT:
-			ret = dmar_parse_one_drhd(entry_header);
-			break;
-		case ACPI_DMAR_TYPE_RESERVED_MEMORY:
-#ifdef CONFIG_DMAR
-			ret = dmar_parse_one_rmrr(entry_header);
-#endif
-			break;
-		case ACPI_DMAR_TYPE_ATSR:
-#ifdef CONFIG_DMAR
-			ret = dmar_parse_one_atsr(entry_header);
-#endif
-			break;
-		case ACPI_DMAR_HARDWARE_AFFINITY:
-#ifdef CONFIG_ACPI_NUMA
-			ret = dmar_parse_one_rhsa(entry_header);
-#endif
-			break;
-		default:
-			printk(KERN_WARNING PREFIX
-				"Unknown DMAR structure type %d\n",
-				entry_header->type);
-			ret = 0; /* for forward compatibility */
-			break;
-		}
-		if (ret)
-			break;
-
-		entry_header = ((void *)entry_header + entry_header->length);
-	}
-	return ret;
-}
-
-static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
-			  struct pci_dev *dev)
-{
-	int index;
-
-	while (dev) {
-		for (index = 0; index < cnt; index++)
-			if (dev == devices[index])
-				return 1;
-
-		/* Check our parent */
-		dev = dev->bus->self;
-	}
-
-	return 0;
-}
-
-struct dmar_drhd_unit *
-dmar_find_matched_drhd_unit(struct pci_dev *dev)
-{
-	struct dmar_drhd_unit *dmaru = NULL;
-	struct acpi_dmar_hardware_unit *drhd;
-
-	dev = pci_physfn(dev);
-
-	list_for_each_entry(dmaru, &dmar_drhd_units, list) {
-		drhd = container_of(dmaru->hdr,
-				    struct acpi_dmar_hardware_unit,
-				    header);
-
-		if (dmaru->include_all &&
-		    drhd->segment == pci_domain_nr(dev->bus))
-			return dmaru;
-
-		if (dmar_pci_device_match(dmaru->devices,
-					  dmaru->devices_cnt, dev))
-			return dmaru;
-	}
-
-	return NULL;
-}
-
-int __init dmar_dev_scope_init(void)
-{
-	struct dmar_drhd_unit *drhd, *drhd_n;
-	int ret = -ENODEV;
-
-	list_for_each_entry_safe(drhd, drhd_n, &dmar_drhd_units, list) {
-		ret = dmar_parse_dev(drhd);
-		if (ret)
-			return ret;
-	}
-
-#ifdef CONFIG_DMAR
-	{
-		struct dmar_rmrr_unit *rmrr, *rmrr_n;
-		struct dmar_atsr_unit *atsr, *atsr_n;
-
-		list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
-			ret = rmrr_parse_dev(rmrr);
-			if (ret)
-				return ret;
-		}
-
-		list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
-			ret = atsr_parse_dev(atsr);
-			if (ret)
-				return ret;
-		}
-	}
-#endif
-
-	return ret;
-}
-
-
-int __init dmar_table_init(void)
-{
-	static int dmar_table_initialized;
-	int ret;
-
-	if (dmar_table_initialized)
-		return 0;
-
-	dmar_table_initialized = 1;
-
-	ret = parse_dmar_table();
-	if (ret) {
-		if (ret != -ENODEV)
-			printk(KERN_INFO PREFIX "parse DMAR table failure.\n");
-		return ret;
-	}
-
-	if (list_empty(&dmar_drhd_units)) {
-		printk(KERN_INFO PREFIX "No DMAR devices found\n");
-		return -ENODEV;
-	}
-
-#ifdef CONFIG_DMAR
-	if (list_empty(&dmar_rmrr_units))
-		printk(KERN_INFO PREFIX "No RMRR found\n");
-
-	if (list_empty(&dmar_atsr_units))
-		printk(KERN_INFO PREFIX "No ATSR found\n");
-#endif
-
-	return 0;
-}
-
-static void warn_invalid_dmar(u64 addr, const char *message)
-{
-	WARN_TAINT_ONCE(
-		1, TAINT_FIRMWARE_WORKAROUND,
-		"Your BIOS is broken; DMAR reported at address %llx%s!\n"
-		"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-		addr, message,
-		dmi_get_system_info(DMI_BIOS_VENDOR),
-		dmi_get_system_info(DMI_BIOS_VERSION),
-		dmi_get_system_info(DMI_PRODUCT_VERSION));
-}
-
-int __init check_zero_address(void)
-{
-	struct acpi_table_dmar *dmar;
-	struct acpi_dmar_header *entry_header;
-	struct acpi_dmar_hardware_unit *drhd;
-
-	dmar = (struct acpi_table_dmar *)dmar_tbl;
-	entry_header = (struct acpi_dmar_header *)(dmar + 1);
-
-	while (((unsigned long)entry_header) <
-			(((unsigned long)dmar) + dmar_tbl->length)) {
-		/* Avoid looping forever on bad ACPI tables */
-		if (entry_header->length == 0) {
-			printk(KERN_WARNING PREFIX
-				"Invalid 0-length structure\n");
-			return 0;
-		}
-
-		if (entry_header->type == ACPI_DMAR_TYPE_HARDWARE_UNIT) {
-			void __iomem *addr;
-			u64 cap, ecap;
-
-			drhd = (void *)entry_header;
-			if (!drhd->address) {
-				warn_invalid_dmar(0, "");
-				goto failed;
-			}
-
-			addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
-			if (!addr ) {
-				printk("IOMMU: can't validate: %llx\n", drhd->address);
-				goto failed;
-			}
-			cap = dmar_readq(addr + DMAR_CAP_REG);
-			ecap = dmar_readq(addr + DMAR_ECAP_REG);
-			early_iounmap(addr, VTD_PAGE_SIZE);
-			if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
-				warn_invalid_dmar(drhd->address,
-						  " returns all ones");
-				goto failed;
-			}
-		}
-
-		entry_header = ((void *)entry_header + entry_header->length);
-	}
-	return 1;
-
-failed:
-#ifdef CONFIG_DMAR
-	dmar_disabled = 1;
-#endif
-	return 0;
-}
-
-int __init detect_intel_iommu(void)
-{
-	int ret;
-
-	ret = dmar_table_detect();
-	if (ret)
-		ret = check_zero_address();
-	{
-#ifdef CONFIG_INTR_REMAP
-		struct acpi_table_dmar *dmar;
-
-		dmar = (struct acpi_table_dmar *) dmar_tbl;
-		if (ret && cpu_has_x2apic && dmar->flags & 0x1)
-			printk(KERN_INFO
-			       "Queued invalidation will be enabled to support "
-			       "x2apic and Intr-remapping.\n");
-#endif
-#ifdef CONFIG_DMAR
-		if (ret && !no_iommu && !iommu_detected && !dmar_disabled) {
-			iommu_detected = 1;
-			/* Make sure ACS will be enabled */
-			pci_request_acs();
-		}
-#endif
-#ifdef CONFIG_X86
-		if (ret)
-			x86_init.iommu.iommu_init = intel_iommu_init;
-#endif
-	}
-	early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
-	dmar_tbl = NULL;
-
-	return ret ? 1 : -ENODEV;
-}
-
-
-int alloc_iommu(struct dmar_drhd_unit *drhd)
-{
-	struct intel_iommu *iommu;
-	int map_size;
-	u32 ver;
-	static int iommu_allocated = 0;
-	int agaw = 0;
-	int msagaw = 0;
-
-	if (!drhd->reg_base_addr) {
-		warn_invalid_dmar(0, "");
-		return -EINVAL;
-	}
-
-	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-	if (!iommu)
-		return -ENOMEM;
-
-	iommu->seq_id = iommu_allocated++;
-	sprintf (iommu->name, "dmar%d", iommu->seq_id);
-
-	iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE);
-	if (!iommu->reg) {
-		printk(KERN_ERR "IOMMU: can't map the region\n");
-		goto error;
-	}
-	iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
-	iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
-
-	if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) {
-		warn_invalid_dmar(drhd->reg_base_addr, " returns all ones");
-		goto err_unmap;
-	}
-
-#ifdef CONFIG_DMAR
-	agaw = iommu_calculate_agaw(iommu);
-	if (agaw < 0) {
-		printk(KERN_ERR
-		       "Cannot get a valid agaw for iommu (seq_id = %d)\n",
-		       iommu->seq_id);
-		goto err_unmap;
-	}
-	msagaw = iommu_calculate_max_sagaw(iommu);
-	if (msagaw < 0) {
-		printk(KERN_ERR
-			"Cannot get a valid max agaw for iommu (seq_id = %d)\n",
-			iommu->seq_id);
-		goto err_unmap;
-	}
-#endif
-	iommu->agaw = agaw;
-	iommu->msagaw = msagaw;
-
-	iommu->node = -1;
-
-	/* the registers might be more than one page */
-	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
-		cap_max_fault_reg_offset(iommu->cap));
-	map_size = VTD_PAGE_ALIGN(map_size);
-	if (map_size > VTD_PAGE_SIZE) {
-		iounmap(iommu->reg);
-		iommu->reg = ioremap(drhd->reg_base_addr, map_size);
-		if (!iommu->reg) {
-			printk(KERN_ERR "IOMMU: can't map the region\n");
-			goto error;
-		}
-	}
-
-	ver = readl(iommu->reg + DMAR_VER_REG);
-	pr_info("IOMMU %d: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
-		iommu->seq_id,
-		(unsigned long long)drhd->reg_base_addr,
-		DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
-		(unsigned long long)iommu->cap,
-		(unsigned long long)iommu->ecap);
-
-	spin_lock_init(&iommu->register_lock);
-
-	drhd->iommu = iommu;
-	return 0;
-
- err_unmap:
-	iounmap(iommu->reg);
- error:
-	kfree(iommu);
-	return -1;
-}
-
-void free_iommu(struct intel_iommu *iommu)
-{
-	if (!iommu)
-		return;
-
-#ifdef CONFIG_DMAR
-	free_dmar_iommu(iommu);
-#endif
-
-	if (iommu->reg)
-		iounmap(iommu->reg);
-	kfree(iommu);
-}
-
-/*
- * Reclaim all the submitted descriptors which have completed its work.
- */
-static inline void reclaim_free_desc(struct q_inval *qi)
-{
-	while (qi->desc_status[qi->free_tail] == QI_DONE ||
-	       qi->desc_status[qi->free_tail] == QI_ABORT) {
-		qi->desc_status[qi->free_tail] = QI_FREE;
-		qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
-		qi->free_cnt++;
-	}
-}
-
-static int qi_check_fault(struct intel_iommu *iommu, int index)
-{
-	u32 fault;
-	int head, tail;
-	struct q_inval *qi = iommu->qi;
-	int wait_index = (index + 1) % QI_LENGTH;
-
-	if (qi->desc_status[wait_index] == QI_ABORT)
-		return -EAGAIN;
-
-	fault = readl(iommu->reg + DMAR_FSTS_REG);
-
-	/*
-	 * If IQE happens, the head points to the descriptor associated
-	 * with the error. No new descriptors are fetched until the IQE
-	 * is cleared.
-	 */
-	if (fault & DMA_FSTS_IQE) {
-		head = readl(iommu->reg + DMAR_IQH_REG);
-		if ((head >> DMAR_IQ_SHIFT) == index) {
-			printk(KERN_ERR "VT-d detected invalid descriptor: "
-				"low=%llx, high=%llx\n",
-				(unsigned long long)qi->desc[index].low,
-				(unsigned long long)qi->desc[index].high);
-			memcpy(&qi->desc[index], &qi->desc[wait_index],
-					sizeof(struct qi_desc));
-			__iommu_flush_cache(iommu, &qi->desc[index],
-					sizeof(struct qi_desc));
-			writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
-			return -EINVAL;
-		}
-	}
-
-	/*
-	 * If ITE happens, all pending wait_desc commands are aborted.
-	 * No new descriptors are fetched until the ITE is cleared.
-	 */
-	if (fault & DMA_FSTS_ITE) {
-		head = readl(iommu->reg + DMAR_IQH_REG);
-		head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
-		head |= 1;
-		tail = readl(iommu->reg + DMAR_IQT_REG);
-		tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
-
-		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
-
-		do {
-			if (qi->desc_status[head] == QI_IN_USE)
-				qi->desc_status[head] = QI_ABORT;
-			head = (head - 2 + QI_LENGTH) % QI_LENGTH;
-		} while (head != tail);
-
-		if (qi->desc_status[wait_index] == QI_ABORT)
-			return -EAGAIN;
-	}
-
-	if (fault & DMA_FSTS_ICE)
-		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
-
-	return 0;
-}
-
-/*
- * Submit the queued invalidation descriptor to the remapping
- * hardware unit and wait for its completion.
- */
-int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
-{
-	int rc;
-	struct q_inval *qi = iommu->qi;
-	struct qi_desc *hw, wait_desc;
-	int wait_index, index;
-	unsigned long flags;
-
-	if (!qi)
-		return 0;
-
-	hw = qi->desc;
-
-restart:
-	rc = 0;
-
-	spin_lock_irqsave(&qi->q_lock, flags);
-	while (qi->free_cnt < 3) {
-		spin_unlock_irqrestore(&qi->q_lock, flags);
-		cpu_relax();
-		spin_lock_irqsave(&qi->q_lock, flags);
-	}
-
-	index = qi->free_head;
-	wait_index = (index + 1) % QI_LENGTH;
-
-	qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE;
-
-	hw[index] = *desc;
-
-	wait_desc.low = QI_IWD_STATUS_DATA(QI_DONE) |
-			QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
-	wait_desc.high = virt_to_phys(&qi->desc_status[wait_index]);
-
-	hw[wait_index] = wait_desc;
-
-	__iommu_flush_cache(iommu, &hw[index], sizeof(struct qi_desc));
-	__iommu_flush_cache(iommu, &hw[wait_index], sizeof(struct qi_desc));
-
-	qi->free_head = (qi->free_head + 2) % QI_LENGTH;
-	qi->free_cnt -= 2;
-
-	/*
-	 * update the HW tail register indicating the presence of
-	 * new descriptors.
-	 */
-	writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG);
-
-	while (qi->desc_status[wait_index] != QI_DONE) {
-		/*
-		 * We will leave the interrupts disabled, to prevent interrupt
-		 * context to queue another cmd while a cmd is already submitted
-		 * and waiting for completion on this cpu. This is to avoid
-		 * a deadlock where the interrupt context can wait indefinitely
-		 * for free slots in the queue.
-		 */
-		rc = qi_check_fault(iommu, index);
-		if (rc)
-			break;
-
-		spin_unlock(&qi->q_lock);
-		cpu_relax();
-		spin_lock(&qi->q_lock);
-	}
-
-	qi->desc_status[index] = QI_DONE;
-
-	reclaim_free_desc(qi);
-	spin_unlock_irqrestore(&qi->q_lock, flags);
-
-	if (rc == -EAGAIN)
-		goto restart;
-
-	return rc;
-}
-
-/*
- * Flush the global interrupt entry cache.
- */
-void qi_global_iec(struct intel_iommu *iommu)
-{
-	struct qi_desc desc;
-
-	desc.low = QI_IEC_TYPE;
-	desc.high = 0;
-
-	/* should never fail */
-	qi_submit_sync(&desc, iommu);
-}
-
-void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
-		      u64 type)
-{
-	struct qi_desc desc;
-
-	desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
-			| QI_CC_GRAN(type) | QI_CC_TYPE;
-	desc.high = 0;
-
-	qi_submit_sync(&desc, iommu);
-}
-
-void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-		    unsigned int size_order, u64 type)
-{
-	u8 dw = 0, dr = 0;
-
-	struct qi_desc desc;
-	int ih = 0;
-
-	if (cap_write_drain(iommu->cap))
-		dw = 1;
-
-	if (cap_read_drain(iommu->cap))
-		dr = 1;
-
-	desc.low = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
-		| QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
-	desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
-		| QI_IOTLB_AM(size_order);
-
-	qi_submit_sync(&desc, iommu);
-}
-
-void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
-			u64 addr, unsigned mask)
-{
-	struct qi_desc desc;
-
-	if (mask) {
-		BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
-		addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1;
-		desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
-	} else
-		desc.high = QI_DEV_IOTLB_ADDR(addr);
-
-	if (qdep >= QI_DEV_IOTLB_MAX_INVS)
-		qdep = 0;
-
-	desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
-		   QI_DIOTLB_TYPE;
-
-	qi_submit_sync(&desc, iommu);
-}
-
-/*
- * Disable Queued Invalidation interface.
- */
-void dmar_disable_qi(struct intel_iommu *iommu)
-{
-	unsigned long flags;
-	u32 sts;
-	cycles_t start_time = get_cycles();
-
-	if (!ecap_qis(iommu->ecap))
-		return;
-
-	spin_lock_irqsave(&iommu->register_lock, flags);
-
-	sts =  dmar_readq(iommu->reg + DMAR_GSTS_REG);
-	if (!(sts & DMA_GSTS_QIES))
-		goto end;
-
-	/*
-	 * Give a chance to HW to complete the pending invalidation requests.
-	 */
-	while ((readl(iommu->reg + DMAR_IQT_REG) !=
-		readl(iommu->reg + DMAR_IQH_REG)) &&
-		(DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
-		cpu_relax();
-
-	iommu->gcmd &= ~DMA_GCMD_QIE;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
-		      !(sts & DMA_GSTS_QIES), sts);
-end:
-	spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-/*
- * Enable queued invalidation.
- */
-static void __dmar_enable_qi(struct intel_iommu *iommu)
-{
-	u32 sts;
-	unsigned long flags;
-	struct q_inval *qi = iommu->qi;
-
-	qi->free_head = qi->free_tail = 0;
-	qi->free_cnt = QI_LENGTH;
-
-	spin_lock_irqsave(&iommu->register_lock, flags);
-
-	/* write zero to the tail reg */
-	writel(0, iommu->reg + DMAR_IQT_REG);
-
-	dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc));
-
-	iommu->gcmd |= DMA_GCMD_QIE;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	/* Make sure hardware complete it */
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-/*
- * Enable Queued Invalidation interface. This is a must to support
- * interrupt-remapping. Also used by DMA-remapping, which replaces
- * register based IOTLB invalidation.
- */
-int dmar_enable_qi(struct intel_iommu *iommu)
-{
-	struct q_inval *qi;
-	struct page *desc_page;
-
-	if (!ecap_qis(iommu->ecap))
-		return -ENOENT;
-
-	/*
-	 * queued invalidation is already setup and enabled.
-	 */
-	if (iommu->qi)
-		return 0;
-
-	iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
-	if (!iommu->qi)
-		return -ENOMEM;
-
-	qi = iommu->qi;
-
-
-	desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, 0);
-	if (!desc_page) {
-		kfree(qi);
-		iommu->qi = 0;
-		return -ENOMEM;
-	}
-
-	qi->desc = page_address(desc_page);
-
-	qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_ATOMIC);
-	if (!qi->desc_status) {
-		free_page((unsigned long) qi->desc);
-		kfree(qi);
-		iommu->qi = 0;
-		return -ENOMEM;
-	}
-
-	qi->free_head = qi->free_tail = 0;
-	qi->free_cnt = QI_LENGTH;
-
-	spin_lock_init(&qi->q_lock);
-
-	__dmar_enable_qi(iommu);
-
-	return 0;
-}
-
-/* iommu interrupt handling. Most stuff are MSI-like. */
-
-enum faulttype {
-	DMA_REMAP,
-	INTR_REMAP,
-	UNKNOWN,
-};
-
-static const char *dma_remap_fault_reasons[] =
-{
-	"Software",
-	"Present bit in root entry is clear",
-	"Present bit in context entry is clear",
-	"Invalid context entry",
-	"Access beyond MGAW",
-	"PTE Write access is not set",
-	"PTE Read access is not set",
-	"Next page table ptr is invalid",
-	"Root table address invalid",
-	"Context table ptr is invalid",
-	"non-zero reserved fields in RTP",
-	"non-zero reserved fields in CTP",
-	"non-zero reserved fields in PTE",
-};
-
-static const char *intr_remap_fault_reasons[] =
-{
-	"Detected reserved fields in the decoded interrupt-remapped request",
-	"Interrupt index exceeded the interrupt-remapping table size",
-	"Present field in the IRTE entry is clear",
-	"Error accessing interrupt-remapping table pointed by IRTA_REG",
-	"Detected reserved fields in the IRTE entry",
-	"Blocked a compatibility format interrupt request",
-	"Blocked an interrupt request due to source-id verification failure",
-};
-
-#define MAX_FAULT_REASON_IDX 	(ARRAY_SIZE(fault_reason_strings) - 1)
-
-const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
-{
-	if (fault_reason >= 0x20 && (fault_reason <= 0x20 +
-				     ARRAY_SIZE(intr_remap_fault_reasons))) {
-		*fault_type = INTR_REMAP;
-		return intr_remap_fault_reasons[fault_reason - 0x20];
-	} else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
-		*fault_type = DMA_REMAP;
-		return dma_remap_fault_reasons[fault_reason];
-	} else {
-		*fault_type = UNKNOWN;
-		return "Unknown";
-	}
-}
-
-void dmar_msi_unmask(struct irq_data *data)
-{
-	struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
-	unsigned long flag;
-
-	/* unmask it */
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	writel(0, iommu->reg + DMAR_FECTL_REG);
-	/* Read a reg to force flush the post write */
-	readl(iommu->reg + DMAR_FECTL_REG);
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_mask(struct irq_data *data)
-{
-	unsigned long flag;
-	struct intel_iommu *iommu = irq_data_get_irq_handler_data(data);
-
-	/* mask it */
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
-	/* Read a reg to force flush the post write */
-	readl(iommu->reg + DMAR_FECTL_REG);
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_write(int irq, struct msi_msg *msg)
-{
-	struct intel_iommu *iommu = irq_get_handler_data(irq);
-	unsigned long flag;
-
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
-	writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
-	writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_read(int irq, struct msi_msg *msg)
-{
-	struct intel_iommu *iommu = irq_get_handler_data(irq);
-	unsigned long flag;
-
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
-	msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
-	msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
-		u8 fault_reason, u16 source_id, unsigned long long addr)
-{
-	const char *reason;
-	int fault_type;
-
-	reason = dmar_get_fault_reason(fault_reason, &fault_type);
-
-	if (fault_type == INTR_REMAP)
-		printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] "
-		       "fault index %llx\n"
-			"INTR-REMAP:[fault reason %02d] %s\n",
-			(source_id >> 8), PCI_SLOT(source_id & 0xFF),
-			PCI_FUNC(source_id & 0xFF), addr >> 48,
-			fault_reason, reason);
-	else
-		printk(KERN_ERR
-		       "DMAR:[%s] Request device [%02x:%02x.%d] "
-		       "fault addr %llx \n"
-		       "DMAR:[fault reason %02d] %s\n",
-		       (type ? "DMA Read" : "DMA Write"),
-		       (source_id >> 8), PCI_SLOT(source_id & 0xFF),
-		       PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
-	return 0;
-}
-
-#define PRIMARY_FAULT_REG_LEN (16)
-irqreturn_t dmar_fault(int irq, void *dev_id)
-{
-	struct intel_iommu *iommu = dev_id;
-	int reg, fault_index;
-	u32 fault_status;
-	unsigned long flag;
-
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
-	if (fault_status)
-		printk(KERN_ERR "DRHD: handling fault status reg %x\n",
-		       fault_status);
-
-	/* TBD: ignore advanced fault log currently */
-	if (!(fault_status & DMA_FSTS_PPF))
-		goto clear_rest;
-
-	fault_index = dma_fsts_fault_record_index(fault_status);
-	reg = cap_fault_reg_offset(iommu->cap);
-	while (1) {
-		u8 fault_reason;
-		u16 source_id;
-		u64 guest_addr;
-		int type;
-		u32 data;
-
-		/* highest 32 bits */
-		data = readl(iommu->reg + reg +
-				fault_index * PRIMARY_FAULT_REG_LEN + 12);
-		if (!(data & DMA_FRCD_F))
-			break;
-
-		fault_reason = dma_frcd_fault_reason(data);
-		type = dma_frcd_type(data);
-
-		data = readl(iommu->reg + reg +
-				fault_index * PRIMARY_FAULT_REG_LEN + 8);
-		source_id = dma_frcd_source_id(data);
-
-		guest_addr = dmar_readq(iommu->reg + reg +
-				fault_index * PRIMARY_FAULT_REG_LEN);
-		guest_addr = dma_frcd_page_addr(guest_addr);
-		/* clear the fault */
-		writel(DMA_FRCD_F, iommu->reg + reg +
-			fault_index * PRIMARY_FAULT_REG_LEN + 12);
-
-		spin_unlock_irqrestore(&iommu->register_lock, flag);
-
-		dmar_fault_do_one(iommu, type, fault_reason,
-				source_id, guest_addr);
-
-		fault_index++;
-		if (fault_index >= cap_num_fault_regs(iommu->cap))
-			fault_index = 0;
-		spin_lock_irqsave(&iommu->register_lock, flag);
-	}
-clear_rest:
-	/* clear all the other faults */
-	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
-	writel(fault_status, iommu->reg + DMAR_FSTS_REG);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-	return IRQ_HANDLED;
-}
-
-int dmar_set_interrupt(struct intel_iommu *iommu)
-{
-	int irq, ret;
-
-	/*
-	 * Check if the fault interrupt is already initialized.
-	 */
-	if (iommu->irq)
-		return 0;
-
-	irq = create_irq();
-	if (!irq) {
-		printk(KERN_ERR "IOMMU: no free vectors\n");
-		return -EINVAL;
-	}
-
-	irq_set_handler_data(irq, iommu);
-	iommu->irq = irq;
-
-	ret = arch_setup_dmar_msi(irq);
-	if (ret) {
-		irq_set_handler_data(irq, NULL);
-		iommu->irq = 0;
-		destroy_irq(irq);
-		return ret;
-	}
-
-	ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu);
-	if (ret)
-		printk(KERN_ERR "IOMMU: can't request irq\n");
-	return ret;
-}
-
-int __init enable_drhd_fault_handling(void)
-{
-	struct dmar_drhd_unit *drhd;
-
-	/*
-	 * Enable fault control interrupt.
-	 */
-	for_each_drhd_unit(drhd) {
-		int ret;
-		struct intel_iommu *iommu = drhd->iommu;
-		ret = dmar_set_interrupt(iommu);
-
-		if (ret) {
-			printk(KERN_ERR "DRHD %Lx: failed to enable fault, "
-			       " interrupt, ret %d\n",
-			       (unsigned long long)drhd->reg_base_addr, ret);
-			return -1;
-		}
-
-		/*
-		 * Clear any previous faults.
-		 */
-		dmar_fault(iommu->irq, iommu);
-	}
-
-	return 0;
-}
-
-/*
- * Re-enable Queued Invalidation interface.
- */
-int dmar_reenable_qi(struct intel_iommu *iommu)
-{
-	if (!ecap_qis(iommu->ecap))
-		return -ENOENT;
-
-	if (!iommu->qi)
-		return -ENOENT;
-
-	/*
-	 * First disable queued invalidation.
-	 */
-	dmar_disable_qi(iommu);
-	/*
-	 * Then enable queued invalidation again. Since there is no pending
-	 * invalidation requests now, it's safe to re-enable queued
-	 * invalidation.
-	 */
-	__dmar_enable_qi(iommu);
-
-	return 0;
-}
-
-/*
- * Check interrupt remapping support in DMAR table description.
- */
-int __init dmar_ir_support(void)
-{
-	struct acpi_table_dmar *dmar;
-	dmar = (struct acpi_table_dmar *)dmar_tbl;
-	if (!dmar)
-		return 0;
-	return dmar->flags & 0x1;
-}
-IOMMU_INIT_POST(detect_intel_iommu);
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
deleted file mode 100644
index f02c34d26d1b..000000000000
--- a/drivers/pci/intel-iommu.c
+++ /dev/null
@@ -1,4017 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Author: Ashok Raj <ashok.raj@intel.com>
- * Author: Shaohua Li <shaohua.li@intel.com>
- * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- * Author: Fenghua Yu <fenghua.yu@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/bitmap.h>
-#include <linux/debugfs.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/dmar.h>
-#include <linux/dma-mapping.h>
-#include <linux/mempool.h>
-#include <linux/timer.h>
-#include <linux/iova.h>
-#include <linux/iommu.h>
-#include <linux/intel-iommu.h>
-#include <linux/syscore_ops.h>
-#include <linux/tboot.h>
-#include <linux/dmi.h>
-#include <linux/pci-ats.h>
-#include <asm/cacheflush.h>
-#include <asm/iommu.h>
-#include "pci.h"
-
-#define ROOT_SIZE		VTD_PAGE_SIZE
-#define CONTEXT_SIZE		VTD_PAGE_SIZE
-
-#define IS_BRIDGE_HOST_DEVICE(pdev) \
-			    ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
-#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
-#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
-#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
-
-#define IOAPIC_RANGE_START	(0xfee00000)
-#define IOAPIC_RANGE_END	(0xfeefffff)
-#define IOVA_START_ADDR		(0x1000)
-
-#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
-
-#define MAX_AGAW_WIDTH 64
-
-#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
-
-/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
-   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
-#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
-				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
-#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
-
-#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
-#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
-#define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
-
-/* page table handling */
-#define LEVEL_STRIDE		(9)
-#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
-
-static inline int agaw_to_level(int agaw)
-{
-	return agaw + 2;
-}
-
-static inline int agaw_to_width(int agaw)
-{
-	return 30 + agaw * LEVEL_STRIDE;
-}
-
-static inline int width_to_agaw(int width)
-{
-	return (width - 30) / LEVEL_STRIDE;
-}
-
-static inline unsigned int level_to_offset_bits(int level)
-{
-	return (level - 1) * LEVEL_STRIDE;
-}
-
-static inline int pfn_level_offset(unsigned long pfn, int level)
-{
-	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
-}
-
-static inline unsigned long level_mask(int level)
-{
-	return -1UL << level_to_offset_bits(level);
-}
-
-static inline unsigned long level_size(int level)
-{
-	return 1UL << level_to_offset_bits(level);
-}
-
-static inline unsigned long align_to_level(unsigned long pfn, int level)
-{
-	return (pfn + level_size(level) - 1) & level_mask(level);
-}
-
-static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
-{
-	return  1 << ((lvl - 1) * LEVEL_STRIDE);
-}
-
-/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
-   are never going to work. */
-static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
-{
-	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
-}
-
-static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
-{
-	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
-}
-static inline unsigned long page_to_dma_pfn(struct page *pg)
-{
-	return mm_to_dma_pfn(page_to_pfn(pg));
-}
-static inline unsigned long virt_to_dma_pfn(void *p)
-{
-	return page_to_dma_pfn(virt_to_page(p));
-}
-
-/* global iommu list, set NULL for ignored DMAR units */
-static struct intel_iommu **g_iommus;
-
-static void __init check_tylersburg_isoch(void);
-static int rwbf_quirk;
-
-/*
- * set to 1 to panic kernel if can't successfully enable VT-d
- * (used when kernel is launched w/ TXT)
- */
-static int force_on = 0;
-
-/*
- * 0: Present
- * 1-11: Reserved
- * 12-63: Context Ptr (12 - (haw-1))
- * 64-127: Reserved
- */
-struct root_entry {
-	u64	val;
-	u64	rsvd1;
-};
-#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
-static inline bool root_present(struct root_entry *root)
-{
-	return (root->val & 1);
-}
-static inline void set_root_present(struct root_entry *root)
-{
-	root->val |= 1;
-}
-static inline void set_root_value(struct root_entry *root, unsigned long value)
-{
-	root->val |= value & VTD_PAGE_MASK;
-}
-
-static inline struct context_entry *
-get_context_addr_from_root(struct root_entry *root)
-{
-	return (struct context_entry *)
-		(root_present(root)?phys_to_virt(
-		root->val & VTD_PAGE_MASK) :
-		NULL);
-}
-
-/*
- * low 64 bits:
- * 0: present
- * 1: fault processing disable
- * 2-3: translation type
- * 12-63: address space root
- * high 64 bits:
- * 0-2: address width
- * 3-6: aval
- * 8-23: domain id
- */
-struct context_entry {
-	u64 lo;
-	u64 hi;
-};
-
-static inline bool context_present(struct context_entry *context)
-{
-	return (context->lo & 1);
-}
-static inline void context_set_present(struct context_entry *context)
-{
-	context->lo |= 1;
-}
-
-static inline void context_set_fault_enable(struct context_entry *context)
-{
-	context->lo &= (((u64)-1) << 2) | 1;
-}
-
-static inline void context_set_translation_type(struct context_entry *context,
-						unsigned long value)
-{
-	context->lo &= (((u64)-1) << 4) | 3;
-	context->lo |= (value & 3) << 2;
-}
-
-static inline void context_set_address_root(struct context_entry *context,
-					    unsigned long value)
-{
-	context->lo |= value & VTD_PAGE_MASK;
-}
-
-static inline void context_set_address_width(struct context_entry *context,
-					     unsigned long value)
-{
-	context->hi |= value & 7;
-}
-
-static inline void context_set_domain_id(struct context_entry *context,
-					 unsigned long value)
-{
-	context->hi |= (value & ((1 << 16) - 1)) << 8;
-}
-
-static inline void context_clear_entry(struct context_entry *context)
-{
-	context->lo = 0;
-	context->hi = 0;
-}
-
-/*
- * 0: readable
- * 1: writable
- * 2-6: reserved
- * 7: super page
- * 8-10: available
- * 11: snoop behavior
- * 12-63: Host physcial address
- */
-struct dma_pte {
-	u64 val;
-};
-
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
-	pte->val = 0;
-}
-
-static inline void dma_set_pte_readable(struct dma_pte *pte)
-{
-	pte->val |= DMA_PTE_READ;
-}
-
-static inline void dma_set_pte_writable(struct dma_pte *pte)
-{
-	pte->val |= DMA_PTE_WRITE;
-}
-
-static inline void dma_set_pte_snp(struct dma_pte *pte)
-{
-	pte->val |= DMA_PTE_SNP;
-}
-
-static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
-{
-	pte->val = (pte->val & ~3) | (prot & 3);
-}
-
-static inline u64 dma_pte_addr(struct dma_pte *pte)
-{
-#ifdef CONFIG_64BIT
-	return pte->val & VTD_PAGE_MASK;
-#else
-	/* Must have a full atomic 64-bit read */
-	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
-#endif
-}
-
-static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
-{
-	pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
-}
-
-static inline bool dma_pte_present(struct dma_pte *pte)
-{
-	return (pte->val & 3) != 0;
-}
-
-static inline int first_pte_in_page(struct dma_pte *pte)
-{
-	return !((unsigned long)pte & ~VTD_PAGE_MASK);
-}
-
-/*
- * This domain is a statically identity mapping domain.
- *	1. This domain creats a static 1:1 mapping to all usable memory.
- * 	2. It maps to each iommu if successful.
- *	3. Each iommu mapps to this domain if successful.
- */
-static struct dmar_domain *si_domain;
-static int hw_pass_through = 1;
-
-/* devices under the same p2p bridge are owned in one domain */
-#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
-
-/* domain represents a virtual machine, more than one devices
- * across iommus may be owned in one domain, e.g. kvm guest.
- */
-#define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 1)
-
-/* si_domain contains mulitple devices */
-#define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 2)
-
-struct dmar_domain {
-	int	id;			/* domain id */
-	int	nid;			/* node id */
-	unsigned long iommu_bmp;	/* bitmap of iommus this domain uses*/
-
-	struct list_head devices; 	/* all devices' list */
-	struct iova_domain iovad;	/* iova's that belong to this domain */
-
-	struct dma_pte	*pgd;		/* virtual address */
-	int		gaw;		/* max guest address width */
-
-	/* adjusted guest address width, 0 is level 2 30-bit */
-	int		agaw;
-
-	int		flags;		/* flags to find out type of domain */
-
-	int		iommu_coherency;/* indicate coherency of iommu access */
-	int		iommu_snooping; /* indicate snooping control feature*/
-	int		iommu_count;	/* reference count of iommu */
-	int		iommu_superpage;/* Level of superpages supported:
-					   0 == 4KiB (no superpages), 1 == 2MiB,
-					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-	spinlock_t	iommu_lock;	/* protect iommu set in domain */
-	u64		max_addr;	/* maximum mapped address */
-};
-
-/* PCI domain-device relationship */
-struct device_domain_info {
-	struct list_head link;	/* link to domain siblings */
-	struct list_head global; /* link to global list */
-	int segment;		/* PCI domain */
-	u8 bus;			/* PCI bus number */
-	u8 devfn;		/* PCI devfn number */
-	struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
-	struct intel_iommu *iommu; /* IOMMU used by this device */
-	struct dmar_domain *domain; /* pointer to domain */
-};
-
-static void flush_unmaps_timeout(unsigned long data);
-
-DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
-
-#define HIGH_WATER_MARK 250
-struct deferred_flush_tables {
-	int next;
-	struct iova *iova[HIGH_WATER_MARK];
-	struct dmar_domain *domain[HIGH_WATER_MARK];
-};
-
-static struct deferred_flush_tables *deferred_flush;
-
-/* bitmap for indexing intel_iommus */
-static int g_num_of_iommus;
-
-static DEFINE_SPINLOCK(async_umap_flush_lock);
-static LIST_HEAD(unmaps_to_do);
-
-static int timer_on;
-static long list_size;
-
-static void domain_remove_dev_info(struct dmar_domain *domain);
-
-#ifdef CONFIG_DMAR_DEFAULT_ON
-int dmar_disabled = 0;
-#else
-int dmar_disabled = 1;
-#endif /*CONFIG_DMAR_DEFAULT_ON*/
-
-static int dmar_map_gfx = 1;
-static int dmar_forcedac;
-static int intel_iommu_strict;
-static int intel_iommu_superpage = 1;
-
-#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
-static DEFINE_SPINLOCK(device_domain_lock);
-static LIST_HEAD(device_domain_list);
-
-static struct iommu_ops intel_iommu_ops;
-
-static int __init intel_iommu_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-	while (*str) {
-		if (!strncmp(str, "on", 2)) {
-			dmar_disabled = 0;
-			printk(KERN_INFO "Intel-IOMMU: enabled\n");
-		} else if (!strncmp(str, "off", 3)) {
-			dmar_disabled = 1;
-			printk(KERN_INFO "Intel-IOMMU: disabled\n");
-		} else if (!strncmp(str, "igfx_off", 8)) {
-			dmar_map_gfx = 0;
-			printk(KERN_INFO
-				"Intel-IOMMU: disable GFX device mapping\n");
-		} else if (!strncmp(str, "forcedac", 8)) {
-			printk(KERN_INFO
-				"Intel-IOMMU: Forcing DAC for PCI devices\n");
-			dmar_forcedac = 1;
-		} else if (!strncmp(str, "strict", 6)) {
-			printk(KERN_INFO
-				"Intel-IOMMU: disable batched IOTLB flush\n");
-			intel_iommu_strict = 1;
-		} else if (!strncmp(str, "sp_off", 6)) {
-			printk(KERN_INFO
-				"Intel-IOMMU: disable supported super page\n");
-			intel_iommu_superpage = 0;
-		}
-
-		str += strcspn(str, ",");
-		while (*str == ',')
-			str++;
-	}
-	return 0;
-}
-__setup("intel_iommu=", intel_iommu_setup);
-
-static struct kmem_cache *iommu_domain_cache;
-static struct kmem_cache *iommu_devinfo_cache;
-static struct kmem_cache *iommu_iova_cache;
-
-static inline void *alloc_pgtable_page(int node)
-{
-	struct page *page;
-	void *vaddr = NULL;
-
-	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
-	if (page)
-		vaddr = page_address(page);
-	return vaddr;
-}
-
-static inline void free_pgtable_page(void *vaddr)
-{
-	free_page((unsigned long)vaddr);
-}
-
-static inline void *alloc_domain_mem(void)
-{
-	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
-}
-
-static void free_domain_mem(void *vaddr)
-{
-	kmem_cache_free(iommu_domain_cache, vaddr);
-}
-
-static inline void * alloc_devinfo_mem(void)
-{
-	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
-}
-
-static inline void free_devinfo_mem(void *vaddr)
-{
-	kmem_cache_free(iommu_devinfo_cache, vaddr);
-}
-
-struct iova *alloc_iova_mem(void)
-{
-	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
-}
-
-void free_iova_mem(struct iova *iova)
-{
-	kmem_cache_free(iommu_iova_cache, iova);
-}
-
-
-static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
-{
-	unsigned long sagaw;
-	int agaw = -1;
-
-	sagaw = cap_sagaw(iommu->cap);
-	for (agaw = width_to_agaw(max_gaw);
-	     agaw >= 0; agaw--) {
-		if (test_bit(agaw, &sagaw))
-			break;
-	}
-
-	return agaw;
-}
-
-/*
- * Calculate max SAGAW for each iommu.
- */
-int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
-{
-	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
-}
-
-/*
- * calculate agaw for each iommu.
- * "SAGAW" may be different across iommus, use a default agaw, and
- * get a supported less agaw for iommus that don't support the default agaw.
- */
-int iommu_calculate_agaw(struct intel_iommu *iommu)
-{
-	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
-}
-
-/* This functionin only returns single iommu in a domain */
-static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
-{
-	int iommu_id;
-
-	/* si_domain and vm domain should not get here. */
-	BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
-	BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
-
-	iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
-	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
-		return NULL;
-
-	return g_iommus[iommu_id];
-}
-
-static void domain_update_iommu_coherency(struct dmar_domain *domain)
-{
-	int i;
-
-	domain->iommu_coherency = 1;
-
-	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
-		if (!ecap_coherent(g_iommus[i]->ecap)) {
-			domain->iommu_coherency = 0;
-			break;
-		}
-	}
-}
-
-static void domain_update_iommu_snooping(struct dmar_domain *domain)
-{
-	int i;
-
-	domain->iommu_snooping = 1;
-
-	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
-		if (!ecap_sc_support(g_iommus[i]->ecap)) {
-			domain->iommu_snooping = 0;
-			break;
-		}
-	}
-}
-
-static void domain_update_iommu_superpage(struct dmar_domain *domain)
-{
-	int i, mask = 0xf;
-
-	if (!intel_iommu_superpage) {
-		domain->iommu_superpage = 0;
-		return;
-	}
-
-	domain->iommu_superpage = 4; /* 1TiB */
-
-	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
-		mask |= cap_super_page_val(g_iommus[i]->cap);
-		if (!mask) {
-			break;
-		}
-	}
-	domain->iommu_superpage = fls(mask);
-}
-
-/* Some capabilities may be different across iommus */
-static void domain_update_iommu_cap(struct dmar_domain *domain)
-{
-	domain_update_iommu_coherency(domain);
-	domain_update_iommu_snooping(domain);
-	domain_update_iommu_superpage(domain);
-}
-
-static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
-{
-	struct dmar_drhd_unit *drhd = NULL;
-	int i;
-
-	for_each_drhd_unit(drhd) {
-		if (drhd->ignored)
-			continue;
-		if (segment != drhd->segment)
-			continue;
-
-		for (i = 0; i < drhd->devices_cnt; i++) {
-			if (drhd->devices[i] &&
-			    drhd->devices[i]->bus->number == bus &&
-			    drhd->devices[i]->devfn == devfn)
-				return drhd->iommu;
-			if (drhd->devices[i] &&
-			    drhd->devices[i]->subordinate &&
-			    drhd->devices[i]->subordinate->number <= bus &&
-			    drhd->devices[i]->subordinate->subordinate >= bus)
-				return drhd->iommu;
-		}
-
-		if (drhd->include_all)
-			return drhd->iommu;
-	}
-
-	return NULL;
-}
-
-static void domain_flush_cache(struct dmar_domain *domain,
-			       void *addr, int size)
-{
-	if (!domain->iommu_coherency)
-		clflush_cache_range(addr, size);
-}
-
-/* Gets context entry for a given bus and devfn */
-static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
-		u8 bus, u8 devfn)
-{
-	struct root_entry *root;
-	struct context_entry *context;
-	unsigned long phy_addr;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	root = &iommu->root_entry[bus];
-	context = get_context_addr_from_root(root);
-	if (!context) {
-		context = (struct context_entry *)
-				alloc_pgtable_page(iommu->node);
-		if (!context) {
-			spin_unlock_irqrestore(&iommu->lock, flags);
-			return NULL;
-		}
-		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
-		phy_addr = virt_to_phys((void *)context);
-		set_root_value(root, phy_addr);
-		set_root_present(root);
-		__iommu_flush_cache(iommu, root, sizeof(*root));
-	}
-	spin_unlock_irqrestore(&iommu->lock, flags);
-	return &context[devfn];
-}
-
-static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
-{
-	struct root_entry *root;
-	struct context_entry *context;
-	int ret;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	root = &iommu->root_entry[bus];
-	context = get_context_addr_from_root(root);
-	if (!context) {
-		ret = 0;
-		goto out;
-	}
-	ret = context_present(&context[devfn]);
-out:
-	spin_unlock_irqrestore(&iommu->lock, flags);
-	return ret;
-}
-
-static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
-{
-	struct root_entry *root;
-	struct context_entry *context;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	root = &iommu->root_entry[bus];
-	context = get_context_addr_from_root(root);
-	if (context) {
-		context_clear_entry(&context[devfn]);
-		__iommu_flush_cache(iommu, &context[devfn], \
-			sizeof(*context));
-	}
-	spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-static void free_context_table(struct intel_iommu *iommu)
-{
-	struct root_entry *root;
-	int i;
-	unsigned long flags;
-	struct context_entry *context;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	if (!iommu->root_entry) {
-		goto out;
-	}
-	for (i = 0; i < ROOT_ENTRY_NR; i++) {
-		root = &iommu->root_entry[i];
-		context = get_context_addr_from_root(root);
-		if (context)
-			free_pgtable_page(context);
-	}
-	free_pgtable_page(iommu->root_entry);
-	iommu->root_entry = NULL;
-out:
-	spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
-				      unsigned long pfn, int large_level)
-{
-	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-	struct dma_pte *parent, *pte = NULL;
-	int level = agaw_to_level(domain->agaw);
-	int offset, target_level;
-
-	BUG_ON(!domain->pgd);
-	BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
-	parent = domain->pgd;
-
-	/* Search pte */
-	if (!large_level)
-		target_level = 1;
-	else
-		target_level = large_level;
-
-	while (level > 0) {
-		void *tmp_page;
-
-		offset = pfn_level_offset(pfn, level);
-		pte = &parent[offset];
-		if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
-			break;
-		if (level == target_level)
-			break;
-
-		if (!dma_pte_present(pte)) {
-			uint64_t pteval;
-
-			tmp_page = alloc_pgtable_page(domain->nid);
-
-			if (!tmp_page)
-				return NULL;
-
-			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
-			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
-			if (cmpxchg64(&pte->val, 0ULL, pteval)) {
-				/* Someone else set it while we were thinking; use theirs. */
-				free_pgtable_page(tmp_page);
-			} else {
-				dma_pte_addr(pte);
-				domain_flush_cache(domain, pte, sizeof(*pte));
-			}
-		}
-		parent = phys_to_virt(dma_pte_addr(pte));
-		level--;
-	}
-
-	return pte;
-}
-
-
-/* return address's pte at specific level */
-static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
-					 unsigned long pfn,
-					 int level, int *large_page)
-{
-	struct dma_pte *parent, *pte = NULL;
-	int total = agaw_to_level(domain->agaw);
-	int offset;
-
-	parent = domain->pgd;
-	while (level <= total) {
-		offset = pfn_level_offset(pfn, total);
-		pte = &parent[offset];
-		if (level == total)
-			return pte;
-
-		if (!dma_pte_present(pte)) {
-			*large_page = total;
-			break;
-		}
-
-		if (pte->val & DMA_PTE_LARGE_PAGE) {
-			*large_page = total;
-			return pte;
-		}
-
-		parent = phys_to_virt(dma_pte_addr(pte));
-		total--;
-	}
-	return NULL;
-}
-
-/* clear last level pte, a tlb flush should be followed */
-static void dma_pte_clear_range(struct dmar_domain *domain,
-				unsigned long start_pfn,
-				unsigned long last_pfn)
-{
-	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-	unsigned int large_page = 1;
-	struct dma_pte *first_pte, *pte;
-
-	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
-	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
-	BUG_ON(start_pfn > last_pfn);
-
-	/* we don't need lock here; nobody else touches the iova range */
-	do {
-		large_page = 1;
-		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
-		if (!pte) {
-			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
-			continue;
-		}
-		do {
-			dma_clear_pte(pte);
-			start_pfn += lvl_to_nr_pages(large_page);
-			pte++;
-		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
-
-		domain_flush_cache(domain, first_pte,
-				   (void *)pte - (void *)first_pte);
-
-	} while (start_pfn && start_pfn <= last_pfn);
-}
-
-/* free page table pages. last level pte should already be cleared */
-static void dma_pte_free_pagetable(struct dmar_domain *domain,
-				   unsigned long start_pfn,
-				   unsigned long last_pfn)
-{
-	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-	struct dma_pte *first_pte, *pte;
-	int total = agaw_to_level(domain->agaw);
-	int level;
-	unsigned long tmp;
-	int large_page = 2;
-
-	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
-	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
-	BUG_ON(start_pfn > last_pfn);
-
-	/* We don't need lock here; nobody else touches the iova range */
-	level = 2;
-	while (level <= total) {
-		tmp = align_to_level(start_pfn, level);
-
-		/* If we can't even clear one PTE at this level, we're done */
-		if (tmp + level_size(level) - 1 > last_pfn)
-			return;
-
-		do {
-			large_page = level;
-			first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
-			if (large_page > level)
-				level = large_page + 1;
-			if (!pte) {
-				tmp = align_to_level(tmp + 1, level + 1);
-				continue;
-			}
-			do {
-				if (dma_pte_present(pte)) {
-					free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
-					dma_clear_pte(pte);
-				}
-				pte++;
-				tmp += level_size(level);
-			} while (!first_pte_in_page(pte) &&
-				 tmp + level_size(level) - 1 <= last_pfn);
-
-			domain_flush_cache(domain, first_pte,
-					   (void *)pte - (void *)first_pte);
-			
-		} while (tmp && tmp + level_size(level) - 1 <= last_pfn);
-		level++;
-	}
-	/* free pgd */
-	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
-		free_pgtable_page(domain->pgd);
-		domain->pgd = NULL;
-	}
-}
-
-/* iommu handling */
-static int iommu_alloc_root_entry(struct intel_iommu *iommu)
-{
-	struct root_entry *root;
-	unsigned long flags;
-
-	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
-	if (!root)
-		return -ENOMEM;
-
-	__iommu_flush_cache(iommu, root, ROOT_SIZE);
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	iommu->root_entry = root;
-	spin_unlock_irqrestore(&iommu->lock, flags);
-
-	return 0;
-}
-
-static void iommu_set_root_entry(struct intel_iommu *iommu)
-{
-	void *addr;
-	u32 sts;
-	unsigned long flag;
-
-	addr = iommu->root_entry;
-
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
-
-	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
-
-	/* Make sure hardware complete it */
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, (sts & DMA_GSTS_RTPS), sts);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-static void iommu_flush_write_buffer(struct intel_iommu *iommu)
-{
-	u32 val;
-	unsigned long flag;
-
-	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
-		return;
-
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
-
-	/* Make sure hardware complete it */
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, (!(val & DMA_GSTS_WBFS)), val);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-/* return value determine if we need a write buffer flush */
-static void __iommu_flush_context(struct intel_iommu *iommu,
-				  u16 did, u16 source_id, u8 function_mask,
-				  u64 type)
-{
-	u64 val = 0;
-	unsigned long flag;
-
-	switch (type) {
-	case DMA_CCMD_GLOBAL_INVL:
-		val = DMA_CCMD_GLOBAL_INVL;
-		break;
-	case DMA_CCMD_DOMAIN_INVL:
-		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
-		break;
-	case DMA_CCMD_DEVICE_INVL:
-		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
-			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
-		break;
-	default:
-		BUG();
-	}
-	val |= DMA_CCMD_ICC;
-
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
-
-	/* Make sure hardware complete it */
-	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
-		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-/* return value determine if we need a write buffer flush */
-static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
-				u64 addr, unsigned int size_order, u64 type)
-{
-	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
-	u64 val = 0, val_iva = 0;
-	unsigned long flag;
-
-	switch (type) {
-	case DMA_TLB_GLOBAL_FLUSH:
-		/* global flush doesn't need set IVA_REG */
-		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
-		break;
-	case DMA_TLB_DSI_FLUSH:
-		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
-		break;
-	case DMA_TLB_PSI_FLUSH:
-		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
-		/* Note: always flush non-leaf currently */
-		val_iva = size_order | addr;
-		break;
-	default:
-		BUG();
-	}
-	/* Note: set drain read/write */
-#if 0
-	/*
-	 * This is probably to be super secure.. Looks like we can
-	 * ignore it without any impact.
-	 */
-	if (cap_read_drain(iommu->cap))
-		val |= DMA_TLB_READ_DRAIN;
-#endif
-	if (cap_write_drain(iommu->cap))
-		val |= DMA_TLB_WRITE_DRAIN;
-
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	/* Note: Only uses first TLB reg currently */
-	if (val_iva)
-		dmar_writeq(iommu->reg + tlb_offset, val_iva);
-	dmar_writeq(iommu->reg + tlb_offset + 8, val);
-
-	/* Make sure hardware complete it */
-	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
-		dmar_readq, (!(val & DMA_TLB_IVT)), val);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-
-	/* check IOTLB invalidation granularity */
-	if (DMA_TLB_IAIG(val) == 0)
-		printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
-	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
-		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
-			(unsigned long long)DMA_TLB_IIRG(type),
-			(unsigned long long)DMA_TLB_IAIG(val));
-}
-
-static struct device_domain_info *iommu_support_dev_iotlb(
-	struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
-{
-	int found = 0;
-	unsigned long flags;
-	struct device_domain_info *info;
-	struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
-
-	if (!ecap_dev_iotlb_support(iommu->ecap))
-		return NULL;
-
-	if (!iommu->qi)
-		return NULL;
-
-	spin_lock_irqsave(&device_domain_lock, flags);
-	list_for_each_entry(info, &domain->devices, link)
-		if (info->bus == bus && info->devfn == devfn) {
-			found = 1;
-			break;
-		}
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-
-	if (!found || !info->dev)
-		return NULL;
-
-	if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
-		return NULL;
-
-	if (!dmar_find_matched_atsr_unit(info->dev))
-		return NULL;
-
-	info->iommu = iommu;
-
-	return info;
-}
-
-static void iommu_enable_dev_iotlb(struct device_domain_info *info)
-{
-	if (!info)
-		return;
-
-	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
-}
-
-static void iommu_disable_dev_iotlb(struct device_domain_info *info)
-{
-	if (!info->dev || !pci_ats_enabled(info->dev))
-		return;
-
-	pci_disable_ats(info->dev);
-}
-
-static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
-				  u64 addr, unsigned mask)
-{
-	u16 sid, qdep;
-	unsigned long flags;
-	struct device_domain_info *info;
-
-	spin_lock_irqsave(&device_domain_lock, flags);
-	list_for_each_entry(info, &domain->devices, link) {
-		if (!info->dev || !pci_ats_enabled(info->dev))
-			continue;
-
-		sid = info->bus << 8 | info->devfn;
-		qdep = pci_ats_queue_depth(info->dev);
-		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
-	}
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-}
-
-static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-				  unsigned long pfn, unsigned int pages, int map)
-{
-	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
-	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
-
-	BUG_ON(pages == 0);
-
-	/*
-	 * Fallback to domain selective flush if no PSI support or the size is
-	 * too big.
-	 * PSI requires page size to be 2 ^ x, and the base address is naturally
-	 * aligned to the size
-	 */
-	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
-		iommu->flush.flush_iotlb(iommu, did, 0, 0,
-						DMA_TLB_DSI_FLUSH);
-	else
-		iommu->flush.flush_iotlb(iommu, did, addr, mask,
-						DMA_TLB_PSI_FLUSH);
-
-	/*
-	 * In caching mode, changes of pages from non-present to present require
-	 * flush. However, device IOTLB doesn't need to be flushed in this case.
-	 */
-	if (!cap_caching_mode(iommu->cap) || !map)
-		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
-}
-
-static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
-{
-	u32 pmen;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->register_lock, flags);
-	pmen = readl(iommu->reg + DMAR_PMEN_REG);
-	pmen &= ~DMA_PMEN_EPM;
-	writel(pmen, iommu->reg + DMAR_PMEN_REG);
-
-	/* wait for the protected region status bit to clear */
-	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
-		readl, !(pmen & DMA_PMEN_PRS), pmen);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-static int iommu_enable_translation(struct intel_iommu *iommu)
-{
-	u32 sts;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->register_lock, flags);
-	iommu->gcmd |= DMA_GCMD_TE;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	/* Make sure hardware complete it */
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, (sts & DMA_GSTS_TES), sts);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flags);
-	return 0;
-}
-
-static int iommu_disable_translation(struct intel_iommu *iommu)
-{
-	u32 sts;
-	unsigned long flag;
-
-	spin_lock_irqsave(&iommu->register_lock, flag);
-	iommu->gcmd &= ~DMA_GCMD_TE;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	/* Make sure hardware complete it */
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, (!(sts & DMA_GSTS_TES)), sts);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flag);
-	return 0;
-}
-
-
-static int iommu_init_domains(struct intel_iommu *iommu)
-{
-	unsigned long ndomains;
-	unsigned long nlongs;
-
-	ndomains = cap_ndoms(iommu->cap);
-	pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
-			ndomains);
-	nlongs = BITS_TO_LONGS(ndomains);
-
-	spin_lock_init(&iommu->lock);
-
-	/* TBD: there might be 64K domains,
-	 * consider other allocation for future chip
-	 */
-	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
-	if (!iommu->domain_ids) {
-		printk(KERN_ERR "Allocating domain id array failed\n");
-		return -ENOMEM;
-	}
-	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
-			GFP_KERNEL);
-	if (!iommu->domains) {
-		printk(KERN_ERR "Allocating domain array failed\n");
-		return -ENOMEM;
-	}
-
-	/*
-	 * if Caching mode is set, then invalid translations are tagged
-	 * with domainid 0. Hence we need to pre-allocate it.
-	 */
-	if (cap_caching_mode(iommu->cap))
-		set_bit(0, iommu->domain_ids);
-	return 0;
-}
-
-
-static void domain_exit(struct dmar_domain *domain);
-static void vm_domain_exit(struct dmar_domain *domain);
-
-void free_dmar_iommu(struct intel_iommu *iommu)
-{
-	struct dmar_domain *domain;
-	int i;
-	unsigned long flags;
-
-	if ((iommu->domains) && (iommu->domain_ids)) {
-		for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
-			domain = iommu->domains[i];
-			clear_bit(i, iommu->domain_ids);
-
-			spin_lock_irqsave(&domain->iommu_lock, flags);
-			if (--domain->iommu_count == 0) {
-				if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
-					vm_domain_exit(domain);
-				else
-					domain_exit(domain);
-			}
-			spin_unlock_irqrestore(&domain->iommu_lock, flags);
-		}
-	}
-
-	if (iommu->gcmd & DMA_GCMD_TE)
-		iommu_disable_translation(iommu);
-
-	if (iommu->irq) {
-		irq_set_handler_data(iommu->irq, NULL);
-		/* This will mask the irq */
-		free_irq(iommu->irq, iommu);
-		destroy_irq(iommu->irq);
-	}
-
-	kfree(iommu->domains);
-	kfree(iommu->domain_ids);
-
-	g_iommus[iommu->seq_id] = NULL;
-
-	/* if all iommus are freed, free g_iommus */
-	for (i = 0; i < g_num_of_iommus; i++) {
-		if (g_iommus[i])
-			break;
-	}
-
-	if (i == g_num_of_iommus)
-		kfree(g_iommus);
-
-	/* free context mapping */
-	free_context_table(iommu);
-}
-
-static struct dmar_domain *alloc_domain(void)
-{
-	struct dmar_domain *domain;
-
-	domain = alloc_domain_mem();
-	if (!domain)
-		return NULL;
-
-	domain->nid = -1;
-	memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
-	domain->flags = 0;
-
-	return domain;
-}
-
-static int iommu_attach_domain(struct dmar_domain *domain,
-			       struct intel_iommu *iommu)
-{
-	int num;
-	unsigned long ndomains;
-	unsigned long flags;
-
-	ndomains = cap_ndoms(iommu->cap);
-
-	spin_lock_irqsave(&iommu->lock, flags);
-
-	num = find_first_zero_bit(iommu->domain_ids, ndomains);
-	if (num >= ndomains) {
-		spin_unlock_irqrestore(&iommu->lock, flags);
-		printk(KERN_ERR "IOMMU: no free domain ids\n");
-		return -ENOMEM;
-	}
-
-	domain->id = num;
-	set_bit(num, iommu->domain_ids);
-	set_bit(iommu->seq_id, &domain->iommu_bmp);
-	iommu->domains[num] = domain;
-	spin_unlock_irqrestore(&iommu->lock, flags);
-
-	return 0;
-}
-
-static void iommu_detach_domain(struct dmar_domain *domain,
-				struct intel_iommu *iommu)
-{
-	unsigned long flags;
-	int num, ndomains;
-	int found = 0;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	ndomains = cap_ndoms(iommu->cap);
-	for_each_set_bit(num, iommu->domain_ids, ndomains) {
-		if (iommu->domains[num] == domain) {
-			found = 1;
-			break;
-		}
-	}
-
-	if (found) {
-		clear_bit(num, iommu->domain_ids);
-		clear_bit(iommu->seq_id, &domain->iommu_bmp);
-		iommu->domains[num] = NULL;
-	}
-	spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-static struct iova_domain reserved_iova_list;
-static struct lock_class_key reserved_rbtree_key;
-
-static int dmar_init_reserved_ranges(void)
-{
-	struct pci_dev *pdev = NULL;
-	struct iova *iova;
-	int i;
-
-	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
-
-	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
-		&reserved_rbtree_key);
-
-	/* IOAPIC ranges shouldn't be accessed by DMA */
-	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
-		IOVA_PFN(IOAPIC_RANGE_END));
-	if (!iova) {
-		printk(KERN_ERR "Reserve IOAPIC range failed\n");
-		return -ENODEV;
-	}
-
-	/* Reserve all PCI MMIO to avoid peer-to-peer access */
-	for_each_pci_dev(pdev) {
-		struct resource *r;
-
-		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-			r = &pdev->resource[i];
-			if (!r->flags || !(r->flags & IORESOURCE_MEM))
-				continue;
-			iova = reserve_iova(&reserved_iova_list,
-					    IOVA_PFN(r->start),
-					    IOVA_PFN(r->end));
-			if (!iova) {
-				printk(KERN_ERR "Reserve iova failed\n");
-				return -ENODEV;
-			}
-		}
-	}
-	return 0;
-}
-
-static void domain_reserve_special_ranges(struct dmar_domain *domain)
-{
-	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
-}
-
-static inline int guestwidth_to_adjustwidth(int gaw)
-{
-	int agaw;
-	int r = (gaw - 12) % 9;
-
-	if (r == 0)
-		agaw = gaw;
-	else
-		agaw = gaw + 9 - r;
-	if (agaw > 64)
-		agaw = 64;
-	return agaw;
-}
-
-static int domain_init(struct dmar_domain *domain, int guest_width)
-{
-	struct intel_iommu *iommu;
-	int adjust_width, agaw;
-	unsigned long sagaw;
-
-	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
-	spin_lock_init(&domain->iommu_lock);
-
-	domain_reserve_special_ranges(domain);
-
-	/* calculate AGAW */
-	iommu = domain_get_iommu(domain);
-	if (guest_width > cap_mgaw(iommu->cap))
-		guest_width = cap_mgaw(iommu->cap);
-	domain->gaw = guest_width;
-	adjust_width = guestwidth_to_adjustwidth(guest_width);
-	agaw = width_to_agaw(adjust_width);
-	sagaw = cap_sagaw(iommu->cap);
-	if (!test_bit(agaw, &sagaw)) {
-		/* hardware doesn't support it, choose a bigger one */
-		pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
-		agaw = find_next_bit(&sagaw, 5, agaw);
-		if (agaw >= 5)
-			return -ENODEV;
-	}
-	domain->agaw = agaw;
-	INIT_LIST_HEAD(&domain->devices);
-
-	if (ecap_coherent(iommu->ecap))
-		domain->iommu_coherency = 1;
-	else
-		domain->iommu_coherency = 0;
-
-	if (ecap_sc_support(iommu->ecap))
-		domain->iommu_snooping = 1;
-	else
-		domain->iommu_snooping = 0;
-
-	domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
-	domain->iommu_count = 1;
-	domain->nid = iommu->node;
-
-	/* always allocate the top pgd */
-	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
-	if (!domain->pgd)
-		return -ENOMEM;
-	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
-	return 0;
-}
-
-static void domain_exit(struct dmar_domain *domain)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-
-	/* Domain 0 is reserved, so dont process it */
-	if (!domain)
-		return;
-
-	/* Flush any lazy unmaps that may reference this domain */
-	if (!intel_iommu_strict)
-		flush_unmaps_timeout(0);
-
-	domain_remove_dev_info(domain);
-	/* destroy iovas */
-	put_iova_domain(&domain->iovad);
-
-	/* clear ptes */
-	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
-
-	/* free page tables */
-	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
-
-	for_each_active_iommu(iommu, drhd)
-		if (test_bit(iommu->seq_id, &domain->iommu_bmp))
-			iommu_detach_domain(domain, iommu);
-
-	free_domain_mem(domain);
-}
-
-static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
-				 u8 bus, u8 devfn, int translation)
-{
-	struct context_entry *context;
-	unsigned long flags;
-	struct intel_iommu *iommu;
-	struct dma_pte *pgd;
-	unsigned long num;
-	unsigned long ndomains;
-	int id;
-	int agaw;
-	struct device_domain_info *info = NULL;
-
-	pr_debug("Set context mapping for %02x:%02x.%d\n",
-		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
-
-	BUG_ON(!domain->pgd);
-	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
-	       translation != CONTEXT_TT_MULTI_LEVEL);
-
-	iommu = device_to_iommu(segment, bus, devfn);
-	if (!iommu)
-		return -ENODEV;
-
-	context = device_to_context_entry(iommu, bus, devfn);
-	if (!context)
-		return -ENOMEM;
-	spin_lock_irqsave(&iommu->lock, flags);
-	if (context_present(context)) {
-		spin_unlock_irqrestore(&iommu->lock, flags);
-		return 0;
-	}
-
-	id = domain->id;
-	pgd = domain->pgd;
-
-	if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
-	    domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
-		int found = 0;
-
-		/* find an available domain id for this device in iommu */
-		ndomains = cap_ndoms(iommu->cap);
-		for_each_set_bit(num, iommu->domain_ids, ndomains) {
-			if (iommu->domains[num] == domain) {
-				id = num;
-				found = 1;
-				break;
-			}
-		}
-
-		if (found == 0) {
-			num = find_first_zero_bit(iommu->domain_ids, ndomains);
-			if (num >= ndomains) {
-				spin_unlock_irqrestore(&iommu->lock, flags);
-				printk(KERN_ERR "IOMMU: no free domain ids\n");
-				return -EFAULT;
-			}
-
-			set_bit(num, iommu->domain_ids);
-			iommu->domains[num] = domain;
-			id = num;
-		}
-
-		/* Skip top levels of page tables for
-		 * iommu which has less agaw than default.
-		 * Unnecessary for PT mode.
-		 */
-		if (translation != CONTEXT_TT_PASS_THROUGH) {
-			for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
-				pgd = phys_to_virt(dma_pte_addr(pgd));
-				if (!dma_pte_present(pgd)) {
-					spin_unlock_irqrestore(&iommu->lock, flags);
-					return -ENOMEM;
-				}
-			}
-		}
-	}
-
-	context_set_domain_id(context, id);
-
-	if (translation != CONTEXT_TT_PASS_THROUGH) {
-		info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
-		translation = info ? CONTEXT_TT_DEV_IOTLB :
-				     CONTEXT_TT_MULTI_LEVEL;
-	}
-	/*
-	 * In pass through mode, AW must be programmed to indicate the largest
-	 * AGAW value supported by hardware. And ASR is ignored by hardware.
-	 */
-	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
-		context_set_address_width(context, iommu->msagaw);
-	else {
-		context_set_address_root(context, virt_to_phys(pgd));
-		context_set_address_width(context, iommu->agaw);
-	}
-
-	context_set_translation_type(context, translation);
-	context_set_fault_enable(context);
-	context_set_present(context);
-	domain_flush_cache(domain, context, sizeof(*context));
-
-	/*
-	 * It's a non-present to present mapping. If hardware doesn't cache
-	 * non-present entry we only need to flush the write-buffer. If the
-	 * _does_ cache non-present entries, then it does so in the special
-	 * domain #0, which we have to flush:
-	 */
-	if (cap_caching_mode(iommu->cap)) {
-		iommu->flush.flush_context(iommu, 0,
-					   (((u16)bus) << 8) | devfn,
-					   DMA_CCMD_MASK_NOBIT,
-					   DMA_CCMD_DEVICE_INVL);
-		iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
-	} else {
-		iommu_flush_write_buffer(iommu);
-	}
-	iommu_enable_dev_iotlb(info);
-	spin_unlock_irqrestore(&iommu->lock, flags);
-
-	spin_lock_irqsave(&domain->iommu_lock, flags);
-	if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
-		domain->iommu_count++;
-		if (domain->iommu_count == 1)
-			domain->nid = iommu->node;
-		domain_update_iommu_cap(domain);
-	}
-	spin_unlock_irqrestore(&domain->iommu_lock, flags);
-	return 0;
-}
-
-static int
-domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
-			int translation)
-{
-	int ret;
-	struct pci_dev *tmp, *parent;
-
-	ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
-					 pdev->bus->number, pdev->devfn,
-					 translation);
-	if (ret)
-		return ret;
-
-	/* dependent device mapping */
-	tmp = pci_find_upstream_pcie_bridge(pdev);
-	if (!tmp)
-		return 0;
-	/* Secondary interface's bus number and devfn 0 */
-	parent = pdev->bus->self;
-	while (parent != tmp) {
-		ret = domain_context_mapping_one(domain,
-						 pci_domain_nr(parent->bus),
-						 parent->bus->number,
-						 parent->devfn, translation);
-		if (ret)
-			return ret;
-		parent = parent->bus->self;
-	}
-	if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
-		return domain_context_mapping_one(domain,
-					pci_domain_nr(tmp->subordinate),
-					tmp->subordinate->number, 0,
-					translation);
-	else /* this is a legacy PCI bridge */
-		return domain_context_mapping_one(domain,
-						  pci_domain_nr(tmp->bus),
-						  tmp->bus->number,
-						  tmp->devfn,
-						  translation);
-}
-
-static int domain_context_mapped(struct pci_dev *pdev)
-{
-	int ret;
-	struct pci_dev *tmp, *parent;
-	struct intel_iommu *iommu;
-
-	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
-				pdev->devfn);
-	if (!iommu)
-		return -ENODEV;
-
-	ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
-	if (!ret)
-		return ret;
-	/* dependent device mapping */
-	tmp = pci_find_upstream_pcie_bridge(pdev);
-	if (!tmp)
-		return ret;
-	/* Secondary interface's bus number and devfn 0 */
-	parent = pdev->bus->self;
-	while (parent != tmp) {
-		ret = device_context_mapped(iommu, parent->bus->number,
-					    parent->devfn);
-		if (!ret)
-			return ret;
-		parent = parent->bus->self;
-	}
-	if (pci_is_pcie(tmp))
-		return device_context_mapped(iommu, tmp->subordinate->number,
-					     0);
-	else
-		return device_context_mapped(iommu, tmp->bus->number,
-					     tmp->devfn);
-}
-
-/* Returns a number of VTD pages, but aligned to MM page size */
-static inline unsigned long aligned_nrpages(unsigned long host_addr,
-					    size_t size)
-{
-	host_addr &= ~PAGE_MASK;
-	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
-}
-
-/* Return largest possible superpage level for a given mapping */
-static inline int hardware_largepage_caps(struct dmar_domain *domain,
-					  unsigned long iov_pfn,
-					  unsigned long phy_pfn,
-					  unsigned long pages)
-{
-	int support, level = 1;
-	unsigned long pfnmerge;
-
-	support = domain->iommu_superpage;
-
-	/* To use a large page, the virtual *and* physical addresses
-	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
-	   of them will mean we have to use smaller pages. So just
-	   merge them and check both at once. */
-	pfnmerge = iov_pfn | phy_pfn;
-
-	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
-		pages >>= VTD_STRIDE_SHIFT;
-		if (!pages)
-			break;
-		pfnmerge >>= VTD_STRIDE_SHIFT;
-		level++;
-		support--;
-	}
-	return level;
-}
-
-static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-			    struct scatterlist *sg, unsigned long phys_pfn,
-			    unsigned long nr_pages, int prot)
-{
-	struct dma_pte *first_pte = NULL, *pte = NULL;
-	phys_addr_t uninitialized_var(pteval);
-	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-	unsigned long sg_res;
-	unsigned int largepage_lvl = 0;
-	unsigned long lvl_pages = 0;
-
-	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
-
-	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
-		return -EINVAL;
-
-	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
-
-	if (sg)
-		sg_res = 0;
-	else {
-		sg_res = nr_pages + 1;
-		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
-	}
-
-	while (nr_pages > 0) {
-		uint64_t tmp;
-
-		if (!sg_res) {
-			sg_res = aligned_nrpages(sg->offset, sg->length);
-			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
-			sg->dma_length = sg->length;
-			pteval = page_to_phys(sg_page(sg)) | prot;
-			phys_pfn = pteval >> VTD_PAGE_SHIFT;
-		}
-
-		if (!pte) {
-			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
-
-			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
-			if (!pte)
-				return -ENOMEM;
-			/* It is large page*/
-			if (largepage_lvl > 1)
-				pteval |= DMA_PTE_LARGE_PAGE;
-			else
-				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
-
-		}
-		/* We don't need lock here, nobody else
-		 * touches the iova range
-		 */
-		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
-		if (tmp) {
-			static int dumps = 5;
-			printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
-			       iov_pfn, tmp, (unsigned long long)pteval);
-			if (dumps) {
-				dumps--;
-				debug_dma_dump_mappings(NULL);
-			}
-			WARN_ON(1);
-		}
-
-		lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
-		BUG_ON(nr_pages < lvl_pages);
-		BUG_ON(sg_res < lvl_pages);
-
-		nr_pages -= lvl_pages;
-		iov_pfn += lvl_pages;
-		phys_pfn += lvl_pages;
-		pteval += lvl_pages * VTD_PAGE_SIZE;
-		sg_res -= lvl_pages;
-
-		/* If the next PTE would be the first in a new page, then we
-		   need to flush the cache on the entries we've just written.
-		   And then we'll need to recalculate 'pte', so clear it and
-		   let it get set again in the if (!pte) block above.
-
-		   If we're done (!nr_pages) we need to flush the cache too.
-
-		   Also if we've been setting superpages, we may need to
-		   recalculate 'pte' and switch back to smaller pages for the
-		   end of the mapping, if the trailing size is not enough to
-		   use another superpage (i.e. sg_res < lvl_pages). */
-		pte++;
-		if (!nr_pages || first_pte_in_page(pte) ||
-		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
-			domain_flush_cache(domain, first_pte,
-					   (void *)pte - (void *)first_pte);
-			pte = NULL;
-		}
-
-		if (!sg_res && nr_pages)
-			sg = sg_next(sg);
-	}
-	return 0;
-}
-
-static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-				    struct scatterlist *sg, unsigned long nr_pages,
-				    int prot)
-{
-	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
-}
-
-static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-				     unsigned long phys_pfn, unsigned long nr_pages,
-				     int prot)
-{
-	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
-}
-
-static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
-{
-	if (!iommu)
-		return;
-
-	clear_context_table(iommu, bus, devfn);
-	iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL);
-	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
-}
-
-static void domain_remove_dev_info(struct dmar_domain *domain)
-{
-	struct device_domain_info *info;
-	unsigned long flags;
-	struct intel_iommu *iommu;
-
-	spin_lock_irqsave(&device_domain_lock, flags);
-	while (!list_empty(&domain->devices)) {
-		info = list_entry(domain->devices.next,
-			struct device_domain_info, link);
-		list_del(&info->link);
-		list_del(&info->global);
-		if (info->dev)
-			info->dev->dev.archdata.iommu = NULL;
-		spin_unlock_irqrestore(&device_domain_lock, flags);
-
-		iommu_disable_dev_iotlb(info);
-		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
-		iommu_detach_dev(iommu, info->bus, info->devfn);
-		free_devinfo_mem(info);
-
-		spin_lock_irqsave(&device_domain_lock, flags);
-	}
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-}
-
-/*
- * find_domain
- * Note: we use struct pci_dev->dev.archdata.iommu stores the info
- */
-static struct dmar_domain *
-find_domain(struct pci_dev *pdev)
-{
-	struct device_domain_info *info;
-
-	/* No lock here, assumes no domain exit in normal case */
-	info = pdev->dev.archdata.iommu;
-	if (info)
-		return info->domain;
-	return NULL;
-}
-
-/* domain is initialized */
-static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
-{
-	struct dmar_domain *domain, *found = NULL;
-	struct intel_iommu *iommu;
-	struct dmar_drhd_unit *drhd;
-	struct device_domain_info *info, *tmp;
-	struct pci_dev *dev_tmp;
-	unsigned long flags;
-	int bus = 0, devfn = 0;
-	int segment;
-	int ret;
-
-	domain = find_domain(pdev);
-	if (domain)
-		return domain;
-
-	segment = pci_domain_nr(pdev->bus);
-
-	dev_tmp = pci_find_upstream_pcie_bridge(pdev);
-	if (dev_tmp) {
-		if (pci_is_pcie(dev_tmp)) {
-			bus = dev_tmp->subordinate->number;
-			devfn = 0;
-		} else {
-			bus = dev_tmp->bus->number;
-			devfn = dev_tmp->devfn;
-		}
-		spin_lock_irqsave(&device_domain_lock, flags);
-		list_for_each_entry(info, &device_domain_list, global) {
-			if (info->segment == segment &&
-			    info->bus == bus && info->devfn == devfn) {
-				found = info->domain;
-				break;
-			}
-		}
-		spin_unlock_irqrestore(&device_domain_lock, flags);
-		/* pcie-pci bridge already has a domain, uses it */
-		if (found) {
-			domain = found;
-			goto found_domain;
-		}
-	}
-
-	domain = alloc_domain();
-	if (!domain)
-		goto error;
-
-	/* Allocate new domain for the device */
-	drhd = dmar_find_matched_drhd_unit(pdev);
-	if (!drhd) {
-		printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
-			pci_name(pdev));
-		return NULL;
-	}
-	iommu = drhd->iommu;
-
-	ret = iommu_attach_domain(domain, iommu);
-	if (ret) {
-		free_domain_mem(domain);
-		goto error;
-	}
-
-	if (domain_init(domain, gaw)) {
-		domain_exit(domain);
-		goto error;
-	}
-
-	/* register pcie-to-pci device */
-	if (dev_tmp) {
-		info = alloc_devinfo_mem();
-		if (!info) {
-			domain_exit(domain);
-			goto error;
-		}
-		info->segment = segment;
-		info->bus = bus;
-		info->devfn = devfn;
-		info->dev = NULL;
-		info->domain = domain;
-		/* This domain is shared by devices under p2p bridge */
-		domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
-
-		/* pcie-to-pci bridge already has a domain, uses it */
-		found = NULL;
-		spin_lock_irqsave(&device_domain_lock, flags);
-		list_for_each_entry(tmp, &device_domain_list, global) {
-			if (tmp->segment == segment &&
-			    tmp->bus == bus && tmp->devfn == devfn) {
-				found = tmp->domain;
-				break;
-			}
-		}
-		if (found) {
-			spin_unlock_irqrestore(&device_domain_lock, flags);
-			free_devinfo_mem(info);
-			domain_exit(domain);
-			domain = found;
-		} else {
-			list_add(&info->link, &domain->devices);
-			list_add(&info->global, &device_domain_list);
-			spin_unlock_irqrestore(&device_domain_lock, flags);
-		}
-	}
-
-found_domain:
-	info = alloc_devinfo_mem();
-	if (!info)
-		goto error;
-	info->segment = segment;
-	info->bus = pdev->bus->number;
-	info->devfn = pdev->devfn;
-	info->dev = pdev;
-	info->domain = domain;
-	spin_lock_irqsave(&device_domain_lock, flags);
-	/* somebody is fast */
-	found = find_domain(pdev);
-	if (found != NULL) {
-		spin_unlock_irqrestore(&device_domain_lock, flags);
-		if (found != domain) {
-			domain_exit(domain);
-			domain = found;
-		}
-		free_devinfo_mem(info);
-		return domain;
-	}
-	list_add(&info->link, &domain->devices);
-	list_add(&info->global, &device_domain_list);
-	pdev->dev.archdata.iommu = info;
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-	return domain;
-error:
-	/* recheck it here, maybe others set it */
-	return find_domain(pdev);
-}
-
-static int iommu_identity_mapping;
-#define IDENTMAP_ALL		1
-#define IDENTMAP_GFX		2
-#define IDENTMAP_AZALIA		4
-
-static int iommu_domain_identity_map(struct dmar_domain *domain,
-				     unsigned long long start,
-				     unsigned long long end)
-{
-	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
-	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
-
-	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
-			  dma_to_mm_pfn(last_vpfn))) {
-		printk(KERN_ERR "IOMMU: reserve iova failed\n");
-		return -ENOMEM;
-	}
-
-	pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
-		 start, end, domain->id);
-	/*
-	 * RMRR range might have overlap with physical memory range,
-	 * clear it first
-	 */
-	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
-
-	return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
-				  last_vpfn - first_vpfn + 1,
-				  DMA_PTE_READ|DMA_PTE_WRITE);
-}
-
-static int iommu_prepare_identity_map(struct pci_dev *pdev,
-				      unsigned long long start,
-				      unsigned long long end)
-{
-	struct dmar_domain *domain;
-	int ret;
-
-	domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
-	if (!domain)
-		return -ENOMEM;
-
-	/* For _hardware_ passthrough, don't bother. But for software
-	   passthrough, we do it anyway -- it may indicate a memory
-	   range which is reserved in E820, so which didn't get set
-	   up to start with in si_domain */
-	if (domain == si_domain && hw_pass_through) {
-		printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
-		       pci_name(pdev), start, end);
-		return 0;
-	}
-
-	printk(KERN_INFO
-	       "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
-	       pci_name(pdev), start, end);
-	
-	if (end < start) {
-		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
-			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-			dmi_get_system_info(DMI_BIOS_VENDOR),
-			dmi_get_system_info(DMI_BIOS_VERSION),
-		     dmi_get_system_info(DMI_PRODUCT_VERSION));
-		ret = -EIO;
-		goto error;
-	}
-
-	if (end >> agaw_to_width(domain->agaw)) {
-		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
-		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-		     agaw_to_width(domain->agaw),
-		     dmi_get_system_info(DMI_BIOS_VENDOR),
-		     dmi_get_system_info(DMI_BIOS_VERSION),
-		     dmi_get_system_info(DMI_PRODUCT_VERSION));
-		ret = -EIO;
-		goto error;
-	}
-
-	ret = iommu_domain_identity_map(domain, start, end);
-	if (ret)
-		goto error;
-
-	/* context entry init */
-	ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
-	if (ret)
-		goto error;
-
-	return 0;
-
- error:
-	domain_exit(domain);
-	return ret;
-}
-
-static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
-	struct pci_dev *pdev)
-{
-	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
-		return 0;
-	return iommu_prepare_identity_map(pdev, rmrr->base_address,
-		rmrr->end_address);
-}
-
-#ifdef CONFIG_DMAR_FLOPPY_WA
-static inline void iommu_prepare_isa(void)
-{
-	struct pci_dev *pdev;
-	int ret;
-
-	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
-	if (!pdev)
-		return;
-
-	printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
-	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
-
-	if (ret)
-		printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
-		       "floppy might not work\n");
-
-}
-#else
-static inline void iommu_prepare_isa(void)
-{
-	return;
-}
-#endif /* !CONFIG_DMAR_FLPY_WA */
-
-static int md_domain_init(struct dmar_domain *domain, int guest_width);
-
-static int __init si_domain_work_fn(unsigned long start_pfn,
-				    unsigned long end_pfn, void *datax)
-{
-	int *ret = datax;
-
-	*ret = iommu_domain_identity_map(si_domain,
-					 (uint64_t)start_pfn << PAGE_SHIFT,
-					 (uint64_t)end_pfn << PAGE_SHIFT);
-	return *ret;
-
-}
-
-static int __init si_domain_init(int hw)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	int nid, ret = 0;
-
-	si_domain = alloc_domain();
-	if (!si_domain)
-		return -EFAULT;
-
-	pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
-
-	for_each_active_iommu(iommu, drhd) {
-		ret = iommu_attach_domain(si_domain, iommu);
-		if (ret) {
-			domain_exit(si_domain);
-			return -EFAULT;
-		}
-	}
-
-	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
-		domain_exit(si_domain);
-		return -EFAULT;
-	}
-
-	si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
-
-	if (hw)
-		return 0;
-
-	for_each_online_node(nid) {
-		work_with_active_regions(nid, si_domain_work_fn, &ret);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static void domain_remove_one_dev_info(struct dmar_domain *domain,
-					  struct pci_dev *pdev);
-static int identity_mapping(struct pci_dev *pdev)
-{
-	struct device_domain_info *info;
-
-	if (likely(!iommu_identity_mapping))
-		return 0;
-
-	info = pdev->dev.archdata.iommu;
-	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
-		return (info->domain == si_domain);
-
-	return 0;
-}
-
-static int domain_add_dev_info(struct dmar_domain *domain,
-			       struct pci_dev *pdev,
-			       int translation)
-{
-	struct device_domain_info *info;
-	unsigned long flags;
-	int ret;
-
-	info = alloc_devinfo_mem();
-	if (!info)
-		return -ENOMEM;
-
-	ret = domain_context_mapping(domain, pdev, translation);
-	if (ret) {
-		free_devinfo_mem(info);
-		return ret;
-	}
-
-	info->segment = pci_domain_nr(pdev->bus);
-	info->bus = pdev->bus->number;
-	info->devfn = pdev->devfn;
-	info->dev = pdev;
-	info->domain = domain;
-
-	spin_lock_irqsave(&device_domain_lock, flags);
-	list_add(&info->link, &domain->devices);
-	list_add(&info->global, &device_domain_list);
-	pdev->dev.archdata.iommu = info;
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-
-	return 0;
-}
-
-static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
-{
-	if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
-		return 1;
-
-	if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
-		return 1;
-
-	if (!(iommu_identity_mapping & IDENTMAP_ALL))
-		return 0;
-
-	/*
-	 * We want to start off with all devices in the 1:1 domain, and
-	 * take them out later if we find they can't access all of memory.
-	 *
-	 * However, we can't do this for PCI devices behind bridges,
-	 * because all PCI devices behind the same bridge will end up
-	 * with the same source-id on their transactions.
-	 *
-	 * Practically speaking, we can't change things around for these
-	 * devices at run-time, because we can't be sure there'll be no
-	 * DMA transactions in flight for any of their siblings.
-	 * 
-	 * So PCI devices (unless they're on the root bus) as well as
-	 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
-	 * the 1:1 domain, just in _case_ one of their siblings turns out
-	 * not to be able to map all of memory.
-	 */
-	if (!pci_is_pcie(pdev)) {
-		if (!pci_is_root_bus(pdev->bus))
-			return 0;
-		if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
-			return 0;
-	} else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
-		return 0;
-
-	/* 
-	 * At boot time, we don't yet know if devices will be 64-bit capable.
-	 * Assume that they will -- if they turn out not to be, then we can 
-	 * take them out of the 1:1 domain later.
-	 */
-	if (!startup) {
-		/*
-		 * If the device's dma_mask is less than the system's memory
-		 * size then this is not a candidate for identity mapping.
-		 */
-		u64 dma_mask = pdev->dma_mask;
-
-		if (pdev->dev.coherent_dma_mask &&
-		    pdev->dev.coherent_dma_mask < dma_mask)
-			dma_mask = pdev->dev.coherent_dma_mask;
-
-		return dma_mask >= dma_get_required_mask(&pdev->dev);
-	}
-
-	return 1;
-}
-
-static int __init iommu_prepare_static_identity_mapping(int hw)
-{
-	struct pci_dev *pdev = NULL;
-	int ret;
-
-	ret = si_domain_init(hw);
-	if (ret)
-		return -EFAULT;
-
-	for_each_pci_dev(pdev) {
-		/* Skip Host/PCI Bridge devices */
-		if (IS_BRIDGE_HOST_DEVICE(pdev))
-			continue;
-		if (iommu_should_identity_map(pdev, 1)) {
-			printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
-			       hw ? "hardware" : "software", pci_name(pdev));
-
-			ret = domain_add_dev_info(si_domain, pdev,
-						     hw ? CONTEXT_TT_PASS_THROUGH :
-						     CONTEXT_TT_MULTI_LEVEL);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
-}
-
-static int __init init_dmars(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct dmar_rmrr_unit *rmrr;
-	struct pci_dev *pdev;
-	struct intel_iommu *iommu;
-	int i, ret;
-
-	/*
-	 * for each drhd
-	 *    allocate root
-	 *    initialize and program root entry to not present
-	 * endfor
-	 */
-	for_each_drhd_unit(drhd) {
-		g_num_of_iommus++;
-		/*
-		 * lock not needed as this is only incremented in the single
-		 * threaded kernel __init code path all other access are read
-		 * only
-		 */
-	}
-
-	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
-			GFP_KERNEL);
-	if (!g_iommus) {
-		printk(KERN_ERR "Allocating global iommu array failed\n");
-		ret = -ENOMEM;
-		goto error;
-	}
-
-	deferred_flush = kzalloc(g_num_of_iommus *
-		sizeof(struct deferred_flush_tables), GFP_KERNEL);
-	if (!deferred_flush) {
-		ret = -ENOMEM;
-		goto error;
-	}
-
-	for_each_drhd_unit(drhd) {
-		if (drhd->ignored)
-			continue;
-
-		iommu = drhd->iommu;
-		g_iommus[iommu->seq_id] = iommu;
-
-		ret = iommu_init_domains(iommu);
-		if (ret)
-			goto error;
-
-		/*
-		 * TBD:
-		 * we could share the same root & context tables
-		 * among all IOMMU's. Need to Split it later.
-		 */
-		ret = iommu_alloc_root_entry(iommu);
-		if (ret) {
-			printk(KERN_ERR "IOMMU: allocate root entry failed\n");
-			goto error;
-		}
-		if (!ecap_pass_through(iommu->ecap))
-			hw_pass_through = 0;
-	}
-
-	/*
-	 * Start from the sane iommu hardware state.
-	 */
-	for_each_drhd_unit(drhd) {
-		if (drhd->ignored)
-			continue;
-
-		iommu = drhd->iommu;
-
-		/*
-		 * If the queued invalidation is already initialized by us
-		 * (for example, while enabling interrupt-remapping) then
-		 * we got the things already rolling from a sane state.
-		 */
-		if (iommu->qi)
-			continue;
-
-		/*
-		 * Clear any previous faults.
-		 */
-		dmar_fault(-1, iommu);
-		/*
-		 * Disable queued invalidation if supported and already enabled
-		 * before OS handover.
-		 */
-		dmar_disable_qi(iommu);
-	}
-
-	for_each_drhd_unit(drhd) {
-		if (drhd->ignored)
-			continue;
-
-		iommu = drhd->iommu;
-
-		if (dmar_enable_qi(iommu)) {
-			/*
-			 * Queued Invalidate not enabled, use Register Based
-			 * Invalidate
-			 */
-			iommu->flush.flush_context = __iommu_flush_context;
-			iommu->flush.flush_iotlb = __iommu_flush_iotlb;
-			printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
-			       "invalidation\n",
-				iommu->seq_id,
-			       (unsigned long long)drhd->reg_base_addr);
-		} else {
-			iommu->flush.flush_context = qi_flush_context;
-			iommu->flush.flush_iotlb = qi_flush_iotlb;
-			printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
-			       "invalidation\n",
-				iommu->seq_id,
-			       (unsigned long long)drhd->reg_base_addr);
-		}
-	}
-
-	if (iommu_pass_through)
-		iommu_identity_mapping |= IDENTMAP_ALL;
-
-#ifdef CONFIG_DMAR_BROKEN_GFX_WA
-	iommu_identity_mapping |= IDENTMAP_GFX;
-#endif
-
-	check_tylersburg_isoch();
-
-	/*
-	 * If pass through is not set or not enabled, setup context entries for
-	 * identity mappings for rmrr, gfx, and isa and may fall back to static
-	 * identity mapping if iommu_identity_mapping is set.
-	 */
-	if (iommu_identity_mapping) {
-		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
-		if (ret) {
-			printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
-			goto error;
-		}
-	}
-	/*
-	 * For each rmrr
-	 *   for each dev attached to rmrr
-	 *   do
-	 *     locate drhd for dev, alloc domain for dev
-	 *     allocate free domain
-	 *     allocate page table entries for rmrr
-	 *     if context not allocated for bus
-	 *           allocate and init context
-	 *           set present in root table for this bus
-	 *     init context with domain, translation etc
-	 *    endfor
-	 * endfor
-	 */
-	printk(KERN_INFO "IOMMU: Setting RMRR:\n");
-	for_each_rmrr_units(rmrr) {
-		for (i = 0; i < rmrr->devices_cnt; i++) {
-			pdev = rmrr->devices[i];
-			/*
-			 * some BIOS lists non-exist devices in DMAR
-			 * table.
-			 */
-			if (!pdev)
-				continue;
-			ret = iommu_prepare_rmrr_dev(rmrr, pdev);
-			if (ret)
-				printk(KERN_ERR
-				       "IOMMU: mapping reserved region failed\n");
-		}
-	}
-
-	iommu_prepare_isa();
-
-	/*
-	 * for each drhd
-	 *   enable fault log
-	 *   global invalidate context cache
-	 *   global invalidate iotlb
-	 *   enable translation
-	 */
-	for_each_drhd_unit(drhd) {
-		if (drhd->ignored) {
-			/*
-			 * we always have to disable PMRs or DMA may fail on
-			 * this device
-			 */
-			if (force_on)
-				iommu_disable_protect_mem_regions(drhd->iommu);
-			continue;
-		}
-		iommu = drhd->iommu;
-
-		iommu_flush_write_buffer(iommu);
-
-		ret = dmar_set_interrupt(iommu);
-		if (ret)
-			goto error;
-
-		iommu_set_root_entry(iommu);
-
-		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
-
-		ret = iommu_enable_translation(iommu);
-		if (ret)
-			goto error;
-
-		iommu_disable_protect_mem_regions(iommu);
-	}
-
-	return 0;
-error:
-	for_each_drhd_unit(drhd) {
-		if (drhd->ignored)
-			continue;
-		iommu = drhd->iommu;
-		free_iommu(iommu);
-	}
-	kfree(g_iommus);
-	return ret;
-}
-
-/* This takes a number of _MM_ pages, not VTD pages */
-static struct iova *intel_alloc_iova(struct device *dev,
-				     struct dmar_domain *domain,
-				     unsigned long nrpages, uint64_t dma_mask)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct iova *iova = NULL;
-
-	/* Restrict dma_mask to the width that the iommu can handle */
-	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
-
-	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
-		/*
-		 * First try to allocate an io virtual address in
-		 * DMA_BIT_MASK(32) and if that fails then try allocating
-		 * from higher range
-		 */
-		iova = alloc_iova(&domain->iovad, nrpages,
-				  IOVA_PFN(DMA_BIT_MASK(32)), 1);
-		if (iova)
-			return iova;
-	}
-	iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
-	if (unlikely(!iova)) {
-		printk(KERN_ERR "Allocating %ld-page iova for %s failed",
-		       nrpages, pci_name(pdev));
-		return NULL;
-	}
-
-	return iova;
-}
-
-static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
-{
-	struct dmar_domain *domain;
-	int ret;
-
-	domain = get_domain_for_dev(pdev,
-			DEFAULT_DOMAIN_ADDRESS_WIDTH);
-	if (!domain) {
-		printk(KERN_ERR
-			"Allocating domain for %s failed", pci_name(pdev));
-		return NULL;
-	}
-
-	/* make sure context mapping is ok */
-	if (unlikely(!domain_context_mapped(pdev))) {
-		ret = domain_context_mapping(domain, pdev,
-					     CONTEXT_TT_MULTI_LEVEL);
-		if (ret) {
-			printk(KERN_ERR
-				"Domain context map for %s failed",
-				pci_name(pdev));
-			return NULL;
-		}
-	}
-
-	return domain;
-}
-
-static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
-{
-	struct device_domain_info *info;
-
-	/* No lock here, assumes no domain exit in normal case */
-	info = dev->dev.archdata.iommu;
-	if (likely(info))
-		return info->domain;
-
-	return __get_valid_domain_for_dev(dev);
-}
-
-static int iommu_dummy(struct pci_dev *pdev)
-{
-	return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
-}
-
-/* Check if the pdev needs to go through non-identity map and unmap process.*/
-static int iommu_no_mapping(struct device *dev)
-{
-	struct pci_dev *pdev;
-	int found;
-
-	if (unlikely(dev->bus != &pci_bus_type))
-		return 1;
-
-	pdev = to_pci_dev(dev);
-	if (iommu_dummy(pdev))
-		return 1;
-
-	if (!iommu_identity_mapping)
-		return 0;
-
-	found = identity_mapping(pdev);
-	if (found) {
-		if (iommu_should_identity_map(pdev, 0))
-			return 1;
-		else {
-			/*
-			 * 32 bit DMA is removed from si_domain and fall back
-			 * to non-identity mapping.
-			 */
-			domain_remove_one_dev_info(si_domain, pdev);
-			printk(KERN_INFO "32bit %s uses non-identity mapping\n",
-			       pci_name(pdev));
-			return 0;
-		}
-	} else {
-		/*
-		 * In case of a detached 64 bit DMA device from vm, the device
-		 * is put into si_domain for identity mapping.
-		 */
-		if (iommu_should_identity_map(pdev, 0)) {
-			int ret;
-			ret = domain_add_dev_info(si_domain, pdev,
-						  hw_pass_through ?
-						  CONTEXT_TT_PASS_THROUGH :
-						  CONTEXT_TT_MULTI_LEVEL);
-			if (!ret) {
-				printk(KERN_INFO "64bit %s uses identity mapping\n",
-				       pci_name(pdev));
-				return 1;
-			}
-		}
-	}
-
-	return 0;
-}
-
-static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
-				     size_t size, int dir, u64 dma_mask)
-{
-	struct pci_dev *pdev = to_pci_dev(hwdev);
-	struct dmar_domain *domain;
-	phys_addr_t start_paddr;
-	struct iova *iova;
-	int prot = 0;
-	int ret;
-	struct intel_iommu *iommu;
-	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
-
-	BUG_ON(dir == DMA_NONE);
-
-	if (iommu_no_mapping(hwdev))
-		return paddr;
-
-	domain = get_valid_domain_for_dev(pdev);
-	if (!domain)
-		return 0;
-
-	iommu = domain_get_iommu(domain);
-	size = aligned_nrpages(paddr, size);
-
-	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
-	if (!iova)
-		goto error;
-
-	/*
-	 * Check if DMAR supports zero-length reads on write only
-	 * mappings..
-	 */
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
-			!cap_zlr(iommu->cap))
-		prot |= DMA_PTE_READ;
-	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-		prot |= DMA_PTE_WRITE;
-	/*
-	 * paddr - (paddr + size) might be partial page, we should map the whole
-	 * page.  Note: if two part of one page are separately mapped, we
-	 * might have two guest_addr mapping to the same host paddr, but this
-	 * is not a big problem
-	 */
-	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
-				 mm_to_dma_pfn(paddr_pfn), size, prot);
-	if (ret)
-		goto error;
-
-	/* it's a non-present to present mapping. Only flush if caching mode */
-	if (cap_caching_mode(iommu->cap))
-		iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
-	else
-		iommu_flush_write_buffer(iommu);
-
-	start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
-	start_paddr += paddr & ~PAGE_MASK;
-	return start_paddr;
-
-error:
-	if (iova)
-		__free_iova(&domain->iovad, iova);
-	printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
-		pci_name(pdev), size, (unsigned long long)paddr, dir);
-	return 0;
-}
-
-static dma_addr_t intel_map_page(struct device *dev, struct page *page,
-				 unsigned long offset, size_t size,
-				 enum dma_data_direction dir,
-				 struct dma_attrs *attrs)
-{
-	return __intel_map_single(dev, page_to_phys(page) + offset, size,
-				  dir, to_pci_dev(dev)->dma_mask);
-}
-
-static void flush_unmaps(void)
-{
-	int i, j;
-
-	timer_on = 0;
-
-	/* just flush them all */
-	for (i = 0; i < g_num_of_iommus; i++) {
-		struct intel_iommu *iommu = g_iommus[i];
-		if (!iommu)
-			continue;
-
-		if (!deferred_flush[i].next)
-			continue;
-
-		/* In caching mode, global flushes turn emulation expensive */
-		if (!cap_caching_mode(iommu->cap))
-			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-					 DMA_TLB_GLOBAL_FLUSH);
-		for (j = 0; j < deferred_flush[i].next; j++) {
-			unsigned long mask;
-			struct iova *iova = deferred_flush[i].iova[j];
-			struct dmar_domain *domain = deferred_flush[i].domain[j];
-
-			/* On real hardware multiple invalidations are expensive */
-			if (cap_caching_mode(iommu->cap))
-				iommu_flush_iotlb_psi(iommu, domain->id,
-				iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
-			else {
-				mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
-				iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
-						(uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
-			}
-			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
-		}
-		deferred_flush[i].next = 0;
-	}
-
-	list_size = 0;
-}
-
-static void flush_unmaps_timeout(unsigned long data)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&async_umap_flush_lock, flags);
-	flush_unmaps();
-	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
-}
-
-static void add_unmap(struct dmar_domain *dom, struct iova *iova)
-{
-	unsigned long flags;
-	int next, iommu_id;
-	struct intel_iommu *iommu;
-
-	spin_lock_irqsave(&async_umap_flush_lock, flags);
-	if (list_size == HIGH_WATER_MARK)
-		flush_unmaps();
-
-	iommu = domain_get_iommu(dom);
-	iommu_id = iommu->seq_id;
-
-	next = deferred_flush[iommu_id].next;
-	deferred_flush[iommu_id].domain[next] = dom;
-	deferred_flush[iommu_id].iova[next] = iova;
-	deferred_flush[iommu_id].next++;
-
-	if (!timer_on) {
-		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
-		timer_on = 1;
-	}
-	list_size++;
-	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
-}
-
-static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
-			     size_t size, enum dma_data_direction dir,
-			     struct dma_attrs *attrs)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct dmar_domain *domain;
-	unsigned long start_pfn, last_pfn;
-	struct iova *iova;
-	struct intel_iommu *iommu;
-
-	if (iommu_no_mapping(dev))
-		return;
-
-	domain = find_domain(pdev);
-	BUG_ON(!domain);
-
-	iommu = domain_get_iommu(domain);
-
-	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
-	if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
-		      (unsigned long long)dev_addr))
-		return;
-
-	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
-	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
-
-	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
-		 pci_name(pdev), start_pfn, last_pfn);
-
-	/*  clear the whole page */
-	dma_pte_clear_range(domain, start_pfn, last_pfn);
-
-	/* free page tables */
-	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
-
-	if (intel_iommu_strict) {
-		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-				      last_pfn - start_pfn + 1, 0);
-		/* free iova */
-		__free_iova(&domain->iovad, iova);
-	} else {
-		add_unmap(domain, iova);
-		/*
-		 * queue up the release of the unmap to save the 1/6th of the
-		 * cpu used up by the iotlb flush operation...
-		 */
-	}
-}
-
-static void *intel_alloc_coherent(struct device *hwdev, size_t size,
-				  dma_addr_t *dma_handle, gfp_t flags)
-{
-	void *vaddr;
-	int order;
-
-	size = PAGE_ALIGN(size);
-	order = get_order(size);
-
-	if (!iommu_no_mapping(hwdev))
-		flags &= ~(GFP_DMA | GFP_DMA32);
-	else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
-		if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
-			flags |= GFP_DMA;
-		else
-			flags |= GFP_DMA32;
-	}
-
-	vaddr = (void *)__get_free_pages(flags, order);
-	if (!vaddr)
-		return NULL;
-	memset(vaddr, 0, size);
-
-	*dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
-					 DMA_BIDIRECTIONAL,
-					 hwdev->coherent_dma_mask);
-	if (*dma_handle)
-		return vaddr;
-	free_pages((unsigned long)vaddr, order);
-	return NULL;
-}
-
-static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
-				dma_addr_t dma_handle)
-{
-	int order;
-
-	size = PAGE_ALIGN(size);
-	order = get_order(size);
-
-	intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
-	free_pages((unsigned long)vaddr, order);
-}
-
-static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
-			   int nelems, enum dma_data_direction dir,
-			   struct dma_attrs *attrs)
-{
-	struct pci_dev *pdev = to_pci_dev(hwdev);
-	struct dmar_domain *domain;
-	unsigned long start_pfn, last_pfn;
-	struct iova *iova;
-	struct intel_iommu *iommu;
-
-	if (iommu_no_mapping(hwdev))
-		return;
-
-	domain = find_domain(pdev);
-	BUG_ON(!domain);
-
-	iommu = domain_get_iommu(domain);
-
-	iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
-	if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
-		      (unsigned long long)sglist[0].dma_address))
-		return;
-
-	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
-	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
-
-	/*  clear the whole page */
-	dma_pte_clear_range(domain, start_pfn, last_pfn);
-
-	/* free page tables */
-	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
-
-	if (intel_iommu_strict) {
-		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-				      last_pfn - start_pfn + 1, 0);
-		/* free iova */
-		__free_iova(&domain->iovad, iova);
-	} else {
-		add_unmap(domain, iova);
-		/*
-		 * queue up the release of the unmap to save the 1/6th of the
-		 * cpu used up by the iotlb flush operation...
-		 */
-	}
-}
-
-static int intel_nontranslate_map_sg(struct device *hddev,
-	struct scatterlist *sglist, int nelems, int dir)
-{
-	int i;
-	struct scatterlist *sg;
-
-	for_each_sg(sglist, sg, nelems, i) {
-		BUG_ON(!sg_page(sg));
-		sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
-		sg->dma_length = sg->length;
-	}
-	return nelems;
-}
-
-static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
-			enum dma_data_direction dir, struct dma_attrs *attrs)
-{
-	int i;
-	struct pci_dev *pdev = to_pci_dev(hwdev);
-	struct dmar_domain *domain;
-	size_t size = 0;
-	int prot = 0;
-	struct iova *iova = NULL;
-	int ret;
-	struct scatterlist *sg;
-	unsigned long start_vpfn;
-	struct intel_iommu *iommu;
-
-	BUG_ON(dir == DMA_NONE);
-	if (iommu_no_mapping(hwdev))
-		return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
-
-	domain = get_valid_domain_for_dev(pdev);
-	if (!domain)
-		return 0;
-
-	iommu = domain_get_iommu(domain);
-
-	for_each_sg(sglist, sg, nelems, i)
-		size += aligned_nrpages(sg->offset, sg->length);
-
-	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
-				pdev->dma_mask);
-	if (!iova) {
-		sglist->dma_length = 0;
-		return 0;
-	}
-
-	/*
-	 * Check if DMAR supports zero-length reads on write only
-	 * mappings..
-	 */
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
-			!cap_zlr(iommu->cap))
-		prot |= DMA_PTE_READ;
-	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-		prot |= DMA_PTE_WRITE;
-
-	start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
-
-	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
-	if (unlikely(ret)) {
-		/*  clear the page */
-		dma_pte_clear_range(domain, start_vpfn,
-				    start_vpfn + size - 1);
-		/* free page tables */
-		dma_pte_free_pagetable(domain, start_vpfn,
-				       start_vpfn + size - 1);
-		/* free iova */
-		__free_iova(&domain->iovad, iova);
-		return 0;
-	}
-
-	/* it's a non-present to present mapping. Only flush if caching mode */
-	if (cap_caching_mode(iommu->cap))
-		iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
-	else
-		iommu_flush_write_buffer(iommu);
-
-	return nelems;
-}
-
-static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	return !dma_addr;
-}
-
-struct dma_map_ops intel_dma_ops = {
-	.alloc_coherent = intel_alloc_coherent,
-	.free_coherent = intel_free_coherent,
-	.map_sg = intel_map_sg,
-	.unmap_sg = intel_unmap_sg,
-	.map_page = intel_map_page,
-	.unmap_page = intel_unmap_page,
-	.mapping_error = intel_mapping_error,
-};
-
-static inline int iommu_domain_cache_init(void)
-{
-	int ret = 0;
-
-	iommu_domain_cache = kmem_cache_create("iommu_domain",
-					 sizeof(struct dmar_domain),
-					 0,
-					 SLAB_HWCACHE_ALIGN,
-
-					 NULL);
-	if (!iommu_domain_cache) {
-		printk(KERN_ERR "Couldn't create iommu_domain cache\n");
-		ret = -ENOMEM;
-	}
-
-	return ret;
-}
-
-static inline int iommu_devinfo_cache_init(void)
-{
-	int ret = 0;
-
-	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
-					 sizeof(struct device_domain_info),
-					 0,
-					 SLAB_HWCACHE_ALIGN,
-					 NULL);
-	if (!iommu_devinfo_cache) {
-		printk(KERN_ERR "Couldn't create devinfo cache\n");
-		ret = -ENOMEM;
-	}
-
-	return ret;
-}
-
-static inline int iommu_iova_cache_init(void)
-{
-	int ret = 0;
-
-	iommu_iova_cache = kmem_cache_create("iommu_iova",
-					 sizeof(struct iova),
-					 0,
-					 SLAB_HWCACHE_ALIGN,
-					 NULL);
-	if (!iommu_iova_cache) {
-		printk(KERN_ERR "Couldn't create iova cache\n");
-		ret = -ENOMEM;
-	}
-
-	return ret;
-}
-
-static int __init iommu_init_mempool(void)
-{
-	int ret;
-	ret = iommu_iova_cache_init();
-	if (ret)
-		return ret;
-
-	ret = iommu_domain_cache_init();
-	if (ret)
-		goto domain_error;
-
-	ret = iommu_devinfo_cache_init();
-	if (!ret)
-		return ret;
-
-	kmem_cache_destroy(iommu_domain_cache);
-domain_error:
-	kmem_cache_destroy(iommu_iova_cache);
-
-	return -ENOMEM;
-}
-
-static void __init iommu_exit_mempool(void)
-{
-	kmem_cache_destroy(iommu_devinfo_cache);
-	kmem_cache_destroy(iommu_domain_cache);
-	kmem_cache_destroy(iommu_iova_cache);
-
-}
-
-static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
-{
-	struct dmar_drhd_unit *drhd;
-	u32 vtbar;
-	int rc;
-
-	/* We know that this device on this chipset has its own IOMMU.
-	 * If we find it under a different IOMMU, then the BIOS is lying
-	 * to us. Hope that the IOMMU for this device is actually
-	 * disabled, and it needs no translation...
-	 */
-	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
-	if (rc) {
-		/* "can't" happen */
-		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
-		return;
-	}
-	vtbar &= 0xffff0000;
-
-	/* we know that the this iommu should be at offset 0xa000 from vtbar */
-	drhd = dmar_find_matched_drhd_unit(pdev);
-	if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
-			    TAINT_FIRMWARE_WORKAROUND,
-			    "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
-		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
-}
-DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
-
-static void __init init_no_remapping_devices(void)
-{
-	struct dmar_drhd_unit *drhd;
-
-	for_each_drhd_unit(drhd) {
-		if (!drhd->include_all) {
-			int i;
-			for (i = 0; i < drhd->devices_cnt; i++)
-				if (drhd->devices[i] != NULL)
-					break;
-			/* ignore DMAR unit if no pci devices exist */
-			if (i == drhd->devices_cnt)
-				drhd->ignored = 1;
-		}
-	}
-
-	if (dmar_map_gfx)
-		return;
-
-	for_each_drhd_unit(drhd) {
-		int i;
-		if (drhd->ignored || drhd->include_all)
-			continue;
-
-		for (i = 0; i < drhd->devices_cnt; i++)
-			if (drhd->devices[i] &&
-				!IS_GFX_DEVICE(drhd->devices[i]))
-				break;
-
-		if (i < drhd->devices_cnt)
-			continue;
-
-		/* bypass IOMMU if it is just for gfx devices */
-		drhd->ignored = 1;
-		for (i = 0; i < drhd->devices_cnt; i++) {
-			if (!drhd->devices[i])
-				continue;
-			drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
-		}
-	}
-}
-
-#ifdef CONFIG_SUSPEND
-static int init_iommu_hw(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu = NULL;
-
-	for_each_active_iommu(iommu, drhd)
-		if (iommu->qi)
-			dmar_reenable_qi(iommu);
-
-	for_each_iommu(iommu, drhd) {
-		if (drhd->ignored) {
-			/*
-			 * we always have to disable PMRs or DMA may fail on
-			 * this device
-			 */
-			if (force_on)
-				iommu_disable_protect_mem_regions(iommu);
-			continue;
-		}
-	
-		iommu_flush_write_buffer(iommu);
-
-		iommu_set_root_entry(iommu);
-
-		iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-					 DMA_TLB_GLOBAL_FLUSH);
-		if (iommu_enable_translation(iommu))
-			return 1;
-		iommu_disable_protect_mem_regions(iommu);
-	}
-
-	return 0;
-}
-
-static void iommu_flush_all(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-
-	for_each_active_iommu(iommu, drhd) {
-		iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-					 DMA_TLB_GLOBAL_FLUSH);
-	}
-}
-
-static int iommu_suspend(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu = NULL;
-	unsigned long flag;
-
-	for_each_active_iommu(iommu, drhd) {
-		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
-						 GFP_ATOMIC);
-		if (!iommu->iommu_state)
-			goto nomem;
-	}
-
-	iommu_flush_all();
-
-	for_each_active_iommu(iommu, drhd) {
-		iommu_disable_translation(iommu);
-
-		spin_lock_irqsave(&iommu->register_lock, flag);
-
-		iommu->iommu_state[SR_DMAR_FECTL_REG] =
-			readl(iommu->reg + DMAR_FECTL_REG);
-		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
-			readl(iommu->reg + DMAR_FEDATA_REG);
-		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
-			readl(iommu->reg + DMAR_FEADDR_REG);
-		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
-			readl(iommu->reg + DMAR_FEUADDR_REG);
-
-		spin_unlock_irqrestore(&iommu->register_lock, flag);
-	}
-	return 0;
-
-nomem:
-	for_each_active_iommu(iommu, drhd)
-		kfree(iommu->iommu_state);
-
-	return -ENOMEM;
-}
-
-static void iommu_resume(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu = NULL;
-	unsigned long flag;
-
-	if (init_iommu_hw()) {
-		if (force_on)
-			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
-		else
-			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
-		return;
-	}
-
-	for_each_active_iommu(iommu, drhd) {
-
-		spin_lock_irqsave(&iommu->register_lock, flag);
-
-		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
-			iommu->reg + DMAR_FECTL_REG);
-		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
-			iommu->reg + DMAR_FEDATA_REG);
-		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
-			iommu->reg + DMAR_FEADDR_REG);
-		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
-			iommu->reg + DMAR_FEUADDR_REG);
-
-		spin_unlock_irqrestore(&iommu->register_lock, flag);
-	}
-
-	for_each_active_iommu(iommu, drhd)
-		kfree(iommu->iommu_state);
-}
-
-static struct syscore_ops iommu_syscore_ops = {
-	.resume		= iommu_resume,
-	.suspend	= iommu_suspend,
-};
-
-static void __init init_iommu_pm_ops(void)
-{
-	register_syscore_ops(&iommu_syscore_ops);
-}
-
-#else
-static inline void init_iommu_pm_ops(void) {}
-#endif	/* CONFIG_PM */
-
-/*
- * Here we only respond to action of unbound device from driver.
- *
- * Added device is not attached to its DMAR domain here yet. That will happen
- * when mapping the device to iova.
- */
-static int device_notifier(struct notifier_block *nb,
-				  unsigned long action, void *data)
-{
-	struct device *dev = data;
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct dmar_domain *domain;
-
-	if (iommu_no_mapping(dev))
-		return 0;
-
-	domain = find_domain(pdev);
-	if (!domain)
-		return 0;
-
-	if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
-		domain_remove_one_dev_info(domain, pdev);
-
-		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
-		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
-		    list_empty(&domain->devices))
-			domain_exit(domain);
-	}
-
-	return 0;
-}
-
-static struct notifier_block device_nb = {
-	.notifier_call = device_notifier,
-};
-
-int __init intel_iommu_init(void)
-{
-	int ret = 0;
-
-	/* VT-d is required for a TXT/tboot launch, so enforce that */
-	force_on = tboot_force_iommu();
-
-	if (dmar_table_init()) {
-		if (force_on)
-			panic("tboot: Failed to initialize DMAR table\n");
-		return 	-ENODEV;
-	}
-
-	if (dmar_dev_scope_init()) {
-		if (force_on)
-			panic("tboot: Failed to initialize DMAR device scope\n");
-		return 	-ENODEV;
-	}
-
-	/*
-	 * Check the need for DMA-remapping initialization now.
-	 * Above initialization will also be used by Interrupt-remapping.
-	 */
-	if (no_iommu || dmar_disabled)
-		return -ENODEV;
-
-	if (iommu_init_mempool()) {
-		if (force_on)
-			panic("tboot: Failed to initialize iommu memory\n");
-		return 	-ENODEV;
-	}
-
-	if (dmar_init_reserved_ranges()) {
-		if (force_on)
-			panic("tboot: Failed to reserve iommu ranges\n");
-		return 	-ENODEV;
-	}
-
-	init_no_remapping_devices();
-
-	ret = init_dmars();
-	if (ret) {
-		if (force_on)
-			panic("tboot: Failed to initialize DMARs\n");
-		printk(KERN_ERR "IOMMU: dmar init failed\n");
-		put_iova_domain(&reserved_iova_list);
-		iommu_exit_mempool();
-		return ret;
-	}
-	printk(KERN_INFO
-	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
-
-	init_timer(&unmap_timer);
-#ifdef CONFIG_SWIOTLB
-	swiotlb = 0;
-#endif
-	dma_ops = &intel_dma_ops;
-
-	init_iommu_pm_ops();
-
-	register_iommu(&intel_iommu_ops);
-
-	bus_register_notifier(&pci_bus_type, &device_nb);
-
-	return 0;
-}
-
-static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
-					   struct pci_dev *pdev)
-{
-	struct pci_dev *tmp, *parent;
-
-	if (!iommu || !pdev)
-		return;
-
-	/* dependent device detach */
-	tmp = pci_find_upstream_pcie_bridge(pdev);
-	/* Secondary interface's bus number and devfn 0 */
-	if (tmp) {
-		parent = pdev->bus->self;
-		while (parent != tmp) {
-			iommu_detach_dev(iommu, parent->bus->number,
-					 parent->devfn);
-			parent = parent->bus->self;
-		}
-		if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
-			iommu_detach_dev(iommu,
-				tmp->subordinate->number, 0);
-		else /* this is a legacy PCI bridge */
-			iommu_detach_dev(iommu, tmp->bus->number,
-					 tmp->devfn);
-	}
-}
-
-static void domain_remove_one_dev_info(struct dmar_domain *domain,
-					  struct pci_dev *pdev)
-{
-	struct device_domain_info *info;
-	struct intel_iommu *iommu;
-	unsigned long flags;
-	int found = 0;
-	struct list_head *entry, *tmp;
-
-	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
-				pdev->devfn);
-	if (!iommu)
-		return;
-
-	spin_lock_irqsave(&device_domain_lock, flags);
-	list_for_each_safe(entry, tmp, &domain->devices) {
-		info = list_entry(entry, struct device_domain_info, link);
-		if (info->segment == pci_domain_nr(pdev->bus) &&
-		    info->bus == pdev->bus->number &&
-		    info->devfn == pdev->devfn) {
-			list_del(&info->link);
-			list_del(&info->global);
-			if (info->dev)
-				info->dev->dev.archdata.iommu = NULL;
-			spin_unlock_irqrestore(&device_domain_lock, flags);
-
-			iommu_disable_dev_iotlb(info);
-			iommu_detach_dev(iommu, info->bus, info->devfn);
-			iommu_detach_dependent_devices(iommu, pdev);
-			free_devinfo_mem(info);
-
-			spin_lock_irqsave(&device_domain_lock, flags);
-
-			if (found)
-				break;
-			else
-				continue;
-		}
-
-		/* if there is no other devices under the same iommu
-		 * owned by this domain, clear this iommu in iommu_bmp
-		 * update iommu count and coherency
-		 */
-		if (iommu == device_to_iommu(info->segment, info->bus,
-					    info->devfn))
-			found = 1;
-	}
-
-	if (found == 0) {
-		unsigned long tmp_flags;
-		spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
-		clear_bit(iommu->seq_id, &domain->iommu_bmp);
-		domain->iommu_count--;
-		domain_update_iommu_cap(domain);
-		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
-
-		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
-		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
-			spin_lock_irqsave(&iommu->lock, tmp_flags);
-			clear_bit(domain->id, iommu->domain_ids);
-			iommu->domains[domain->id] = NULL;
-			spin_unlock_irqrestore(&iommu->lock, tmp_flags);
-		}
-	}
-
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-}
-
-static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
-{
-	struct device_domain_info *info;
-	struct intel_iommu *iommu;
-	unsigned long flags1, flags2;
-
-	spin_lock_irqsave(&device_domain_lock, flags1);
-	while (!list_empty(&domain->devices)) {
-		info = list_entry(domain->devices.next,
-			struct device_domain_info, link);
-		list_del(&info->link);
-		list_del(&info->global);
-		if (info->dev)
-			info->dev->dev.archdata.iommu = NULL;
-
-		spin_unlock_irqrestore(&device_domain_lock, flags1);
-
-		iommu_disable_dev_iotlb(info);
-		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
-		iommu_detach_dev(iommu, info->bus, info->devfn);
-		iommu_detach_dependent_devices(iommu, info->dev);
-
-		/* clear this iommu in iommu_bmp, update iommu count
-		 * and capabilities
-		 */
-		spin_lock_irqsave(&domain->iommu_lock, flags2);
-		if (test_and_clear_bit(iommu->seq_id,
-				       &domain->iommu_bmp)) {
-			domain->iommu_count--;
-			domain_update_iommu_cap(domain);
-		}
-		spin_unlock_irqrestore(&domain->iommu_lock, flags2);
-
-		free_devinfo_mem(info);
-		spin_lock_irqsave(&device_domain_lock, flags1);
-	}
-	spin_unlock_irqrestore(&device_domain_lock, flags1);
-}
-
-/* domain id for virtual machine, it won't be set in context */
-static unsigned long vm_domid;
-
-static struct dmar_domain *iommu_alloc_vm_domain(void)
-{
-	struct dmar_domain *domain;
-
-	domain = alloc_domain_mem();
-	if (!domain)
-		return NULL;
-
-	domain->id = vm_domid++;
-	domain->nid = -1;
-	memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
-	domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
-
-	return domain;
-}
-
-static int md_domain_init(struct dmar_domain *domain, int guest_width)
-{
-	int adjust_width;
-
-	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
-	spin_lock_init(&domain->iommu_lock);
-
-	domain_reserve_special_ranges(domain);
-
-	/* calculate AGAW */
-	domain->gaw = guest_width;
-	adjust_width = guestwidth_to_adjustwidth(guest_width);
-	domain->agaw = width_to_agaw(adjust_width);
-
-	INIT_LIST_HEAD(&domain->devices);
-
-	domain->iommu_count = 0;
-	domain->iommu_coherency = 0;
-	domain->iommu_snooping = 0;
-	domain->iommu_superpage = 0;
-	domain->max_addr = 0;
-	domain->nid = -1;
-
-	/* always allocate the top pgd */
-	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
-	if (!domain->pgd)
-		return -ENOMEM;
-	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
-	return 0;
-}
-
-static void iommu_free_vm_domain(struct dmar_domain *domain)
-{
-	unsigned long flags;
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	unsigned long i;
-	unsigned long ndomains;
-
-	for_each_drhd_unit(drhd) {
-		if (drhd->ignored)
-			continue;
-		iommu = drhd->iommu;
-
-		ndomains = cap_ndoms(iommu->cap);
-		for_each_set_bit(i, iommu->domain_ids, ndomains) {
-			if (iommu->domains[i] == domain) {
-				spin_lock_irqsave(&iommu->lock, flags);
-				clear_bit(i, iommu->domain_ids);
-				iommu->domains[i] = NULL;
-				spin_unlock_irqrestore(&iommu->lock, flags);
-				break;
-			}
-		}
-	}
-}
-
-static void vm_domain_exit(struct dmar_domain *domain)
-{
-	/* Domain 0 is reserved, so dont process it */
-	if (!domain)
-		return;
-
-	vm_domain_remove_all_dev_info(domain);
-	/* destroy iovas */
-	put_iova_domain(&domain->iovad);
-
-	/* clear ptes */
-	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
-
-	/* free page tables */
-	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
-
-	iommu_free_vm_domain(domain);
-	free_domain_mem(domain);
-}
-
-static int intel_iommu_domain_init(struct iommu_domain *domain)
-{
-	struct dmar_domain *dmar_domain;
-
-	dmar_domain = iommu_alloc_vm_domain();
-	if (!dmar_domain) {
-		printk(KERN_ERR
-			"intel_iommu_domain_init: dmar_domain == NULL\n");
-		return -ENOMEM;
-	}
-	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
-		printk(KERN_ERR
-			"intel_iommu_domain_init() failed\n");
-		vm_domain_exit(dmar_domain);
-		return -ENOMEM;
-	}
-	domain->priv = dmar_domain;
-
-	return 0;
-}
-
-static void intel_iommu_domain_destroy(struct iommu_domain *domain)
-{
-	struct dmar_domain *dmar_domain = domain->priv;
-
-	domain->priv = NULL;
-	vm_domain_exit(dmar_domain);
-}
-
-static int intel_iommu_attach_device(struct iommu_domain *domain,
-				     struct device *dev)
-{
-	struct dmar_domain *dmar_domain = domain->priv;
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct intel_iommu *iommu;
-	int addr_width;
-
-	/* normally pdev is not mapped */
-	if (unlikely(domain_context_mapped(pdev))) {
-		struct dmar_domain *old_domain;
-
-		old_domain = find_domain(pdev);
-		if (old_domain) {
-			if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
-			    dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
-				domain_remove_one_dev_info(old_domain, pdev);
-			else
-				domain_remove_dev_info(old_domain);
-		}
-	}
-
-	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
-				pdev->devfn);
-	if (!iommu)
-		return -ENODEV;
-
-	/* check if this iommu agaw is sufficient for max mapped address */
-	addr_width = agaw_to_width(iommu->agaw);
-	if (addr_width > cap_mgaw(iommu->cap))
-		addr_width = cap_mgaw(iommu->cap);
-
-	if (dmar_domain->max_addr > (1LL << addr_width)) {
-		printk(KERN_ERR "%s: iommu width (%d) is not "
-		       "sufficient for the mapped address (%llx)\n",
-		       __func__, addr_width, dmar_domain->max_addr);
-		return -EFAULT;
-	}
-	dmar_domain->gaw = addr_width;
-
-	/*
-	 * Knock out extra levels of page tables if necessary
-	 */
-	while (iommu->agaw < dmar_domain->agaw) {
-		struct dma_pte *pte;
-
-		pte = dmar_domain->pgd;
-		if (dma_pte_present(pte)) {
-			dmar_domain->pgd = (struct dma_pte *)
-				phys_to_virt(dma_pte_addr(pte));
-			free_pgtable_page(pte);
-		}
-		dmar_domain->agaw--;
-	}
-
-	return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
-}
-
-static void intel_iommu_detach_device(struct iommu_domain *domain,
-				      struct device *dev)
-{
-	struct dmar_domain *dmar_domain = domain->priv;
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	domain_remove_one_dev_info(dmar_domain, pdev);
-}
-
-static int intel_iommu_map(struct iommu_domain *domain,
-			   unsigned long iova, phys_addr_t hpa,
-			   int gfp_order, int iommu_prot)
-{
-	struct dmar_domain *dmar_domain = domain->priv;
-	u64 max_addr;
-	int prot = 0;
-	size_t size;
-	int ret;
-
-	if (iommu_prot & IOMMU_READ)
-		prot |= DMA_PTE_READ;
-	if (iommu_prot & IOMMU_WRITE)
-		prot |= DMA_PTE_WRITE;
-	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
-		prot |= DMA_PTE_SNP;
-
-	size     = PAGE_SIZE << gfp_order;
-	max_addr = iova + size;
-	if (dmar_domain->max_addr < max_addr) {
-		u64 end;
-
-		/* check if minimum agaw is sufficient for mapped address */
-		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
-		if (end < max_addr) {
-			printk(KERN_ERR "%s: iommu width (%d) is not "
-			       "sufficient for the mapped address (%llx)\n",
-			       __func__, dmar_domain->gaw, max_addr);
-			return -EFAULT;
-		}
-		dmar_domain->max_addr = max_addr;
-	}
-	/* Round up size to next multiple of PAGE_SIZE, if it and
-	   the low bits of hpa would take us onto the next page */
-	size = aligned_nrpages(hpa, size);
-	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
-				 hpa >> VTD_PAGE_SHIFT, size, prot);
-	return ret;
-}
-
-static int intel_iommu_unmap(struct iommu_domain *domain,
-			     unsigned long iova, int gfp_order)
-{
-	struct dmar_domain *dmar_domain = domain->priv;
-	size_t size = PAGE_SIZE << gfp_order;
-
-	dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
-			    (iova + size - 1) >> VTD_PAGE_SHIFT);
-
-	if (dmar_domain->max_addr == iova + size)
-		dmar_domain->max_addr = iova;
-
-	return gfp_order;
-}
-
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
-					    unsigned long iova)
-{
-	struct dmar_domain *dmar_domain = domain->priv;
-	struct dma_pte *pte;
-	u64 phys = 0;
-
-	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
-	if (pte)
-		phys = dma_pte_addr(pte);
-
-	return phys;
-}
-
-static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
-				      unsigned long cap)
-{
-	struct dmar_domain *dmar_domain = domain->priv;
-
-	if (cap == IOMMU_CAP_CACHE_COHERENCY)
-		return dmar_domain->iommu_snooping;
-	if (cap == IOMMU_CAP_INTR_REMAP)
-		return intr_remapping_enabled;
-
-	return 0;
-}
-
-static struct iommu_ops intel_iommu_ops = {
-	.domain_init	= intel_iommu_domain_init,
-	.domain_destroy = intel_iommu_domain_destroy,
-	.attach_dev	= intel_iommu_attach_device,
-	.detach_dev	= intel_iommu_detach_device,
-	.map		= intel_iommu_map,
-	.unmap		= intel_iommu_unmap,
-	.iova_to_phys	= intel_iommu_iova_to_phys,
-	.domain_has_cap = intel_iommu_domain_has_cap,
-};
-
-static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
-{
-	/*
-	 * Mobile 4 Series Chipset neglects to set RWBF capability,
-	 * but needs it:
-	 */
-	printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
-	rwbf_quirk = 1;
-
-	/* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
-	if (dev->revision == 0x07) {
-		printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
-		dmar_map_gfx = 0;
-	}
-}
-
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
-
-#define GGC 0x52
-#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
-#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
-#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
-#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
-#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
-#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
-#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
-#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
-
-static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
-{
-	unsigned short ggc;
-
-	if (pci_read_config_word(dev, GGC, &ggc))
-		return;
-
-	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
-		printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
-		dmar_map_gfx = 0;
-	}
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
-
-/* On Tylersburg chipsets, some BIOSes have been known to enable the
-   ISOCH DMAR unit for the Azalia sound device, but not give it any
-   TLB entries, which causes it to deadlock. Check for that.  We do
-   this in a function called from init_dmars(), instead of in a PCI
-   quirk, because we don't want to print the obnoxious "BIOS broken"
-   message if VT-d is actually disabled.
-*/
-static void __init check_tylersburg_isoch(void)
-{
-	struct pci_dev *pdev;
-	uint32_t vtisochctrl;
-
-	/* If there's no Azalia in the system anyway, forget it. */
-	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
-	if (!pdev)
-		return;
-	pci_dev_put(pdev);
-
-	/* System Management Registers. Might be hidden, in which case
-	   we can't do the sanity check. But that's OK, because the
-	   known-broken BIOSes _don't_ actually hide it, so far. */
-	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
-	if (!pdev)
-		return;
-
-	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
-		pci_dev_put(pdev);
-		return;
-	}
-
-	pci_dev_put(pdev);
-
-	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
-	if (vtisochctrl & 1)
-		return;
-
-	/* Drop all bits other than the number of TLB entries */
-	vtisochctrl &= 0x1c;
-
-	/* If we have the recommended number of TLB entries (16), fine. */
-	if (vtisochctrl == 0x10)
-		return;
-
-	/* Zero TLB entries? You get to ride the short bus to school. */
-	if (!vtisochctrl) {
-		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
-		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-		     dmi_get_system_info(DMI_BIOS_VENDOR),
-		     dmi_get_system_info(DMI_BIOS_VERSION),
-		     dmi_get_system_info(DMI_PRODUCT_VERSION));
-		iommu_identity_mapping |= IDENTMAP_AZALIA;
-		return;
-	}
-	
-	printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
-	       vtisochctrl);
-}
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
deleted file mode 100644
index 3607faf28a4d..000000000000
--- a/drivers/pci/intr_remapping.c
+++ /dev/null
@@ -1,798 +0,0 @@
-#include <linux/interrupt.h>
-#include <linux/dmar.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/hpet.h>
-#include <linux/pci.h>
-#include <linux/irq.h>
-#include <asm/io_apic.h>
-#include <asm/smp.h>
-#include <asm/cpu.h>
-#include <linux/intel-iommu.h>
-#include "intr_remapping.h"
-#include <acpi/acpi.h>
-#include <asm/pci-direct.h>
-#include "pci.h"
-
-static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
-static struct hpet_scope ir_hpet[MAX_HPET_TBS];
-static int ir_ioapic_num, ir_hpet_num;
-int intr_remapping_enabled;
-
-static int disable_intremap;
-static int disable_sourceid_checking;
-
-static __init int setup_nointremap(char *str)
-{
-	disable_intremap = 1;
-	return 0;
-}
-early_param("nointremap", setup_nointremap);
-
-static __init int setup_intremap(char *str)
-{
-	if (!str)
-		return -EINVAL;
-
-	if (!strncmp(str, "on", 2))
-		disable_intremap = 0;
-	else if (!strncmp(str, "off", 3))
-		disable_intremap = 1;
-	else if (!strncmp(str, "nosid", 5))
-		disable_sourceid_checking = 1;
-
-	return 0;
-}
-early_param("intremap", setup_intremap);
-
-static DEFINE_SPINLOCK(irq_2_ir_lock);
-
-static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
-{
-	struct irq_cfg *cfg = irq_get_chip_data(irq);
-	return cfg ? &cfg->irq_2_iommu : NULL;
-}
-
-int get_irte(int irq, struct irte *entry)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-	int index;
-
-	if (!entry || !irq_iommu)
-		return -1;
-
-	spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	index = irq_iommu->irte_index + irq_iommu->sub_handle;
-	*entry = *(irq_iommu->iommu->ir_table->base + index);
-
-	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-	return 0;
-}
-
-int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
-{
-	struct ir_table *table = iommu->ir_table;
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	u16 index, start_index;
-	unsigned int mask = 0;
-	unsigned long flags;
-	int i;
-
-	if (!count || !irq_iommu)
-		return -1;
-
-	/*
-	 * start the IRTE search from index 0.
-	 */
-	index = start_index = 0;
-
-	if (count > 1) {
-		count = __roundup_pow_of_two(count);
-		mask = ilog2(count);
-	}
-
-	if (mask > ecap_max_handle_mask(iommu->ecap)) {
-		printk(KERN_ERR
-		       "Requested mask %x exceeds the max invalidation handle"
-		       " mask value %Lx\n", mask,
-		       ecap_max_handle_mask(iommu->ecap));
-		return -1;
-	}
-
-	spin_lock_irqsave(&irq_2_ir_lock, flags);
-	do {
-		for (i = index; i < index + count; i++)
-			if  (table->base[i].present)
-				break;
-		/* empty index found */
-		if (i == index + count)
-			break;
-
-		index = (index + count) % INTR_REMAP_TABLE_ENTRIES;
-
-		if (index == start_index) {
-			spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-			printk(KERN_ERR "can't allocate an IRTE\n");
-			return -1;
-		}
-	} while (1);
-
-	for (i = index; i < index + count; i++)
-		table->base[i].present = 1;
-
-	irq_iommu->iommu = iommu;
-	irq_iommu->irte_index =  index;
-	irq_iommu->sub_handle = 0;
-	irq_iommu->irte_mask = mask;
-
-	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return index;
-}
-
-static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
-{
-	struct qi_desc desc;
-
-	desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
-		   | QI_IEC_SELECTIVE;
-	desc.high = 0;
-
-	return qi_submit_sync(&desc, iommu);
-}
-
-int map_irq_to_irte_handle(int irq, u16 *sub_handle)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-	int index;
-
-	if (!irq_iommu)
-		return -1;
-
-	spin_lock_irqsave(&irq_2_ir_lock, flags);
-	*sub_handle = irq_iommu->sub_handle;
-	index = irq_iommu->irte_index;
-	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-	return index;
-}
-
-int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-
-	if (!irq_iommu)
-		return -1;
-
-	spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	irq_iommu->iommu = iommu;
-	irq_iommu->irte_index = index;
-	irq_iommu->sub_handle = subhandle;
-	irq_iommu->irte_mask = 0;
-
-	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return 0;
-}
-
-int modify_irte(int irq, struct irte *irte_modified)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	struct intel_iommu *iommu;
-	unsigned long flags;
-	struct irte *irte;
-	int rc, index;
-
-	if (!irq_iommu)
-		return -1;
-
-	spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	iommu = irq_iommu->iommu;
-
-	index = irq_iommu->irte_index + irq_iommu->sub_handle;
-	irte = &iommu->ir_table->base[index];
-
-	set_64bit(&irte->low, irte_modified->low);
-	set_64bit(&irte->high, irte_modified->high);
-	__iommu_flush_cache(iommu, irte, sizeof(*irte));
-
-	rc = qi_flush_iec(iommu, index, 0);
-	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return rc;
-}
-
-struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
-{
-	int i;
-
-	for (i = 0; i < MAX_HPET_TBS; i++)
-		if (ir_hpet[i].id == hpet_id)
-			return ir_hpet[i].iommu;
-	return NULL;
-}
-
-struct intel_iommu *map_ioapic_to_ir(int apic)
-{
-	int i;
-
-	for (i = 0; i < MAX_IO_APICS; i++)
-		if (ir_ioapic[i].id == apic)
-			return ir_ioapic[i].iommu;
-	return NULL;
-}
-
-struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
-{
-	struct dmar_drhd_unit *drhd;
-
-	drhd = dmar_find_matched_drhd_unit(dev);
-	if (!drhd)
-		return NULL;
-
-	return drhd->iommu;
-}
-
-static int clear_entries(struct irq_2_iommu *irq_iommu)
-{
-	struct irte *start, *entry, *end;
-	struct intel_iommu *iommu;
-	int index;
-
-	if (irq_iommu->sub_handle)
-		return 0;
-
-	iommu = irq_iommu->iommu;
-	index = irq_iommu->irte_index + irq_iommu->sub_handle;
-
-	start = iommu->ir_table->base + index;
-	end = start + (1 << irq_iommu->irte_mask);
-
-	for (entry = start; entry < end; entry++) {
-		set_64bit(&entry->low, 0);
-		set_64bit(&entry->high, 0);
-	}
-
-	return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
-}
-
-int free_irte(int irq)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-	int rc;
-
-	if (!irq_iommu)
-		return -1;
-
-	spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	rc = clear_entries(irq_iommu);
-
-	irq_iommu->iommu = NULL;
-	irq_iommu->irte_index = 0;
-	irq_iommu->sub_handle = 0;
-	irq_iommu->irte_mask = 0;
-
-	spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return rc;
-}
-
-/*
- * source validation type
- */
-#define SVT_NO_VERIFY		0x0  /* no verification is required */
-#define SVT_VERIFY_SID_SQ	0x1  /* verify using SID and SQ fields */
-#define SVT_VERIFY_BUS		0x2  /* verify bus of request-id */
-
-/*
- * source-id qualifier
- */
-#define SQ_ALL_16	0x0  /* verify all 16 bits of request-id */
-#define SQ_13_IGNORE_1	0x1  /* verify most significant 13 bits, ignore
-			      * the third least significant bit
-			      */
-#define SQ_13_IGNORE_2	0x2  /* verify most significant 13 bits, ignore
-			      * the second and third least significant bits
-			      */
-#define SQ_13_IGNORE_3	0x3  /* verify most significant 13 bits, ignore
-			      * the least three significant bits
-			      */
-
-/*
- * set SVT, SQ and SID fields of irte to verify
- * source ids of interrupt requests
- */
-static void set_irte_sid(struct irte *irte, unsigned int svt,
-			 unsigned int sq, unsigned int sid)
-{
-	if (disable_sourceid_checking)
-		svt = SVT_NO_VERIFY;
-	irte->svt = svt;
-	irte->sq = sq;
-	irte->sid = sid;
-}
-
-int set_ioapic_sid(struct irte *irte, int apic)
-{
-	int i;
-	u16 sid = 0;
-
-	if (!irte)
-		return -1;
-
-	for (i = 0; i < MAX_IO_APICS; i++) {
-		if (ir_ioapic[i].id == apic) {
-			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
-			break;
-		}
-	}
-
-	if (sid == 0) {
-		pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic);
-		return -1;
-	}
-
-	set_irte_sid(irte, 1, 0, sid);
-
-	return 0;
-}
-
-int set_hpet_sid(struct irte *irte, u8 id)
-{
-	int i;
-	u16 sid = 0;
-
-	if (!irte)
-		return -1;
-
-	for (i = 0; i < MAX_HPET_TBS; i++) {
-		if (ir_hpet[i].id == id) {
-			sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
-			break;
-		}
-	}
-
-	if (sid == 0) {
-		pr_warning("Failed to set source-id of HPET block (%d)\n", id);
-		return -1;
-	}
-
-	/*
-	 * Should really use SQ_ALL_16. Some platforms are broken.
-	 * While we figure out the right quirks for these broken platforms, use
-	 * SQ_13_IGNORE_3 for now.
-	 */
-	set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid);
-
-	return 0;
-}
-
-int set_msi_sid(struct irte *irte, struct pci_dev *dev)
-{
-	struct pci_dev *bridge;
-
-	if (!irte || !dev)
-		return -1;
-
-	/* PCIe device or Root Complex integrated PCI device */
-	if (pci_is_pcie(dev) || !dev->bus->parent) {
-		set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
-			     (dev->bus->number << 8) | dev->devfn);
-		return 0;
-	}
-
-	bridge = pci_find_upstream_pcie_bridge(dev);
-	if (bridge) {
-		if (pci_is_pcie(bridge))/* this is a PCIe-to-PCI/PCIX bridge */
-			set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
-				(bridge->bus->number << 8) | dev->bus->number);
-		else /* this is a legacy PCI bridge */
-			set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
-				(bridge->bus->number << 8) | bridge->devfn);
-	}
-
-	return 0;
-}
-
-static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
-{
-	u64 addr;
-	u32 sts;
-	unsigned long flags;
-
-	addr = virt_to_phys((void *)iommu->ir_table->base);
-
-	spin_lock_irqsave(&iommu->register_lock, flags);
-
-	dmar_writeq(iommu->reg + DMAR_IRTA_REG,
-		    (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
-
-	/* Set interrupt-remapping table pointer */
-	iommu->gcmd |= DMA_GCMD_SIRTP;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, (sts & DMA_GSTS_IRTPS), sts);
-	spin_unlock_irqrestore(&iommu->register_lock, flags);
-
-	/*
-	 * global invalidation of interrupt entry cache before enabling
-	 * interrupt-remapping.
-	 */
-	qi_global_iec(iommu);
-
-	spin_lock_irqsave(&iommu->register_lock, flags);
-
-	/* Enable interrupt-remapping */
-	iommu->gcmd |= DMA_GCMD_IRE;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, (sts & DMA_GSTS_IRES), sts);
-
-	spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-
-static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
-{
-	struct ir_table *ir_table;
-	struct page *pages;
-
-	ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table),
-					     GFP_ATOMIC);
-
-	if (!iommu->ir_table)
-		return -ENOMEM;
-
-	pages = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
-				 INTR_REMAP_PAGE_ORDER);
-
-	if (!pages) {
-		printk(KERN_ERR "failed to allocate pages of order %d\n",
-		       INTR_REMAP_PAGE_ORDER);
-		kfree(iommu->ir_table);
-		return -ENOMEM;
-	}
-
-	ir_table->base = page_address(pages);
-
-	iommu_set_intr_remapping(iommu, mode);
-	return 0;
-}
-
-/*
- * Disable Interrupt Remapping.
- */
-static void iommu_disable_intr_remapping(struct intel_iommu *iommu)
-{
-	unsigned long flags;
-	u32 sts;
-
-	if (!ecap_ir_support(iommu->ecap))
-		return;
-
-	/*
-	 * global invalidation of interrupt entry cache before disabling
-	 * interrupt-remapping.
-	 */
-	qi_global_iec(iommu);
-
-	spin_lock_irqsave(&iommu->register_lock, flags);
-
-	sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
-	if (!(sts & DMA_GSTS_IRES))
-		goto end;
-
-	iommu->gcmd &= ~DMA_GCMD_IRE;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, !(sts & DMA_GSTS_IRES), sts);
-
-end:
-	spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-int __init intr_remapping_supported(void)
-{
-	struct dmar_drhd_unit *drhd;
-
-	if (disable_intremap)
-		return 0;
-
-	if (!dmar_ir_support())
-		return 0;
-
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		if (!ecap_ir_support(iommu->ecap))
-			return 0;
-	}
-
-	return 1;
-}
-
-int __init enable_intr_remapping(int eim)
-{
-	struct dmar_drhd_unit *drhd;
-	int setup = 0;
-
-	if (parse_ioapics_under_ir() != 1) {
-		printk(KERN_INFO "Not enable interrupt remapping\n");
-		return -1;
-	}
-
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		/*
-		 * If the queued invalidation is already initialized,
-		 * shouldn't disable it.
-		 */
-		if (iommu->qi)
-			continue;
-
-		/*
-		 * Clear previous faults.
-		 */
-		dmar_fault(-1, iommu);
-
-		/*
-		 * Disable intr remapping and queued invalidation, if already
-		 * enabled prior to OS handover.
-		 */
-		iommu_disable_intr_remapping(iommu);
-
-		dmar_disable_qi(iommu);
-	}
-
-	/*
-	 * check for the Interrupt-remapping support
-	 */
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		if (!ecap_ir_support(iommu->ecap))
-			continue;
-
-		if (eim && !ecap_eim_support(iommu->ecap)) {
-			printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
-			       " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
-			return -1;
-		}
-	}
-
-	/*
-	 * Enable queued invalidation for all the DRHD's.
-	 */
-	for_each_drhd_unit(drhd) {
-		int ret;
-		struct intel_iommu *iommu = drhd->iommu;
-		ret = dmar_enable_qi(iommu);
-
-		if (ret) {
-			printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
-			       " invalidation, ecap %Lx, ret %d\n",
-			       drhd->reg_base_addr, iommu->ecap, ret);
-			return -1;
-		}
-	}
-
-	/*
-	 * Setup Interrupt-remapping for all the DRHD's now.
-	 */
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		if (!ecap_ir_support(iommu->ecap))
-			continue;
-
-		if (setup_intr_remapping(iommu, eim))
-			goto error;
-
-		setup = 1;
-	}
-
-	if (!setup)
-		goto error;
-
-	intr_remapping_enabled = 1;
-
-	return 0;
-
-error:
-	/*
-	 * handle error condition gracefully here!
-	 */
-	return -1;
-}
-
-static void ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
-				      struct intel_iommu *iommu)
-{
-	struct acpi_dmar_pci_path *path;
-	u8 bus;
-	int count;
-
-	bus = scope->bus;
-	path = (struct acpi_dmar_pci_path *)(scope + 1);
-	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
-		/ sizeof(struct acpi_dmar_pci_path);
-
-	while (--count > 0) {
-		/*
-		 * Access PCI directly due to the PCI
-		 * subsystem isn't initialized yet.
-		 */
-		bus = read_pci_config_byte(bus, path->dev, path->fn,
-					   PCI_SECONDARY_BUS);
-		path++;
-	}
-	ir_hpet[ir_hpet_num].bus   = bus;
-	ir_hpet[ir_hpet_num].devfn = PCI_DEVFN(path->dev, path->fn);
-	ir_hpet[ir_hpet_num].iommu = iommu;
-	ir_hpet[ir_hpet_num].id    = scope->enumeration_id;
-	ir_hpet_num++;
-}
-
-static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
-				      struct intel_iommu *iommu)
-{
-	struct acpi_dmar_pci_path *path;
-	u8 bus;
-	int count;
-
-	bus = scope->bus;
-	path = (struct acpi_dmar_pci_path *)(scope + 1);
-	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
-		/ sizeof(struct acpi_dmar_pci_path);
-
-	while (--count > 0) {
-		/*
-		 * Access PCI directly due to the PCI
-		 * subsystem isn't initialized yet.
-		 */
-		bus = read_pci_config_byte(bus, path->dev, path->fn,
-					   PCI_SECONDARY_BUS);
-		path++;
-	}
-
-	ir_ioapic[ir_ioapic_num].bus   = bus;
-	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
-	ir_ioapic[ir_ioapic_num].iommu = iommu;
-	ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id;
-	ir_ioapic_num++;
-}
-
-static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
-				      struct intel_iommu *iommu)
-{
-	struct acpi_dmar_hardware_unit *drhd;
-	struct acpi_dmar_device_scope *scope;
-	void *start, *end;
-
-	drhd = (struct acpi_dmar_hardware_unit *)header;
-
-	start = (void *)(drhd + 1);
-	end = ((void *)drhd) + header->length;
-
-	while (start < end) {
-		scope = start;
-		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
-			if (ir_ioapic_num == MAX_IO_APICS) {
-				printk(KERN_WARNING "Exceeded Max IO APICS\n");
-				return -1;
-			}
-
-			printk(KERN_INFO "IOAPIC id %d under DRHD base "
-			       " 0x%Lx IOMMU %d\n", scope->enumeration_id,
-			       drhd->address, iommu->seq_id);
-
-			ir_parse_one_ioapic_scope(scope, iommu);
-		} else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) {
-			if (ir_hpet_num == MAX_HPET_TBS) {
-				printk(KERN_WARNING "Exceeded Max HPET blocks\n");
-				return -1;
-			}
-
-			printk(KERN_INFO "HPET id %d under DRHD base"
-			       " 0x%Lx\n", scope->enumeration_id,
-			       drhd->address);
-
-			ir_parse_one_hpet_scope(scope, iommu);
-		}
-		start += scope->length;
-	}
-
-	return 0;
-}
-
-/*
- * Finds the assocaition between IOAPIC's and its Interrupt-remapping
- * hardware unit.
- */
-int __init parse_ioapics_under_ir(void)
-{
-	struct dmar_drhd_unit *drhd;
-	int ir_supported = 0;
-
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		if (ecap_ir_support(iommu->ecap)) {
-			if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu))
-				return -1;
-
-			ir_supported = 1;
-		}
-	}
-
-	if (ir_supported && ir_ioapic_num != nr_ioapics) {
-		printk(KERN_WARNING
-		       "Not all IO-APIC's listed under remapping hardware\n");
-		return -1;
-	}
-
-	return ir_supported;
-}
-
-void disable_intr_remapping(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu = NULL;
-
-	/*
-	 * Disable Interrupt-remapping for all the DRHD's now.
-	 */
-	for_each_iommu(iommu, drhd) {
-		if (!ecap_ir_support(iommu->ecap))
-			continue;
-
-		iommu_disable_intr_remapping(iommu);
-	}
-}
-
-int reenable_intr_remapping(int eim)
-{
-	struct dmar_drhd_unit *drhd;
-	int setup = 0;
-	struct intel_iommu *iommu = NULL;
-
-	for_each_iommu(iommu, drhd)
-		if (iommu->qi)
-			dmar_reenable_qi(iommu);
-
-	/*
-	 * Setup Interrupt-remapping for all the DRHD's now.
-	 */
-	for_each_iommu(iommu, drhd) {
-		if (!ecap_ir_support(iommu->ecap))
-			continue;
-
-		/* Set up interrupt remapping for iommu.*/
-		iommu_set_intr_remapping(iommu, eim);
-		setup = 1;
-	}
-
-	if (!setup)
-		goto error;
-
-	return 0;
-
-error:
-	/*
-	 * handle error condition gracefully here!
-	 */
-	return -1;
-}
-
diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h
deleted file mode 100644
index 5662fecfee60..000000000000
--- a/drivers/pci/intr_remapping.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <linux/intel-iommu.h>
-
-struct ioapic_scope {
-	struct intel_iommu *iommu;
-	unsigned int id;
-	unsigned int bus;	/* PCI bus number */
-	unsigned int devfn;	/* PCI devfn number */
-};
-
-struct hpet_scope {
-	struct intel_iommu *iommu;
-	u8 id;
-	unsigned int bus;
-	unsigned int devfn;
-};
-
-#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c
deleted file mode 100644
index c5c274ab5c5a..000000000000
--- a/drivers/pci/iova.c
+++ /dev/null
@@ -1,435 +0,0 @@
-/*
- * Copyright © 2006-2009, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- */
-
-#include <linux/iova.h>
-
-void
-init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
-{
-	spin_lock_init(&iovad->iova_rbtree_lock);
-	iovad->rbroot = RB_ROOT;
-	iovad->cached32_node = NULL;
-	iovad->dma_32bit_pfn = pfn_32bit;
-}
-
-static struct rb_node *
-__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
-{
-	if ((*limit_pfn != iovad->dma_32bit_pfn) ||
-		(iovad->cached32_node == NULL))
-		return rb_last(&iovad->rbroot);
-	else {
-		struct rb_node *prev_node = rb_prev(iovad->cached32_node);
-		struct iova *curr_iova =
-			container_of(iovad->cached32_node, struct iova, node);
-		*limit_pfn = curr_iova->pfn_lo - 1;
-		return prev_node;
-	}
-}
-
-static void
-__cached_rbnode_insert_update(struct iova_domain *iovad,
-	unsigned long limit_pfn, struct iova *new)
-{
-	if (limit_pfn != iovad->dma_32bit_pfn)
-		return;
-	iovad->cached32_node = &new->node;
-}
-
-static void
-__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
-{
-	struct iova *cached_iova;
-	struct rb_node *curr;
-
-	if (!iovad->cached32_node)
-		return;
-	curr = iovad->cached32_node;
-	cached_iova = container_of(curr, struct iova, node);
-
-	if (free->pfn_lo >= cached_iova->pfn_lo) {
-		struct rb_node *node = rb_next(&free->node);
-		struct iova *iova = container_of(node, struct iova, node);
-
-		/* only cache if it's below 32bit pfn */
-		if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
-			iovad->cached32_node = node;
-		else
-			iovad->cached32_node = NULL;
-	}
-}
-
-/* Computes the padding size required, to make the
- * the start address naturally aligned on its size
- */
-static int
-iova_get_pad_size(int size, unsigned int limit_pfn)
-{
-	unsigned int pad_size = 0;
-	unsigned int order = ilog2(size);
-
-	if (order)
-		pad_size = (limit_pfn + 1) % (1 << order);
-
-	return pad_size;
-}
-
-static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
-		unsigned long size, unsigned long limit_pfn,
-			struct iova *new, bool size_aligned)
-{
-	struct rb_node *prev, *curr = NULL;
-	unsigned long flags;
-	unsigned long saved_pfn;
-	unsigned int pad_size = 0;
-
-	/* Walk the tree backwards */
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	saved_pfn = limit_pfn;
-	curr = __get_cached_rbnode(iovad, &limit_pfn);
-	prev = curr;
-	while (curr) {
-		struct iova *curr_iova = container_of(curr, struct iova, node);
-
-		if (limit_pfn < curr_iova->pfn_lo)
-			goto move_left;
-		else if (limit_pfn < curr_iova->pfn_hi)
-			goto adjust_limit_pfn;
-		else {
-			if (size_aligned)
-				pad_size = iova_get_pad_size(size, limit_pfn);
-			if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
-				break;	/* found a free slot */
-		}
-adjust_limit_pfn:
-		limit_pfn = curr_iova->pfn_lo - 1;
-move_left:
-		prev = curr;
-		curr = rb_prev(curr);
-	}
-
-	if (!curr) {
-		if (size_aligned)
-			pad_size = iova_get_pad_size(size, limit_pfn);
-		if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
-			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-			return -ENOMEM;
-		}
-	}
-
-	/* pfn_lo will point to size aligned address if size_aligned is set */
-	new->pfn_lo = limit_pfn - (size + pad_size) + 1;
-	new->pfn_hi = new->pfn_lo + size - 1;
-
-	/* Insert the new_iova into domain rbtree by holding writer lock */
-	/* Add new node and rebalance tree. */
-	{
-		struct rb_node **entry, *parent = NULL;
-
-		/* If we have 'prev', it's a valid place to start the
-		   insertion. Otherwise, start from the root. */
-		if (prev)
-			entry = &prev;
-		else
-			entry = &iovad->rbroot.rb_node;
-
-		/* Figure out where to put new node */
-		while (*entry) {
-			struct iova *this = container_of(*entry,
-							struct iova, node);
-			parent = *entry;
-
-			if (new->pfn_lo < this->pfn_lo)
-				entry = &((*entry)->rb_left);
-			else if (new->pfn_lo > this->pfn_lo)
-				entry = &((*entry)->rb_right);
-			else
-				BUG(); /* this should not happen */
-		}
-
-		/* Add new node and rebalance tree. */
-		rb_link_node(&new->node, parent, entry);
-		rb_insert_color(&new->node, &iovad->rbroot);
-	}
-	__cached_rbnode_insert_update(iovad, saved_pfn, new);
-
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-
-
-	return 0;
-}
-
-static void
-iova_insert_rbtree(struct rb_root *root, struct iova *iova)
-{
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
-	/* Figure out where to put new node */
-	while (*new) {
-		struct iova *this = container_of(*new, struct iova, node);
-		parent = *new;
-
-		if (iova->pfn_lo < this->pfn_lo)
-			new = &((*new)->rb_left);
-		else if (iova->pfn_lo > this->pfn_lo)
-			new = &((*new)->rb_right);
-		else
-			BUG(); /* this should not happen */
-	}
-	/* Add new node and rebalance tree. */
-	rb_link_node(&iova->node, parent, new);
-	rb_insert_color(&iova->node, root);
-}
-
-/**
- * alloc_iova - allocates an iova
- * @iovad - iova domain in question
- * @size - size of page frames to allocate
- * @limit_pfn - max limit address
- * @size_aligned - set if size_aligned address range is required
- * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
- * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
- * flag is set then the allocated address iova->pfn_lo will be naturally
- * aligned on roundup_power_of_two(size).
- */
-struct iova *
-alloc_iova(struct iova_domain *iovad, unsigned long size,
-	unsigned long limit_pfn,
-	bool size_aligned)
-{
-	struct iova *new_iova;
-	int ret;
-
-	new_iova = alloc_iova_mem();
-	if (!new_iova)
-		return NULL;
-
-	/* If size aligned is set then round the size to
-	 * to next power of two.
-	 */
-	if (size_aligned)
-		size = __roundup_pow_of_two(size);
-
-	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn,
-			new_iova, size_aligned);
-
-	if (ret) {
-		free_iova_mem(new_iova);
-		return NULL;
-	}
-
-	return new_iova;
-}
-
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad - iova domain in question.
- * pfn - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
-{
-	unsigned long flags;
-	struct rb_node *node;
-
-	/* Take the lock so that no other thread is manipulating the rbtree */
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	node = iovad->rbroot.rb_node;
-	while (node) {
-		struct iova *iova = container_of(node, struct iova, node);
-
-		/* If pfn falls within iova's range, return iova */
-		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-			/* We are not holding the lock while this iova
-			 * is referenced by the caller as the same thread
-			 * which called this function also calls __free_iova()
-			 * and it is by desing that only one thread can possibly
-			 * reference a particular iova and hence no conflict.
-			 */
-			return iova;
-		}
-
-		if (pfn < iova->pfn_lo)
-			node = node->rb_left;
-		else if (pfn > iova->pfn_lo)
-			node = node->rb_right;
-	}
-
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-	return NULL;
-}
-
-/**
- * __free_iova - frees the given iova
- * @iovad: iova domain in question.
- * @iova: iova in question.
- * Frees the given iova belonging to the giving domain
- */
-void
-__free_iova(struct iova_domain *iovad, struct iova *iova)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	__cached_rbnode_delete_update(iovad, iova);
-	rb_erase(&iova->node, &iovad->rbroot);
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-	free_iova_mem(iova);
-}
-
-/**
- * free_iova - finds and frees the iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - pfn that is allocated previously
- * This functions finds an iova for a given pfn and then
- * frees the iova from that domain.
- */
-void
-free_iova(struct iova_domain *iovad, unsigned long pfn)
-{
-	struct iova *iova = find_iova(iovad, pfn);
-	if (iova)
-		__free_iova(iovad, iova);
-
-}
-
-/**
- * put_iova_domain - destroys the iova doamin
- * @iovad: - iova domain in question.
- * All the iova's in that domain are destroyed.
- */
-void put_iova_domain(struct iova_domain *iovad)
-{
-	struct rb_node *node;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	node = rb_first(&iovad->rbroot);
-	while (node) {
-		struct iova *iova = container_of(node, struct iova, node);
-		rb_erase(node, &iovad->rbroot);
-		free_iova_mem(iova);
-		node = rb_first(&iovad->rbroot);
-	}
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-}
-
-static int
-__is_range_overlap(struct rb_node *node,
-	unsigned long pfn_lo, unsigned long pfn_hi)
-{
-	struct iova *iova = container_of(node, struct iova, node);
-
-	if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
-		return 1;
-	return 0;
-}
-
-static struct iova *
-__insert_new_range(struct iova_domain *iovad,
-	unsigned long pfn_lo, unsigned long pfn_hi)
-{
-	struct iova *iova;
-
-	iova = alloc_iova_mem();
-	if (!iova)
-		return iova;
-
-	iova->pfn_hi = pfn_hi;
-	iova->pfn_lo = pfn_lo;
-	iova_insert_rbtree(&iovad->rbroot, iova);
-	return iova;
-}
-
-static void
-__adjust_overlap_range(struct iova *iova,
-	unsigned long *pfn_lo, unsigned long *pfn_hi)
-{
-	if (*pfn_lo < iova->pfn_lo)
-		iova->pfn_lo = *pfn_lo;
-	if (*pfn_hi > iova->pfn_hi)
-		*pfn_lo = iova->pfn_hi + 1;
-}
-
-/**
- * reserve_iova - reserves an iova in the given range
- * @iovad: - iova domain pointer
- * @pfn_lo: - lower page frame address
- * @pfn_hi:- higher pfn adderss
- * This function allocates reserves the address range from pfn_lo to pfn_hi so
- * that this address is not dished out as part of alloc_iova.
- */
-struct iova *
-reserve_iova(struct iova_domain *iovad,
-	unsigned long pfn_lo, unsigned long pfn_hi)
-{
-	struct rb_node *node;
-	unsigned long flags;
-	struct iova *iova;
-	unsigned int overlap = 0;
-
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
-		if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
-			iova = container_of(node, struct iova, node);
-			__adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
-			if ((pfn_lo >= iova->pfn_lo) &&
-				(pfn_hi <= iova->pfn_hi))
-				goto finish;
-			overlap = 1;
-
-		} else if (overlap)
-				break;
-	}
-
-	/* We are here either because this is the first reserver node
-	 * or need to insert remaining non overlap addr range
-	 */
-	iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
-finish:
-
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-	return iova;
-}
-
-/**
- * copy_reserved_iova - copies the reserved between domains
- * @from: - source doamin from where to copy
- * @to: - destination domin where to copy
- * This function copies reserved iova's from one doamin to
- * other.
- */
-void
-copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
-{
-	unsigned long flags;
-	struct rb_node *node;
-
-	spin_lock_irqsave(&from->iova_rbtree_lock, flags);
-	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
-		struct iova *iova = container_of(node, struct iova, node);
-		struct iova *new_iova;
-		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
-		if (!new_iova)
-			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
-				iova->pfn_lo, iova->pfn_lo);
-	}
-	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
-}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 731e20265ace..b7bf11dd546a 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -184,8 +184,6 @@ pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
 	return NULL;
 }
 
-struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
-
 /* PCI slot sysfs helper code */
 #define to_pci_slot(s) container_of(s, struct pci_slot, kobj)
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c446b5ca2d38..970bfe0941c3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1589,5 +1589,16 @@ int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt);
 int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
 			      unsigned int len, const char *kw);
 
+/**
+ * pci_find_upstream_pcie_bridge - find upstream PCIe-to-PCI bridge of a device
+ * @pdev: the PCI device
+ *
+ * if the device is PCIE, return NULL
+ * if the device isn't connected to a PCIe bridge (that is its parent is a
+ * legacy PCI bridge and the bridge is directly connected to bus 0), return its
+ * parent
+ */
+struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3


From 403f81d8ee532c976d50a5e1051f14ec78ae8db3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 14 Jun 2011 16:44:25 +0200
Subject: iommu/amd: Move missing parts to drivers/iommu

A few parts of the driver were missing in drivers/iommu.
Move them there to have the complete driver in that
directory.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu.h       |   35 -
 arch/x86/include/asm/amd_iommu_proto.h |   54 --
 arch/x86/include/asm/amd_iommu_types.h |  580 ------------
 arch/x86/kernel/Makefile               |    1 -
 arch/x86/kernel/amd_iommu_init.c       | 1572 -------------------------------
 drivers/iommu/Makefile                 |    2 +-
 drivers/iommu/amd_iommu.c              |    7 +-
 drivers/iommu/amd_iommu_init.c         | 1574 ++++++++++++++++++++++++++++++++
 drivers/iommu/amd_iommu_proto.h        |   54 ++
 drivers/iommu/amd_iommu_types.h        |  580 ++++++++++++
 include/linux/amd-iommu.h              |   35 +
 11 files changed, 2248 insertions(+), 2246 deletions(-)
 delete mode 100644 arch/x86/include/asm/amd_iommu.h
 delete mode 100644 arch/x86/include/asm/amd_iommu_proto.h
 delete mode 100644 arch/x86/include/asm/amd_iommu_types.h
 delete mode 100644 arch/x86/kernel/amd_iommu_init.c
 create mode 100644 drivers/iommu/amd_iommu_init.c
 create mode 100644 drivers/iommu/amd_iommu_proto.h
 create mode 100644 drivers/iommu/amd_iommu_types.h
 create mode 100644 include/linux/amd-iommu.h

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
deleted file mode 100644
index a6863a2dec1f..000000000000
--- a/arch/x86/include/asm/amd_iommu.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_H
-#define _ASM_X86_AMD_IOMMU_H
-
-#include <linux/irqreturn.h>
-
-#ifdef CONFIG_AMD_IOMMU
-
-extern int amd_iommu_detect(void);
-
-#else
-
-static inline int amd_iommu_detect(void) { return -ENODEV; }
-
-#endif
-
-#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
deleted file mode 100644
index 55d95eb789b3..000000000000
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
-#define _ASM_X86_AMD_IOMMU_PROTO_H
-
-#include <asm/amd_iommu_types.h>
-
-extern int amd_iommu_init_dma_ops(void);
-extern int amd_iommu_init_passthrough(void);
-extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
-extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_apply_erratum_63(u16 devid);
-extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
-extern int amd_iommu_init_devices(void);
-extern void amd_iommu_uninit_devices(void);
-extern void amd_iommu_init_notifier(void);
-extern void amd_iommu_init_api(void);
-#ifndef CONFIG_AMD_IOMMU_STATS
-
-static inline void amd_iommu_stats_init(void) { }
-
-#endif /* !CONFIG_AMD_IOMMU_STATS */
-
-static inline bool is_rd890_iommu(struct pci_dev *pdev)
-{
-	return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
-	       (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
-}
-
-static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
-{
-	if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
-		return false;
-
-	return !!(iommu->features & f);
-}
-
-#endif /* _ASM_X86_AMD_IOMMU_PROTO_H  */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
deleted file mode 100644
index 4c9982995414..000000000000
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
-#define _ASM_X86_AMD_IOMMU_TYPES_H
-
-#include <linux/types.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-/*
- * Maximum number of IOMMUs supported
- */
-#define MAX_IOMMUS	32
-
-/*
- * some size calculation constants
- */
-#define DEV_TABLE_ENTRY_SIZE		32
-#define ALIAS_TABLE_ENTRY_SIZE		2
-#define RLOOKUP_TABLE_ENTRY_SIZE	(sizeof(void *))
-
-/* Length of the MMIO region for the AMD IOMMU */
-#define MMIO_REGION_LENGTH       0x4000
-
-/* Capability offsets used by the driver */
-#define MMIO_CAP_HDR_OFFSET	0x00
-#define MMIO_RANGE_OFFSET	0x0c
-#define MMIO_MISC_OFFSET	0x10
-
-/* Masks, shifts and macros to parse the device range capability */
-#define MMIO_RANGE_LD_MASK	0xff000000
-#define MMIO_RANGE_FD_MASK	0x00ff0000
-#define MMIO_RANGE_BUS_MASK	0x0000ff00
-#define MMIO_RANGE_LD_SHIFT	24
-#define MMIO_RANGE_FD_SHIFT	16
-#define MMIO_RANGE_BUS_SHIFT	8
-#define MMIO_GET_LD(x)  (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
-#define MMIO_GET_FD(x)  (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
-#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
-#define MMIO_MSI_NUM(x)	((x) & 0x1f)
-
-/* Flag masks for the AMD IOMMU exclusion range */
-#define MMIO_EXCL_ENABLE_MASK 0x01ULL
-#define MMIO_EXCL_ALLOW_MASK  0x02ULL
-
-/* Used offsets into the MMIO space */
-#define MMIO_DEV_TABLE_OFFSET   0x0000
-#define MMIO_CMD_BUF_OFFSET     0x0008
-#define MMIO_EVT_BUF_OFFSET     0x0010
-#define MMIO_CONTROL_OFFSET     0x0018
-#define MMIO_EXCL_BASE_OFFSET   0x0020
-#define MMIO_EXCL_LIMIT_OFFSET  0x0028
-#define MMIO_EXT_FEATURES	0x0030
-#define MMIO_CMD_HEAD_OFFSET	0x2000
-#define MMIO_CMD_TAIL_OFFSET	0x2008
-#define MMIO_EVT_HEAD_OFFSET	0x2010
-#define MMIO_EVT_TAIL_OFFSET	0x2018
-#define MMIO_STATUS_OFFSET	0x2020
-
-
-/* Extended Feature Bits */
-#define FEATURE_PREFETCH	(1ULL<<0)
-#define FEATURE_PPR		(1ULL<<1)
-#define FEATURE_X2APIC		(1ULL<<2)
-#define FEATURE_NX		(1ULL<<3)
-#define FEATURE_GT		(1ULL<<4)
-#define FEATURE_IA		(1ULL<<6)
-#define FEATURE_GA		(1ULL<<7)
-#define FEATURE_HE		(1ULL<<8)
-#define FEATURE_PC		(1ULL<<9)
-
-/* MMIO status bits */
-#define MMIO_STATUS_COM_WAIT_INT_MASK	0x04
-
-/* event logging constants */
-#define EVENT_ENTRY_SIZE	0x10
-#define EVENT_TYPE_SHIFT	28
-#define EVENT_TYPE_MASK		0xf
-#define EVENT_TYPE_ILL_DEV	0x1
-#define EVENT_TYPE_IO_FAULT	0x2
-#define EVENT_TYPE_DEV_TAB_ERR	0x3
-#define EVENT_TYPE_PAGE_TAB_ERR	0x4
-#define EVENT_TYPE_ILL_CMD	0x5
-#define EVENT_TYPE_CMD_HARD_ERR	0x6
-#define EVENT_TYPE_IOTLB_INV_TO	0x7
-#define EVENT_TYPE_INV_DEV_REQ	0x8
-#define EVENT_DEVID_MASK	0xffff
-#define EVENT_DEVID_SHIFT	0
-#define EVENT_DOMID_MASK	0xffff
-#define EVENT_DOMID_SHIFT	0
-#define EVENT_FLAGS_MASK	0xfff
-#define EVENT_FLAGS_SHIFT	0x10
-
-/* feature control bits */
-#define CONTROL_IOMMU_EN        0x00ULL
-#define CONTROL_HT_TUN_EN       0x01ULL
-#define CONTROL_EVT_LOG_EN      0x02ULL
-#define CONTROL_EVT_INT_EN      0x03ULL
-#define CONTROL_COMWAIT_EN      0x04ULL
-#define CONTROL_PASSPW_EN       0x08ULL
-#define CONTROL_RESPASSPW_EN    0x09ULL
-#define CONTROL_COHERENT_EN     0x0aULL
-#define CONTROL_ISOC_EN         0x0bULL
-#define CONTROL_CMDBUF_EN       0x0cULL
-#define CONTROL_PPFLOG_EN       0x0dULL
-#define CONTROL_PPFINT_EN       0x0eULL
-
-/* command specific defines */
-#define CMD_COMPL_WAIT          0x01
-#define CMD_INV_DEV_ENTRY       0x02
-#define CMD_INV_IOMMU_PAGES	0x03
-#define CMD_INV_IOTLB_PAGES	0x04
-#define CMD_INV_ALL		0x08
-
-#define CMD_COMPL_WAIT_STORE_MASK	0x01
-#define CMD_COMPL_WAIT_INT_MASK		0x02
-#define CMD_INV_IOMMU_PAGES_SIZE_MASK	0x01
-#define CMD_INV_IOMMU_PAGES_PDE_MASK	0x02
-
-#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS	0x7fffffffffffffffULL
-
-/* macros and definitions for device table entries */
-#define DEV_ENTRY_VALID         0x00
-#define DEV_ENTRY_TRANSLATION   0x01
-#define DEV_ENTRY_IR            0x3d
-#define DEV_ENTRY_IW            0x3e
-#define DEV_ENTRY_NO_PAGE_FAULT	0x62
-#define DEV_ENTRY_EX            0x67
-#define DEV_ENTRY_SYSMGT1       0x68
-#define DEV_ENTRY_SYSMGT2       0x69
-#define DEV_ENTRY_INIT_PASS     0xb8
-#define DEV_ENTRY_EINT_PASS     0xb9
-#define DEV_ENTRY_NMI_PASS      0xba
-#define DEV_ENTRY_LINT0_PASS    0xbe
-#define DEV_ENTRY_LINT1_PASS    0xbf
-#define DEV_ENTRY_MODE_MASK	0x07
-#define DEV_ENTRY_MODE_SHIFT	0x09
-
-/* constants to configure the command buffer */
-#define CMD_BUFFER_SIZE    8192
-#define CMD_BUFFER_UNINITIALIZED 1
-#define CMD_BUFFER_ENTRIES 512
-#define MMIO_CMD_SIZE_SHIFT 56
-#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
-
-/* constants for event buffer handling */
-#define EVT_BUFFER_SIZE		8192 /* 512 entries */
-#define EVT_LEN_MASK		(0x9ULL << 56)
-
-#define PAGE_MODE_NONE    0x00
-#define PAGE_MODE_1_LEVEL 0x01
-#define PAGE_MODE_2_LEVEL 0x02
-#define PAGE_MODE_3_LEVEL 0x03
-#define PAGE_MODE_4_LEVEL 0x04
-#define PAGE_MODE_5_LEVEL 0x05
-#define PAGE_MODE_6_LEVEL 0x06
-
-#define PM_LEVEL_SHIFT(x)	(12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x)	(((x) < 6) ? \
-				  ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
-				   (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a)	(((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x)		(((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a)	((a) | PM_LEVEL_ENC((x)) | \
-				 IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte)	(((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k		0
-#define PM_ADDR_MASK		0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl)	(PM_ADDR_MASK & \
-				(~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr)	((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
-		((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
-		(1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
-		((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address an a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize)		\
-		(((address) | ((pagesize) - 1)) &	\
-		 (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
-	(1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-#define IOMMU_PTE_P  (1ULL << 0)
-#define IOMMU_PTE_TV (1ULL << 1)
-#define IOMMU_PTE_U  (1ULL << 59)
-#define IOMMU_PTE_FC (1ULL << 60)
-#define IOMMU_PTE_IR (1ULL << 61)
-#define IOMMU_PTE_IW (1ULL << 62)
-
-#define DTE_FLAG_IOTLB	0x01
-
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
-#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
-#define IOMMU_PROT_MASK 0x03
-#define IOMMU_PROT_IR 0x01
-#define IOMMU_PROT_IW 0x02
-
-/* IOMMU capabilities */
-#define IOMMU_CAP_IOTLB   24
-#define IOMMU_CAP_NPCACHE 26
-#define IOMMU_CAP_EFR     27
-
-#define MAX_DOMAIN_ID 65536
-
-/* FIXME: move this macro to <linux/pci.h> */
-#define PCI_BUS(x) (((x) >> 8) & 0xff)
-
-/* Protection domain flags */
-#define PD_DMA_OPS_MASK		(1UL << 0) /* domain used for dma_ops */
-#define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
-					      domain for an IOMMU */
-#define PD_PASSTHROUGH_MASK	(1UL << 2) /* domain has no page
-					      translation */
-
-extern bool amd_iommu_dump;
-#define DUMP_printk(format, arg...)					\
-	do {								\
-		if (amd_iommu_dump)						\
-			printk(KERN_INFO "AMD-Vi: " format, ## arg);	\
-	} while(0);
-
-/* global flag if IOMMUs cache non-present entries */
-extern bool amd_iommu_np_cache;
-/* Only true if all IOMMUs support device IOTLBs */
-extern bool amd_iommu_iotlb_sup;
-
-/*
- * Make iterating over all IOMMUs easier
- */
-#define for_each_iommu(iommu) \
-	list_for_each_entry((iommu), &amd_iommu_list, list)
-#define for_each_iommu_safe(iommu, next) \
-	list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
-
-#define APERTURE_RANGE_SHIFT	27	/* 128 MB */
-#define APERTURE_RANGE_SIZE	(1ULL << APERTURE_RANGE_SHIFT)
-#define APERTURE_RANGE_PAGES	(APERTURE_RANGE_SIZE >> PAGE_SHIFT)
-#define APERTURE_MAX_RANGES	32	/* allows 4GB of DMA address space */
-#define APERTURE_RANGE_INDEX(a)	((a) >> APERTURE_RANGE_SHIFT)
-#define APERTURE_PAGE_INDEX(a)	(((a) >> 21) & 0x3fULL)
-
-/*
- * This structure contains generic data for  IOMMU protection domains
- * independent of their use.
- */
-struct protection_domain {
-	struct list_head list;  /* for list of all protection domains */
-	struct list_head dev_list; /* List of all devices in this domain */
-	spinlock_t lock;	/* mostly used to lock the page table*/
-	struct mutex api_lock;	/* protect page tables in the iommu-api path */
-	u16 id;			/* the domain id written to the device table */
-	int mode;		/* paging mode (0-6 levels) */
-	u64 *pt_root;		/* page table root pointer */
-	unsigned long flags;	/* flags to find out type of domain */
-	bool updated;		/* complete domain flush required */
-	unsigned dev_cnt;	/* devices assigned to this domain */
-	unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
-	void *priv;		/* private data */
-
-};
-
-/*
- * This struct contains device specific data for the IOMMU
- */
-struct iommu_dev_data {
-	struct list_head list;		  /* For domain->dev_list */
-	struct device *dev;		  /* Device this data belong to */
-	struct device *alias;		  /* The Alias Device */
-	struct protection_domain *domain; /* Domain the device is bound to */
-	atomic_t bind;			  /* Domain attach reverent count */
-};
-
-/*
- * For dynamic growth the aperture size is split into ranges of 128MB of
- * DMA address space each. This struct represents one such range.
- */
-struct aperture_range {
-
-	/* address allocation bitmap */
-	unsigned long *bitmap;
-
-	/*
-	 * Array of PTE pages for the aperture. In this array we save all the
-	 * leaf pages of the domain page table used for the aperture. This way
-	 * we don't need to walk the page table to find a specific PTE. We can
-	 * just calculate its address in constant time.
-	 */
-	u64 *pte_pages[64];
-
-	unsigned long offset;
-};
-
-/*
- * Data container for a dma_ops specific protection domain
- */
-struct dma_ops_domain {
-	struct list_head list;
-
-	/* generic protection domain information */
-	struct protection_domain domain;
-
-	/* size of the aperture for the mappings */
-	unsigned long aperture_size;
-
-	/* address we start to search for free addresses */
-	unsigned long next_address;
-
-	/* address space relevant data */
-	struct aperture_range *aperture[APERTURE_MAX_RANGES];
-
-	/* This will be set to true when TLB needs to be flushed */
-	bool need_flush;
-
-	/*
-	 * if this is a preallocated domain, keep the device for which it was
-	 * preallocated in this variable
-	 */
-	u16 target_dev;
-};
-
-/*
- * Structure where we save information about one hardware AMD IOMMU in the
- * system.
- */
-struct amd_iommu {
-	struct list_head list;
-
-	/* Index within the IOMMU array */
-	int index;
-
-	/* locks the accesses to the hardware */
-	spinlock_t lock;
-
-	/* Pointer to PCI device of this IOMMU */
-	struct pci_dev *dev;
-
-	/* physical address of MMIO space */
-	u64 mmio_phys;
-	/* virtual address of MMIO space */
-	u8 *mmio_base;
-
-	/* capabilities of that IOMMU read from ACPI */
-	u32 cap;
-
-	/* flags read from acpi table */
-	u8 acpi_flags;
-
-	/* Extended features */
-	u64 features;
-
-	/*
-	 * Capability pointer. There could be more than one IOMMU per PCI
-	 * device function if there are more than one AMD IOMMU capability
-	 * pointers.
-	 */
-	u16 cap_ptr;
-
-	/* pci domain of this IOMMU */
-	u16 pci_seg;
-
-	/* first device this IOMMU handles. read from PCI */
-	u16 first_device;
-	/* last device this IOMMU handles. read from PCI */
-	u16 last_device;
-
-	/* start of exclusion range of that IOMMU */
-	u64 exclusion_start;
-	/* length of exclusion range of that IOMMU */
-	u64 exclusion_length;
-
-	/* command buffer virtual address */
-	u8 *cmd_buf;
-	/* size of command buffer */
-	u32 cmd_buf_size;
-
-	/* size of event buffer */
-	u32 evt_buf_size;
-	/* event buffer virtual address */
-	u8 *evt_buf;
-	/* MSI number for event interrupt */
-	u16 evt_msi_num;
-
-	/* true if interrupts for this IOMMU are already enabled */
-	bool int_enabled;
-
-	/* if one, we need to send a completion wait command */
-	bool need_sync;
-
-	/* default dma_ops domain for that IOMMU */
-	struct dma_ops_domain *default_dom;
-
-	/*
-	 * We can't rely on the BIOS to restore all values on reinit, so we
-	 * need to stash them
-	 */
-
-	/* The iommu BAR */
-	u32 stored_addr_lo;
-	u32 stored_addr_hi;
-
-	/*
-	 * Each iommu has 6 l1s, each of which is documented as having 0x12
-	 * registers
-	 */
-	u32 stored_l1[6][0x12];
-
-	/* The l2 indirect registers */
-	u32 stored_l2[0x83];
-};
-
-/*
- * List with all IOMMUs in the system. This list is not locked because it is
- * only written and read at driver initialization or suspend time
- */
-extern struct list_head amd_iommu_list;
-
-/*
- * Array with pointers to each IOMMU struct
- * The indices are referenced in the protection domains
- */
-extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
-
-/* Number of IOMMUs present in the system */
-extern int amd_iommus_present;
-
-/*
- * Declarations for the global list of all protection domains
- */
-extern spinlock_t amd_iommu_pd_lock;
-extern struct list_head amd_iommu_pd_list;
-
-/*
- * Structure defining one entry in the device table
- */
-struct dev_table_entry {
-	u32 data[8];
-};
-
-/*
- * One entry for unity mappings parsed out of the ACPI table.
- */
-struct unity_map_entry {
-	struct list_head list;
-
-	/* starting device id this entry is used for (including) */
-	u16 devid_start;
-	/* end device id this entry is used for (including) */
-	u16 devid_end;
-
-	/* start address to unity map (including) */
-	u64 address_start;
-	/* end address to unity map (including) */
-	u64 address_end;
-
-	/* required protection */
-	int prot;
-};
-
-/*
- * List of all unity mappings. It is not locked because as runtime it is only
- * read. It is created at ACPI table parsing time.
- */
-extern struct list_head amd_iommu_unity_map;
-
-/*
- * Data structures for device handling
- */
-
-/*
- * Device table used by hardware. Read and write accesses by software are
- * locked with the amd_iommu_pd_table lock.
- */
-extern struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * Alias table to find requestor ids to device ids. Not locked because only
- * read on runtime.
- */
-extern u16 *amd_iommu_alias_table;
-
-/*
- * Reverse lookup table to find the IOMMU which translates a specific device.
- */
-extern struct amd_iommu **amd_iommu_rlookup_table;
-
-/* size of the dma_ops aperture as power of 2 */
-extern unsigned amd_iommu_aperture_order;
-
-/* largest PCI device id we expect translation requests for */
-extern u16 amd_iommu_last_bdf;
-
-/* allocation bitmap for domain ids */
-extern unsigned long *amd_iommu_pd_alloc_bitmap;
-
-/*
- * If true, the addresses will be flushed on unmap time, not when
- * they are reused
- */
-extern bool amd_iommu_unmap_flush;
-
-/* takes bus and device/function and returns the device id
- * FIXME: should that be in generic PCI code? */
-static inline u16 calc_devid(u8 bus, u8 devfn)
-{
-	return (((u16)bus) << 8) | devfn;
-}
-
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-struct __iommu_counter {
-	char *name;
-	struct dentry *dent;
-	u64 value;
-};
-
-#define DECLARE_STATS_COUNTER(nm) \
-	static struct __iommu_counter nm = {	\
-		.name = #nm,			\
-	}
-
-#define INC_STATS_COUNTER(name)		name.value += 1
-#define ADD_STATS_COUNTER(name, x)	name.value += (x)
-#define SUB_STATS_COUNTER(name, x)	name.value -= (x)
-
-#else /* CONFIG_AMD_IOMMU_STATS */
-
-#define DECLARE_STATS_COUNTER(name)
-#define INC_STATS_COUNTER(name)
-#define ADD_STATS_COUNTER(name, x)
-#define SUB_STATS_COUNTER(name, x)
-
-#endif /* CONFIG_AMD_IOMMU_STATS */
-
-#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index aef02989c930..11817ff85399 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -123,7 +123,6 @@ ifeq ($(CONFIG_X86_64),y)
 
 	obj-$(CONFIG_GART_IOMMU)	+= amd_gart_64.o aperture_64.o
 	obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
-	obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o
 
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index bfc8453bd98d..000000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1572 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/syscore_ops.h>
-#include <linux/interrupt.h>
-#include <linux/msi.h>
-#include <asm/pci-direct.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-/*
- * definitions for the ACPI scanning code
- */
-#define IVRS_HEADER_LENGTH 48
-
-#define ACPI_IVHD_TYPE                  0x10
-#define ACPI_IVMD_TYPE_ALL              0x20
-#define ACPI_IVMD_TYPE                  0x21
-#define ACPI_IVMD_TYPE_RANGE            0x22
-
-#define IVHD_DEV_ALL                    0x01
-#define IVHD_DEV_SELECT                 0x02
-#define IVHD_DEV_SELECT_RANGE_START     0x03
-#define IVHD_DEV_RANGE_END              0x04
-#define IVHD_DEV_ALIAS                  0x42
-#define IVHD_DEV_ALIAS_RANGE            0x43
-#define IVHD_DEV_EXT_SELECT             0x46
-#define IVHD_DEV_EXT_SELECT_RANGE       0x47
-
-#define IVHD_FLAG_HT_TUN_EN_MASK        0x01
-#define IVHD_FLAG_PASSPW_EN_MASK        0x02
-#define IVHD_FLAG_RESPASSPW_EN_MASK     0x04
-#define IVHD_FLAG_ISOC_EN_MASK          0x08
-
-#define IVMD_FLAG_EXCL_RANGE            0x08
-#define IVMD_FLAG_UNITY_MAP             0x01
-
-#define ACPI_DEVFLAG_INITPASS           0x01
-#define ACPI_DEVFLAG_EXTINT             0x02
-#define ACPI_DEVFLAG_NMI                0x04
-#define ACPI_DEVFLAG_SYSMGT1            0x10
-#define ACPI_DEVFLAG_SYSMGT2            0x20
-#define ACPI_DEVFLAG_LINT0              0x40
-#define ACPI_DEVFLAG_LINT1              0x80
-#define ACPI_DEVFLAG_ATSDIS             0x10000000
-
-/*
- * ACPI table definitions
- *
- * These data structures are laid over the table to parse the important values
- * out of it.
- */
-
-/*
- * structure describing one IOMMU in the ACPI table. Typically followed by one
- * or more ivhd_entrys.
- */
-struct ivhd_header {
-	u8 type;
-	u8 flags;
-	u16 length;
-	u16 devid;
-	u16 cap_ptr;
-	u64 mmio_phys;
-	u16 pci_seg;
-	u16 info;
-	u32 reserved;
-} __attribute__((packed));
-
-/*
- * A device entry describing which devices a specific IOMMU translates and
- * which requestor ids they use.
- */
-struct ivhd_entry {
-	u8 type;
-	u16 devid;
-	u8 flags;
-	u32 ext;
-} __attribute__((packed));
-
-/*
- * An AMD IOMMU memory definition structure. It defines things like exclusion
- * ranges for devices and regions that should be unity mapped.
- */
-struct ivmd_header {
-	u8 type;
-	u8 flags;
-	u16 length;
-	u16 devid;
-	u16 aux;
-	u64 resv;
-	u64 range_start;
-	u64 range_length;
-} __attribute__((packed));
-
-bool amd_iommu_dump;
-
-static int __initdata amd_iommu_detected;
-static bool __initdata amd_iommu_disabled;
-
-u16 amd_iommu_last_bdf;			/* largest PCI device id we have
-					   to handle */
-LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
-					   we find in ACPI */
-bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
-
-LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
-					   system */
-
-/* Array to assign indices to IOMMUs*/
-struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
-
-/* IOMMUs have a non-present cache? */
-bool amd_iommu_np_cache __read_mostly;
-bool amd_iommu_iotlb_sup __read_mostly = true;
-
-/*
- * The ACPI table parsing functions set this variable on an error
- */
-static int __initdata amd_iommu_init_err;
-
-/*
- * List of protection domains - used during resume
- */
-LIST_HEAD(amd_iommu_pd_list);
-spinlock_t amd_iommu_pd_lock;
-
-/*
- * Pointer to the device table which is shared by all AMD IOMMUs
- * it is indexed by the PCI device id or the HT unit id and contains
- * information about the domain the device belongs to as well as the
- * page table root pointer.
- */
-struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * The alias table is a driver specific data structure which contains the
- * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
- * More than one device can share the same requestor id.
- */
-u16 *amd_iommu_alias_table;
-
-/*
- * The rlookup table is used to find the IOMMU which is responsible
- * for a specific device. It is also indexed by the PCI device id.
- */
-struct amd_iommu **amd_iommu_rlookup_table;
-
-/*
- * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
-
-static u32 dev_table_size;	/* size of the device table */
-static u32 alias_table_size;	/* size of the alias table */
-static u32 rlookup_table_size;	/* size if the rlookup table */
-
-/*
- * This function flushes all internal caches of
- * the IOMMU used by this driver.
- */
-extern void iommu_flush_all_caches(struct amd_iommu *iommu);
-
-static inline void update_last_devid(u16 devid)
-{
-	if (devid > amd_iommu_last_bdf)
-		amd_iommu_last_bdf = devid;
-}
-
-static inline unsigned long tbl_size(int entry_size)
-{
-	unsigned shift = PAGE_SHIFT +
-			 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
-
-	return 1UL << shift;
-}
-
-/* Access to l1 and l2 indexed register spaces */
-
-static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
-{
-	u32 val;
-
-	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-	pci_read_config_dword(iommu->dev, 0xfc, &val);
-	return val;
-}
-
-static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
-{
-	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
-	pci_write_config_dword(iommu->dev, 0xfc, val);
-	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-}
-
-static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
-{
-	u32 val;
-
-	pci_write_config_dword(iommu->dev, 0xf0, address);
-	pci_read_config_dword(iommu->dev, 0xf4, &val);
-	return val;
-}
-
-static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
-{
-	pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
-	pci_write_config_dword(iommu->dev, 0xf4, val);
-}
-
-/****************************************************************************
- *
- * AMD IOMMU MMIO register space handling functions
- *
- * These functions are used to program the IOMMU device registers in
- * MMIO space required for that driver.
- *
- ****************************************************************************/
-
-/*
- * This function set the exclusion range in the IOMMU. DMA accesses to the
- * exclusion range are passed through untranslated
- */
-static void iommu_set_exclusion_range(struct amd_iommu *iommu)
-{
-	u64 start = iommu->exclusion_start & PAGE_MASK;
-	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
-	u64 entry;
-
-	if (!iommu->exclusion_start)
-		return;
-
-	entry = start | MMIO_EXCL_ENABLE_MASK;
-	memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
-			&entry, sizeof(entry));
-
-	entry = limit;
-	memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
-			&entry, sizeof(entry));
-}
-
-/* Programs the physical address of the device table into the IOMMU hardware */
-static void __init iommu_set_device_table(struct amd_iommu *iommu)
-{
-	u64 entry;
-
-	BUG_ON(iommu->mmio_base == NULL);
-
-	entry = virt_to_phys(amd_iommu_dev_table);
-	entry |= (dev_table_size >> 12) - 1;
-	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
-			&entry, sizeof(entry));
-}
-
-/* Generic functions to enable/disable certain features of the IOMMU. */
-static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
-{
-	u32 ctrl;
-
-	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
-	ctrl |= (1 << bit);
-	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
-{
-	u32 ctrl;
-
-	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
-	ctrl &= ~(1 << bit);
-	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-/* Function to enable the hardware */
-static void iommu_enable(struct amd_iommu *iommu)
-{
-	static const char * const feat_str[] = {
-		"PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
-		"IA", "GA", "HE", "PC", NULL
-	};
-	int i;
-
-	printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
-	       dev_name(&iommu->dev->dev), iommu->cap_ptr);
-
-	if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
-		printk(KERN_CONT " extended features: ");
-		for (i = 0; feat_str[i]; ++i)
-			if (iommu_feature(iommu, (1ULL << i)))
-				printk(KERN_CONT " %s", feat_str[i]);
-	}
-	printk(KERN_CONT "\n");
-
-	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-}
-
-static void iommu_disable(struct amd_iommu *iommu)
-{
-	/* Disable command buffer */
-	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
-	/* Disable event logging and event interrupts */
-	iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
-	iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
-
-	/* Disable IOMMU hardware itself */
-	iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
-}
-
-/*
- * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
- * the system has one.
- */
-static u8 * __init iommu_map_mmio_space(u64 address)
-{
-	u8 *ret;
-
-	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
-		pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
-			address);
-		pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
-		return NULL;
-	}
-
-	ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
-	if (ret != NULL)
-		return ret;
-
-	release_mem_region(address, MMIO_REGION_LENGTH);
-
-	return NULL;
-}
-
-static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
-{
-	if (iommu->mmio_base)
-		iounmap(iommu->mmio_base);
-	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
-}
-
-/****************************************************************************
- *
- * The functions below belong to the first pass of AMD IOMMU ACPI table
- * parsing. In this pass we try to find out the highest device id this
- * code has to handle. Upon this information the size of the shared data
- * structures is determined later.
- *
- ****************************************************************************/
-
-/*
- * This function calculates the length of a given IVHD entry
- */
-static inline int ivhd_entry_length(u8 *ivhd)
-{
-	return 0x04 << (*ivhd >> 6);
-}
-
-/*
- * This function reads the last device id the IOMMU has to handle from the PCI
- * capability header for this IOMMU
- */
-static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
-{
-	u32 cap;
-
-	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-	update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
-
-	return 0;
-}
-
-/*
- * After reading the highest device id from the IOMMU PCI capability header
- * this function looks if there is a higher device id defined in the ACPI table
- */
-static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
-{
-	u8 *p = (void *)h, *end = (void *)h;
-	struct ivhd_entry *dev;
-
-	p += sizeof(*h);
-	end += h->length;
-
-	find_last_devid_on_pci(PCI_BUS(h->devid),
-			PCI_SLOT(h->devid),
-			PCI_FUNC(h->devid),
-			h->cap_ptr);
-
-	while (p < end) {
-		dev = (struct ivhd_entry *)p;
-		switch (dev->type) {
-		case IVHD_DEV_SELECT:
-		case IVHD_DEV_RANGE_END:
-		case IVHD_DEV_ALIAS:
-		case IVHD_DEV_EXT_SELECT:
-			/* all the above subfield types refer to device ids */
-			update_last_devid(dev->devid);
-			break;
-		default:
-			break;
-		}
-		p += ivhd_entry_length(p);
-	}
-
-	WARN_ON(p != end);
-
-	return 0;
-}
-
-/*
- * Iterate over all IVHD entries in the ACPI table and find the highest device
- * id which we need to handle. This is the first of three functions which parse
- * the ACPI table. So we check the checksum here.
- */
-static int __init find_last_devid_acpi(struct acpi_table_header *table)
-{
-	int i;
-	u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
-	struct ivhd_header *h;
-
-	/*
-	 * Validate checksum here so we don't need to do it when
-	 * we actually parse the table
-	 */
-	for (i = 0; i < table->length; ++i)
-		checksum += p[i];
-	if (checksum != 0) {
-		/* ACPI table corrupt */
-		amd_iommu_init_err = -ENODEV;
-		return 0;
-	}
-
-	p += IVRS_HEADER_LENGTH;
-
-	end += table->length;
-	while (p < end) {
-		h = (struct ivhd_header *)p;
-		switch (h->type) {
-		case ACPI_IVHD_TYPE:
-			find_last_devid_from_ivhd(h);
-			break;
-		default:
-			break;
-		}
-		p += h->length;
-	}
-	WARN_ON(p != end);
-
-	return 0;
-}
-
-/****************************************************************************
- *
- * The following functions belong the the code path which parses the ACPI table
- * the second time. In this ACPI parsing iteration we allocate IOMMU specific
- * data structures, initialize the device/alias/rlookup table and also
- * basically initialize the hardware.
- *
- ****************************************************************************/
-
-/*
- * Allocates the command buffer. This buffer is per AMD IOMMU. We can
- * write commands to that buffer later and the IOMMU will execute them
- * asynchronously
- */
-static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
-{
-	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-			get_order(CMD_BUFFER_SIZE));
-
-	if (cmd_buf == NULL)
-		return NULL;
-
-	iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
-
-	return cmd_buf;
-}
-
-/*
- * This function resets the command buffer if the IOMMU stopped fetching
- * commands from it.
- */
-void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
-{
-	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
-	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
-	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-}
-
-/*
- * This function writes the command buffer address to the hardware and
- * enables it.
- */
-static void iommu_enable_command_buffer(struct amd_iommu *iommu)
-{
-	u64 entry;
-
-	BUG_ON(iommu->cmd_buf == NULL);
-
-	entry = (u64)virt_to_phys(iommu->cmd_buf);
-	entry |= MMIO_CMD_SIZE_512;
-
-	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
-		    &entry, sizeof(entry));
-
-	amd_iommu_reset_cmd_buffer(iommu);
-	iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
-}
-
-static void __init free_command_buffer(struct amd_iommu *iommu)
-{
-	free_pages((unsigned long)iommu->cmd_buf,
-		   get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
-}
-
-/* allocates the memory where the IOMMU will log its events to */
-static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
-{
-	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-						get_order(EVT_BUFFER_SIZE));
-
-	if (iommu->evt_buf == NULL)
-		return NULL;
-
-	iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
-	return iommu->evt_buf;
-}
-
-static void iommu_enable_event_buffer(struct amd_iommu *iommu)
-{
-	u64 entry;
-
-	BUG_ON(iommu->evt_buf == NULL);
-
-	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
-	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
-		    &entry, sizeof(entry));
-
-	/* set head and tail to zero manually */
-	writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-	writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
-	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-}
-
-static void __init free_event_buffer(struct amd_iommu *iommu)
-{
-	free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
-}
-
-/* sets a specific bit in the device table entry. */
-static void set_dev_entry_bit(u16 devid, u8 bit)
-{
-	int i = (bit >> 5) & 0x07;
-	int _bit = bit & 0x1f;
-
-	amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
-}
-
-static int get_dev_entry_bit(u16 devid, u8 bit)
-{
-	int i = (bit >> 5) & 0x07;
-	int _bit = bit & 0x1f;
-
-	return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
-}
-
-
-void amd_iommu_apply_erratum_63(u16 devid)
-{
-	int sysmgt;
-
-	sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
-		 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
-
-	if (sysmgt == 0x01)
-		set_dev_entry_bit(devid, DEV_ENTRY_IW);
-}
-
-/* Writes the specific IOMMU for a device into the rlookup table */
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
-	amd_iommu_rlookup_table[devid] = iommu;
-}
-
-/*
- * This function takes the device specific flags read from the ACPI
- * table and sets up the device table entry with that information
- */
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
-					   u16 devid, u32 flags, u32 ext_flags)
-{
-	if (flags & ACPI_DEVFLAG_INITPASS)
-		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
-	if (flags & ACPI_DEVFLAG_EXTINT)
-		set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
-	if (flags & ACPI_DEVFLAG_NMI)
-		set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
-	if (flags & ACPI_DEVFLAG_SYSMGT1)
-		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
-	if (flags & ACPI_DEVFLAG_SYSMGT2)
-		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
-	if (flags & ACPI_DEVFLAG_LINT0)
-		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
-	if (flags & ACPI_DEVFLAG_LINT1)
-		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-
-	amd_iommu_apply_erratum_63(devid);
-
-	set_iommu_for_device(iommu, devid);
-}
-
-/*
- * Reads the device exclusion range from ACPI and initialize IOMMU with
- * it
- */
-static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
-{
-	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-
-	if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
-		return;
-
-	if (iommu) {
-		/*
-		 * We only can configure exclusion ranges per IOMMU, not
-		 * per device. But we can enable the exclusion range per
-		 * device. This is done here
-		 */
-		set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
-		iommu->exclusion_start = m->range_start;
-		iommu->exclusion_length = m->range_length;
-	}
-}
-
-/*
- * This function reads some important data from the IOMMU PCI space and
- * initializes the driver data structure with it. It reads the hardware
- * capabilities and the first/last device entries
- */
-static void __init init_iommu_from_pci(struct amd_iommu *iommu)
-{
-	int cap_ptr = iommu->cap_ptr;
-	u32 range, misc, low, high;
-	int i, j;
-
-	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
-			      &iommu->cap);
-	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
-			      &range);
-	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
-			      &misc);
-
-	iommu->first_device = calc_devid(MMIO_GET_BUS(range),
-					 MMIO_GET_FD(range));
-	iommu->last_device = calc_devid(MMIO_GET_BUS(range),
-					MMIO_GET_LD(range));
-	iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-
-	if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
-		amd_iommu_iotlb_sup = false;
-
-	/* read extended feature bits */
-	low  = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
-	high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
-
-	iommu->features = ((u64)high << 32) | low;
-
-	if (!is_rd890_iommu(iommu->dev))
-		return;
-
-	/*
-	 * Some rd890 systems may not be fully reconfigured by the BIOS, so
-	 * it's necessary for us to store this information so it can be
-	 * reprogrammed on resume
-	 */
-
-	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
-			      &iommu->stored_addr_lo);
-	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
-			      &iommu->stored_addr_hi);
-
-	/* Low bit locks writes to configuration space */
-	iommu->stored_addr_lo &= ~1;
-
-	for (i = 0; i < 6; i++)
-		for (j = 0; j < 0x12; j++)
-			iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
-
-	for (i = 0; i < 0x83; i++)
-		iommu->stored_l2[i] = iommu_read_l2(iommu, i);
-}
-
-/*
- * Takes a pointer to an AMD IOMMU entry in the ACPI table and
- * initializes the hardware and our data structures with it.
- */
-static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
-					struct ivhd_header *h)
-{
-	u8 *p = (u8 *)h;
-	u8 *end = p, flags = 0;
-	u16 devid = 0, devid_start = 0, devid_to = 0;
-	u32 dev_i, ext_flags = 0;
-	bool alias = false;
-	struct ivhd_entry *e;
-
-	/*
-	 * First save the recommended feature enable bits from ACPI
-	 */
-	iommu->acpi_flags = h->flags;
-
-	/*
-	 * Done. Now parse the device entries
-	 */
-	p += sizeof(struct ivhd_header);
-	end += h->length;
-
-
-	while (p < end) {
-		e = (struct ivhd_entry *)p;
-		switch (e->type) {
-		case IVHD_DEV_ALL:
-
-			DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x"
-				    " last device %02x:%02x.%x flags: %02x\n",
-				    PCI_BUS(iommu->first_device),
-				    PCI_SLOT(iommu->first_device),
-				    PCI_FUNC(iommu->first_device),
-				    PCI_BUS(iommu->last_device),
-				    PCI_SLOT(iommu->last_device),
-				    PCI_FUNC(iommu->last_device),
-				    e->flags);
-
-			for (dev_i = iommu->first_device;
-					dev_i <= iommu->last_device; ++dev_i)
-				set_dev_entry_from_acpi(iommu, dev_i,
-							e->flags, 0);
-			break;
-		case IVHD_DEV_SELECT:
-
-			DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
-				    "flags: %02x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags);
-
-			devid = e->devid;
-			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
-			break;
-		case IVHD_DEV_SELECT_RANGE_START:
-
-			DUMP_printk("  DEV_SELECT_RANGE_START\t "
-				    "devid: %02x:%02x.%x flags: %02x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags);
-
-			devid_start = e->devid;
-			flags = e->flags;
-			ext_flags = 0;
-			alias = false;
-			break;
-		case IVHD_DEV_ALIAS:
-
-			DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
-				    "flags: %02x devid_to: %02x:%02x.%x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags,
-				    PCI_BUS(e->ext >> 8),
-				    PCI_SLOT(e->ext >> 8),
-				    PCI_FUNC(e->ext >> 8));
-
-			devid = e->devid;
-			devid_to = e->ext >> 8;
-			set_dev_entry_from_acpi(iommu, devid   , e->flags, 0);
-			set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
-			amd_iommu_alias_table[devid] = devid_to;
-			break;
-		case IVHD_DEV_ALIAS_RANGE:
-
-			DUMP_printk("  DEV_ALIAS_RANGE\t\t "
-				    "devid: %02x:%02x.%x flags: %02x "
-				    "devid_to: %02x:%02x.%x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags,
-				    PCI_BUS(e->ext >> 8),
-				    PCI_SLOT(e->ext >> 8),
-				    PCI_FUNC(e->ext >> 8));
-
-			devid_start = e->devid;
-			flags = e->flags;
-			devid_to = e->ext >> 8;
-			ext_flags = 0;
-			alias = true;
-			break;
-		case IVHD_DEV_EXT_SELECT:
-
-			DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
-				    "flags: %02x ext: %08x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags, e->ext);
-
-			devid = e->devid;
-			set_dev_entry_from_acpi(iommu, devid, e->flags,
-						e->ext);
-			break;
-		case IVHD_DEV_EXT_SELECT_RANGE:
-
-			DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
-				    "%02x:%02x.%x flags: %02x ext: %08x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags, e->ext);
-
-			devid_start = e->devid;
-			flags = e->flags;
-			ext_flags = e->ext;
-			alias = false;
-			break;
-		case IVHD_DEV_RANGE_END:
-
-			DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid));
-
-			devid = e->devid;
-			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
-				if (alias) {
-					amd_iommu_alias_table[dev_i] = devid_to;
-					set_dev_entry_from_acpi(iommu,
-						devid_to, flags, ext_flags);
-				}
-				set_dev_entry_from_acpi(iommu, dev_i,
-							flags, ext_flags);
-			}
-			break;
-		default:
-			break;
-		}
-
-		p += ivhd_entry_length(p);
-	}
-}
-
-/* Initializes the device->iommu mapping for the driver */
-static int __init init_iommu_devices(struct amd_iommu *iommu)
-{
-	u32 i;
-
-	for (i = iommu->first_device; i <= iommu->last_device; ++i)
-		set_iommu_for_device(iommu, i);
-
-	return 0;
-}
-
-static void __init free_iommu_one(struct amd_iommu *iommu)
-{
-	free_command_buffer(iommu);
-	free_event_buffer(iommu);
-	iommu_unmap_mmio_space(iommu);
-}
-
-static void __init free_iommu_all(void)
-{
-	struct amd_iommu *iommu, *next;
-
-	for_each_iommu_safe(iommu, next) {
-		list_del(&iommu->list);
-		free_iommu_one(iommu);
-		kfree(iommu);
-	}
-}
-
-/*
- * This function clues the initialization function for one IOMMU
- * together and also allocates the command buffer and programs the
- * hardware. It does NOT enable the IOMMU. This is done afterwards.
- */
-static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
-{
-	spin_lock_init(&iommu->lock);
-
-	/* Add IOMMU to internal data structures */
-	list_add_tail(&iommu->list, &amd_iommu_list);
-	iommu->index             = amd_iommus_present++;
-
-	if (unlikely(iommu->index >= MAX_IOMMUS)) {
-		WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
-		return -ENOSYS;
-	}
-
-	/* Index is fine - add IOMMU to the array */
-	amd_iommus[iommu->index] = iommu;
-
-	/*
-	 * Copy data from ACPI table entry to the iommu struct
-	 */
-	iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
-	if (!iommu->dev)
-		return 1;
-
-	iommu->cap_ptr = h->cap_ptr;
-	iommu->pci_seg = h->pci_seg;
-	iommu->mmio_phys = h->mmio_phys;
-	iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
-	if (!iommu->mmio_base)
-		return -ENOMEM;
-
-	iommu->cmd_buf = alloc_command_buffer(iommu);
-	if (!iommu->cmd_buf)
-		return -ENOMEM;
-
-	iommu->evt_buf = alloc_event_buffer(iommu);
-	if (!iommu->evt_buf)
-		return -ENOMEM;
-
-	iommu->int_enabled = false;
-
-	init_iommu_from_pci(iommu);
-	init_iommu_from_acpi(iommu, h);
-	init_iommu_devices(iommu);
-
-	if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
-		amd_iommu_np_cache = true;
-
-	return pci_enable_device(iommu->dev);
-}
-
-/*
- * Iterates over all IOMMU entries in the ACPI table, allocates the
- * IOMMU structure and initializes it with init_iommu_one()
- */
-static int __init init_iommu_all(struct acpi_table_header *table)
-{
-	u8 *p = (u8 *)table, *end = (u8 *)table;
-	struct ivhd_header *h;
-	struct amd_iommu *iommu;
-	int ret;
-
-	end += table->length;
-	p += IVRS_HEADER_LENGTH;
-
-	while (p < end) {
-		h = (struct ivhd_header *)p;
-		switch (*p) {
-		case ACPI_IVHD_TYPE:
-
-			DUMP_printk("device: %02x:%02x.%01x cap: %04x "
-				    "seg: %d flags: %01x info %04x\n",
-				    PCI_BUS(h->devid), PCI_SLOT(h->devid),
-				    PCI_FUNC(h->devid), h->cap_ptr,
-				    h->pci_seg, h->flags, h->info);
-			DUMP_printk("       mmio-addr: %016llx\n",
-				    h->mmio_phys);
-
-			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
-			if (iommu == NULL) {
-				amd_iommu_init_err = -ENOMEM;
-				return 0;
-			}
-
-			ret = init_iommu_one(iommu, h);
-			if (ret) {
-				amd_iommu_init_err = ret;
-				return 0;
-			}
-			break;
-		default:
-			break;
-		}
-		p += h->length;
-
-	}
-	WARN_ON(p != end);
-
-	return 0;
-}
-
-/****************************************************************************
- *
- * The following functions initialize the MSI interrupts for all IOMMUs
- * in the system. Its a bit challenging because there could be multiple
- * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
- * pci_dev.
- *
- ****************************************************************************/
-
-static int iommu_setup_msi(struct amd_iommu *iommu)
-{
-	int r;
-
-	if (pci_enable_msi(iommu->dev))
-		return 1;
-
-	r = request_threaded_irq(iommu->dev->irq,
-				 amd_iommu_int_handler,
-				 amd_iommu_int_thread,
-				 0, "AMD-Vi",
-				 iommu->dev);
-
-	if (r) {
-		pci_disable_msi(iommu->dev);
-		return 1;
-	}
-
-	iommu->int_enabled = true;
-	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-
-	return 0;
-}
-
-static int iommu_init_msi(struct amd_iommu *iommu)
-{
-	if (iommu->int_enabled)
-		return 0;
-
-	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
-		return iommu_setup_msi(iommu);
-
-	return 1;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the third pass of parsing the ACPI
- * table. In this last pass the memory mapping requirements are
- * gathered (like exclusion and unity mapping reanges).
- *
- ****************************************************************************/
-
-static void __init free_unity_maps(void)
-{
-	struct unity_map_entry *entry, *next;
-
-	list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
-}
-
-/* called when we find an exclusion range definition in ACPI */
-static int __init init_exclusion_range(struct ivmd_header *m)
-{
-	int i;
-
-	switch (m->type) {
-	case ACPI_IVMD_TYPE:
-		set_device_exclusion_range(m->devid, m);
-		break;
-	case ACPI_IVMD_TYPE_ALL:
-		for (i = 0; i <= amd_iommu_last_bdf; ++i)
-			set_device_exclusion_range(i, m);
-		break;
-	case ACPI_IVMD_TYPE_RANGE:
-		for (i = m->devid; i <= m->aux; ++i)
-			set_device_exclusion_range(i, m);
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-/* called for unity map ACPI definition */
-static int __init init_unity_map_range(struct ivmd_header *m)
-{
-	struct unity_map_entry *e = 0;
-	char *s;
-
-	e = kzalloc(sizeof(*e), GFP_KERNEL);
-	if (e == NULL)
-		return -ENOMEM;
-
-	switch (m->type) {
-	default:
-		kfree(e);
-		return 0;
-	case ACPI_IVMD_TYPE:
-		s = "IVMD_TYPEi\t\t\t";
-		e->devid_start = e->devid_end = m->devid;
-		break;
-	case ACPI_IVMD_TYPE_ALL:
-		s = "IVMD_TYPE_ALL\t\t";
-		e->devid_start = 0;
-		e->devid_end = amd_iommu_last_bdf;
-		break;
-	case ACPI_IVMD_TYPE_RANGE:
-		s = "IVMD_TYPE_RANGE\t\t";
-		e->devid_start = m->devid;
-		e->devid_end = m->aux;
-		break;
-	}
-	e->address_start = PAGE_ALIGN(m->range_start);
-	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
-	e->prot = m->flags >> 1;
-
-	DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
-		    " range_start: %016llx range_end: %016llx flags: %x\n", s,
-		    PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
-		    PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
-		    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
-		    e->address_start, e->address_end, m->flags);
-
-	list_add_tail(&e->list, &amd_iommu_unity_map);
-
-	return 0;
-}
-
-/* iterates over all memory definitions we find in the ACPI table */
-static int __init init_memory_definitions(struct acpi_table_header *table)
-{
-	u8 *p = (u8 *)table, *end = (u8 *)table;
-	struct ivmd_header *m;
-
-	end += table->length;
-	p += IVRS_HEADER_LENGTH;
-
-	while (p < end) {
-		m = (struct ivmd_header *)p;
-		if (m->flags & IVMD_FLAG_EXCL_RANGE)
-			init_exclusion_range(m);
-		else if (m->flags & IVMD_FLAG_UNITY_MAP)
-			init_unity_map_range(m);
-
-		p += m->length;
-	}
-
-	return 0;
-}
-
-/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
- */
-static void init_device_table(void)
-{
-	u32 devid;
-
-	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
-		set_dev_entry_bit(devid, DEV_ENTRY_VALID);
-		set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
-	}
-}
-
-static void iommu_init_flags(struct amd_iommu *iommu)
-{
-	iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
-		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
-	iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
-		iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
-	iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
-		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
-	iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
-		iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
-	/*
-	 * make IOMMU memory accesses cache coherent
-	 */
-	iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
-}
-
-static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
-{
-	int i, j;
-	u32 ioc_feature_control;
-	struct pci_dev *pdev = NULL;
-
-	/* RD890 BIOSes may not have completely reconfigured the iommu */
-	if (!is_rd890_iommu(iommu->dev))
-		return;
-
-	/*
-	 * First, we need to ensure that the iommu is enabled. This is
-	 * controlled by a register in the northbridge
-	 */
-	pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
-
-	if (!pdev)
-		return;
-
-	/* Select Northbridge indirect register 0x75 and enable writing */
-	pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
-	pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
-
-	/* Enable the iommu */
-	if (!(ioc_feature_control & 0x1))
-		pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
-
-	pci_dev_put(pdev);
-
-	/* Restore the iommu BAR */
-	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
-			       iommu->stored_addr_lo);
-	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
-			       iommu->stored_addr_hi);
-
-	/* Restore the l1 indirect regs for each of the 6 l1s */
-	for (i = 0; i < 6; i++)
-		for (j = 0; j < 0x12; j++)
-			iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
-
-	/* Restore the l2 indirect regs */
-	for (i = 0; i < 0x83; i++)
-		iommu_write_l2(iommu, i, iommu->stored_l2[i]);
-
-	/* Lock PCI setup registers */
-	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
-			       iommu->stored_addr_lo | 1);
-}
-
-/*
- * This function finally enables all IOMMUs found in the system after
- * they have been initialized
- */
-static void enable_iommus(void)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu) {
-		iommu_disable(iommu);
-		iommu_init_flags(iommu);
-		iommu_set_device_table(iommu);
-		iommu_enable_command_buffer(iommu);
-		iommu_enable_event_buffer(iommu);
-		iommu_set_exclusion_range(iommu);
-		iommu_init_msi(iommu);
-		iommu_enable(iommu);
-		iommu_flush_all_caches(iommu);
-	}
-}
-
-static void disable_iommus(void)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu)
-		iommu_disable(iommu);
-}
-
-/*
- * Suspend/Resume support
- * disable suspend until real resume implemented
- */
-
-static void amd_iommu_resume(void)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu)
-		iommu_apply_resume_quirks(iommu);
-
-	/* re-load the hardware */
-	enable_iommus();
-
-	/*
-	 * we have to flush after the IOMMUs are enabled because a
-	 * disabled IOMMU will never execute the commands we send
-	 */
-	for_each_iommu(iommu)
-		iommu_flush_all_caches(iommu);
-}
-
-static int amd_iommu_suspend(void)
-{
-	/* disable IOMMUs to go out of the way for BIOS */
-	disable_iommus();
-
-	return 0;
-}
-
-static struct syscore_ops amd_iommu_syscore_ops = {
-	.suspend = amd_iommu_suspend,
-	.resume = amd_iommu_resume,
-};
-
-/*
- * This is the core init function for AMD IOMMU hardware in the system.
- * This function is called from the generic x86 DMA layer initialization
- * code.
- *
- * This function basically parses the ACPI table for AMD IOMMU (IVRS)
- * three times:
- *
- *	1 pass) Find the highest PCI device id the driver has to handle.
- *		Upon this information the size of the data structures is
- *		determined that needs to be allocated.
- *
- *	2 pass) Initialize the data structures just allocated with the
- *		information in the ACPI table about available AMD IOMMUs
- *		in the system. It also maps the PCI devices in the
- *		system to specific IOMMUs
- *
- *	3 pass) After the basic data structures are allocated and
- *		initialized we update them with information about memory
- *		remapping requirements parsed out of the ACPI table in
- *		this last pass.
- *
- * After that the hardware is initialized and ready to go. In the last
- * step we do some Linux specific things like registering the driver in
- * the dma_ops interface and initializing the suspend/resume support
- * functions. Finally it prints some information about AMD IOMMUs and
- * the driver state and enables the hardware.
- */
-static int __init amd_iommu_init(void)
-{
-	int i, ret = 0;
-
-	/*
-	 * First parse ACPI tables to find the largest Bus/Dev/Func
-	 * we need to handle. Upon this information the shared data
-	 * structures for the IOMMUs in the system will be allocated
-	 */
-	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
-		return -ENODEV;
-
-	ret = amd_iommu_init_err;
-	if (ret)
-		goto out;
-
-	dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
-	alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
-	rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
-
-	ret = -ENOMEM;
-
-	/* Device table - directly used by all IOMMUs */
-	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-				      get_order(dev_table_size));
-	if (amd_iommu_dev_table == NULL)
-		goto out;
-
-	/*
-	 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
-	 * IOMMU see for that device
-	 */
-	amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
-			get_order(alias_table_size));
-	if (amd_iommu_alias_table == NULL)
-		goto free;
-
-	/* IOMMU rlookup table - find the IOMMU for a specific device */
-	amd_iommu_rlookup_table = (void *)__get_free_pages(
-			GFP_KERNEL | __GFP_ZERO,
-			get_order(rlookup_table_size));
-	if (amd_iommu_rlookup_table == NULL)
-		goto free;
-
-	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
-					    GFP_KERNEL | __GFP_ZERO,
-					    get_order(MAX_DOMAIN_ID/8));
-	if (amd_iommu_pd_alloc_bitmap == NULL)
-		goto free;
-
-	/* init the device table */
-	init_device_table();
-
-	/*
-	 * let all alias entries point to itself
-	 */
-	for (i = 0; i <= amd_iommu_last_bdf; ++i)
-		amd_iommu_alias_table[i] = i;
-
-	/*
-	 * never allocate domain 0 because its used as the non-allocated and
-	 * error value placeholder
-	 */
-	amd_iommu_pd_alloc_bitmap[0] = 1;
-
-	spin_lock_init(&amd_iommu_pd_lock);
-
-	/*
-	 * now the data structures are allocated and basically initialized
-	 * start the real acpi table scan
-	 */
-	ret = -ENODEV;
-	if (acpi_table_parse("IVRS", init_iommu_all) != 0)
-		goto free;
-
-	if (amd_iommu_init_err) {
-		ret = amd_iommu_init_err;
-		goto free;
-	}
-
-	if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
-		goto free;
-
-	if (amd_iommu_init_err) {
-		ret = amd_iommu_init_err;
-		goto free;
-	}
-
-	ret = amd_iommu_init_devices();
-	if (ret)
-		goto free;
-
-	enable_iommus();
-
-	if (iommu_pass_through)
-		ret = amd_iommu_init_passthrough();
-	else
-		ret = amd_iommu_init_dma_ops();
-
-	if (ret)
-		goto free_disable;
-
-	amd_iommu_init_api();
-
-	amd_iommu_init_notifier();
-
-	register_syscore_ops(&amd_iommu_syscore_ops);
-
-	if (iommu_pass_through)
-		goto out;
-
-	if (amd_iommu_unmap_flush)
-		printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
-	else
-		printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
-
-	x86_platform.iommu_shutdown = disable_iommus;
-out:
-	return ret;
-
-free_disable:
-	disable_iommus();
-
-free:
-	amd_iommu_uninit_devices();
-
-	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
-		   get_order(MAX_DOMAIN_ID/8));
-
-	free_pages((unsigned long)amd_iommu_rlookup_table,
-		   get_order(rlookup_table_size));
-
-	free_pages((unsigned long)amd_iommu_alias_table,
-		   get_order(alias_table_size));
-
-	free_pages((unsigned long)amd_iommu_dev_table,
-		   get_order(dev_table_size));
-
-	free_iommu_all();
-
-	free_unity_maps();
-
-#ifdef CONFIG_GART_IOMMU
-	/*
-	 * We failed to initialize the AMD IOMMU - try fallback to GART
-	 * if possible.
-	 */
-	gart_iommu_init();
-
-#endif
-
-	goto out;
-}
-
-/****************************************************************************
- *
- * Early detect code. This code runs at IOMMU detection time in the DMA
- * layer. It just looks if there is an IVRS ACPI table to detect AMD
- * IOMMUs
- *
- ****************************************************************************/
-static int __init early_amd_iommu_detect(struct acpi_table_header *table)
-{
-	return 0;
-}
-
-int __init amd_iommu_detect(void)
-{
-	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
-		return -ENODEV;
-
-	if (amd_iommu_disabled)
-		return -ENODEV;
-
-	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
-		iommu_detected = 1;
-		amd_iommu_detected = 1;
-		x86_init.iommu.iommu_init = amd_iommu_init;
-
-		/* Make sure ACS will be enabled */
-		pci_request_acs();
-		return 1;
-	}
-	return -ENODEV;
-}
-
-/****************************************************************************
- *
- * Parsing functions for the AMD IOMMU specific kernel command line
- * options.
- *
- ****************************************************************************/
-
-static int __init parse_amd_iommu_dump(char *str)
-{
-	amd_iommu_dump = true;
-
-	return 1;
-}
-
-static int __init parse_amd_iommu_options(char *str)
-{
-	for (; *str; ++str) {
-		if (strncmp(str, "fullflush", 9) == 0)
-			amd_iommu_unmap_flush = true;
-		if (strncmp(str, "off", 3) == 0)
-			amd_iommu_disabled = true;
-	}
-
-	return 1;
-}
-
-__setup("amd_iommu_dump", parse_amd_iommu_dump);
-__setup("amd_iommu=", parse_amd_iommu_options);
-
-IOMMU_INIT_FINISH(amd_iommu_detect,
-		  gart_iommu_hole_init,
-		  0,
-		  0);
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 49e9c0f46bd5..4d4d77df7cac 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_IOMMU_API) += iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
-obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o
+obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
 obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
 obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 7c3a95e54ec5..5aa12eaabd21 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -27,13 +27,14 @@
 #include <linux/iommu-helper.h>
 #include <linux/iommu.h>
 #include <linux/delay.h>
+#include <linux/amd-iommu.h>
 #include <asm/proto.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/dma.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
+
+#include "amd_iommu_proto.h"
+#include "amd_iommu_types.h"
 
 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
 
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
new file mode 100644
index 000000000000..82d2410f4205
--- /dev/null
+++ b/drivers/iommu/amd_iommu_init.c
@@ -0,0 +1,1574 @@
+/*
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+#include <linux/interrupt.h>
+#include <linux/msi.h>
+#include <linux/amd-iommu.h>
+#include <asm/pci-direct.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+#include "amd_iommu_proto.h"
+#include "amd_iommu_types.h"
+
+/*
+ * definitions for the ACPI scanning code
+ */
+#define IVRS_HEADER_LENGTH 48
+
+#define ACPI_IVHD_TYPE                  0x10
+#define ACPI_IVMD_TYPE_ALL              0x20
+#define ACPI_IVMD_TYPE                  0x21
+#define ACPI_IVMD_TYPE_RANGE            0x22
+
+#define IVHD_DEV_ALL                    0x01
+#define IVHD_DEV_SELECT                 0x02
+#define IVHD_DEV_SELECT_RANGE_START     0x03
+#define IVHD_DEV_RANGE_END              0x04
+#define IVHD_DEV_ALIAS                  0x42
+#define IVHD_DEV_ALIAS_RANGE            0x43
+#define IVHD_DEV_EXT_SELECT             0x46
+#define IVHD_DEV_EXT_SELECT_RANGE       0x47
+
+#define IVHD_FLAG_HT_TUN_EN_MASK        0x01
+#define IVHD_FLAG_PASSPW_EN_MASK        0x02
+#define IVHD_FLAG_RESPASSPW_EN_MASK     0x04
+#define IVHD_FLAG_ISOC_EN_MASK          0x08
+
+#define IVMD_FLAG_EXCL_RANGE            0x08
+#define IVMD_FLAG_UNITY_MAP             0x01
+
+#define ACPI_DEVFLAG_INITPASS           0x01
+#define ACPI_DEVFLAG_EXTINT             0x02
+#define ACPI_DEVFLAG_NMI                0x04
+#define ACPI_DEVFLAG_SYSMGT1            0x10
+#define ACPI_DEVFLAG_SYSMGT2            0x20
+#define ACPI_DEVFLAG_LINT0              0x40
+#define ACPI_DEVFLAG_LINT1              0x80
+#define ACPI_DEVFLAG_ATSDIS             0x10000000
+
+/*
+ * ACPI table definitions
+ *
+ * These data structures are laid over the table to parse the important values
+ * out of it.
+ */
+
+/*
+ * structure describing one IOMMU in the ACPI table. Typically followed by one
+ * or more ivhd_entrys.
+ */
+struct ivhd_header {
+	u8 type;
+	u8 flags;
+	u16 length;
+	u16 devid;
+	u16 cap_ptr;
+	u64 mmio_phys;
+	u16 pci_seg;
+	u16 info;
+	u32 reserved;
+} __attribute__((packed));
+
+/*
+ * A device entry describing which devices a specific IOMMU translates and
+ * which requestor ids they use.
+ */
+struct ivhd_entry {
+	u8 type;
+	u16 devid;
+	u8 flags;
+	u32 ext;
+} __attribute__((packed));
+
+/*
+ * An AMD IOMMU memory definition structure. It defines things like exclusion
+ * ranges for devices and regions that should be unity mapped.
+ */
+struct ivmd_header {
+	u8 type;
+	u8 flags;
+	u16 length;
+	u16 devid;
+	u16 aux;
+	u64 resv;
+	u64 range_start;
+	u64 range_length;
+} __attribute__((packed));
+
+bool amd_iommu_dump;
+
+static int __initdata amd_iommu_detected;
+static bool __initdata amd_iommu_disabled;
+
+u16 amd_iommu_last_bdf;			/* largest PCI device id we have
+					   to handle */
+LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
+					   we find in ACPI */
+bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
+
+LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
+					   system */
+
+/* Array to assign indices to IOMMUs*/
+struct amd_iommu *amd_iommus[MAX_IOMMUS];
+int amd_iommus_present;
+
+/* IOMMUs have a non-present cache? */
+bool amd_iommu_np_cache __read_mostly;
+bool amd_iommu_iotlb_sup __read_mostly = true;
+
+/*
+ * The ACPI table parsing functions set this variable on an error
+ */
+static int __initdata amd_iommu_init_err;
+
+/*
+ * List of protection domains - used during resume
+ */
+LIST_HEAD(amd_iommu_pd_list);
+spinlock_t amd_iommu_pd_lock;
+
+/*
+ * Pointer to the device table which is shared by all AMD IOMMUs
+ * it is indexed by the PCI device id or the HT unit id and contains
+ * information about the domain the device belongs to as well as the
+ * page table root pointer.
+ */
+struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * The alias table is a driver specific data structure which contains the
+ * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
+ * More than one device can share the same requestor id.
+ */
+u16 *amd_iommu_alias_table;
+
+/*
+ * The rlookup table is used to find the IOMMU which is responsible
+ * for a specific device. It is also indexed by the PCI device id.
+ */
+struct amd_iommu **amd_iommu_rlookup_table;
+
+/*
+ * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
+ * to know which ones are already in use.
+ */
+unsigned long *amd_iommu_pd_alloc_bitmap;
+
+static u32 dev_table_size;	/* size of the device table */
+static u32 alias_table_size;	/* size of the alias table */
+static u32 rlookup_table_size;	/* size if the rlookup table */
+
+/*
+ * This function flushes all internal caches of
+ * the IOMMU used by this driver.
+ */
+extern void iommu_flush_all_caches(struct amd_iommu *iommu);
+
+static inline void update_last_devid(u16 devid)
+{
+	if (devid > amd_iommu_last_bdf)
+		amd_iommu_last_bdf = devid;
+}
+
+static inline unsigned long tbl_size(int entry_size)
+{
+	unsigned shift = PAGE_SHIFT +
+			 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
+
+	return 1UL << shift;
+}
+
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+	u32 val;
+
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+	pci_read_config_dword(iommu->dev, 0xfc, &val);
+	return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+	pci_write_config_dword(iommu->dev, 0xfc, val);
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+	u32 val;
+
+	pci_write_config_dword(iommu->dev, 0xf0, address);
+	pci_read_config_dword(iommu->dev, 0xf4, &val);
+	return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+	pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+	pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
+/****************************************************************************
+ *
+ * AMD IOMMU MMIO register space handling functions
+ *
+ * These functions are used to program the IOMMU device registers in
+ * MMIO space required for that driver.
+ *
+ ****************************************************************************/
+
+/*
+ * This function set the exclusion range in the IOMMU. DMA accesses to the
+ * exclusion range are passed through untranslated
+ */
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
+{
+	u64 start = iommu->exclusion_start & PAGE_MASK;
+	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
+	u64 entry;
+
+	if (!iommu->exclusion_start)
+		return;
+
+	entry = start | MMIO_EXCL_ENABLE_MASK;
+	memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
+			&entry, sizeof(entry));
+
+	entry = limit;
+	memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
+			&entry, sizeof(entry));
+}
+
+/* Programs the physical address of the device table into the IOMMU hardware */
+static void __init iommu_set_device_table(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->mmio_base == NULL);
+
+	entry = virt_to_phys(amd_iommu_dev_table);
+	entry |= (dev_table_size >> 12) - 1;
+	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
+			&entry, sizeof(entry));
+}
+
+/* Generic functions to enable/disable certain features of the IOMMU. */
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+{
+	u32 ctrl;
+
+	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl |= (1 << bit);
+	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+{
+	u32 ctrl;
+
+	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl &= ~(1 << bit);
+	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+/* Function to enable the hardware */
+static void iommu_enable(struct amd_iommu *iommu)
+{
+	static const char * const feat_str[] = {
+		"PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
+		"IA", "GA", "HE", "PC", NULL
+	};
+	int i;
+
+	printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
+	       dev_name(&iommu->dev->dev), iommu->cap_ptr);
+
+	if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
+		printk(KERN_CONT " extended features: ");
+		for (i = 0; feat_str[i]; ++i)
+			if (iommu_feature(iommu, (1ULL << i)))
+				printk(KERN_CONT " %s", feat_str[i]);
+	}
+	printk(KERN_CONT "\n");
+
+	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
+}
+
+static void iommu_disable(struct amd_iommu *iommu)
+{
+	/* Disable command buffer */
+	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+	/* Disable event logging and event interrupts */
+	iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
+	iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
+
+	/* Disable IOMMU hardware itself */
+	iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
+}
+
+/*
+ * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
+ * the system has one.
+ */
+static u8 * __init iommu_map_mmio_space(u64 address)
+{
+	u8 *ret;
+
+	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
+		pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
+			address);
+		pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
+		return NULL;
+	}
+
+	ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
+	if (ret != NULL)
+		return ret;
+
+	release_mem_region(address, MMIO_REGION_LENGTH);
+
+	return NULL;
+}
+
+static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
+{
+	if (iommu->mmio_base)
+		iounmap(iommu->mmio_base);
+	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
+}
+
+/****************************************************************************
+ *
+ * The functions below belong to the first pass of AMD IOMMU ACPI table
+ * parsing. In this pass we try to find out the highest device id this
+ * code has to handle. Upon this information the size of the shared data
+ * structures is determined later.
+ *
+ ****************************************************************************/
+
+/*
+ * This function calculates the length of a given IVHD entry
+ */
+static inline int ivhd_entry_length(u8 *ivhd)
+{
+	return 0x04 << (*ivhd >> 6);
+}
+
+/*
+ * This function reads the last device id the IOMMU has to handle from the PCI
+ * capability header for this IOMMU
+ */
+static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
+{
+	u32 cap;
+
+	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
+	update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+
+	return 0;
+}
+
+/*
+ * After reading the highest device id from the IOMMU PCI capability header
+ * this function looks if there is a higher device id defined in the ACPI table
+ */
+static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
+{
+	u8 *p = (void *)h, *end = (void *)h;
+	struct ivhd_entry *dev;
+
+	p += sizeof(*h);
+	end += h->length;
+
+	find_last_devid_on_pci(PCI_BUS(h->devid),
+			PCI_SLOT(h->devid),
+			PCI_FUNC(h->devid),
+			h->cap_ptr);
+
+	while (p < end) {
+		dev = (struct ivhd_entry *)p;
+		switch (dev->type) {
+		case IVHD_DEV_SELECT:
+		case IVHD_DEV_RANGE_END:
+		case IVHD_DEV_ALIAS:
+		case IVHD_DEV_EXT_SELECT:
+			/* all the above subfield types refer to device ids */
+			update_last_devid(dev->devid);
+			break;
+		default:
+			break;
+		}
+		p += ivhd_entry_length(p);
+	}
+
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+/*
+ * Iterate over all IVHD entries in the ACPI table and find the highest device
+ * id which we need to handle. This is the first of three functions which parse
+ * the ACPI table. So we check the checksum here.
+ */
+static int __init find_last_devid_acpi(struct acpi_table_header *table)
+{
+	int i;
+	u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
+	struct ivhd_header *h;
+
+	/*
+	 * Validate checksum here so we don't need to do it when
+	 * we actually parse the table
+	 */
+	for (i = 0; i < table->length; ++i)
+		checksum += p[i];
+	if (checksum != 0) {
+		/* ACPI table corrupt */
+		amd_iommu_init_err = -ENODEV;
+		return 0;
+	}
+
+	p += IVRS_HEADER_LENGTH;
+
+	end += table->length;
+	while (p < end) {
+		h = (struct ivhd_header *)p;
+		switch (h->type) {
+		case ACPI_IVHD_TYPE:
+			find_last_devid_from_ivhd(h);
+			break;
+		default:
+			break;
+		}
+		p += h->length;
+	}
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+/****************************************************************************
+ *
+ * The following functions belong the the code path which parses the ACPI table
+ * the second time. In this ACPI parsing iteration we allocate IOMMU specific
+ * data structures, initialize the device/alias/rlookup table and also
+ * basically initialize the hardware.
+ *
+ ****************************************************************************/
+
+/*
+ * Allocates the command buffer. This buffer is per AMD IOMMU. We can
+ * write commands to that buffer later and the IOMMU will execute them
+ * asynchronously
+ */
+static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
+{
+	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+			get_order(CMD_BUFFER_SIZE));
+
+	if (cmd_buf == NULL)
+		return NULL;
+
+	iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
+
+	return cmd_buf;
+}
+
+/*
+ * This function resets the command buffer if the IOMMU stopped fetching
+ * commands from it.
+ */
+void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
+{
+	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->cmd_buf == NULL);
+
+	entry = (u64)virt_to_phys(iommu->cmd_buf);
+	entry |= MMIO_CMD_SIZE_512;
+
+	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+		    &entry, sizeof(entry));
+
+	amd_iommu_reset_cmd_buffer(iommu);
+	iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
+}
+
+static void __init free_command_buffer(struct amd_iommu *iommu)
+{
+	free_pages((unsigned long)iommu->cmd_buf,
+		   get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
+}
+
+/* allocates the memory where the IOMMU will log its events to */
+static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
+{
+	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+						get_order(EVT_BUFFER_SIZE));
+
+	if (iommu->evt_buf == NULL)
+		return NULL;
+
+	iommu->evt_buf_size = EVT_BUFFER_SIZE;
+
+	return iommu->evt_buf;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->evt_buf == NULL);
+
+	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
+	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
+		    &entry, sizeof(entry));
+
+	/* set head and tail to zero manually */
+	writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+	writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+
+	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
+}
+
+static void __init free_event_buffer(struct amd_iommu *iommu)
+{
+	free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
+}
+
+/* sets a specific bit in the device table entry. */
+static void set_dev_entry_bit(u16 devid, u8 bit)
+{
+	int i = (bit >> 5) & 0x07;
+	int _bit = bit & 0x1f;
+
+	amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
+}
+
+static int get_dev_entry_bit(u16 devid, u8 bit)
+{
+	int i = (bit >> 5) & 0x07;
+	int _bit = bit & 0x1f;
+
+	return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
+}
+
+
+void amd_iommu_apply_erratum_63(u16 devid)
+{
+	int sysmgt;
+
+	sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
+		 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
+
+	if (sysmgt == 0x01)
+		set_dev_entry_bit(devid, DEV_ENTRY_IW);
+}
+
+/* Writes the specific IOMMU for a device into the rlookup table */
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+	amd_iommu_rlookup_table[devid] = iommu;
+}
+
+/*
+ * This function takes the device specific flags read from the ACPI
+ * table and sets up the device table entry with that information
+ */
+static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
+					   u16 devid, u32 flags, u32 ext_flags)
+{
+	if (flags & ACPI_DEVFLAG_INITPASS)
+		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
+	if (flags & ACPI_DEVFLAG_EXTINT)
+		set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
+	if (flags & ACPI_DEVFLAG_NMI)
+		set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
+	if (flags & ACPI_DEVFLAG_SYSMGT1)
+		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
+	if (flags & ACPI_DEVFLAG_SYSMGT2)
+		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
+	if (flags & ACPI_DEVFLAG_LINT0)
+		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
+	if (flags & ACPI_DEVFLAG_LINT1)
+		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
+
+	amd_iommu_apply_erratum_63(devid);
+
+	set_iommu_for_device(iommu, devid);
+}
+
+/*
+ * Reads the device exclusion range from ACPI and initialize IOMMU with
+ * it
+ */
+static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
+{
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+	if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
+		return;
+
+	if (iommu) {
+		/*
+		 * We only can configure exclusion ranges per IOMMU, not
+		 * per device. But we can enable the exclusion range per
+		 * device. This is done here
+		 */
+		set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
+		iommu->exclusion_start = m->range_start;
+		iommu->exclusion_length = m->range_length;
+	}
+}
+
+/*
+ * This function reads some important data from the IOMMU PCI space and
+ * initializes the driver data structure with it. It reads the hardware
+ * capabilities and the first/last device entries
+ */
+static void __init init_iommu_from_pci(struct amd_iommu *iommu)
+{
+	int cap_ptr = iommu->cap_ptr;
+	u32 range, misc, low, high;
+	int i, j;
+
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
+			      &iommu->cap);
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
+			      &range);
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
+			      &misc);
+
+	iommu->first_device = calc_devid(MMIO_GET_BUS(range),
+					 MMIO_GET_FD(range));
+	iommu->last_device = calc_devid(MMIO_GET_BUS(range),
+					MMIO_GET_LD(range));
+	iommu->evt_msi_num = MMIO_MSI_NUM(misc);
+
+	if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
+		amd_iommu_iotlb_sup = false;
+
+	/* read extended feature bits */
+	low  = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
+	high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
+
+	iommu->features = ((u64)high << 32) | low;
+
+	if (!is_rd890_iommu(iommu->dev))
+		return;
+
+	/*
+	 * Some rd890 systems may not be fully reconfigured by the BIOS, so
+	 * it's necessary for us to store this information so it can be
+	 * reprogrammed on resume
+	 */
+
+	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			      &iommu->stored_addr_lo);
+	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+			      &iommu->stored_addr_hi);
+
+	/* Low bit locks writes to configuration space */
+	iommu->stored_addr_lo &= ~1;
+
+	for (i = 0; i < 6; i++)
+		for (j = 0; j < 0x12; j++)
+			iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+	for (i = 0; i < 0x83; i++)
+		iommu->stored_l2[i] = iommu_read_l2(iommu, i);
+}
+
+/*
+ * Takes a pointer to an AMD IOMMU entry in the ACPI table and
+ * initializes the hardware and our data structures with it.
+ */
+static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
+					struct ivhd_header *h)
+{
+	u8 *p = (u8 *)h;
+	u8 *end = p, flags = 0;
+	u16 devid = 0, devid_start = 0, devid_to = 0;
+	u32 dev_i, ext_flags = 0;
+	bool alias = false;
+	struct ivhd_entry *e;
+
+	/*
+	 * First save the recommended feature enable bits from ACPI
+	 */
+	iommu->acpi_flags = h->flags;
+
+	/*
+	 * Done. Now parse the device entries
+	 */
+	p += sizeof(struct ivhd_header);
+	end += h->length;
+
+
+	while (p < end) {
+		e = (struct ivhd_entry *)p;
+		switch (e->type) {
+		case IVHD_DEV_ALL:
+
+			DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x"
+				    " last device %02x:%02x.%x flags: %02x\n",
+				    PCI_BUS(iommu->first_device),
+				    PCI_SLOT(iommu->first_device),
+				    PCI_FUNC(iommu->first_device),
+				    PCI_BUS(iommu->last_device),
+				    PCI_SLOT(iommu->last_device),
+				    PCI_FUNC(iommu->last_device),
+				    e->flags);
+
+			for (dev_i = iommu->first_device;
+					dev_i <= iommu->last_device; ++dev_i)
+				set_dev_entry_from_acpi(iommu, dev_i,
+							e->flags, 0);
+			break;
+		case IVHD_DEV_SELECT:
+
+			DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+				    "flags: %02x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags);
+
+			devid = e->devid;
+			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+			break;
+		case IVHD_DEV_SELECT_RANGE_START:
+
+			DUMP_printk("  DEV_SELECT_RANGE_START\t "
+				    "devid: %02x:%02x.%x flags: %02x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags);
+
+			devid_start = e->devid;
+			flags = e->flags;
+			ext_flags = 0;
+			alias = false;
+			break;
+		case IVHD_DEV_ALIAS:
+
+			DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+				    "flags: %02x devid_to: %02x:%02x.%x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags,
+				    PCI_BUS(e->ext >> 8),
+				    PCI_SLOT(e->ext >> 8),
+				    PCI_FUNC(e->ext >> 8));
+
+			devid = e->devid;
+			devid_to = e->ext >> 8;
+			set_dev_entry_from_acpi(iommu, devid   , e->flags, 0);
+			set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
+			amd_iommu_alias_table[devid] = devid_to;
+			break;
+		case IVHD_DEV_ALIAS_RANGE:
+
+			DUMP_printk("  DEV_ALIAS_RANGE\t\t "
+				    "devid: %02x:%02x.%x flags: %02x "
+				    "devid_to: %02x:%02x.%x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags,
+				    PCI_BUS(e->ext >> 8),
+				    PCI_SLOT(e->ext >> 8),
+				    PCI_FUNC(e->ext >> 8));
+
+			devid_start = e->devid;
+			flags = e->flags;
+			devid_to = e->ext >> 8;
+			ext_flags = 0;
+			alias = true;
+			break;
+		case IVHD_DEV_EXT_SELECT:
+
+			DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+				    "flags: %02x ext: %08x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags, e->ext);
+
+			devid = e->devid;
+			set_dev_entry_from_acpi(iommu, devid, e->flags,
+						e->ext);
+			break;
+		case IVHD_DEV_EXT_SELECT_RANGE:
+
+			DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
+				    "%02x:%02x.%x flags: %02x ext: %08x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags, e->ext);
+
+			devid_start = e->devid;
+			flags = e->flags;
+			ext_flags = e->ext;
+			alias = false;
+			break;
+		case IVHD_DEV_RANGE_END:
+
+			DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid));
+
+			devid = e->devid;
+			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
+				if (alias) {
+					amd_iommu_alias_table[dev_i] = devid_to;
+					set_dev_entry_from_acpi(iommu,
+						devid_to, flags, ext_flags);
+				}
+				set_dev_entry_from_acpi(iommu, dev_i,
+							flags, ext_flags);
+			}
+			break;
+		default:
+			break;
+		}
+
+		p += ivhd_entry_length(p);
+	}
+}
+
+/* Initializes the device->iommu mapping for the driver */
+static int __init init_iommu_devices(struct amd_iommu *iommu)
+{
+	u32 i;
+
+	for (i = iommu->first_device; i <= iommu->last_device; ++i)
+		set_iommu_for_device(iommu, i);
+
+	return 0;
+}
+
+static void __init free_iommu_one(struct amd_iommu *iommu)
+{
+	free_command_buffer(iommu);
+	free_event_buffer(iommu);
+	iommu_unmap_mmio_space(iommu);
+}
+
+static void __init free_iommu_all(void)
+{
+	struct amd_iommu *iommu, *next;
+
+	for_each_iommu_safe(iommu, next) {
+		list_del(&iommu->list);
+		free_iommu_one(iommu);
+		kfree(iommu);
+	}
+}
+
+/*
+ * This function clues the initialization function for one IOMMU
+ * together and also allocates the command buffer and programs the
+ * hardware. It does NOT enable the IOMMU. This is done afterwards.
+ */
+static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
+{
+	spin_lock_init(&iommu->lock);
+
+	/* Add IOMMU to internal data structures */
+	list_add_tail(&iommu->list, &amd_iommu_list);
+	iommu->index             = amd_iommus_present++;
+
+	if (unlikely(iommu->index >= MAX_IOMMUS)) {
+		WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
+		return -ENOSYS;
+	}
+
+	/* Index is fine - add IOMMU to the array */
+	amd_iommus[iommu->index] = iommu;
+
+	/*
+	 * Copy data from ACPI table entry to the iommu struct
+	 */
+	iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
+	if (!iommu->dev)
+		return 1;
+
+	iommu->cap_ptr = h->cap_ptr;
+	iommu->pci_seg = h->pci_seg;
+	iommu->mmio_phys = h->mmio_phys;
+	iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
+	if (!iommu->mmio_base)
+		return -ENOMEM;
+
+	iommu->cmd_buf = alloc_command_buffer(iommu);
+	if (!iommu->cmd_buf)
+		return -ENOMEM;
+
+	iommu->evt_buf = alloc_event_buffer(iommu);
+	if (!iommu->evt_buf)
+		return -ENOMEM;
+
+	iommu->int_enabled = false;
+
+	init_iommu_from_pci(iommu);
+	init_iommu_from_acpi(iommu, h);
+	init_iommu_devices(iommu);
+
+	if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
+		amd_iommu_np_cache = true;
+
+	return pci_enable_device(iommu->dev);
+}
+
+/*
+ * Iterates over all IOMMU entries in the ACPI table, allocates the
+ * IOMMU structure and initializes it with init_iommu_one()
+ */
+static int __init init_iommu_all(struct acpi_table_header *table)
+{
+	u8 *p = (u8 *)table, *end = (u8 *)table;
+	struct ivhd_header *h;
+	struct amd_iommu *iommu;
+	int ret;
+
+	end += table->length;
+	p += IVRS_HEADER_LENGTH;
+
+	while (p < end) {
+		h = (struct ivhd_header *)p;
+		switch (*p) {
+		case ACPI_IVHD_TYPE:
+
+			DUMP_printk("device: %02x:%02x.%01x cap: %04x "
+				    "seg: %d flags: %01x info %04x\n",
+				    PCI_BUS(h->devid), PCI_SLOT(h->devid),
+				    PCI_FUNC(h->devid), h->cap_ptr,
+				    h->pci_seg, h->flags, h->info);
+			DUMP_printk("       mmio-addr: %016llx\n",
+				    h->mmio_phys);
+
+			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
+			if (iommu == NULL) {
+				amd_iommu_init_err = -ENOMEM;
+				return 0;
+			}
+
+			ret = init_iommu_one(iommu, h);
+			if (ret) {
+				amd_iommu_init_err = ret;
+				return 0;
+			}
+			break;
+		default:
+			break;
+		}
+		p += h->length;
+
+	}
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+/****************************************************************************
+ *
+ * The following functions initialize the MSI interrupts for all IOMMUs
+ * in the system. Its a bit challenging because there could be multiple
+ * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
+ * pci_dev.
+ *
+ ****************************************************************************/
+
+static int iommu_setup_msi(struct amd_iommu *iommu)
+{
+	int r;
+
+	if (pci_enable_msi(iommu->dev))
+		return 1;
+
+	r = request_threaded_irq(iommu->dev->irq,
+				 amd_iommu_int_handler,
+				 amd_iommu_int_thread,
+				 0, "AMD-Vi",
+				 iommu->dev);
+
+	if (r) {
+		pci_disable_msi(iommu->dev);
+		return 1;
+	}
+
+	iommu->int_enabled = true;
+	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
+	return 0;
+}
+
+static int iommu_init_msi(struct amd_iommu *iommu)
+{
+	if (iommu->int_enabled)
+		return 0;
+
+	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+		return iommu_setup_msi(iommu);
+
+	return 1;
+}
+
+/****************************************************************************
+ *
+ * The next functions belong to the third pass of parsing the ACPI
+ * table. In this last pass the memory mapping requirements are
+ * gathered (like exclusion and unity mapping reanges).
+ *
+ ****************************************************************************/
+
+static void __init free_unity_maps(void)
+{
+	struct unity_map_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+}
+
+/* called when we find an exclusion range definition in ACPI */
+static int __init init_exclusion_range(struct ivmd_header *m)
+{
+	int i;
+
+	switch (m->type) {
+	case ACPI_IVMD_TYPE:
+		set_device_exclusion_range(m->devid, m);
+		break;
+	case ACPI_IVMD_TYPE_ALL:
+		for (i = 0; i <= amd_iommu_last_bdf; ++i)
+			set_device_exclusion_range(i, m);
+		break;
+	case ACPI_IVMD_TYPE_RANGE:
+		for (i = m->devid; i <= m->aux; ++i)
+			set_device_exclusion_range(i, m);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/* called for unity map ACPI definition */
+static int __init init_unity_map_range(struct ivmd_header *m)
+{
+	struct unity_map_entry *e = 0;
+	char *s;
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (e == NULL)
+		return -ENOMEM;
+
+	switch (m->type) {
+	default:
+		kfree(e);
+		return 0;
+	case ACPI_IVMD_TYPE:
+		s = "IVMD_TYPEi\t\t\t";
+		e->devid_start = e->devid_end = m->devid;
+		break;
+	case ACPI_IVMD_TYPE_ALL:
+		s = "IVMD_TYPE_ALL\t\t";
+		e->devid_start = 0;
+		e->devid_end = amd_iommu_last_bdf;
+		break;
+	case ACPI_IVMD_TYPE_RANGE:
+		s = "IVMD_TYPE_RANGE\t\t";
+		e->devid_start = m->devid;
+		e->devid_end = m->aux;
+		break;
+	}
+	e->address_start = PAGE_ALIGN(m->range_start);
+	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
+	e->prot = m->flags >> 1;
+
+	DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+		    " range_start: %016llx range_end: %016llx flags: %x\n", s,
+		    PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
+		    PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
+		    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+		    e->address_start, e->address_end, m->flags);
+
+	list_add_tail(&e->list, &amd_iommu_unity_map);
+
+	return 0;
+}
+
+/* iterates over all memory definitions we find in the ACPI table */
+static int __init init_memory_definitions(struct acpi_table_header *table)
+{
+	u8 *p = (u8 *)table, *end = (u8 *)table;
+	struct ivmd_header *m;
+
+	end += table->length;
+	p += IVRS_HEADER_LENGTH;
+
+	while (p < end) {
+		m = (struct ivmd_header *)p;
+		if (m->flags & IVMD_FLAG_EXCL_RANGE)
+			init_exclusion_range(m);
+		else if (m->flags & IVMD_FLAG_UNITY_MAP)
+			init_unity_map_range(m);
+
+		p += m->length;
+	}
+
+	return 0;
+}
+
+/*
+ * Init the device table to not allow DMA access for devices and
+ * suppress all page faults
+ */
+static void init_device_table(void)
+{
+	u32 devid;
+
+	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+		set_dev_entry_bit(devid, DEV_ENTRY_VALID);
+		set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
+	}
+}
+
+static void iommu_init_flags(struct amd_iommu *iommu)
+{
+	iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
+		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+	iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
+		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+		iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+	iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
+		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+	iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
+		iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+		iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+	/*
+	 * make IOMMU memory accesses cache coherent
+	 */
+	iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+}
+
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
+{
+	int i, j;
+	u32 ioc_feature_control;
+	struct pci_dev *pdev = NULL;
+
+	/* RD890 BIOSes may not have completely reconfigured the iommu */
+	if (!is_rd890_iommu(iommu->dev))
+		return;
+
+	/*
+	 * First, we need to ensure that the iommu is enabled. This is
+	 * controlled by a register in the northbridge
+	 */
+	pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+	if (!pdev)
+		return;
+
+	/* Select Northbridge indirect register 0x75 and enable writing */
+	pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+	pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+	/* Enable the iommu */
+	if (!(ioc_feature_control & 0x1))
+		pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+	pci_dev_put(pdev);
+
+	/* Restore the iommu BAR */
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			       iommu->stored_addr_lo);
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+			       iommu->stored_addr_hi);
+
+	/* Restore the l1 indirect regs for each of the 6 l1s */
+	for (i = 0; i < 6; i++)
+		for (j = 0; j < 0x12; j++)
+			iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+	/* Restore the l2 indirect regs */
+	for (i = 0; i < 0x83; i++)
+		iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+	/* Lock PCI setup registers */
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			       iommu->stored_addr_lo | 1);
+}
+
+/*
+ * This function finally enables all IOMMUs found in the system after
+ * they have been initialized
+ */
+static void enable_iommus(void)
+{
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu) {
+		iommu_disable(iommu);
+		iommu_init_flags(iommu);
+		iommu_set_device_table(iommu);
+		iommu_enable_command_buffer(iommu);
+		iommu_enable_event_buffer(iommu);
+		iommu_set_exclusion_range(iommu);
+		iommu_init_msi(iommu);
+		iommu_enable(iommu);
+		iommu_flush_all_caches(iommu);
+	}
+}
+
+static void disable_iommus(void)
+{
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		iommu_disable(iommu);
+}
+
+/*
+ * Suspend/Resume support
+ * disable suspend until real resume implemented
+ */
+
+static void amd_iommu_resume(void)
+{
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		iommu_apply_resume_quirks(iommu);
+
+	/* re-load the hardware */
+	enable_iommus();
+
+	/*
+	 * we have to flush after the IOMMUs are enabled because a
+	 * disabled IOMMU will never execute the commands we send
+	 */
+	for_each_iommu(iommu)
+		iommu_flush_all_caches(iommu);
+}
+
+static int amd_iommu_suspend(void)
+{
+	/* disable IOMMUs to go out of the way for BIOS */
+	disable_iommus();
+
+	return 0;
+}
+
+static struct syscore_ops amd_iommu_syscore_ops = {
+	.suspend = amd_iommu_suspend,
+	.resume = amd_iommu_resume,
+};
+
+/*
+ * This is the core init function for AMD IOMMU hardware in the system.
+ * This function is called from the generic x86 DMA layer initialization
+ * code.
+ *
+ * This function basically parses the ACPI table for AMD IOMMU (IVRS)
+ * three times:
+ *
+ *	1 pass) Find the highest PCI device id the driver has to handle.
+ *		Upon this information the size of the data structures is
+ *		determined that needs to be allocated.
+ *
+ *	2 pass) Initialize the data structures just allocated with the
+ *		information in the ACPI table about available AMD IOMMUs
+ *		in the system. It also maps the PCI devices in the
+ *		system to specific IOMMUs
+ *
+ *	3 pass) After the basic data structures are allocated and
+ *		initialized we update them with information about memory
+ *		remapping requirements parsed out of the ACPI table in
+ *		this last pass.
+ *
+ * After that the hardware is initialized and ready to go. In the last
+ * step we do some Linux specific things like registering the driver in
+ * the dma_ops interface and initializing the suspend/resume support
+ * functions. Finally it prints some information about AMD IOMMUs and
+ * the driver state and enables the hardware.
+ */
+static int __init amd_iommu_init(void)
+{
+	int i, ret = 0;
+
+	/*
+	 * First parse ACPI tables to find the largest Bus/Dev/Func
+	 * we need to handle. Upon this information the shared data
+	 * structures for the IOMMUs in the system will be allocated
+	 */
+	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
+		return -ENODEV;
+
+	ret = amd_iommu_init_err;
+	if (ret)
+		goto out;
+
+	dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
+	alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
+	rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
+
+	ret = -ENOMEM;
+
+	/* Device table - directly used by all IOMMUs */
+	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+				      get_order(dev_table_size));
+	if (amd_iommu_dev_table == NULL)
+		goto out;
+
+	/*
+	 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
+	 * IOMMU see for that device
+	 */
+	amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
+			get_order(alias_table_size));
+	if (amd_iommu_alias_table == NULL)
+		goto free;
+
+	/* IOMMU rlookup table - find the IOMMU for a specific device */
+	amd_iommu_rlookup_table = (void *)__get_free_pages(
+			GFP_KERNEL | __GFP_ZERO,
+			get_order(rlookup_table_size));
+	if (amd_iommu_rlookup_table == NULL)
+		goto free;
+
+	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
+					    GFP_KERNEL | __GFP_ZERO,
+					    get_order(MAX_DOMAIN_ID/8));
+	if (amd_iommu_pd_alloc_bitmap == NULL)
+		goto free;
+
+	/* init the device table */
+	init_device_table();
+
+	/*
+	 * let all alias entries point to itself
+	 */
+	for (i = 0; i <= amd_iommu_last_bdf; ++i)
+		amd_iommu_alias_table[i] = i;
+
+	/*
+	 * never allocate domain 0 because its used as the non-allocated and
+	 * error value placeholder
+	 */
+	amd_iommu_pd_alloc_bitmap[0] = 1;
+
+	spin_lock_init(&amd_iommu_pd_lock);
+
+	/*
+	 * now the data structures are allocated and basically initialized
+	 * start the real acpi table scan
+	 */
+	ret = -ENODEV;
+	if (acpi_table_parse("IVRS", init_iommu_all) != 0)
+		goto free;
+
+	if (amd_iommu_init_err) {
+		ret = amd_iommu_init_err;
+		goto free;
+	}
+
+	if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
+		goto free;
+
+	if (amd_iommu_init_err) {
+		ret = amd_iommu_init_err;
+		goto free;
+	}
+
+	ret = amd_iommu_init_devices();
+	if (ret)
+		goto free;
+
+	enable_iommus();
+
+	if (iommu_pass_through)
+		ret = amd_iommu_init_passthrough();
+	else
+		ret = amd_iommu_init_dma_ops();
+
+	if (ret)
+		goto free_disable;
+
+	amd_iommu_init_api();
+
+	amd_iommu_init_notifier();
+
+	register_syscore_ops(&amd_iommu_syscore_ops);
+
+	if (iommu_pass_through)
+		goto out;
+
+	if (amd_iommu_unmap_flush)
+		printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
+	else
+		printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
+
+	x86_platform.iommu_shutdown = disable_iommus;
+out:
+	return ret;
+
+free_disable:
+	disable_iommus();
+
+free:
+	amd_iommu_uninit_devices();
+
+	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
+		   get_order(MAX_DOMAIN_ID/8));
+
+	free_pages((unsigned long)amd_iommu_rlookup_table,
+		   get_order(rlookup_table_size));
+
+	free_pages((unsigned long)amd_iommu_alias_table,
+		   get_order(alias_table_size));
+
+	free_pages((unsigned long)amd_iommu_dev_table,
+		   get_order(dev_table_size));
+
+	free_iommu_all();
+
+	free_unity_maps();
+
+#ifdef CONFIG_GART_IOMMU
+	/*
+	 * We failed to initialize the AMD IOMMU - try fallback to GART
+	 * if possible.
+	 */
+	gart_iommu_init();
+
+#endif
+
+	goto out;
+}
+
+/****************************************************************************
+ *
+ * Early detect code. This code runs at IOMMU detection time in the DMA
+ * layer. It just looks if there is an IVRS ACPI table to detect AMD
+ * IOMMUs
+ *
+ ****************************************************************************/
+static int __init early_amd_iommu_detect(struct acpi_table_header *table)
+{
+	return 0;
+}
+
+int __init amd_iommu_detect(void)
+{
+	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
+		return -ENODEV;
+
+	if (amd_iommu_disabled)
+		return -ENODEV;
+
+	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
+		iommu_detected = 1;
+		amd_iommu_detected = 1;
+		x86_init.iommu.iommu_init = amd_iommu_init;
+
+		/* Make sure ACS will be enabled */
+		pci_request_acs();
+		return 1;
+	}
+	return -ENODEV;
+}
+
+/****************************************************************************
+ *
+ * Parsing functions for the AMD IOMMU specific kernel command line
+ * options.
+ *
+ ****************************************************************************/
+
+static int __init parse_amd_iommu_dump(char *str)
+{
+	amd_iommu_dump = true;
+
+	return 1;
+}
+
+static int __init parse_amd_iommu_options(char *str)
+{
+	for (; *str; ++str) {
+		if (strncmp(str, "fullflush", 9) == 0)
+			amd_iommu_unmap_flush = true;
+		if (strncmp(str, "off", 3) == 0)
+			amd_iommu_disabled = true;
+	}
+
+	return 1;
+}
+
+__setup("amd_iommu_dump", parse_amd_iommu_dump);
+__setup("amd_iommu=", parse_amd_iommu_options);
+
+IOMMU_INIT_FINISH(amd_iommu_detect,
+		  gart_iommu_hole_init,
+		  0,
+		  0);
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
new file mode 100644
index 000000000000..7ffaa64410b0
--- /dev/null
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
+#define _ASM_X86_AMD_IOMMU_PROTO_H
+
+#include "amd_iommu_types.h"
+
+extern int amd_iommu_init_dma_ops(void);
+extern int amd_iommu_init_passthrough(void);
+extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
+extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_apply_erratum_63(u16 devid);
+extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
+extern int amd_iommu_init_devices(void);
+extern void amd_iommu_uninit_devices(void);
+extern void amd_iommu_init_notifier(void);
+extern void amd_iommu_init_api(void);
+#ifndef CONFIG_AMD_IOMMU_STATS
+
+static inline void amd_iommu_stats_init(void) { }
+
+#endif /* !CONFIG_AMD_IOMMU_STATS */
+
+static inline bool is_rd890_iommu(struct pci_dev *pdev)
+{
+	return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
+	       (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
+}
+
+static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
+{
+	if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
+		return false;
+
+	return !!(iommu->features & f);
+}
+
+#endif /* _ASM_X86_AMD_IOMMU_PROTO_H  */
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
new file mode 100644
index 000000000000..4c9982995414
--- /dev/null
+++ b/drivers/iommu/amd_iommu_types.h
@@ -0,0 +1,580 @@
+/*
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
+#define _ASM_X86_AMD_IOMMU_TYPES_H
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+/*
+ * Maximum number of IOMMUs supported
+ */
+#define MAX_IOMMUS	32
+
+/*
+ * some size calculation constants
+ */
+#define DEV_TABLE_ENTRY_SIZE		32
+#define ALIAS_TABLE_ENTRY_SIZE		2
+#define RLOOKUP_TABLE_ENTRY_SIZE	(sizeof(void *))
+
+/* Length of the MMIO region for the AMD IOMMU */
+#define MMIO_REGION_LENGTH       0x4000
+
+/* Capability offsets used by the driver */
+#define MMIO_CAP_HDR_OFFSET	0x00
+#define MMIO_RANGE_OFFSET	0x0c
+#define MMIO_MISC_OFFSET	0x10
+
+/* Masks, shifts and macros to parse the device range capability */
+#define MMIO_RANGE_LD_MASK	0xff000000
+#define MMIO_RANGE_FD_MASK	0x00ff0000
+#define MMIO_RANGE_BUS_MASK	0x0000ff00
+#define MMIO_RANGE_LD_SHIFT	24
+#define MMIO_RANGE_FD_SHIFT	16
+#define MMIO_RANGE_BUS_SHIFT	8
+#define MMIO_GET_LD(x)  (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
+#define MMIO_GET_FD(x)  (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
+#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
+#define MMIO_MSI_NUM(x)	((x) & 0x1f)
+
+/* Flag masks for the AMD IOMMU exclusion range */
+#define MMIO_EXCL_ENABLE_MASK 0x01ULL
+#define MMIO_EXCL_ALLOW_MASK  0x02ULL
+
+/* Used offsets into the MMIO space */
+#define MMIO_DEV_TABLE_OFFSET   0x0000
+#define MMIO_CMD_BUF_OFFSET     0x0008
+#define MMIO_EVT_BUF_OFFSET     0x0010
+#define MMIO_CONTROL_OFFSET     0x0018
+#define MMIO_EXCL_BASE_OFFSET   0x0020
+#define MMIO_EXCL_LIMIT_OFFSET  0x0028
+#define MMIO_EXT_FEATURES	0x0030
+#define MMIO_CMD_HEAD_OFFSET	0x2000
+#define MMIO_CMD_TAIL_OFFSET	0x2008
+#define MMIO_EVT_HEAD_OFFSET	0x2010
+#define MMIO_EVT_TAIL_OFFSET	0x2018
+#define MMIO_STATUS_OFFSET	0x2020
+
+
+/* Extended Feature Bits */
+#define FEATURE_PREFETCH	(1ULL<<0)
+#define FEATURE_PPR		(1ULL<<1)
+#define FEATURE_X2APIC		(1ULL<<2)
+#define FEATURE_NX		(1ULL<<3)
+#define FEATURE_GT		(1ULL<<4)
+#define FEATURE_IA		(1ULL<<6)
+#define FEATURE_GA		(1ULL<<7)
+#define FEATURE_HE		(1ULL<<8)
+#define FEATURE_PC		(1ULL<<9)
+
+/* MMIO status bits */
+#define MMIO_STATUS_COM_WAIT_INT_MASK	0x04
+
+/* event logging constants */
+#define EVENT_ENTRY_SIZE	0x10
+#define EVENT_TYPE_SHIFT	28
+#define EVENT_TYPE_MASK		0xf
+#define EVENT_TYPE_ILL_DEV	0x1
+#define EVENT_TYPE_IO_FAULT	0x2
+#define EVENT_TYPE_DEV_TAB_ERR	0x3
+#define EVENT_TYPE_PAGE_TAB_ERR	0x4
+#define EVENT_TYPE_ILL_CMD	0x5
+#define EVENT_TYPE_CMD_HARD_ERR	0x6
+#define EVENT_TYPE_IOTLB_INV_TO	0x7
+#define EVENT_TYPE_INV_DEV_REQ	0x8
+#define EVENT_DEVID_MASK	0xffff
+#define EVENT_DEVID_SHIFT	0
+#define EVENT_DOMID_MASK	0xffff
+#define EVENT_DOMID_SHIFT	0
+#define EVENT_FLAGS_MASK	0xfff
+#define EVENT_FLAGS_SHIFT	0x10
+
+/* feature control bits */
+#define CONTROL_IOMMU_EN        0x00ULL
+#define CONTROL_HT_TUN_EN       0x01ULL
+#define CONTROL_EVT_LOG_EN      0x02ULL
+#define CONTROL_EVT_INT_EN      0x03ULL
+#define CONTROL_COMWAIT_EN      0x04ULL
+#define CONTROL_PASSPW_EN       0x08ULL
+#define CONTROL_RESPASSPW_EN    0x09ULL
+#define CONTROL_COHERENT_EN     0x0aULL
+#define CONTROL_ISOC_EN         0x0bULL
+#define CONTROL_CMDBUF_EN       0x0cULL
+#define CONTROL_PPFLOG_EN       0x0dULL
+#define CONTROL_PPFINT_EN       0x0eULL
+
+/* command specific defines */
+#define CMD_COMPL_WAIT          0x01
+#define CMD_INV_DEV_ENTRY       0x02
+#define CMD_INV_IOMMU_PAGES	0x03
+#define CMD_INV_IOTLB_PAGES	0x04
+#define CMD_INV_ALL		0x08
+
+#define CMD_COMPL_WAIT_STORE_MASK	0x01
+#define CMD_COMPL_WAIT_INT_MASK		0x02
+#define CMD_INV_IOMMU_PAGES_SIZE_MASK	0x01
+#define CMD_INV_IOMMU_PAGES_PDE_MASK	0x02
+
+#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS	0x7fffffffffffffffULL
+
+/* macros and definitions for device table entries */
+#define DEV_ENTRY_VALID         0x00
+#define DEV_ENTRY_TRANSLATION   0x01
+#define DEV_ENTRY_IR            0x3d
+#define DEV_ENTRY_IW            0x3e
+#define DEV_ENTRY_NO_PAGE_FAULT	0x62
+#define DEV_ENTRY_EX            0x67
+#define DEV_ENTRY_SYSMGT1       0x68
+#define DEV_ENTRY_SYSMGT2       0x69
+#define DEV_ENTRY_INIT_PASS     0xb8
+#define DEV_ENTRY_EINT_PASS     0xb9
+#define DEV_ENTRY_NMI_PASS      0xba
+#define DEV_ENTRY_LINT0_PASS    0xbe
+#define DEV_ENTRY_LINT1_PASS    0xbf
+#define DEV_ENTRY_MODE_MASK	0x07
+#define DEV_ENTRY_MODE_SHIFT	0x09
+
+/* constants to configure the command buffer */
+#define CMD_BUFFER_SIZE    8192
+#define CMD_BUFFER_UNINITIALIZED 1
+#define CMD_BUFFER_ENTRIES 512
+#define MMIO_CMD_SIZE_SHIFT 56
+#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
+
+/* constants for event buffer handling */
+#define EVT_BUFFER_SIZE		8192 /* 512 entries */
+#define EVT_LEN_MASK		(0x9ULL << 56)
+
+#define PAGE_MODE_NONE    0x00
+#define PAGE_MODE_1_LEVEL 0x01
+#define PAGE_MODE_2_LEVEL 0x02
+#define PAGE_MODE_3_LEVEL 0x03
+#define PAGE_MODE_4_LEVEL 0x04
+#define PAGE_MODE_5_LEVEL 0x05
+#define PAGE_MODE_6_LEVEL 0x06
+
+#define PM_LEVEL_SHIFT(x)	(12 + ((x) * 9))
+#define PM_LEVEL_SIZE(x)	(((x) < 6) ? \
+				  ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
+				   (0xffffffffffffffffULL))
+#define PM_LEVEL_INDEX(x, a)	(((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
+#define PM_LEVEL_ENC(x)		(((x) << 9) & 0xe00ULL)
+#define PM_LEVEL_PDE(x, a)	((a) | PM_LEVEL_ENC((x)) | \
+				 IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
+#define PM_PTE_LEVEL(pte)	(((pte) >> 9) & 0x7ULL)
+
+#define PM_MAP_4k		0
+#define PM_ADDR_MASK		0x000ffffffffff000ULL
+#define PM_MAP_MASK(lvl)	(PM_ADDR_MASK & \
+				(~((1ULL << (12 + ((lvl) * 9))) - 1)))
+#define PM_ALIGNED(lvl, addr)	((PM_MAP_MASK(lvl) & (addr)) == (addr))
+
+/*
+ * Returns the page table level to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_LEVEL(pagesize) \
+		((__ffs(pagesize) - 12) / 9)
+/*
+ * Returns the number of ptes to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_PTE_COUNT(pagesize) \
+		(1ULL << ((__ffs(pagesize) - 12) % 9))
+
+/*
+ * Aligns a given io-virtual address to a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_ALIGN(address, pagesize) \
+		((address) & ~((pagesize) - 1))
+/*
+ * Creates an IOMMU PTE for an address an a given pagesize
+ * The PTE has no permission bits set
+ * Pagesize is expected to be a power-of-two larger than 4096
+ */
+#define PAGE_SIZE_PTE(address, pagesize)		\
+		(((address) | ((pagesize) - 1)) &	\
+		 (~(pagesize >> 1)) & PM_ADDR_MASK)
+
+/*
+ * Takes a PTE value with mode=0x07 and returns the page size it maps
+ */
+#define PTE_PAGE_SIZE(pte) \
+	(1ULL << (1 + ffz(((pte) | 0xfffULL))))
+
+#define IOMMU_PTE_P  (1ULL << 0)
+#define IOMMU_PTE_TV (1ULL << 1)
+#define IOMMU_PTE_U  (1ULL << 59)
+#define IOMMU_PTE_FC (1ULL << 60)
+#define IOMMU_PTE_IR (1ULL << 61)
+#define IOMMU_PTE_IW (1ULL << 62)
+
+#define DTE_FLAG_IOTLB	0x01
+
+#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
+#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
+#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
+#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
+
+#define IOMMU_PROT_MASK 0x03
+#define IOMMU_PROT_IR 0x01
+#define IOMMU_PROT_IW 0x02
+
+/* IOMMU capabilities */
+#define IOMMU_CAP_IOTLB   24
+#define IOMMU_CAP_NPCACHE 26
+#define IOMMU_CAP_EFR     27
+
+#define MAX_DOMAIN_ID 65536
+
+/* FIXME: move this macro to <linux/pci.h> */
+#define PCI_BUS(x) (((x) >> 8) & 0xff)
+
+/* Protection domain flags */
+#define PD_DMA_OPS_MASK		(1UL << 0) /* domain used for dma_ops */
+#define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
+					      domain for an IOMMU */
+#define PD_PASSTHROUGH_MASK	(1UL << 2) /* domain has no page
+					      translation */
+
+extern bool amd_iommu_dump;
+#define DUMP_printk(format, arg...)					\
+	do {								\
+		if (amd_iommu_dump)						\
+			printk(KERN_INFO "AMD-Vi: " format, ## arg);	\
+	} while(0);
+
+/* global flag if IOMMUs cache non-present entries */
+extern bool amd_iommu_np_cache;
+/* Only true if all IOMMUs support device IOTLBs */
+extern bool amd_iommu_iotlb_sup;
+
+/*
+ * Make iterating over all IOMMUs easier
+ */
+#define for_each_iommu(iommu) \
+	list_for_each_entry((iommu), &amd_iommu_list, list)
+#define for_each_iommu_safe(iommu, next) \
+	list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
+
+#define APERTURE_RANGE_SHIFT	27	/* 128 MB */
+#define APERTURE_RANGE_SIZE	(1ULL << APERTURE_RANGE_SHIFT)
+#define APERTURE_RANGE_PAGES	(APERTURE_RANGE_SIZE >> PAGE_SHIFT)
+#define APERTURE_MAX_RANGES	32	/* allows 4GB of DMA address space */
+#define APERTURE_RANGE_INDEX(a)	((a) >> APERTURE_RANGE_SHIFT)
+#define APERTURE_PAGE_INDEX(a)	(((a) >> 21) & 0x3fULL)
+
+/*
+ * This structure contains generic data for  IOMMU protection domains
+ * independent of their use.
+ */
+struct protection_domain {
+	struct list_head list;  /* for list of all protection domains */
+	struct list_head dev_list; /* List of all devices in this domain */
+	spinlock_t lock;	/* mostly used to lock the page table*/
+	struct mutex api_lock;	/* protect page tables in the iommu-api path */
+	u16 id;			/* the domain id written to the device table */
+	int mode;		/* paging mode (0-6 levels) */
+	u64 *pt_root;		/* page table root pointer */
+	unsigned long flags;	/* flags to find out type of domain */
+	bool updated;		/* complete domain flush required */
+	unsigned dev_cnt;	/* devices assigned to this domain */
+	unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
+	void *priv;		/* private data */
+
+};
+
+/*
+ * This struct contains device specific data for the IOMMU
+ */
+struct iommu_dev_data {
+	struct list_head list;		  /* For domain->dev_list */
+	struct device *dev;		  /* Device this data belong to */
+	struct device *alias;		  /* The Alias Device */
+	struct protection_domain *domain; /* Domain the device is bound to */
+	atomic_t bind;			  /* Domain attach reverent count */
+};
+
+/*
+ * For dynamic growth the aperture size is split into ranges of 128MB of
+ * DMA address space each. This struct represents one such range.
+ */
+struct aperture_range {
+
+	/* address allocation bitmap */
+	unsigned long *bitmap;
+
+	/*
+	 * Array of PTE pages for the aperture. In this array we save all the
+	 * leaf pages of the domain page table used for the aperture. This way
+	 * we don't need to walk the page table to find a specific PTE. We can
+	 * just calculate its address in constant time.
+	 */
+	u64 *pte_pages[64];
+
+	unsigned long offset;
+};
+
+/*
+ * Data container for a dma_ops specific protection domain
+ */
+struct dma_ops_domain {
+	struct list_head list;
+
+	/* generic protection domain information */
+	struct protection_domain domain;
+
+	/* size of the aperture for the mappings */
+	unsigned long aperture_size;
+
+	/* address we start to search for free addresses */
+	unsigned long next_address;
+
+	/* address space relevant data */
+	struct aperture_range *aperture[APERTURE_MAX_RANGES];
+
+	/* This will be set to true when TLB needs to be flushed */
+	bool need_flush;
+
+	/*
+	 * if this is a preallocated domain, keep the device for which it was
+	 * preallocated in this variable
+	 */
+	u16 target_dev;
+};
+
+/*
+ * Structure where we save information about one hardware AMD IOMMU in the
+ * system.
+ */
+struct amd_iommu {
+	struct list_head list;
+
+	/* Index within the IOMMU array */
+	int index;
+
+	/* locks the accesses to the hardware */
+	spinlock_t lock;
+
+	/* Pointer to PCI device of this IOMMU */
+	struct pci_dev *dev;
+
+	/* physical address of MMIO space */
+	u64 mmio_phys;
+	/* virtual address of MMIO space */
+	u8 *mmio_base;
+
+	/* capabilities of that IOMMU read from ACPI */
+	u32 cap;
+
+	/* flags read from acpi table */
+	u8 acpi_flags;
+
+	/* Extended features */
+	u64 features;
+
+	/*
+	 * Capability pointer. There could be more than one IOMMU per PCI
+	 * device function if there are more than one AMD IOMMU capability
+	 * pointers.
+	 */
+	u16 cap_ptr;
+
+	/* pci domain of this IOMMU */
+	u16 pci_seg;
+
+	/* first device this IOMMU handles. read from PCI */
+	u16 first_device;
+	/* last device this IOMMU handles. read from PCI */
+	u16 last_device;
+
+	/* start of exclusion range of that IOMMU */
+	u64 exclusion_start;
+	/* length of exclusion range of that IOMMU */
+	u64 exclusion_length;
+
+	/* command buffer virtual address */
+	u8 *cmd_buf;
+	/* size of command buffer */
+	u32 cmd_buf_size;
+
+	/* size of event buffer */
+	u32 evt_buf_size;
+	/* event buffer virtual address */
+	u8 *evt_buf;
+	/* MSI number for event interrupt */
+	u16 evt_msi_num;
+
+	/* true if interrupts for this IOMMU are already enabled */
+	bool int_enabled;
+
+	/* if one, we need to send a completion wait command */
+	bool need_sync;
+
+	/* default dma_ops domain for that IOMMU */
+	struct dma_ops_domain *default_dom;
+
+	/*
+	 * We can't rely on the BIOS to restore all values on reinit, so we
+	 * need to stash them
+	 */
+
+	/* The iommu BAR */
+	u32 stored_addr_lo;
+	u32 stored_addr_hi;
+
+	/*
+	 * Each iommu has 6 l1s, each of which is documented as having 0x12
+	 * registers
+	 */
+	u32 stored_l1[6][0x12];
+
+	/* The l2 indirect registers */
+	u32 stored_l2[0x83];
+};
+
+/*
+ * List with all IOMMUs in the system. This list is not locked because it is
+ * only written and read at driver initialization or suspend time
+ */
+extern struct list_head amd_iommu_list;
+
+/*
+ * Array with pointers to each IOMMU struct
+ * The indices are referenced in the protection domains
+ */
+extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
+
+/* Number of IOMMUs present in the system */
+extern int amd_iommus_present;
+
+/*
+ * Declarations for the global list of all protection domains
+ */
+extern spinlock_t amd_iommu_pd_lock;
+extern struct list_head amd_iommu_pd_list;
+
+/*
+ * Structure defining one entry in the device table
+ */
+struct dev_table_entry {
+	u32 data[8];
+};
+
+/*
+ * One entry for unity mappings parsed out of the ACPI table.
+ */
+struct unity_map_entry {
+	struct list_head list;
+
+	/* starting device id this entry is used for (including) */
+	u16 devid_start;
+	/* end device id this entry is used for (including) */
+	u16 devid_end;
+
+	/* start address to unity map (including) */
+	u64 address_start;
+	/* end address to unity map (including) */
+	u64 address_end;
+
+	/* required protection */
+	int prot;
+};
+
+/*
+ * List of all unity mappings. It is not locked because as runtime it is only
+ * read. It is created at ACPI table parsing time.
+ */
+extern struct list_head amd_iommu_unity_map;
+
+/*
+ * Data structures for device handling
+ */
+
+/*
+ * Device table used by hardware. Read and write accesses by software are
+ * locked with the amd_iommu_pd_table lock.
+ */
+extern struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * Alias table to find requestor ids to device ids. Not locked because only
+ * read on runtime.
+ */
+extern u16 *amd_iommu_alias_table;
+
+/*
+ * Reverse lookup table to find the IOMMU which translates a specific device.
+ */
+extern struct amd_iommu **amd_iommu_rlookup_table;
+
+/* size of the dma_ops aperture as power of 2 */
+extern unsigned amd_iommu_aperture_order;
+
+/* largest PCI device id we expect translation requests for */
+extern u16 amd_iommu_last_bdf;
+
+/* allocation bitmap for domain ids */
+extern unsigned long *amd_iommu_pd_alloc_bitmap;
+
+/*
+ * If true, the addresses will be flushed on unmap time, not when
+ * they are reused
+ */
+extern bool amd_iommu_unmap_flush;
+
+/* takes bus and device/function and returns the device id
+ * FIXME: should that be in generic PCI code? */
+static inline u16 calc_devid(u8 bus, u8 devfn)
+{
+	return (((u16)bus) << 8) | devfn;
+}
+
+#ifdef CONFIG_AMD_IOMMU_STATS
+
+struct __iommu_counter {
+	char *name;
+	struct dentry *dent;
+	u64 value;
+};
+
+#define DECLARE_STATS_COUNTER(nm) \
+	static struct __iommu_counter nm = {	\
+		.name = #nm,			\
+	}
+
+#define INC_STATS_COUNTER(name)		name.value += 1
+#define ADD_STATS_COUNTER(name, x)	name.value += (x)
+#define SUB_STATS_COUNTER(name, x)	name.value -= (x)
+
+#else /* CONFIG_AMD_IOMMU_STATS */
+
+#define DECLARE_STATS_COUNTER(name)
+#define INC_STATS_COUNTER(name)
+#define ADD_STATS_COUNTER(name, x)
+#define SUB_STATS_COUNTER(name, x)
+
+#endif /* CONFIG_AMD_IOMMU_STATS */
+
+#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
new file mode 100644
index 000000000000..a6863a2dec1f
--- /dev/null
+++ b/include/linux/amd-iommu.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_H
+#define _ASM_X86_AMD_IOMMU_H
+
+#include <linux/irqreturn.h>
+
+#ifdef CONFIG_AMD_IOMMU
+
+extern int amd_iommu_detect(void);
+
+#else
+
+static inline int amd_iommu_detect(void) { return -ENODEV; }
+
+#endif
+
+#endif /* _ASM_X86_AMD_IOMMU_H */
-- 
cgit v1.2.3


From cbb49c2665eebfd1fa2e491403684d0542662137 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 21 Jun 2011 10:59:34 -0600
Subject: dt: Add default match table for bus ids

No need for most platforms to define their own bus table when calling
of_platform_populate().  Supply a stock one.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/platform.c       | 8 ++++++++
 include/linux/of_platform.h | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 63d3cb73bdb9..bb483ef32e00 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -22,6 +22,14 @@
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
 
+const struct of_device_id of_default_bus_match_table[] = {
+	{ .compatible = "simple-bus", },
+#ifdef CONFIG_ARM_AMBA
+	{ .compatible = "arm,amba-bus", },
+#endif /* CONFIG_ARM_AMBA */
+	{} /* Empty terminated list */
+};
+
 static int of_dev_node_match(struct device *dev, void *data)
 {
 	return dev->of_node == data;
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index fb51ae38cea7..8f023f889a8a 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -40,6 +40,8 @@ struct of_platform_driver
 #define	to_of_platform_driver(drv) \
 	container_of(drv,struct of_platform_driver, driver)
 
+extern const struct of_device_id of_default_bus_match_table[];
+
 /* Platform drivers register/unregister */
 extern struct platform_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
-- 
cgit v1.2.3


From 29d4f8a4974aacf46b028fa92f9dd3ffdba3e614 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 21 Jun 2011 10:59:34 -0600
Subject: dt: add of_platform_populate() for creating device from the device
 tree

of_platform_populate() is similar to of_platform_bus_probe() except
that it strictly enforces that all device nodes must have a compatible
property, and it can be used to register devices (not buses) which are
children of the root node.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/platform.c       | 54 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/of_platform.h |  3 +++
 2 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index bb483ef32e00..1f4a5d3af76e 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -229,19 +229,26 @@ EXPORT_SYMBOL(of_platform_device_create);
  */
 static int of_platform_bus_create(struct device_node *bus,
 				  const struct of_device_id *matches,
-				  struct device *parent)
+				  struct device *parent, bool strict)
 {
 	struct device_node *child;
 	struct platform_device *dev;
 	int rc = 0;
 
+	/* Make sure it has a compatible property */
+	if (strict && (!of_get_property(bus, "compatible", NULL))) {
+		pr_debug("%s() - skipping %s, no compatible prop\n",
+			 __func__, bus->full_name);
+		return 0;
+	}
+
 	dev = of_platform_device_create(bus, NULL, parent);
 	if (!dev || !of_match_node(matches, bus))
 		return 0;
 
 	for_each_child_of_node(bus, child) {
 		pr_debug("   create child: %s\n", child->full_name);
-		rc = of_platform_bus_create(child, matches, &dev->dev);
+		rc = of_platform_bus_create(child, matches, &dev->dev, strict);
 		if (rc) {
 			of_node_put(child);
 			break;
@@ -275,11 +282,11 @@ int of_platform_bus_probe(struct device_node *root,
 
 	/* Do a self check of bus type, if there's a match, create children */
 	if (of_match_node(matches, root)) {
-		rc = of_platform_bus_create(root, matches, parent);
+		rc = of_platform_bus_create(root, matches, parent, false);
 	} else for_each_child_of_node(root, child) {
 		if (!of_match_node(matches, child))
 			continue;
-		rc = of_platform_bus_create(child, matches, parent);
+		rc = of_platform_bus_create(child, matches, parent, false);
 		if (rc)
 			break;
 	}
@@ -288,4 +295,43 @@ int of_platform_bus_probe(struct device_node *root,
 	return rc;
 }
 EXPORT_SYMBOL(of_platform_bus_probe);
+
+/**
+ * of_platform_populate() - Populate platform_devices from device tree data
+ * @root: parent of the first level to probe or NULL for the root of the tree
+ * @matches: match table, NULL to use the default
+ * @parent: parent to hook devices from, NULL for toplevel
+ *
+ * Similar to of_platform_bus_probe(), this function walks the device tree
+ * and creates devices from nodes.  It differs in that it follows the modern
+ * convention of requiring all device nodes to have a 'compatible' property,
+ * and it is suitable for creating devices which are children of the root
+ * node (of_platform_bus_probe will only create children of the root which
+ * are selected by the @matches argument).
+ *
+ * New board support should be using this function instead of
+ * of_platform_bus_probe().
+ *
+ * Returns 0 on success, < 0 on failure.
+ */
+int of_platform_populate(struct device_node *root,
+			const struct of_device_id *matches,
+			struct device *parent)
+{
+	struct device_node *child;
+	int rc = 0;
+
+	root = root ? of_node_get(root) : of_find_node_by_path("/");
+	if (!root)
+		return -EINVAL;
+
+	for_each_child_of_node(root, child) {
+		rc = of_platform_bus_create(child, matches, parent, true);
+		if (rc)
+			break;
+	}
+
+	of_node_put(root);
+	return rc;
+}
 #endif /* !CONFIG_SPARC */
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 8f023f889a8a..839023351176 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -57,6 +57,9 @@ extern struct platform_device *of_platform_device_create(struct device_node *np,
 extern int of_platform_bus_probe(struct device_node *root,
 				 const struct of_device_id *matches,
 				 struct device *parent);
+extern int of_platform_populate(struct device_node *root,
+				const struct of_device_id *matches,
+				struct device *parent);
 #endif /* !CONFIG_SPARC */
 
 #endif /* CONFIG_OF_DEVICE */
-- 
cgit v1.2.3


From 15c3597d6ea47e8f3caf068887c4666165df63ad Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 21 Jun 2011 10:59:35 -0600
Subject: dt/platform: allow device name to be overridden

Some platform code has specific requirements on the naming of devices.
This patch allows callers of of_platform_populate() to provide a
device name lookup table.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/platform.c       | 73 ++++++++++++++++++++++++++++++++++++++-------
 include/linux/of_platform.h | 35 ++++++++++++++++++++++
 2 files changed, 98 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 4192ddc62638..e75af391e286 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -177,17 +177,20 @@ struct platform_device *of_device_alloc(struct device_node *np,
 EXPORT_SYMBOL(of_device_alloc);
 
 /**
- * of_platform_device_create - Alloc, initialize and register an of_device
+ * of_platform_device_create_pdata - Alloc, initialize and register an of_device
  * @np: pointer to node to create device for
  * @bus_id: name to assign device
+ * @platform_data: pointer to populate platform_data pointer with
  * @parent: Linux device model parent device.
  *
  * Returns pointer to created platform device, or NULL if a device was not
  * registered.  Unavailable devices will not get registered.
  */
-struct platform_device *of_platform_device_create(struct device_node *np,
-					    const char *bus_id,
-					    struct device *parent)
+struct platform_device *of_platform_device_create_pdata(
+					struct device_node *np,
+					const char *bus_id,
+					void *platform_data,
+					struct device *parent)
 {
 	struct platform_device *dev;
 
@@ -203,6 +206,7 @@ struct platform_device *of_platform_device_create(struct device_node *np,
 #endif
 	dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
 	dev->dev.bus = &platform_bus_type;
+	dev->dev.platform_data = platform_data;
 
 	/* We do not fill the DMA ops for platform devices by default.
 	 * This is currently the responsibility of the platform code
@@ -216,6 +220,22 @@ struct platform_device *of_platform_device_create(struct device_node *np,
 
 	return dev;
 }
+
+/**
+ * of_platform_device_create - Alloc, initialize and register an of_device
+ * @np: pointer to node to create device for
+ * @bus_id: name to assign device
+ * @parent: Linux device model parent device.
+ *
+ * Returns pointer to created platform device, or NULL if a device was not
+ * registered.  Unavailable devices will not get registered.
+ */
+struct platform_device *of_platform_device_create(struct device_node *np,
+					    const char *bus_id,
+					    struct device *parent)
+{
+	return of_platform_device_create_pdata(np, bus_id, NULL, parent);
+}
 EXPORT_SYMBOL(of_platform_device_create);
 
 #ifdef CONFIG_ARM_AMBA
@@ -283,6 +303,28 @@ static struct amba_device *of_amba_device_create(struct device_node *node,
 }
 #endif /* CONFIG_ARM_AMBA */
 
+/**
+ * of_devname_lookup() - Given a device node, lookup the preferred Linux name
+ */
+static const struct of_dev_auxdata *of_dev_lookup(const struct of_dev_auxdata *lookup,
+				 struct device_node *np)
+{
+	struct resource res;
+	if (lookup) {
+		for(; lookup->name != NULL; lookup++) {
+			if (!of_device_is_compatible(np, lookup->compatible))
+				continue;
+			if (of_address_to_resource(np, 0, &res))
+				continue;
+			if (res.start != lookup->phys_addr)
+				continue;
+			pr_debug("%s: devname=%s\n", np->full_name, lookup->name);
+			return lookup;
+		}
+	}
+	return NULL;
+}
+
 /**
  * of_platform_bus_create() - Create a device for a node and its children.
  * @bus: device node of the bus to instantiate
@@ -295,10 +337,14 @@ static struct amba_device *of_amba_device_create(struct device_node *node,
  */
 static int of_platform_bus_create(struct device_node *bus,
 				  const struct of_device_id *matches,
+				  const struct of_dev_auxdata *lookup,
 				  struct device *parent, bool strict)
 {
+	const struct of_dev_auxdata *auxdata;
 	struct device_node *child;
 	struct platform_device *dev;
+	const char *bus_id = NULL;
+	void *platform_data = NULL;
 	int rc = 0;
 
 	/* Make sure it has a compatible property */
@@ -308,18 +354,24 @@ static int of_platform_bus_create(struct device_node *bus,
 		return 0;
 	}
 
+	auxdata = of_dev_lookup(lookup, bus);
+	if (auxdata) {
+		bus_id = auxdata->name;
+		platform_data = auxdata->platform_data;
+	}
+
 	if (of_device_is_compatible(bus, "arm,primecell")) {
-		of_amba_device_create(bus, NULL, NULL, parent);
+		of_amba_device_create(bus, bus_id, platform_data, parent);
 		return 0;
 	}
 
-	dev = of_platform_device_create(bus, NULL, parent);
+	dev = of_platform_device_create_pdata(bus, bus_id, platform_data, parent);
 	if (!dev || !of_match_node(matches, bus))
 		return 0;
 
 	for_each_child_of_node(bus, child) {
 		pr_debug("   create child: %s\n", child->full_name);
-		rc = of_platform_bus_create(child, matches, &dev->dev, strict);
+		rc = of_platform_bus_create(child, matches, lookup, &dev->dev, strict);
 		if (rc) {
 			of_node_put(child);
 			break;
@@ -353,11 +405,11 @@ int of_platform_bus_probe(struct device_node *root,
 
 	/* Do a self check of bus type, if there's a match, create children */
 	if (of_match_node(matches, root)) {
-		rc = of_platform_bus_create(root, matches, parent, false);
+		rc = of_platform_bus_create(root, matches, NULL, parent, false);
 	} else for_each_child_of_node(root, child) {
 		if (!of_match_node(matches, child))
 			continue;
-		rc = of_platform_bus_create(child, matches, parent, false);
+		rc = of_platform_bus_create(child, matches, NULL, parent, false);
 		if (rc)
 			break;
 	}
@@ -387,6 +439,7 @@ EXPORT_SYMBOL(of_platform_bus_probe);
  */
 int of_platform_populate(struct device_node *root,
 			const struct of_device_id *matches,
+			const struct of_dev_auxdata *lookup,
 			struct device *parent)
 {
 	struct device_node *child;
@@ -397,7 +450,7 @@ int of_platform_populate(struct device_node *root,
 		return -EINVAL;
 
 	for_each_child_of_node(root, child) {
-		rc = of_platform_bus_create(child, matches, parent, true);
+		rc = of_platform_bus_create(child, matches, lookup, parent, true);
 		if (rc)
 			break;
 	}
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 839023351176..5a6f458a4bb7 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -19,6 +19,40 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 
+/**
+ * struct of_dev_auxdata - lookup table entry for device names & platform_data
+ * @compatible: compatible value of node to match against node
+ * @phys_addr: Start address of registers to match against node
+ * @name: Name to assign for matching nodes
+ * @platform_data: platform_data to assign for matching nodes
+ *
+ * This lookup table allows the caller of of_platform_populate() to override
+ * the names of devices when creating devices from the device tree.  The table
+ * should be terminated with an empty entry.  It also allows the platform_data
+ * pointer to be set.
+ *
+ * The reason for this functionality is that some Linux infrastructure uses
+ * the device name to look up a specific device, but the Linux-specific names
+ * are not encoded into the device tree, so the kernel needs to provide specific
+ * values.
+ *
+ * Note: Using an auxdata lookup table should be considered a last resort when
+ * converting a platform to use the DT.  Normally the automatically generated
+ * device name will not matter, and drivers should obtain data from the device
+ * node instead of from an anonymouns platform_data pointer.
+ */
+struct of_dev_auxdata {
+	char *compatible;
+	resource_size_t phys_addr;
+	char *name;
+	void *platform_data;
+};
+
+/* Macro to simplify populating a lookup table */
+#define OF_DEV_AUXDATA(_compat,_phys,_name,_pdata) \
+	{ .compatible = _compat, .phys_addr = _phys, .name = _name, \
+	  .platform_data = _pdata }
+
 /**
  * of_platform_driver - Legacy of-aware driver for platform devices.
  *
@@ -59,6 +93,7 @@ extern int of_platform_bus_probe(struct device_node *root,
 				 struct device *parent);
 extern int of_platform_populate(struct device_node *root,
 				const struct of_device_id *matches,
+				const struct of_dev_auxdata *lookup,
 				struct device *parent);
 #endif /* !CONFIG_SPARC */
 
-- 
cgit v1.2.3


From f4b5ad26bcb983c493e131ff34b2fa60100c82e5 Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Mon, 20 Jun 2011 15:15:56 +0000
Subject: cnic, bnx2i: Add support for new devices - 57800, 57810, and 57840

And change iSCSI RQ doorbell size from 16B to 64B to match new firmware.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/cnic.h                        | 14 +++++++++++++-
 drivers/scsi/bnx2i/57xx_iscsi_constants.h |  2 +-
 drivers/scsi/bnx2i/bnx2i.h                |  2 +-
 drivers/scsi/bnx2i/bnx2i_init.c           | 21 +++++++++++++++------
 include/linux/pci_ids.h                   |  9 +++++++++
 5 files changed, 39 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/cnic.h b/drivers/net/cnic.h
index 330ef9350413..7a2928f82d40 100644
--- a/drivers/net/cnic.h
+++ b/drivers/net/cnic.h
@@ -384,6 +384,9 @@ struct bnx2x_bd_chain_next {
 #define BNX2X_CHIP_NUM_57712E		0x1663
 #define BNX2X_CHIP_NUM_57713		0x1651
 #define BNX2X_CHIP_NUM_57713E		0x1652
+#define BNX2X_CHIP_NUM_57800		0x168a
+#define BNX2X_CHIP_NUM_57810		0x168e
+#define BNX2X_CHIP_NUM_57840		0x168d
 
 #define BNX2X_CHIP_NUM(x)		(x >> 16)
 #define BNX2X_CHIP_IS_57710(x)		\
@@ -402,10 +405,19 @@ struct bnx2x_bd_chain_next {
 	(BNX2X_CHIP_NUM(x) == BNX2X_CHIP_NUM_57713)
 #define BNX2X_CHIP_IS_57713E(x)		\
 	(BNX2X_CHIP_NUM(x) == BNX2X_CHIP_NUM_57713E)
+#define BNX2X_CHIP_IS_57800(x)		\
+	(BNX2X_CHIP_NUM(x) == BNX2X_CHIP_NUM_57800)
+#define BNX2X_CHIP_IS_57810(x)		\
+	(BNX2X_CHIP_NUM(x) == BNX2X_CHIP_NUM_57810)
+#define BNX2X_CHIP_IS_57840(x)		\
+	(BNX2X_CHIP_NUM(x) == BNX2X_CHIP_NUM_57840)
 #define BNX2X_CHIP_IS_E2(x)		\
 	(BNX2X_CHIP_IS_57712(x) || BNX2X_CHIP_IS_57712E(x) || \
 	 BNX2X_CHIP_IS_57713(x) || BNX2X_CHIP_IS_57713E(x))
-#define BNX2X_CHIP_IS_E2_PLUS(x) BNX2X_CHIP_IS_E2(x)
+#define BNX2X_CHIP_IS_E3(x)			\
+	(BNX2X_CHIP_IS_57800(x) || BNX2X_CHIP_IS_57810(x) || \
+	 BNX2X_CHIP_IS_57840(x))
+#define BNX2X_CHIP_IS_E2_PLUS(x) (BNX2X_CHIP_IS_E2(x) || BNX2X_CHIP_IS_E3(x))
 
 #define IS_E1H_OFFSET       		BNX2X_CHIP_IS_E1H(cp->chip_id)
 
diff --git a/drivers/scsi/bnx2i/57xx_iscsi_constants.h b/drivers/scsi/bnx2i/57xx_iscsi_constants.h
index 30e6bdbd65af..15673cc786ff 100644
--- a/drivers/scsi/bnx2i/57xx_iscsi_constants.h
+++ b/drivers/scsi/bnx2i/57xx_iscsi_constants.h
@@ -125,7 +125,7 @@
 
 /* SQ/RQ/CQ DB structure sizes */
 #define ISCSI_SQ_DB_SIZE    (16)
-#define ISCSI_RQ_DB_SIZE    (16)
+#define ISCSI_RQ_DB_SIZE    (64)
 #define ISCSI_CQ_DB_SIZE    (80)
 
 #define ISCSI_SQN_TO_NOTIFY_NOT_VALID                                   0xFFFF
diff --git a/drivers/scsi/bnx2i/bnx2i.h b/drivers/scsi/bnx2i/bnx2i.h
index 6bdd25a93db9..e7cb7ecf6847 100644
--- a/drivers/scsi/bnx2i/bnx2i.h
+++ b/drivers/scsi/bnx2i/bnx2i.h
@@ -478,7 +478,7 @@ struct bnx2i_5771x_cq_db {
 
 struct bnx2i_5771x_sq_rq_db {
 	u16 prod_idx;
-	u8 reserved0[14]; /* Pad structure size to 16 bytes */
+	u8 reserved0[62]; /* Pad structure size to 64 bytes */
 };
 
 
diff --git a/drivers/scsi/bnx2i/bnx2i_init.c b/drivers/scsi/bnx2i/bnx2i_init.c
index 6adbdc34a9a5..6973413e91ec 100644
--- a/drivers/scsi/bnx2i/bnx2i_init.c
+++ b/drivers/scsi/bnx2i/bnx2i_init.c
@@ -30,7 +30,7 @@ MODULE_AUTHOR("Anil Veerabhadrappa <anilgv@broadcom.com> and "
 	      "Eddie Wai <eddie.wai@broadcom.com>");
 
 MODULE_DESCRIPTION("Broadcom NetXtreme II BCM5706/5708/5709/57710/57711/57712"
-		   " iSCSI Driver");
+		   "/57800/57810/57840 iSCSI Driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
 
@@ -88,11 +88,20 @@ void bnx2i_identify_device(struct bnx2i_hba *hba)
 	    (hba->pci_did == PCI_DEVICE_ID_NX2_5709S)) {
 		set_bit(BNX2I_NX2_DEV_5709, &hba->cnic_dev_type);
 		hba->mail_queue_access = BNX2I_MQ_BIN_MODE;
-	} else if (hba->pci_did == PCI_DEVICE_ID_NX2_57710  ||
-		   hba->pci_did == PCI_DEVICE_ID_NX2_57711  ||
-		   hba->pci_did == PCI_DEVICE_ID_NX2_57711E ||
-		   hba->pci_did == PCI_DEVICE_ID_NX2_57712  ||
-		   hba->pci_did == PCI_DEVICE_ID_NX2_57712E)
+	} else if (hba->pci_did == PCI_DEVICE_ID_NX2_57710    ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57711    ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57711E   ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57712    ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57712E   ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57800    ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57800_MF ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57800_VF ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57810    ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57810_MF ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57810_VF ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57840    ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57840_MF ||
+		   hba->pci_did == PCI_DEVICE_ID_NX2_57840_VF)
 		set_bit(BNX2I_NX2_DEV_57710, &hba->cnic_dev_type);
 	else
 		printk(KERN_ALERT "bnx2i: unknown device, 0x%x\n",
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a311008af5e1..d76bd9e4c784 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2102,6 +2102,9 @@
 #define PCI_DEVICE_ID_TIGON3_5761E	0x1680
 #define PCI_DEVICE_ID_TIGON3_5761	0x1681
 #define PCI_DEVICE_ID_TIGON3_5764	0x1684
+#define PCI_DEVICE_ID_NX2_57800		0x168a
+#define PCI_DEVICE_ID_NX2_57840		0x168d
+#define PCI_DEVICE_ID_NX2_57810		0x168e
 #define PCI_DEVICE_ID_TIGON3_5787M	0x1693
 #define PCI_DEVICE_ID_TIGON3_5782	0x1696
 #define PCI_DEVICE_ID_TIGON3_5784	0x1698
@@ -2109,11 +2112,17 @@
 #define PCI_DEVICE_ID_TIGON3_5787	0x169b
 #define PCI_DEVICE_ID_TIGON3_5788	0x169c
 #define PCI_DEVICE_ID_TIGON3_5789	0x169d
+#define PCI_DEVICE_ID_NX2_57800_MF	0x16a5
 #define PCI_DEVICE_ID_TIGON3_5702X	0x16a6
 #define PCI_DEVICE_ID_TIGON3_5703X	0x16a7
 #define PCI_DEVICE_ID_TIGON3_5704S	0x16a8
+#define PCI_DEVICE_ID_NX2_57800_VF	0x16a9
 #define PCI_DEVICE_ID_NX2_5706S		0x16aa
+#define PCI_DEVICE_ID_NX2_57840_MF	0x16ab
 #define PCI_DEVICE_ID_NX2_5708S		0x16ac
+#define PCI_DEVICE_ID_NX2_57840_VF	0x16ad
+#define PCI_DEVICE_ID_NX2_57810_MF	0x16ae
+#define PCI_DEVICE_ID_NX2_57810_VF	0x16af
 #define PCI_DEVICE_ID_TIGON3_5702A3	0x16c6
 #define PCI_DEVICE_ID_TIGON3_5703A3	0x16c7
 #define PCI_DEVICE_ID_TIGON3_5781	0x16dd
-- 
cgit v1.2.3


From 314b4778ed579f29b6d46ba90dbf31314c13805f Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Tue, 21 Jun 2011 07:34:37 +0000
Subject: net: dcbnl, add multicast group for DCB

Now that dcbnl is being used in many cases by more
than a single agent it is beneficial to be notified
when some entity either driver or user space has
changed the DCB attributes.

Today applications either end up polling the interface
or relying on a user space database to maintain the DCB
state and post events. Polling is a poor solution for
obvious reasons. And relying on a user space database
has its own downside. Namely it has created strange
boot dependencies requiring the database be populated
before any applications dependent on DCB attributes
starts or the application goes into a polling loop.
Populating the database requires negotiating link
setting with the peer and can take anywhere from less
than a second up to a few seconds depending on the switch
implementation.

Perhaps more importantly if another application or an
embedded agent sets a DCB link attribute the database
has no way of knowing other than polling the kernel.
This prevents applications from responding quickly to
changes in link events which at least in the FCoE case
and probably any other protocols expecting a lossless
link may result in IO errors.

By adding a multicast group for DCB we have clean way
to disseminate kernel DCB link attributes up to user
space. Avoiding the need for user space to maintain
a coherant database and disperse events that potentially
do not reflect the current link state.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |   2 +
 include/net/dcbnl.h       |   2 +
 net/dcb/dcbnl.c           | 229 +++++++++++++++++++++++++++++-----------------
 3 files changed, 150 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index bbad657a3725..c81226a9a35c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -585,6 +585,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_PHONET_IFADDR	RTNLGRP_PHONET_IFADDR
 	RTNLGRP_PHONET_ROUTE,
 #define RTNLGRP_PHONET_ROUTE	RTNLGRP_PHONET_ROUTE
+	RTNLGRP_DCB,
+#define RTNLGRP_DCB		RTNLGRP_DCB
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index e5983c9053dc..b3cf10d9b828 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -31,6 +31,8 @@ struct dcb_app_type {
 u8 dcb_setapp(struct net_device *, struct dcb_app *);
 u8 dcb_getapp(struct net_device *, struct dcb_app *);
 
+int dcbnl_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 pid);
+
 /*
  * Ops struct for the netlink callbacks.  Used by DCB-enabled drivers through
  * the netdevice struct.
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 3a6d97d5280c..ffba32692bdb 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1166,64 +1166,6 @@ err:
 	return ret;
 }
 
-/* Handle IEEE 802.1Qaz SET commands. If any requested operation can not
- * be completed the entire msg is aborted and error value is returned.
- * No attempt is made to reconcile the case where only part of the
- * cmd can be completed.
- */
-static int dcbnl_ieee_set(struct net_device *netdev, struct nlattr **tb,
-			  u32 pid, u32 seq, u16 flags)
-{
-	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
-	struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
-	int err = -EOPNOTSUPP;
-
-	if (!ops)
-		goto err;
-
-	err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX,
-			       tb[DCB_ATTR_IEEE], dcbnl_ieee_policy);
-	if (err)
-		goto err;
-
-	if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) {
-		struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]);
-		err = ops->ieee_setets(netdev, ets);
-		if (err)
-			goto err;
-	}
-
-	if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) {
-		struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]);
-		err = ops->ieee_setpfc(netdev, pfc);
-		if (err)
-			goto err;
-	}
-
-	if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
-		struct nlattr *attr;
-		int rem;
-
-		nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
-			struct dcb_app *app_data;
-			if (nla_type(attr) != DCB_ATTR_IEEE_APP)
-				continue;
-			app_data = nla_data(attr);
-			if (ops->ieee_setapp)
-				err = ops->ieee_setapp(netdev, app_data);
-			else
-				err = dcb_setapp(netdev, app_data);
-			if (err)
-				goto err;
-		}
-	}
-
-err:
-	dcbnl_reply(err, RTM_SETDCB, DCB_CMD_IEEE_SET, DCB_ATTR_IEEE,
-		    pid, seq, flags);
-	return err;
-}
-
 static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb,
 				int app_nested_type, int app_info_type,
 				int app_entry_type)
@@ -1279,30 +1221,13 @@ nla_put_failure:
 }
 
 /* Handle IEEE 802.1Qaz GET commands. */
-static int dcbnl_ieee_get(struct net_device *netdev, struct nlattr **tb,
-			  u32 pid, u32 seq, u16 flags)
+static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 {
-	struct sk_buff *skb;
-	struct nlmsghdr *nlh;
-	struct dcbmsg *dcb;
 	struct nlattr *ieee, *app;
 	struct dcb_app_type *itr;
 	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
 	int dcbx;
-	int err;
-
-	if (!ops)
-		return -EOPNOTSUPP;
-
-	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!skb)
-		return -ENOBUFS;
-
-	nlh = NLMSG_NEW(skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
-
-	dcb = NLMSG_DATA(nlh);
-	dcb->dcb_family = AF_UNSPEC;
-	dcb->cmd = DCB_CMD_IEEE_GET;
+	int err = -EMSGSIZE;
 
 	NLA_PUT_STRING(skb, DCB_ATTR_IFNAME, netdev->name);
 
@@ -1378,16 +1303,154 @@ static int dcbnl_ieee_get(struct net_device *netdev, struct nlattr **tb,
 		if (err)
 			goto nla_put_failure;
 	}
-	nlmsg_end(skb, nlh);
 
-	return rtnl_unicast(skb, &init_net, pid);
+	return 0;
+
 nla_put_failure:
-	nlmsg_cancel(skb, nlh);
-nlmsg_failure:
-	kfree_skb(skb);
-	return -1;
+	return err;
 }
 
+int dcbnl_notify(struct net_device *dev, int event, int cmd,
+			u32 seq, u32 pid)
+{
+	struct net *net = dev_net(dev);
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops;
+	int err;
+
+	if (!ops)
+		return -EOPNOTSUPP;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	nlh = nlmsg_put(skb, pid, 0, event, sizeof(*dcb), 0);
+	if (nlh == NULL) {
+		kfree(skb);
+		return -EMSGSIZE;
+	}
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = cmd;
+
+	err = dcbnl_ieee_fill(skb, dev);
+	if (err < 0) {
+		/* Report error to broadcast listeners */
+		nlmsg_cancel(skb, nlh);
+		kfree_skb(skb);
+		rtnl_set_sk_err(net, RTNLGRP_DCB, err);
+	} else {
+		/* End nlmsg and notify broadcast listeners */
+		nlmsg_end(skb, nlh);
+		rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(dcbnl_notify);
+
+/* Handle IEEE 802.1Qaz SET commands. If any requested operation can not
+ * be completed the entire msg is aborted and error value is returned.
+ * No attempt is made to reconcile the case where only part of the
+ * cmd can be completed.
+ */
+static int dcbnl_ieee_set(struct net_device *netdev, struct nlattr **tb,
+			  u32 pid, u32 seq, u16 flags)
+{
+	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
+	int err = -EOPNOTSUPP;
+
+	if (!ops)
+		return err;
+
+	err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX,
+			       tb[DCB_ATTR_IEEE], dcbnl_ieee_policy);
+	if (err)
+		return err;
+
+	if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) {
+		struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]);
+		err = ops->ieee_setets(netdev, ets);
+		if (err)
+			goto err;
+	}
+
+	if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) {
+		struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]);
+		err = ops->ieee_setpfc(netdev, pfc);
+		if (err)
+			goto err;
+	}
+
+	if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
+		struct nlattr *attr;
+		int rem;
+
+		nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
+			struct dcb_app *app_data;
+			if (nla_type(attr) != DCB_ATTR_IEEE_APP)
+				continue;
+			app_data = nla_data(attr);
+			if (ops->ieee_setapp)
+				err = ops->ieee_setapp(netdev, app_data);
+			else
+				err = dcb_setapp(netdev, app_data);
+			if (err)
+				goto err;
+		}
+	}
+
+err:
+	dcbnl_reply(err, RTM_SETDCB, DCB_CMD_IEEE_SET, DCB_ATTR_IEEE,
+		    pid, seq, flags);
+	dcbnl_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0);
+	return err;
+}
+
+static int dcbnl_ieee_get(struct net_device *netdev, struct nlattr **tb,
+			  u32 pid, u32 seq, u16 flags)
+{
+	struct net *net = dev_net(netdev);
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	int err;
+
+	if (!ops)
+		return -EOPNOTSUPP;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	nlh = nlmsg_put(skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+	if (nlh == NULL) {
+		kfree(skb);
+		return -EMSGSIZE;
+	}
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_IEEE_GET;
+
+	err = dcbnl_ieee_fill(skb, netdev);
+
+	if (err < 0) {
+		nlmsg_cancel(skb, nlh);
+		kfree_skb(skb);
+	} else {
+		nlmsg_end(skb, nlh);
+		err = rtnl_unicast(skb, net, pid);
+	}
+
+	return err;
+}
 /* DCBX configuration */
 static int dcbnl_getdcbx(struct net_device *netdev, struct nlattr **tb,
 			 u32 pid, u32 seq, u16 flags)
-- 
cgit v1.2.3


From f9ae7e4b515c4d56baf6e0e84ebee2e03ae57a25 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Tue, 21 Jun 2011 07:34:48 +0000
Subject: dcb: Add ieee_dcb_delapp() and dcb op to delete app entry

Now that we allow multiple IEEE App entries we need a way
to remove specific entries. To do this add the ieee_dcb_delapp()
routine.

Additionaly drivers may need to remove the APP entry from
their firmware tables. Add dcb ops routine to handle this.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dcbnl.h |  2 ++
 include/net/dcbnl.h   |  2 ++
 net/dcb/dcbnl.c       | 90 +++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 92 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h
index c52280047e2c..66a67235e729 100644
--- a/include/linux/dcbnl.h
+++ b/include/linux/dcbnl.h
@@ -203,6 +203,7 @@ struct dcbmsg {
  * @DCB_CMD_GFEATCFG: get DCBX features flags
  * @DCB_CMD_SFEATCFG: set DCBX features negotiation flags
  * @DCB_CMD_CEE_GET: get CEE aggregated configuration
+ * @DCB_CMD_IEEE_DEL: delete IEEE 802.1Qaz configuration
  */
 enum dcbnl_commands {
 	DCB_CMD_UNDEFINED,
@@ -246,6 +247,7 @@ enum dcbnl_commands {
 	DCB_CMD_SFEATCFG,
 
 	DCB_CMD_CEE_GET,
+	DCB_CMD_IEEE_DEL,
 
 	__DCB_CMD_ENUM_MAX,
 	DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1,
diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index c53a4e06a16a..62a268531d4a 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -31,6 +31,7 @@ struct dcb_app_type {
 u8 dcb_setapp(struct net_device *, struct dcb_app *);
 u8 dcb_getapp(struct net_device *, struct dcb_app *);
 int dcb_ieee_setapp(struct net_device *, struct dcb_app *);
+int dcb_ieee_delapp(struct net_device *, struct dcb_app *);
 
 int dcbnl_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 pid);
 
@@ -46,6 +47,7 @@ struct dcbnl_rtnl_ops {
 	int (*ieee_setpfc) (struct net_device *, struct ieee_pfc *);
 	int (*ieee_getapp) (struct net_device *, struct dcb_app *);
 	int (*ieee_setapp) (struct net_device *, struct dcb_app *);
+	int (*ieee_delapp) (struct net_device *, struct dcb_app *);
 	int (*ieee_peer_getets) (struct net_device *, struct ieee_ets *);
 	int (*ieee_peer_getpfc) (struct net_device *, struct ieee_pfc *);
 
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 3e3b51c4a84a..196084f310d2 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1451,6 +1451,52 @@ static int dcbnl_ieee_get(struct net_device *netdev, struct nlattr **tb,
 
 	return err;
 }
+
+static int dcbnl_ieee_del(struct net_device *netdev, struct nlattr **tb,
+			  u32 pid, u32 seq, u16 flags)
+{
+	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
+	int err = -EOPNOTSUPP;
+
+	if (!ops)
+		return -EOPNOTSUPP;
+
+	if (!tb[DCB_ATTR_IEEE])
+		return -EINVAL;
+
+	err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX,
+			       tb[DCB_ATTR_IEEE], dcbnl_ieee_policy);
+	if (err)
+		return err;
+
+	if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
+		struct nlattr *attr;
+		int rem;
+
+		nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
+			struct dcb_app *app_data;
+
+			if (nla_type(attr) != DCB_ATTR_IEEE_APP)
+				continue;
+			app_data = nla_data(attr);
+			if (ops->ieee_delapp)
+				err = ops->ieee_delapp(netdev, app_data);
+			else
+				err = dcb_ieee_delapp(netdev, app_data);
+			if (err)
+				goto err;
+		}
+	}
+
+err:
+	dcbnl_reply(err, RTM_SETDCB, DCB_CMD_IEEE_DEL, DCB_ATTR_IEEE,
+		    pid, seq, flags);
+	dcbnl_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0);
+	return err;
+}
+
+
 /* DCBX configuration */
 static int dcbnl_getdcbx(struct net_device *netdev, struct nlattr **tb,
 			 u32 pid, u32 seq, u16 flags)
@@ -1765,11 +1811,15 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		goto out;
 	case DCB_CMD_IEEE_SET:
 		ret = dcbnl_ieee_set(netdev, tb, pid, nlh->nlmsg_seq,
-				 nlh->nlmsg_flags);
+				     nlh->nlmsg_flags);
 		goto out;
 	case DCB_CMD_IEEE_GET:
 		ret = dcbnl_ieee_get(netdev, tb, pid, nlh->nlmsg_seq,
-				 nlh->nlmsg_flags);
+				     nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_IEEE_DEL:
+		ret = dcbnl_ieee_del(netdev, tb, pid, nlh->nlmsg_seq,
+				     nlh->nlmsg_flags);
 		goto out;
 	case DCB_CMD_GDCBX:
 		ret = dcbnl_getdcbx(netdev, tb, pid, nlh->nlmsg_seq,
@@ -1924,6 +1974,42 @@ out:
 }
 EXPORT_SYMBOL(dcb_ieee_setapp);
 
+/**
+ * dcb_ieee_delapp - delete IEEE dcb application data from list
+ *
+ * This removes a matching APP data from the APP list
+ */
+int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
+{
+	struct dcb_app_type *itr;
+	struct dcb_app_type event;
+	int err = -ENOENT;
+
+	memcpy(&event.name, dev->name, sizeof(event.name));
+	memcpy(&event.app, del, sizeof(event.app));
+
+	spin_lock(&dcb_lock);
+	/* Search for existing match and remove it. */
+	list_for_each_entry(itr, &dcb_app_list, list) {
+		if (itr->app.selector == del->selector &&
+		    itr->app.protocol == del->protocol &&
+		    itr->app.priority == del->priority &&
+		    (strncmp(itr->name, dev->name, IFNAMSIZ) == 0)) {
+			list_del(&itr->list);
+			kfree(itr);
+			err = 0;
+			goto out;
+		}
+	}
+
+out:
+	spin_unlock(&dcb_lock);
+	if (!err)
+		call_dcbevent_notifiers(DCB_APP_EVENT, &event);
+	return err;
+}
+EXPORT_SYMBOL(dcb_ieee_delapp);
+
 static void dcb_flushapp(void)
 {
 	struct dcb_app_type *app;
-- 
cgit v1.2.3


From b7f080cfe223b3b7424872639d153695615a9255 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 16 Jun 2011 11:01:34 +0000
Subject: net: remove mm.h inclusion from netdevice.h

Remove linux/mm.h inclusion from netdevice.h -- it's unused (I've checked manually).

To prevent mm.h inclusion via other channels also extract "enum dma_data_direction"
definition into separate header. This tiny piece is what gluing netdevice.h with mm.h
via "netdevice.h => dmaengine.h => dma-mapping.h => scatterlist.h => mm.h".
Removal of mm.h from scatterlist.h was tried and was found not feasible
on most archs, so the link was cutoff earlier.

Hope people are OK with tiny include file.

Note, that mm_types.h is still dragged in, but it is a separate story.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/mach-davinci/board-mityomapl138.c        |  1 +
 arch/arm/mach-davinci/dm646x.c                    |  1 +
 arch/arm/mach-davinci/pm.c                        |  1 +
 arch/arm/mach-imx/dma-v1.c                        |  1 +
 arch/arm/mach-imx/mach-mx31_3ds.c                 |  1 +
 arch/arm/mach-iop13xx/setup.c                     |  1 +
 arch/arm/mach-mxs/devices/platform-auart.c        |  1 +
 arch/arm/mach-mxs/devices/platform-dma.c          |  1 +
 arch/arm/mach-mxs/devices/platform-fec.c          |  1 +
 arch/arm/plat-mxc/devices/platform-fec.c          |  1 +
 arch/arm/plat-mxc/devices/platform-fsl-usb2-udc.c |  1 +
 arch/arm/plat-mxc/devices/platform-imx-fb.c       |  1 +
 arch/arm/plat-mxc/devices/platform-ipu-core.c     |  1 +
 arch/arm/plat-mxc/devices/platform-mxc-ehci.c     |  1 +
 arch/arm/plat-mxc/devices/platform-mxc-mmc.c      |  1 +
 arch/arm/plat-nomadik/include/plat/ste_dma40.h    |  1 +
 arch/x86/kernel/tboot.c                           |  1 +
 crypto/async_tx/raid6test.c                       |  1 +
 drivers/dma/coh901318.c                           |  1 +
 drivers/dma/dmaengine.c                           |  1 +
 drivers/dma/dmatest.c                             |  1 +
 drivers/dma/ipu/ipu_idmac.c                       |  1 +
 drivers/dma/ste_dma40.c                           |  1 +
 drivers/media/dvb/mantis/mantis_ca.c              |  1 +
 drivers/media/dvb/mantis/mantis_evm.c             |  1 +
 drivers/media/dvb/mantis/mantis_hif.c             |  1 +
 drivers/media/dvb/mantis/mantis_ioc.c             |  1 +
 drivers/media/dvb/mantis/mantis_pcmcia.c          |  1 +
 drivers/media/dvb/mantis/mantis_uart.c            |  1 +
 drivers/media/dvb/mantis/mantis_vp1034.c          |  1 +
 drivers/mmc/host/tmio_mmc_dma.c                   |  1 +
 drivers/mtd/nand/atmel_nand.c                     |  1 +
 drivers/net/arm/ks8695net.c                       |  1 +
 drivers/net/bnx2x/bnx2x.h                         |  1 +
 drivers/net/can/janz-ican3.c                      |  1 +
 drivers/net/can/softing/softing_fw.c              |  1 +
 drivers/net/can/softing/softing_main.c            |  1 +
 drivers/net/ethoc.c                               |  1 +
 drivers/net/fec_mpc52xx.c                         |  1 +
 drivers/net/greth.c                               |  1 +
 drivers/net/irda/pxaficp_ir.c                     |  1 +
 drivers/net/ks8851_mll.c                          |  1 +
 drivers/net/sgiseeq.c                             |  1 +
 drivers/net/stmmac/dwmac1000_core.c               |  1 +
 drivers/net/stmmac/dwmac1000_dma.c                |  1 +
 drivers/net/stmmac/dwmac100_core.c                |  1 +
 drivers/net/stmmac/dwmac100_dma.c                 |  1 +
 drivers/net/stmmac/stmmac_ethtool.c               |  1 +
 drivers/net/stmmac/stmmac_mdio.c                  |  1 +
 drivers/net/usb/cdc-phonet.c                      |  1 +
 drivers/net/vxge/vxge-config.h                    |  1 +
 drivers/net/wireless/ath/ath5k/base.c             |  1 +
 drivers/net/wireless/ath/ath9k/beacon.c           |  1 +
 drivers/net/wireless/ath/ath9k/init.c             |  1 +
 drivers/net/wireless/ath/ath9k/recv.c             |  1 +
 drivers/net/wireless/ath/ath9k/xmit.c             |  1 +
 drivers/staging/pohmelfs/crypto.c                 |  1 +
 drivers/tty/serial/ifx6x60.c                      |  1 +
 drivers/usb/gadget/f_phonet.c                     |  1 +
 include/crypto/if_alg.h                           |  1 +
 include/linux/dma-direction.h                     | 13 +++++++++++++
 include/linux/dma-mapping.h                       | 10 +---------
 include/linux/dmaengine.h                         |  4 +++-
 include/linux/netdevice.h                         |  1 -
 net/sched/sch_netem.c                             |  1 +
 security/apparmor/lib.c                           |  1 +
 66 files changed, 79 insertions(+), 11 deletions(-)
 create mode 100644 include/linux/dma-direction.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-davinci/board-mityomapl138.c b/arch/arm/mach-davinci/board-mityomapl138.c
index 606a6f27ed6c..5f5d78308873 100644
--- a/arch/arm/mach-davinci/board-mityomapl138.c
+++ b/arch/arm/mach-davinci/board-mityomapl138.c
@@ -20,6 +20,7 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/flash.h>
 
+#include <asm/io.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
 #include <mach/common.h>
diff --git a/arch/arm/mach-davinci/dm646x.c b/arch/arm/mach-davinci/dm646x.c
index 1e0f809644bb..e00d61e2efbe 100644
--- a/arch/arm/mach-davinci/dm646x.c
+++ b/arch/arm/mach-davinci/dm646x.c
@@ -8,6 +8,7 @@
  * is licensed "as is" without any warranty of any kind, whether express
  * or implied.
  */
+#include <linux/dma-mapping.h>
 #include <linux/init.h>
 #include <linux/clk.h>
 #include <linux/serial_8250.h>
diff --git a/arch/arm/mach-davinci/pm.c b/arch/arm/mach-davinci/pm.c
index 1bd73a04be20..04c49f7543ef 100644
--- a/arch/arm/mach-davinci/pm.c
+++ b/arch/arm/mach-davinci/pm.c
@@ -17,6 +17,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/delay.h>
+#include <asm/io.h>
 
 #include <mach/da8xx.h>
 #include <mach/sram.h>
diff --git a/arch/arm/mach-imx/dma-v1.c b/arch/arm/mach-imx/dma-v1.c
index 236f1495efad..f8aa5be0eb15 100644
--- a/arch/arm/mach-imx/dma-v1.c
+++ b/arch/arm/mach-imx/dma-v1.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/clk.h>
 #include <linux/scatterlist.h>
diff --git a/arch/arm/mach-imx/mach-mx31_3ds.c b/arch/arm/mach-imx/mach-mx31_3ds.c
index 9b982449cb52..b5ecc26d08a4 100644
--- a/arch/arm/mach-imx/mach-mx31_3ds.c
+++ b/arch/arm/mach-imx/mach-mx31_3ds.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/clk.h>
diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c
index 5c147fb66a01..a5b989728b9e 100644
--- a/arch/arm/mach-iop13xx/setup.c
+++ b/arch/arm/mach-iop13xx/setup.c
@@ -17,6 +17,7 @@
  *
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/serial_8250.h>
 #include <linux/io.h>
 #ifdef CONFIG_MTD_PHYSMAP
diff --git a/arch/arm/mach-mxs/devices/platform-auart.c b/arch/arm/mach-mxs/devices/platform-auart.c
index 796606cce0ce..27608f5d2ac8 100644
--- a/arch/arm/mach-mxs/devices/platform-auart.c
+++ b/arch/arm/mach-mxs/devices/platform-auart.c
@@ -6,6 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
+#include <linux/dma-mapping.h>
 #include <asm/sizes.h>
 #include <mach/mx23.h>
 #include <mach/mx28.h>
diff --git a/arch/arm/mach-mxs/devices/platform-dma.c b/arch/arm/mach-mxs/devices/platform-dma.c
index 295c4424d5d9..6a0202b1016c 100644
--- a/arch/arm/mach-mxs/devices/platform-dma.c
+++ b/arch/arm/mach-mxs/devices/platform-dma.c
@@ -6,6 +6,7 @@
  * Free Software Foundation.
  */
 #include <linux/compiler.h>
+#include <linux/dma-mapping.h>
 #include <linux/err.h>
 #include <linux/init.h>
 
diff --git a/arch/arm/mach-mxs/devices/platform-fec.c b/arch/arm/mach-mxs/devices/platform-fec.c
index 9859cf283335..ae96a4fd8f14 100644
--- a/arch/arm/mach-mxs/devices/platform-fec.c
+++ b/arch/arm/mach-mxs/devices/platform-fec.c
@@ -6,6 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
+#include <linux/dma-mapping.h>
 #include <asm/sizes.h>
 #include <mach/mx28.h>
 #include <mach/devices-common.h>
diff --git a/arch/arm/plat-mxc/devices/platform-fec.c b/arch/arm/plat-mxc/devices/platform-fec.c
index ccc789e21daa..4fc6ffc2a13e 100644
--- a/arch/arm/plat-mxc/devices/platform-fec.c
+++ b/arch/arm/plat-mxc/devices/platform-fec.c
@@ -6,6 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
+#include <linux/dma-mapping.h>
 #include <asm/sizes.h>
 #include <mach/hardware.h>
 #include <mach/devices-common.h>
diff --git a/arch/arm/plat-mxc/devices/platform-fsl-usb2-udc.c b/arch/arm/plat-mxc/devices/platform-fsl-usb2-udc.c
index 59c33f6e401c..23ce08e6ffd2 100644
--- a/arch/arm/plat-mxc/devices/platform-fsl-usb2-udc.c
+++ b/arch/arm/plat-mxc/devices/platform-fsl-usb2-udc.c
@@ -6,6 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
+#include <linux/dma-mapping.h>
 #include <mach/hardware.h>
 #include <mach/devices-common.h>
 
diff --git a/arch/arm/plat-mxc/devices/platform-imx-fb.c b/arch/arm/plat-mxc/devices/platform-imx-fb.c
index 79a1cb18a5b0..2b0b5e0aa998 100644
--- a/arch/arm/plat-mxc/devices/platform-imx-fb.c
+++ b/arch/arm/plat-mxc/devices/platform-imx-fb.c
@@ -6,6 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
+#include <linux/dma-mapping.h>
 #include <mach/hardware.h>
 #include <mach/devices-common.h>
 
diff --git a/arch/arm/plat-mxc/devices/platform-ipu-core.c b/arch/arm/plat-mxc/devices/platform-ipu-core.c
index edf65034aea5..79d340ae0af1 100644
--- a/arch/arm/plat-mxc/devices/platform-ipu-core.c
+++ b/arch/arm/plat-mxc/devices/platform-ipu-core.c
@@ -6,6 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
+#include <linux/dma-mapping.h>
 #include <mach/hardware.h>
 #include <mach/devices-common.h>
 
diff --git a/arch/arm/plat-mxc/devices/platform-mxc-ehci.c b/arch/arm/plat-mxc/devices/platform-mxc-ehci.c
index cc488f4b6204..e1763e03e7cb 100644
--- a/arch/arm/plat-mxc/devices/platform-mxc-ehci.c
+++ b/arch/arm/plat-mxc/devices/platform-mxc-ehci.c
@@ -6,6 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
+#include <linux/dma-mapping.h>
 #include <mach/hardware.h>
 #include <mach/devices-common.h>
 
diff --git a/arch/arm/plat-mxc/devices/platform-mxc-mmc.c b/arch/arm/plat-mxc/devices/platform-mxc-mmc.c
index 90d762f6f93b..540d3a7d92df 100644
--- a/arch/arm/plat-mxc/devices/platform-mxc-mmc.c
+++ b/arch/arm/plat-mxc/devices/platform-mxc-mmc.c
@@ -6,6 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
+#include <linux/dma-mapping.h>
 #include <mach/hardware.h>
 #include <mach/devices-common.h>
 
diff --git a/arch/arm/plat-nomadik/include/plat/ste_dma40.h b/arch/arm/plat-nomadik/include/plat/ste_dma40.h
index c44886062f8e..685c78716d95 100644
--- a/arch/arm/plat-nomadik/include/plat/ste_dma40.h
+++ b/arch/arm/plat-nomadik/include/plat/ste_dma40.h
@@ -10,6 +10,7 @@
 #define STE_DMA40_H
 
 #include <linux/dmaengine.h>
+#include <linux/scatterlist.h>
 #include <linux/workqueue.h>
 #include <linux/interrupt.h>
 
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 30ac65df7d4e..e07a2fc876b9 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -36,6 +36,7 @@
 #include <asm/bootparam.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/swiotlb.h>
 #include <asm/fixmap.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
diff --git a/crypto/async_tx/raid6test.c b/crypto/async_tx/raid6test.c
index c1321935ebcc..c88ff9e3fd30 100644
--- a/crypto/async_tx/raid6test.c
+++ b/crypto/async_tx/raid6test.c
@@ -21,6 +21,7 @@
  */
 #include <linux/async_tx.h>
 #include <linux/gfp.h>
+#include <linux/mm.h>
 #include <linux/random.h>
 
 #undef pr
diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c
index af8c0b5ed70f..a92d95eac86b 100644
--- a/drivers/dma/coh901318.c
+++ b/drivers/dma/coh901318.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h> /* printk() */
 #include <linux/fs.h> /* everything... */
+#include <linux/scatterlist.h>
 #include <linux/slab.h> /* kmalloc() */
 #include <linux/dmaengine.h>
 #include <linux/platform_device.h>
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 8bcb15fb959d..48694c34d96b 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -45,6 +45,7 @@
  * See Documentation/dmaengine.txt for more details
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index b4f5c32b6a47..765f5ff22304 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -8,6 +8,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
 #include <linux/init.h>
 #include <linux/kthread.h>
diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c
index c1a125e7d1df..fd7d2b308cf2 100644
--- a/drivers/dma/ipu/ipu_idmac.c
+++ b/drivers/dma/ipu/ipu_idmac.c
@@ -9,6 +9,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/err.h>
diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c
index 8f222d4db7de..29d1addbe0cf 100644
--- a/drivers/dma/ste_dma40.c
+++ b/drivers/dma/ste_dma40.c
@@ -6,6 +6,7 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/dmaengine.h>
diff --git a/drivers/media/dvb/mantis/mantis_ca.c b/drivers/media/dvb/mantis/mantis_ca.c
index 330216febd78..3d7046909009 100644
--- a/drivers/media/dvb/mantis/mantis_ca.c
+++ b/drivers/media/dvb/mantis/mantis_ca.c
@@ -22,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
+#include <asm/io.h>
 
 #include "dmxdev.h"
 #include "dvbdev.h"
diff --git a/drivers/media/dvb/mantis/mantis_evm.c b/drivers/media/dvb/mantis/mantis_evm.c
index 9f73c2cfc9ea..36f2256ebb0e 100644
--- a/drivers/media/dvb/mantis/mantis_evm.c
+++ b/drivers/media/dvb/mantis/mantis_evm.c
@@ -23,6 +23,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
+#include <asm/io.h>
 
 #include "dmxdev.h"
 #include "dvbdev.h"
diff --git a/drivers/media/dvb/mantis/mantis_hif.c b/drivers/media/dvb/mantis/mantis_hif.c
index 5772ebb3a69e..672cf4d2462d 100644
--- a/drivers/media/dvb/mantis/mantis_hif.c
+++ b/drivers/media/dvb/mantis/mantis_hif.c
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 
 #include <linux/interrupt.h>
+#include <asm/io.h>
 
 #include "dmxdev.h"
 #include "dvbdev.h"
diff --git a/drivers/media/dvb/mantis/mantis_ioc.c b/drivers/media/dvb/mantis/mantis_ioc.c
index 479086dbb9a8..24fcdc63d6d5 100644
--- a/drivers/media/dvb/mantis/mantis_ioc.c
+++ b/drivers/media/dvb/mantis/mantis_ioc.c
@@ -24,6 +24,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
+#include <asm/io.h>
 
 #include "dmxdev.h"
 #include "dvbdev.h"
diff --git a/drivers/media/dvb/mantis/mantis_pcmcia.c b/drivers/media/dvb/mantis/mantis_pcmcia.c
index 5cb545b913f6..2f188c089666 100644
--- a/drivers/media/dvb/mantis/mantis_pcmcia.c
+++ b/drivers/media/dvb/mantis/mantis_pcmcia.c
@@ -23,6 +23,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
+#include <asm/io.h>
 
 #include "dmxdev.h"
 #include "dvbdev.h"
diff --git a/drivers/media/dvb/mantis/mantis_uart.c b/drivers/media/dvb/mantis/mantis_uart.c
index f807c8ba26e4..18340dafa426 100644
--- a/drivers/media/dvb/mantis/mantis_uart.c
+++ b/drivers/media/dvb/mantis/mantis_uart.c
@@ -20,6 +20,7 @@
 
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
+#include <asm/io.h>
 
 #include <linux/signal.h>
 #include <linux/sched.h>
diff --git a/drivers/media/dvb/mantis/mantis_vp1034.c b/drivers/media/dvb/mantis/mantis_vp1034.c
index 26bc0cbe84d4..430ae84ce528 100644
--- a/drivers/media/dvb/mantis/mantis_vp1034.c
+++ b/drivers/media/dvb/mantis/mantis_vp1034.c
@@ -21,6 +21,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
+#include <asm/io.h>
 
 #include "dmxdev.h"
 #include "dvbdev.h"
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index 25f1ad6cbe09..bf12513d6297 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
 #include <linux/mfd/tmio.h>
 #include <linux/mmc/host.h>
diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c
index b300705d41cb..1b90fd56bef1 100644
--- a/drivers/mtd/nand/atmel_nand.c
+++ b/drivers/mtd/nand/atmel_nand.c
@@ -22,6 +22,7 @@
  *
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/net/arm/ks8695net.c b/drivers/net/arm/ks8695net.c
index bb62b3f51837..c827a6097d02 100644
--- a/drivers/net/arm/ks8695net.c
+++ b/drivers/net/arm/ks8695net.c
@@ -16,6 +16,7 @@
  *		  Vincent Sanders <vince@simtec.co.uk>
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/bnx2x/bnx2x.h b/drivers/net/bnx2x/bnx2x.h
index 69fc7280be30..eff78a439eeb 100644
--- a/drivers/net/bnx2x/bnx2x.h
+++ b/drivers/net/bnx2x/bnx2x.h
@@ -14,6 +14,7 @@
 #ifndef BNX2X_H
 #define BNX2X_H
 #include <linux/netdevice.h>
+#include <linux/dma-mapping.h>
 #include <linux/types.h>
 
 /* compilation time flags */
diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c
index f1942cab35f6..32778d56d330 100644
--- a/drivers/net/can/janz-ican3.c
+++ b/drivers/net/can/janz-ican3.c
@@ -22,6 +22,7 @@
 #include <linux/can/error.h>
 
 #include <linux/mfd/janz.h>
+#include <asm/io.h>
 
 /* the DPM has 64k of memory, organized into 256x 256 byte pages */
 #define DPM_NUM_PAGES		256
diff --git a/drivers/net/can/softing/softing_fw.c b/drivers/net/can/softing/softing_fw.c
index b520784fb197..310596175676 100644
--- a/drivers/net/can/softing/softing_fw.c
+++ b/drivers/net/can/softing/softing_fw.c
@@ -20,6 +20,7 @@
 #include <linux/firmware.h>
 #include <linux/sched.h>
 #include <asm/div64.h>
+#include <asm/io.h>
 
 #include "softing.h"
 
diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c
index 60a49e5a2a53..954b2959b6b2 100644
--- a/drivers/net/can/softing/softing_main.c
+++ b/drivers/net/can/softing/softing_main.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <asm/io.h>
 
 #include "softing.h"
 
diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c
index 0e8cc75bb5f4..0da6295d9da6 100644
--- a/drivers/net/ethoc.c
+++ b/drivers/net/ethoc.c
@@ -11,6 +11,7 @@
  * Written by Thierry Reding <thierry.reding@avionic-design.de>
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/etherdevice.h>
 #include <linux/crc32.h>
 #include <linux/interrupt.h>
diff --git a/drivers/net/fec_mpc52xx.c b/drivers/net/fec_mpc52xx.c
index cecc3b1eb486..381bdea97d5f 100644
--- a/drivers/net/fec_mpc52xx.c
+++ b/drivers/net/fec_mpc52xx.c
@@ -14,6 +14,7 @@
  *
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/module.h>
 
 #include <linux/kernel.h>
diff --git a/drivers/net/greth.c b/drivers/net/greth.c
index 69b86d7fac85..82c3767ec5f8 100644
--- a/drivers/net/greth.c
+++ b/drivers/net/greth.c
@@ -22,6 +22,7 @@
  *               Marko Isomaki
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/init.h>
diff --git a/drivers/net/irda/pxaficp_ir.c b/drivers/net/irda/pxaficp_ir.c
index b1d1ce3dd8b5..d0851dfa0378 100644
--- a/drivers/net/irda/pxaficp_ir.c
+++ b/drivers/net/irda/pxaficp_ir.c
@@ -12,6 +12,7 @@
  * Infra-red driver (SIR/FIR) for the PXA2xx embedded microprocessor
  *
  */
+#include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ks8851_mll.c b/drivers/net/ks8851_mll.c
index aefbdd896d6a..a82378231fc5 100644
--- a/drivers/net/ks8851_mll.c
+++ b/drivers/net/ks8851_mll.c
@@ -35,6 +35,7 @@
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <asm/io.h>
 
 #define	DRV_NAME	"ks8851_mll"
 
diff --git a/drivers/net/sgiseeq.c b/drivers/net/sgiseeq.c
index 54415c7b84a2..52fb7ed9f365 100644
--- a/drivers/net/sgiseeq.c
+++ b/drivers/net/sgiseeq.c
@@ -6,6 +6,7 @@
 
 #undef DEBUG
 
+#include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/net/stmmac/dwmac1000_core.c b/drivers/net/stmmac/dwmac1000_core.c
index f20455cbfbbc..0f63b3c83c19 100644
--- a/drivers/net/stmmac/dwmac1000_core.c
+++ b/drivers/net/stmmac/dwmac1000_core.c
@@ -28,6 +28,7 @@
 
 #include <linux/crc32.h>
 #include <linux/slab.h>
+#include <asm/io.h>
 #include "dwmac1000.h"
 
 static void dwmac1000_core_init(void __iomem *ioaddr)
diff --git a/drivers/net/stmmac/dwmac1000_dma.c b/drivers/net/stmmac/dwmac1000_dma.c
index 2c47712d45d0..3dbeea619085 100644
--- a/drivers/net/stmmac/dwmac1000_dma.c
+++ b/drivers/net/stmmac/dwmac1000_dma.c
@@ -26,6 +26,7 @@
   Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
 *******************************************************************************/
 
+#include <asm/io.h>
 #include "dwmac1000.h"
 #include "dwmac_dma.h"
 
diff --git a/drivers/net/stmmac/dwmac100_core.c b/drivers/net/stmmac/dwmac100_core.c
index c724fc36a24f..743a58017637 100644
--- a/drivers/net/stmmac/dwmac100_core.c
+++ b/drivers/net/stmmac/dwmac100_core.c
@@ -29,6 +29,7 @@
 *******************************************************************************/
 
 #include <linux/crc32.h>
+#include <asm/io.h>
 #include "dwmac100.h"
 
 static void dwmac100_core_init(void __iomem *ioaddr)
diff --git a/drivers/net/stmmac/dwmac100_dma.c b/drivers/net/stmmac/dwmac100_dma.c
index e3e224b7d9e2..627f656b0f3c 100644
--- a/drivers/net/stmmac/dwmac100_dma.c
+++ b/drivers/net/stmmac/dwmac100_dma.c
@@ -28,6 +28,7 @@
   Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
 *******************************************************************************/
 
+#include <asm/io.h>
 #include "dwmac100.h"
 #include "dwmac_dma.h"
 
diff --git a/drivers/net/stmmac/stmmac_ethtool.c b/drivers/net/stmmac/stmmac_ethtool.c
index 720c5a1531bc..7ed8fb6c2117 100644
--- a/drivers/net/stmmac/stmmac_ethtool.c
+++ b/drivers/net/stmmac/stmmac_ethtool.c
@@ -27,6 +27,7 @@
 #include <linux/interrupt.h>
 #include <linux/mii.h>
 #include <linux/phy.h>
+#include <asm/io.h>
 
 #include "stmmac.h"
 #include "dwmac_dma.h"
diff --git a/drivers/net/stmmac/stmmac_mdio.c b/drivers/net/stmmac/stmmac_mdio.c
index 234b4068a1fc..29a6bb6b8058 100644
--- a/drivers/net/stmmac/stmmac_mdio.c
+++ b/drivers/net/stmmac/stmmac_mdio.c
@@ -27,6 +27,7 @@
 #include <linux/mii.h>
 #include <linux/phy.h>
 #include <linux/slab.h>
+#include <asm/io.h>
 
 #include "stmmac.h"
 
diff --git a/drivers/net/usb/cdc-phonet.c b/drivers/net/usb/cdc-phonet.c
index f967913e11bc..a60d0069cc45 100644
--- a/drivers/net/usb/cdc-phonet.c
+++ b/drivers/net/usb/cdc-phonet.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/usb.h>
diff --git a/drivers/net/vxge/vxge-config.h b/drivers/net/vxge/vxge-config.h
index 6219006d9d2e..dd362584f5ca 100644
--- a/drivers/net/vxge/vxge-config.h
+++ b/drivers/net/vxge/vxge-config.h
@@ -16,6 +16,7 @@
 #include <linux/hardirq.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <asm/io.h>
 
 #ifndef VXGE_CACHE_LINE_SIZE
 #define VXGE_CACHE_LINE_SIZE 128
diff --git a/drivers/net/wireless/ath/ath5k/base.c b/drivers/net/wireless/ath/ath5k/base.c
index b6c5d3715b96..779a1d270243 100644
--- a/drivers/net/wireless/ath/ath5k/base.c
+++ b/drivers/net/wireless/ath/ath5k/base.c
@@ -42,6 +42,7 @@
 
 #include <linux/module.h>
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 #include <linux/hardirq.h>
 #include <linux/if.h>
 #include <linux/io.h>
diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c
index 0174cdb65a83..4c790b5f7937 100644
--- a/drivers/net/wireless/ath/ath9k/beacon.c
+++ b/drivers/net/wireless/ath/ath9k/beacon.c
@@ -14,6 +14,7 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include <linux/dma-mapping.h>
 #include "ath9k.h"
 
 #define FUDGE 2
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index d4b166cfdf60..1202bb0a5534 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -14,6 +14,7 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/ath9k_platform.h>
 
diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c
index 07e35e59c9e3..c4c5d9f455bb 100644
--- a/drivers/net/wireless/ath/ath9k/recv.c
+++ b/drivers/net/wireless/ath/ath9k/recv.c
@@ -14,6 +14,7 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include <linux/dma-mapping.h>
 #include "ath9k.h"
 #include "ar9003_mac.h"
 
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index ec012b4317af..7e79bbaf2ba7 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -14,6 +14,7 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include <linux/dma-mapping.h>
 #include "ath9k.h"
 #include "ar9003_mac.h"
 
diff --git a/drivers/staging/pohmelfs/crypto.c b/drivers/staging/pohmelfs/crypto.c
index 5cca24fcf6ca..ad92771dce57 100644
--- a/drivers/staging/pohmelfs/crypto.c
+++ b/drivers/staging/pohmelfs/crypto.c
@@ -17,6 +17,7 @@
 #include <linux/highmem.h>
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
+#include <linux/scatterlist.h>
 #include <linux/slab.h>
 
 #include "netfs.h"
diff --git a/drivers/tty/serial/ifx6x60.c b/drivers/tty/serial/ifx6x60.c
index 5315525220fb..426434e5eb7c 100644
--- a/drivers/tty/serial/ifx6x60.c
+++ b/drivers/tty/serial/ifx6x60.c
@@ -36,6 +36,7 @@
  *	you need to use this driver for another platform.
  *
  *****************************************************************************/
+#include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/termios.h>
 #include <linux/tty.h>
diff --git a/drivers/usb/gadget/f_phonet.c b/drivers/usb/gadget/f_phonet.c
index 5e1495097ec3..f22fc685ddfd 100644
--- a/drivers/usb/gadget/f_phonet.c
+++ b/drivers/usb/gadget/f_phonet.c
@@ -20,6 +20,7 @@
  * 02110-1301 USA
  */
 
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/device.h>
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index c5813c87de06..d61c11170213 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -16,6 +16,7 @@
 #include <linux/compiler.h>
 #include <linux/completion.h>
 #include <linux/if_alg.h>
+#include <linux/scatterlist.h>
 #include <linux/types.h>
 #include <net/sock.h>
 
diff --git a/include/linux/dma-direction.h b/include/linux/dma-direction.h
new file mode 100644
index 000000000000..95b6a82f5951
--- /dev/null
+++ b/include/linux/dma-direction.h
@@ -0,0 +1,13 @@
+#ifndef _LINUX_DMA_DIRECTION_H
+#define _LINUX_DMA_DIRECTION_H
+/*
+ * These definitions mirror those in pci.h, so they can be used
+ * interchangeably with their PCI_ counterparts.
+ */
+enum dma_data_direction {
+	DMA_BIDIRECTIONAL = 0,
+	DMA_TO_DEVICE = 1,
+	DMA_FROM_DEVICE = 2,
+	DMA_NONE = 3,
+};
+#endif
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index ba8319ae5fcc..1a167c48d84d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -4,17 +4,9 @@
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/dma-attrs.h>
+#include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
 
-/* These definitions mirror those in pci.h, so they can be used
- * interchangeably with their PCI_ counterparts */
-enum dma_data_direction {
-	DMA_BIDIRECTIONAL = 0,
-	DMA_TO_DEVICE = 1,
-	DMA_FROM_DEVICE = 2,
-	DMA_NONE = 3,
-};
-
 struct dma_map_ops {
 	void* (*alloc_coherent)(struct device *dev, size_t size,
 				dma_addr_t *dma_handle, gfp_t gfp);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index eee7addec282..8fbf40e0713c 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -23,7 +23,9 @@
 
 #include <linux/device.h>
 #include <linux/uio.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-direction.h>
+
+struct scatterlist;
 
 /**
  * typedef dma_cookie_t - an opaque DMA cookie
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 22a8ceca0ed0..011eb89e9582 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -34,7 +34,6 @@
 #include <linux/pm_qos_params.h>
 #include <linux/timer.h>
 #include <linux/delay.h>
-#include <linux/mm.h>
 #include <asm/atomic.h>
 #include <asm/cache.h>
 #include <asm/byteorder.h>
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 69c35f6cd13f..eb3b9a86c6ed 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -13,6 +13,7 @@
  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  */
 
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/types.h>
diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c
index 506d2baf6147..b82e383beb77 100644
--- a/security/apparmor/lib.c
+++ b/security/apparmor/lib.c
@@ -12,6 +12,7 @@
  * License.
  */
 
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
-- 
cgit v1.2.3


From d21142ece414ce1088cfcae760689aa60d6fee80 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:34 +0200
Subject: ptrace: kill task_ptrace()

task_ptrace(task) simply dereferences task->ptrace and isn't even used
consistently only adding confusion.  Kill it and directly access
->ptrace instead.

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h    | 11 -----------
 include/linux/tracehook.h | 16 ++++++++--------
 kernel/exit.c             |  8 ++++----
 kernel/signal.c           | 14 +++++++-------
 mm/oom_kill.c             |  3 +--
 5 files changed, 20 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 4f224f169524..3ff20b322598 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -145,17 +145,6 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
 int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 			    unsigned long data);
 
-/**
- * task_ptrace - return %PT_* flags that apply to a task
- * @task:	pointer to &task_struct in question
- *
- * Returns the %PT_* flags that apply to @task.
- */
-static inline int task_ptrace(struct task_struct *task)
-{
-	return task->ptrace;
-}
-
 /**
  * ptrace_event - possibly stop for a ptrace event notification
  * @mask:	%PT_* bit to check in @current->ptrace
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 15745cdd32ce..a3e838784f43 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -63,7 +63,7 @@ struct linux_binprm;
  */
 static inline int tracehook_expect_breakpoints(struct task_struct *task)
 {
-	return (task_ptrace(task) & PT_PTRACED) != 0;
+	return (task->ptrace & PT_PTRACED) != 0;
 }
 
 /*
@@ -71,7 +71,7 @@ static inline int tracehook_expect_breakpoints(struct task_struct *task)
  */
 static inline void ptrace_report_syscall(struct pt_regs *regs)
 {
-	int ptrace = task_ptrace(current);
+	int ptrace = current->ptrace;
 
 	if (!(ptrace & PT_PTRACED))
 		return;
@@ -155,7 +155,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 static inline int tracehook_unsafe_exec(struct task_struct *task)
 {
 	int unsafe = 0;
-	int ptrace = task_ptrace(task);
+	int ptrace = task->ptrace;
 	if (ptrace & PT_PTRACED) {
 		if (ptrace & PT_PTRACE_CAP)
 			unsafe |= LSM_UNSAFE_PTRACE_CAP;
@@ -178,7 +178,7 @@ static inline int tracehook_unsafe_exec(struct task_struct *task)
  */
 static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
 {
-	if (task_ptrace(tsk) & PT_PTRACED)
+	if (tsk->ptrace & PT_PTRACED)
 		return rcu_dereference(tsk->parent);
 	return NULL;
 }
@@ -202,7 +202,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
 					 struct pt_regs *regs)
 {
 	if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) &&
-	    unlikely(task_ptrace(current) & PT_PTRACED))
+	    unlikely(current->ptrace & PT_PTRACED))
 		send_sig(SIGTRAP, current, 0);
 }
 
@@ -285,7 +285,7 @@ static inline void tracehook_report_clone(struct pt_regs *regs,
 					  unsigned long clone_flags,
 					  pid_t pid, struct task_struct *child)
 {
-	if (unlikely(task_ptrace(child))) {
+	if (unlikely(child->ptrace)) {
 		/*
 		 * It doesn't matter who attached/attaching to this
 		 * task, the pending SIGSTOP is right in any case.
@@ -403,7 +403,7 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
 static inline int tracehook_consider_ignored_signal(struct task_struct *task,
 						    int sig)
 {
-	return (task_ptrace(task) & PT_PTRACED) != 0;
+	return (task->ptrace & PT_PTRACED) != 0;
 }
 
 /**
@@ -422,7 +422,7 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
 static inline int tracehook_consider_fatal_signal(struct task_struct *task,
 						  int sig)
 {
-	return (task_ptrace(task) & PT_PTRACED) != 0;
+	return (task->ptrace & PT_PTRACED) != 0;
 }
 
 #define DEATH_REAP			-1
diff --git a/kernel/exit.c b/kernel/exit.c
index 289f59d686bf..e5cc05644609 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -765,7 +765,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 	p->exit_signal = SIGCHLD;
 
 	/* If it has exited notify the new parent about this child's death. */
-	if (!task_ptrace(p) &&
+	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		do_notify_parent(p, p->exit_signal);
 		if (task_detached(p)) {
@@ -795,7 +795,7 @@ static void forget_original_parent(struct task_struct *father)
 		do {
 			t->real_parent = reaper;
 			if (t->parent == father) {
-				BUG_ON(task_ptrace(t));
+				BUG_ON(t->ptrace);
 				t->parent = t->real_parent;
 			}
 			if (t->pdeath_signal)
@@ -1565,7 +1565,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		 * Notification and reaping will be cascaded to the real
 		 * parent when the ptracer detaches.
 		 */
-		if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+		if (likely(!ptrace) && unlikely(p->ptrace)) {
 			/* it will become visible, clear notask_error */
 			wo->notask_error = 0;
 			return 0;
@@ -1608,7 +1608,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		 * own children, it should create a separate process which
 		 * takes the role of real parent.
 		 */
-		if (likely(!ptrace) && task_ptrace(p) &&
+		if (likely(!ptrace) && p->ptrace &&
 		    same_thread_group(p->parent, p->real_parent))
 			return 0;
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 97e575a3387e..0f3370872506 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1592,7 +1592,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
  	/* do_notify_parent_cldstop should have been called instead.  */
  	BUG_ON(task_is_stopped_or_traced(tsk));
 
-	BUG_ON(!task_ptrace(tsk) &&
+	BUG_ON(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
 	info.si_signo = sig;
@@ -1631,7 +1631,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
-	if (!task_ptrace(tsk) && sig == SIGCHLD &&
+	if (!tsk->ptrace && sig == SIGCHLD &&
 	    (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
 	     (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
 		/*
@@ -1731,7 +1731,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 
 static inline int may_ptrace_stop(void)
 {
-	if (!likely(task_ptrace(current)))
+	if (!likely(current->ptrace))
 		return 0;
 	/*
 	 * Are we in the middle of do_coredump?
@@ -1989,7 +1989,7 @@ static bool do_signal_stop(int signr)
 		if (!(sig->flags & SIGNAL_STOP_STOPPED))
 			sig->group_exit_code = signr;
 		else
-			WARN_ON_ONCE(!task_ptrace(current));
+			WARN_ON_ONCE(!current->ptrace);
 
 		sig->group_stop_count = 0;
 
@@ -2014,7 +2014,7 @@ static bool do_signal_stop(int signr)
 		}
 	}
 
-	if (likely(!task_ptrace(current))) {
+	if (likely(!current->ptrace)) {
 		int notify = 0;
 
 		/*
@@ -2093,7 +2093,7 @@ static void do_jobctl_trap(void)
 static int ptrace_signal(int signr, siginfo_t *info,
 			 struct pt_regs *regs, void *cookie)
 {
-	if (!task_ptrace(current))
+	if (!current->ptrace)
 		return signr;
 
 	ptrace_signal_deliver(regs, cookie);
@@ -2179,7 +2179,7 @@ relock:
 		do_notify_parent_cldstop(current, false, why);
 
 		leader = current->group_leader;
-		if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
+		if (leader->ptrace && !real_parent_is_ptracer(leader))
 			do_notify_parent_cldstop(leader, true, why);
 
 		read_unlock(&tasklist_lock);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca351..b0be989d4365 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 				 * then wait for it to finish before killing
 				 * some other task unnecessarily.
 				 */
-				if (!(task_ptrace(p->group_leader) &
-							PT_TRACE_EXIT))
+				if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
 					return ERR_PTR(-1UL);
 			}
 		}
-- 
cgit v1.2.3


From 643ad8388e189dfd14ef76972cf7dc394b3cbebd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:35 +0200
Subject: ptrace: introduce ptrace_event_enabled() and simplify ptrace_event()
 and tracehook_prepare_clone()

This patch implements ptrace_event_enabled() which tests whether a
given PTRACE_EVENT_* is enabled and use it to simplify ptrace_event()
and tracehook_prepare_clone().

PT_EVENT_FLAG() macro is added which calculates PT_TRACE_* flag from
PTRACE_EVENT_*.  This is used to define PT_TRACE_* flags and by
ptrace_event_enabled() to find the matching flag.

This is used to make ptrace_event() and tracehook_prepare_clone()
simpler.

* ptrace_event() callers were responsible for providing mask to test
  whether the event was enabled.  This patch implements
  ptrace_event_enabled() and make ptrace_event() drop @mask and
  determine whether the event is enabled from @event.  Note that
  @event is constant and this conversion doesn't add runtime overhead.

  All conversions except tracehook_report_clone_complete() are
  trivial.  tracehook_report_clone_complete() used to use 0 for @mask
  (always enabled) but now tests whether the specified event is
  enabled.  This doesn't cause any behavior difference as it's
  guaranteed that the event specified by @trace is enabled.

* tracehook_prepare_clone() now only determines which event is
  applicable and use ptrace_event_enabled() for enable test.

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h    | 46 ++++++++++++++++++++++++++++++++--------------
 include/linux/tracehook.h | 26 +++++++++++++-------------
 2 files changed, 45 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 3ff20b322598..18feac6f441e 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -90,12 +90,17 @@
 #define PT_DTRACE	0x00000002	/* delayed trace (used on m68k, i386) */
 #define PT_TRACESYSGOOD	0x00000004
 #define PT_PTRACE_CAP	0x00000008	/* ptracer can follow suid-exec */
-#define PT_TRACE_FORK	0x00000010
-#define PT_TRACE_VFORK	0x00000020
-#define PT_TRACE_CLONE	0x00000040
-#define PT_TRACE_EXEC	0x00000080
-#define PT_TRACE_VFORK_DONE	0x00000100
-#define PT_TRACE_EXIT	0x00000200
+
+/* PT_TRACE_* event enable flags */
+#define PT_EVENT_FLAG_SHIFT	4
+#define PT_EVENT_FLAG(event)	(1 << (PT_EVENT_FLAG_SHIFT + (event) - 1))
+
+#define PT_TRACE_FORK		PT_EVENT_FLAG(PTRACE_EVENT_FORK)
+#define PT_TRACE_VFORK		PT_EVENT_FLAG(PTRACE_EVENT_VFORK)
+#define PT_TRACE_CLONE		PT_EVENT_FLAG(PTRACE_EVENT_CLONE)
+#define PT_TRACE_EXEC		PT_EVENT_FLAG(PTRACE_EVENT_EXEC)
+#define PT_TRACE_VFORK_DONE	PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
+#define PT_TRACE_EXIT		PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
 
 #define PT_TRACE_MASK	0x000003f4
 
@@ -145,26 +150,39 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
 int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 			    unsigned long data);
 
+/**
+ * ptrace_event_enabled - test whether a ptrace event is enabled
+ * @task: ptracee of interest
+ * @event: %PTRACE_EVENT_* to test
+ *
+ * Test whether @event is enabled for ptracee @task.
+ *
+ * Returns %true if @event is enabled, %false otherwise.
+ */
+static inline bool ptrace_event_enabled(struct task_struct *task, int event)
+{
+	return task->ptrace & PT_EVENT_FLAG(event);
+}
+
 /**
  * ptrace_event - possibly stop for a ptrace event notification
- * @mask:	%PT_* bit to check in @current->ptrace
- * @event:	%PTRACE_EVENT_* value to report if @mask is set
+ * @event:	%PTRACE_EVENT_* value to report
  * @message:	value for %PTRACE_GETEVENTMSG to return
  *
- * This checks the @mask bit to see if ptrace wants stops for this event.
- * If so we stop, reporting @event and @message to the ptrace parent.
+ * Check whether @event is enabled and, if so, report @event and @message
+ * to the ptrace parent.
  *
  * Returns nonzero if we did a ptrace notification, zero if not.
  *
  * Called without locks.
  */
-static inline int ptrace_event(int mask, int event, unsigned long message)
+static inline int ptrace_event(int event, unsigned long message)
 {
-	if (mask && likely(!(current->ptrace & mask)))
-		return 0;
+	if (likely(!ptrace_event_enabled(current, event)))
+		return false;
 	current->ptrace_message = message;
 	ptrace_notify((event << 8) | SIGTRAP);
-	return 1;
+	return true;
 }
 
 /**
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index a3e838784f43..7d38571b0c05 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -201,7 +201,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
 					 struct linux_binprm *bprm,
 					 struct pt_regs *regs)
 {
-	if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) &&
+	if (!ptrace_event(PTRACE_EVENT_EXEC, 0) &&
 	    unlikely(current->ptrace & PT_PTRACED))
 		send_sig(SIGTRAP, current, 0);
 }
@@ -218,7 +218,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
  */
 static inline void tracehook_report_exit(long *exit_code)
 {
-	ptrace_event(PT_TRACE_EXIT, PTRACE_EVENT_EXIT, *exit_code);
+	ptrace_event(PTRACE_EVENT_EXIT, *exit_code);
 }
 
 /**
@@ -232,19 +232,19 @@ static inline void tracehook_report_exit(long *exit_code)
  */
 static inline int tracehook_prepare_clone(unsigned clone_flags)
 {
+	int event = 0;
+
 	if (clone_flags & CLONE_UNTRACED)
 		return 0;
 
-	if (clone_flags & CLONE_VFORK) {
-		if (current->ptrace & PT_TRACE_VFORK)
-			return PTRACE_EVENT_VFORK;
-	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
-		if (current->ptrace & PT_TRACE_CLONE)
-			return PTRACE_EVENT_CLONE;
-	} else if (current->ptrace & PT_TRACE_FORK)
-		return PTRACE_EVENT_FORK;
+	if (clone_flags & CLONE_VFORK)
+		event = PTRACE_EVENT_VFORK;
+	else if ((clone_flags & CSIGNAL) != SIGCHLD)
+		event = PTRACE_EVENT_CLONE;
+	else
+		event = PTRACE_EVENT_FORK;
 
-	return 0;
+	return ptrace_event_enabled(current, event) ? event : 0;
 }
 
 /**
@@ -318,7 +318,7 @@ static inline void tracehook_report_clone_complete(int trace,
 						   struct task_struct *child)
 {
 	if (unlikely(trace))
-		ptrace_event(0, trace, pid);
+		ptrace_event(trace, pid);
 }
 
 /**
@@ -336,7 +336,7 @@ static inline void tracehook_report_clone_complete(int trace,
 static inline void tracehook_report_vfork_done(struct task_struct *child,
 					       pid_t pid)
 {
-	ptrace_event(PT_TRACE_VFORK_DONE, PTRACE_EVENT_VFORK_DONE, pid);
+	ptrace_event(PTRACE_EVENT_VFORK_DONE, pid);
 }
 
 /**
-- 
cgit v1.2.3


From f3c04b934d429b1ace21866f011b66de328c0dc9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:36 +0200
Subject: ptrace: move SIGTRAP on exec(2) logic to ptrace_event()

Move SIGTRAP on exec(2) logic from tracehook_report_exec() to
ptrace_event().  This is part of changes to make ptrace_event()
smarter and handle ptrace event related details in one place.

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h    | 16 ++++++++--------
 include/linux/tracehook.h |  4 +---
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 18feac6f441e..b546fd6c3506 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -172,17 +172,17 @@ static inline bool ptrace_event_enabled(struct task_struct *task, int event)
  * Check whether @event is enabled and, if so, report @event and @message
  * to the ptrace parent.
  *
- * Returns nonzero if we did a ptrace notification, zero if not.
- *
  * Called without locks.
  */
-static inline int ptrace_event(int event, unsigned long message)
+static inline void ptrace_event(int event, unsigned long message)
 {
-	if (likely(!ptrace_event_enabled(current, event)))
-		return false;
-	current->ptrace_message = message;
-	ptrace_notify((event << 8) | SIGTRAP);
-	return true;
+	if (unlikely(ptrace_event_enabled(current, event))) {
+		current->ptrace_message = message;
+		ptrace_notify((event << 8) | SIGTRAP);
+	} else if (event == PTRACE_EVENT_EXEC && unlikely(current->ptrace)) {
+		/* legacy EXEC report via SIGTRAP */
+		send_sig(SIGTRAP, current, 0);
+	}
 }
 
 /**
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 7d38571b0c05..3b68aa842a92 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -201,9 +201,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
 					 struct linux_binprm *bprm,
 					 struct pt_regs *regs)
 {
-	if (!ptrace_event(PTRACE_EVENT_EXEC, 0) &&
-	    unlikely(current->ptrace & PT_PTRACED))
-		send_sig(SIGTRAP, current, 0);
+	ptrace_event(PTRACE_EVENT_EXEC, 0);
 }
 
 /**
-- 
cgit v1.2.3


From a288eecce5253cc1565d400a52b9b476a157e040 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:37 +0200
Subject: ptrace: kill trivial tracehooks

At this point, tracehooks aren't useful to mainline kernel and mostly
just add an extra layer of obfuscation.  Although they have comments,
without actual in-kernel users, it is difficult to tell what are their
assumptions and they're actually trying to achieve.  To mainline
kernel, they just aren't worth keeping around.

This patch kills the following trivial tracehooks.

* Ones testing whether task is ptraced.  Replace with ->ptrace test.

	tracehook_expect_breakpoints()
	tracehook_consider_ignored_signal()
	tracehook_consider_fatal_signal()

* ptrace_event() wrappers.  Call directly.

	tracehook_report_exec()
	tracehook_report_exit()
	tracehook_report_vfork_done()

* ptrace_release_task() wrapper.  Call directly.

	tracehook_finish_release_task()

* noop

	tracehook_prepare_release_task()
	tracehook_report_death()

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 arch/s390/kernel/traps.c  |   4 +-
 fs/exec.c                 |   2 +-
 include/linux/tracehook.h | 156 ----------------------------------------------
 kernel/exit.c             |   7 +--
 kernel/fork.c             |   2 +-
 kernel/signal.c           |   8 +--
 mm/nommu.c                |   3 +-
 7 files changed, 11 insertions(+), 171 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index a65d2e82f61d..a63d34c3611e 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -331,7 +331,7 @@ void __kprobes do_per_trap(struct pt_regs *regs)
 {
 	if (notify_die(DIE_SSTEP, "sstep", regs, 0, 0, SIGTRAP) == NOTIFY_STOP)
 		return;
-	if (tracehook_consider_fatal_signal(current, SIGTRAP))
+	if (current->ptrace)
 		force_sig(SIGTRAP, current);
 }
 
@@ -425,7 +425,7 @@ static void __kprobes illegal_op(struct pt_regs *regs, long pgm_int_code,
 		if (get_user(*((__u16 *) opcode), (__u16 __user *) location))
 			return;
 		if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) {
-			if (tracehook_consider_fatal_signal(current, SIGTRAP))
+			if (current->ptrace)
 				force_sig(SIGTRAP, current);
 			else
 				signal = SIGILL;
diff --git a/fs/exec.c b/fs/exec.c
index a9f2b3631bdb..b37030d0a50b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1384,7 +1384,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			bprm->recursion_depth = depth;
 			if (retval >= 0) {
 				if (depth == 0)
-					tracehook_report_exec(fmt, bprm, regs);
+					ptrace_event(PTRACE_EVENT_EXEC, 0);
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
 				if (bprm->file)
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 3b68aa842a92..8b06d4f2b814 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -51,21 +51,6 @@
 #include <linux/security.h>
 struct linux_binprm;
 
-/**
- * tracehook_expect_breakpoints - guess if task memory might be touched
- * @task:		current task, making a new mapping
- *
- * Return nonzero if @task is expected to want breakpoint insertion in
- * its memory at some point.  A zero return is no guarantee it won't
- * be done, but this is a hint that it's known to be likely.
- *
- * May be called with @task->mm->mmap_sem held for writing.
- */
-static inline int tracehook_expect_breakpoints(struct task_struct *task)
-{
-	return (task->ptrace & PT_PTRACED) != 0;
-}
-
 /*
  * ptrace report for syscall entry and exit looks identical.
  */
@@ -183,42 +168,6 @@ static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
 	return NULL;
 }
 
-/**
- * tracehook_report_exec - a successful exec was completed
- * @fmt:		&struct linux_binfmt that performed the exec
- * @bprm:		&struct linux_binprm containing exec details
- * @regs:		user-mode register state
- *
- * An exec just completed, we are shortly going to return to user mode.
- * The freshly initialized register state can be seen and changed in @regs.
- * The name, file and other pointers in @bprm are still on hand to be
- * inspected, but will be freed as soon as this returns.
- *
- * Called with no locks, but with some kernel resources held live
- * and a reference on @fmt->module.
- */
-static inline void tracehook_report_exec(struct linux_binfmt *fmt,
-					 struct linux_binprm *bprm,
-					 struct pt_regs *regs)
-{
-	ptrace_event(PTRACE_EVENT_EXEC, 0);
-}
-
-/**
- * tracehook_report_exit - task has begun to exit
- * @exit_code:		pointer to value destined for @current->exit_code
- *
- * @exit_code points to the value passed to do_exit(), which tracing
- * might change here.  This is almost the first thing in do_exit(),
- * before freeing any resources or setting the %PF_EXITING flag.
- *
- * Called with no locks held.
- */
-static inline void tracehook_report_exit(long *exit_code)
-{
-	ptrace_event(PTRACE_EVENT_EXIT, *exit_code);
-}
-
 /**
  * tracehook_prepare_clone - prepare for new child to be cloned
  * @clone_flags:	%CLONE_* flags from clone/fork/vfork system call
@@ -319,52 +268,6 @@ static inline void tracehook_report_clone_complete(int trace,
 		ptrace_event(trace, pid);
 }
 
-/**
- * tracehook_report_vfork_done - vfork parent's child has exited or exec'd
- * @child:		child task, already running
- * @pid:		new child's PID in the parent's namespace
- *
- * Called after a %CLONE_VFORK parent has waited for the child to complete.
- * The clone/vfork system call will return immediately after this.
- * The @child pointer may be invalid if a self-reaping child died and
- * tracehook_report_clone() took no action to prevent it from self-reaping.
- *
- * Called with no locks held.
- */
-static inline void tracehook_report_vfork_done(struct task_struct *child,
-					       pid_t pid)
-{
-	ptrace_event(PTRACE_EVENT_VFORK_DONE, pid);
-}
-
-/**
- * tracehook_prepare_release_task - task is being reaped, clean up tracing
- * @task:		task in %EXIT_DEAD state
- *
- * This is called in release_task() just before @task gets finally reaped
- * and freed.  This would be the ideal place to remove and clean up any
- * tracing-related state for @task.
- *
- * Called with no locks held.
- */
-static inline void tracehook_prepare_release_task(struct task_struct *task)
-{
-}
-
-/**
- * tracehook_finish_release_task - final tracing clean-up
- * @task:		task in %EXIT_DEAD state
- *
- * This is called in release_task() when @task is being in the middle of
- * being reaped.  After this, there must be no tracing entanglements.
- *
- * Called with write_lock_irq(&tasklist_lock) held.
- */
-static inline void tracehook_finish_release_task(struct task_struct *task)
-{
-	ptrace_release_task(task);
-}
-
 /**
  * tracehook_signal_handler - signal handler setup is complete
  * @sig:		number of signal being delivered
@@ -388,41 +291,6 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
 		ptrace_notify(SIGTRAP);
 }
 
-/**
- * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal
- * @task:		task receiving the signal
- * @sig:		signal number being sent
- *
- * Return zero iff tracing doesn't care to examine this ignored signal,
- * so it can short-circuit normal delivery and never even get queued.
- *
- * Called with @task->sighand->siglock held.
- */
-static inline int tracehook_consider_ignored_signal(struct task_struct *task,
-						    int sig)
-{
-	return (task->ptrace & PT_PTRACED) != 0;
-}
-
-/**
- * tracehook_consider_fatal_signal - suppress special handling of fatal signal
- * @task:		task receiving the signal
- * @sig:		signal number being sent
- *
- * Return nonzero to prevent special handling of this termination signal.
- * Normally handler for signal is %SIG_DFL.  It can be %SIG_IGN if @sig is
- * ignored, in which case force_sig() is about to reset it to %SIG_DFL.
- * When this returns zero, this signal might cause a quick termination
- * that does not give the debugger a chance to intercept the signal.
- *
- * Called with or without @task->sighand->siglock held.
- */
-static inline int tracehook_consider_fatal_signal(struct task_struct *task,
-						  int sig)
-{
-	return (task->ptrace & PT_PTRACED) != 0;
-}
-
 #define DEATH_REAP			-1
 #define DEATH_DELAYED_GROUP_LEADER	-2
 
@@ -457,30 +325,6 @@ static inline int tracehook_notify_death(struct task_struct *task,
 	return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER;
 }
 
-/**
- * tracehook_report_death - task is dead and ready to be reaped
- * @task:		@current task now exiting
- * @signal:		return value from tracheook_notify_death()
- * @death_cookie:	value passed back from tracehook_notify_death()
- * @group_dead:		nonzero if this was the last thread in the group to die
- *
- * Thread has just become a zombie or is about to self-reap.  If positive,
- * @signal is the signal number just sent to the parent (usually %SIGCHLD).
- * If @signal is %DEATH_REAP, this thread will self-reap.  If @signal is
- * %DEATH_DELAYED_GROUP_LEADER, this is a delayed_group_leader() zombie.
- * The @death_cookie was passed back by tracehook_notify_death().
- *
- * If normal reaping is not inhibited, @task->exit_state might be changing
- * in parallel.
- *
- * Called without locks.
- */
-static inline void tracehook_report_death(struct task_struct *task,
-					  int signal, void *death_cookie,
-					  int group_dead)
-{
-}
-
 #ifdef TIF_NOTIFY_RESUME
 /**
  * set_notify_resume - cause tracehook_notify_resume() to be called
diff --git a/kernel/exit.c b/kernel/exit.c
index e5cc05644609..d49134a7f250 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -169,7 +169,6 @@ void release_task(struct task_struct * p)
 	struct task_struct *leader;
 	int zap_leader;
 repeat:
-	tracehook_prepare_release_task(p);
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
 	rcu_read_lock();
@@ -179,7 +178,7 @@ repeat:
 	proc_flush_task(p);
 
 	write_lock_irq(&tasklist_lock);
-	tracehook_finish_release_task(p);
+	ptrace_release_task(p);
 	__exit_signal(p);
 
 	/*
@@ -868,8 +867,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		wake_up_process(tsk->signal->group_exit_task);
 	write_unlock_irq(&tasklist_lock);
 
-	tracehook_report_death(tsk, signal, cookie, group_dead);
-
 	/* If the process is dead, release it - nobody will wait for it */
 	if (signal == DEATH_REAP)
 		release_task(tsk);
@@ -924,7 +921,7 @@ NORET_TYPE void do_exit(long code)
 	 */
 	set_fs(USER_DS);
 
-	tracehook_report_exit(&code);
+	ptrace_event(PTRACE_EVENT_EXIT, code);
 
 	validate_creds_for_do_exit(tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30401a0..d4f0dff9d617 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1527,7 +1527,7 @@ long do_fork(unsigned long clone_flags,
 			freezer_do_not_count();
 			wait_for_completion(&vfork);
 			freezer_count();
-			tracehook_report_vfork_done(p, nr);
+			ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
 		}
 	} else {
 		nr = PTR_ERR(p);
diff --git a/kernel/signal.c b/kernel/signal.c
index 0f3370872506..1550aee34f42 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
 	/*
 	 * Tracers may want to know about even ignored signals.
 	 */
-	return !tracehook_consider_ignored_signal(t, sig);
+	return !t->ptrace;
 }
 
 /*
@@ -493,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
 		return 1;
 	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return !tracehook_consider_fatal_signal(tsk, sig);
+	/* if ptraced, let the tracer determine */
+	return !tsk->ptrace;
 }
 
 /*
@@ -981,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	if (sig_fatal(p, sig) &&
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL ||
-	     !tracehook_consider_fatal_signal(t, sig))) {
+	    (sig == SIGKILL || !t->ptrace)) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
diff --git a/mm/nommu.c b/mm/nommu.c
index 1fd0c51b10a6..54ae707bdae8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
 	 * it's being traced - otherwise breakpoints set in it may interfere
 	 * with another untraced process
 	 */
-	if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
+	if ((flags & MAP_PRIVATE) && current->ptrace)
 		vm_flags &= ~VM_MAYSHARE;
 
 	return vm_flags;
-- 
cgit v1.2.3


From 4b9d33e6d83cc05a8005a8f9a8b9677fa0f53626 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:38 +0200
Subject: ptrace: kill clone/exec tracehooks

At this point, tracehooks aren't useful to mainline kernel and mostly
just add an extra layer of obfuscation.  Although they have comments,
without actual in-kernel users, it is difficult to tell what are their
assumptions and they're actually trying to achieve.  To mainline
kernel, they just aren't worth keeping around.

This patch kills the following clone and exec related tracehooks.

	tracehook_prepare_clone()
	tracehook_finish_clone()
	tracehook_report_clone()
	tracehook_report_clone_complete()
	tracehook_unsafe_exec()

The changes are mostly trivial - logic is moved to the caller and
comments are merged and adjusted appropriately.

The only exception is in check_unsafe_exec() where LSM_UNSAFE_PTRACE*
are OR'd to bprm->unsafe instead of setting it, which produces the
same result as the field is always zero on entry.  It also tests
p->ptrace instead of (p->ptrace & PT_PTRACED) for consistency, which
also gives the same result.

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c                 |   7 ++-
 include/linux/tracehook.h | 121 ----------------------------------------------
 kernel/fork.c             |  41 ++++++++++++----
 3 files changed, 38 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index b37030d0a50b..8dca45b0dae8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1224,7 +1224,12 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 	unsigned n_fs;
 	int res = 0;
 
-	bprm->unsafe = tracehook_unsafe_exec(p);
+	if (p->ptrace) {
+		if (p->ptrace & PT_PTRACE_CAP)
+			bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
+		else
+			bprm->unsafe |= LSM_UNSAFE_PTRACE;
+	}
 
 	n_fs = 1;
 	spin_lock(&p->fs->lock);
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 8b06d4f2b814..bcc4ca762aee 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -129,27 +129,6 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 	ptrace_report_syscall(regs);
 }
 
-/**
- * tracehook_unsafe_exec - check for exec declared unsafe due to tracing
- * @task:		current task doing exec
- *
- * Return %LSM_UNSAFE_* bits applied to an exec because of tracing.
- *
- * @task->signal->cred_guard_mutex is held by the caller through the do_execve().
- */
-static inline int tracehook_unsafe_exec(struct task_struct *task)
-{
-	int unsafe = 0;
-	int ptrace = task->ptrace;
-	if (ptrace & PT_PTRACED) {
-		if (ptrace & PT_PTRACE_CAP)
-			unsafe |= LSM_UNSAFE_PTRACE_CAP;
-		else
-			unsafe |= LSM_UNSAFE_PTRACE;
-	}
-	return unsafe;
-}
-
 /**
  * tracehook_tracer_task - return the task that is tracing the given task
  * @tsk:		task to consider
@@ -168,106 +147,6 @@ static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
 	return NULL;
 }
 
-/**
- * tracehook_prepare_clone - prepare for new child to be cloned
- * @clone_flags:	%CLONE_* flags from clone/fork/vfork system call
- *
- * This is called before a new user task is to be cloned.
- * Its return value will be passed to tracehook_finish_clone().
- *
- * Called with no locks held.
- */
-static inline int tracehook_prepare_clone(unsigned clone_flags)
-{
-	int event = 0;
-
-	if (clone_flags & CLONE_UNTRACED)
-		return 0;
-
-	if (clone_flags & CLONE_VFORK)
-		event = PTRACE_EVENT_VFORK;
-	else if ((clone_flags & CSIGNAL) != SIGCHLD)
-		event = PTRACE_EVENT_CLONE;
-	else
-		event = PTRACE_EVENT_FORK;
-
-	return ptrace_event_enabled(current, event) ? event : 0;
-}
-
-/**
- * tracehook_finish_clone - new child created and being attached
- * @child:		new child task
- * @clone_flags:	%CLONE_* flags from clone/fork/vfork system call
- * @trace:		return value from tracehook_prepare_clone()
- *
- * This is called immediately after adding @child to its parent's children list.
- * The @trace value is that returned by tracehook_prepare_clone().
- *
- * Called with current's siglock and write_lock_irq(&tasklist_lock) held.
- */
-static inline void tracehook_finish_clone(struct task_struct *child,
-					  unsigned long clone_flags, int trace)
-{
-	ptrace_init_task(child, (clone_flags & CLONE_PTRACE) || trace);
-}
-
-/**
- * tracehook_report_clone - in parent, new child is about to start running
- * @regs:		parent's user register state
- * @clone_flags:	flags from parent's system call
- * @pid:		new child's PID in the parent's namespace
- * @child:		new child task
- *
- * Called after a child is set up, but before it has been started running.
- * This is not a good place to block, because the child has not started
- * yet.  Suspend the child here if desired, and then block in
- * tracehook_report_clone_complete().  This must prevent the child from
- * self-reaping if tracehook_report_clone_complete() uses the @child
- * pointer; otherwise it might have died and been released by the time
- * tracehook_report_clone_complete() is called.
- *
- * Called with no locks held, but the child cannot run until this returns.
- */
-static inline void tracehook_report_clone(struct pt_regs *regs,
-					  unsigned long clone_flags,
-					  pid_t pid, struct task_struct *child)
-{
-	if (unlikely(child->ptrace)) {
-		/*
-		 * It doesn't matter who attached/attaching to this
-		 * task, the pending SIGSTOP is right in any case.
-		 */
-		sigaddset(&child->pending.signal, SIGSTOP);
-		set_tsk_thread_flag(child, TIF_SIGPENDING);
-	}
-}
-
-/**
- * tracehook_report_clone_complete - new child is running
- * @trace:		return value from tracehook_prepare_clone()
- * @regs:		parent's user register state
- * @clone_flags:	flags from parent's system call
- * @pid:		new child's PID in the parent's namespace
- * @child:		child task, already running
- *
- * This is called just after the child has started running.  This is
- * just before the clone/fork syscall returns, or blocks for vfork
- * child completion if @clone_flags has the %CLONE_VFORK bit set.
- * The @child pointer may be invalid if a self-reaping child died and
- * tracehook_report_clone() took no action to prevent it from self-reaping.
- *
- * Called with no locks held.
- */
-static inline void tracehook_report_clone_complete(int trace,
-						   struct pt_regs *regs,
-						   unsigned long clone_flags,
-						   pid_t pid,
-						   struct task_struct *child)
-{
-	if (unlikely(trace))
-		ptrace_event(trace, pid);
-}
-
 /**
  * tracehook_signal_handler - signal handler setup is complete
  * @sig:		number of signal being delivered
diff --git a/kernel/fork.c b/kernel/fork.c
index d4f0dff9d617..3c72a5b321a7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1340,7 +1340,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	}
 
 	if (likely(p->pid)) {
-		tracehook_finish_clone(p, clone_flags, trace);
+		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
 
 		if (thread_group_leader(p)) {
 			if (is_child_reaper(pid))
@@ -1481,10 +1481,22 @@ long do_fork(unsigned long clone_flags,
 	}
 
 	/*
-	 * When called from kernel_thread, don't do user tracing stuff.
+	 * Determine whether and which event to report to ptracer.  When
+	 * called from kernel_thread or CLONE_UNTRACED is explicitly
+	 * requested, no event is reported; otherwise, report if the event
+	 * for the type of forking is enabled.
 	 */
-	if (likely(user_mode(regs)))
-		trace = tracehook_prepare_clone(clone_flags);
+	if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+		if (clone_flags & CLONE_VFORK)
+			trace = PTRACE_EVENT_VFORK;
+		else if ((clone_flags & CSIGNAL) != SIGCHLD)
+			trace = PTRACE_EVENT_CLONE;
+		else
+			trace = PTRACE_EVENT_FORK;
+
+		if (likely(!ptrace_event_enabled(current, trace)))
+			trace = 0;
+	}
 
 	p = copy_process(clone_flags, stack_start, regs, stack_size,
 			 child_tidptr, NULL, trace);
@@ -1508,20 +1520,31 @@ long do_fork(unsigned long clone_flags,
 		}
 
 		audit_finish_fork(p);
-		tracehook_report_clone(regs, clone_flags, nr, p);
+
+		/*
+		 * Child is ready but hasn't started running yet.  Queue
+		 * SIGSTOP if it's gonna be ptraced - it doesn't matter who
+		 * attached/attaching to this task, the pending SIGSTOP is
+		 * right in any case.
+		 */
+		if (unlikely(p->ptrace)) {
+			sigaddset(&p->pending.signal, SIGSTOP);
+			set_tsk_thread_flag(p, TIF_SIGPENDING);
+		}
 
 		/*
 		 * We set PF_STARTING at creation in case tracing wants to
 		 * use this to distinguish a fully live task from one that
-		 * hasn't gotten to tracehook_report_clone() yet.  Now we
-		 * clear it and set the child going.
+		 * hasn't finished SIGSTOP raising yet.  Now we clear it
+		 * and set the child going.
 		 */
 		p->flags &= ~PF_STARTING;
 
 		wake_up_new_task(p);
 
-		tracehook_report_clone_complete(trace, regs,
-						clone_flags, nr, p);
+		/* forking complete and child started to run, tell ptracer */
+		if (unlikely(trace))
+			ptrace_event(trace, nr);
 
 		if (clone_flags & CLONE_VFORK) {
 			freezer_do_not_count();
-- 
cgit v1.2.3


From 06d984737bac0545fe20bb5447ee488b95adb531 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:40 +0200
Subject: ptrace: s/tracehook_tracer_task()/ptrace_parent()/

tracehook.h is on the way out.  Rename tracehook_tracer_task() to
ptrace_parent() and move it from tracehook.h to ptrace.h.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: John Johansen <john.johansen@canonical.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/proc/array.c            |  2 +-
 fs/proc/base.c             |  2 +-
 include/linux/ptrace.h     | 18 ++++++++++++++++++
 include/linux/tracehook.h  | 18 ------------------
 security/apparmor/domain.c |  2 +-
 security/selinux/hooks.c   |  4 ++--
 6 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9b45ee84fbcc..3a1dafd228d1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -172,7 +172,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
 	tpid = 0;
 	if (pid_alive(p)) {
-		struct task_struct *tracer = tracehook_tracer_task(p);
+		struct task_struct *tracer = ptrace_parent(p);
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 14def991d9dd..c883dad74b9a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -216,7 +216,7 @@ static struct mm_struct *__check_mem_permission(struct task_struct *task)
 	if (task_is_stopped_or_traced(task)) {
 		int match;
 		rcu_read_lock();
-		match = (tracehook_tracer_task(task) == current);
+		match = (ptrace_parent(task) == current);
 		rcu_read_unlock();
 		if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
 			return mm;
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index b546fd6c3506..bb157bdd0c55 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -150,6 +150,24 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
 int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 			    unsigned long data);
 
+/**
+ * ptrace_parent - return the task that is tracing the given task
+ * @task: task to consider
+ *
+ * Returns %NULL if no one is tracing @task, or the &struct task_struct
+ * pointer to its tracer.
+ *
+ * Must called under rcu_read_lock().  The pointer returned might be kept
+ * live only by RCU.  During exec, this may be called with task_lock() held
+ * on @task, still held from when check_unsafe_exec() was called.
+ */
+static inline struct task_struct *ptrace_parent(struct task_struct *task)
+{
+	if (unlikely(task->ptrace))
+		return rcu_dereference(task->parent);
+	return NULL;
+}
+
 /**
  * ptrace_event_enabled - test whether a ptrace event is enabled
  * @task: ptracee of interest
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index bcc4ca762aee..7a1bd12aeffa 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -129,24 +129,6 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 	ptrace_report_syscall(regs);
 }
 
-/**
- * tracehook_tracer_task - return the task that is tracing the given task
- * @tsk:		task to consider
- *
- * Returns NULL if no one is tracing @task, or the &struct task_struct
- * pointer to its tracer.
- *
- * Must called under rcu_read_lock().  The pointer returned might be kept
- * live only by RCU.  During exec, this may be called with task_lock()
- * held on @task, still held from when tracehook_unsafe_exec() was called.
- */
-static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
-{
-	if (tsk->ptrace & PT_PTRACED)
-		return rcu_dereference(tsk->parent);
-	return NULL;
-}
-
 /**
  * tracehook_signal_handler - signal handler setup is complete
  * @sig:		number of signal being delivered
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index c825c6e0b636..7312bf9f7afc 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -67,7 +67,7 @@ static int may_change_ptraced_domain(struct task_struct *task,
 	int error = 0;
 
 	rcu_read_lock();
-	tracer = tracehook_tracer_task(task);
+	tracer = ptrace_parent(task);
 	if (tracer) {
 		/* released below */
 		cred = get_task_cred(tracer);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a0d38459d650..fc07d18ed6fc 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2048,7 +2048,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 			u32 ptsid = 0;
 
 			rcu_read_lock();
-			tracer = tracehook_tracer_task(current);
+			tracer = ptrace_parent(current);
 			if (likely(tracer != NULL)) {
 				sec = __task_cred(tracer)->security;
 				ptsid = sec->sid;
@@ -5314,7 +5314,7 @@ static int selinux_setprocattr(struct task_struct *p,
 		   Otherwise, leave SID unchanged and fail. */
 		ptsid = 0;
 		task_lock(p);
-		tracer = tracehook_tracer_task(p);
+		tracer = ptrace_parent(p);
 		if (tracer)
 			ptsid = task_sid(tracer);
 		task_unlock(p);
-- 
cgit v1.2.3


From 670dc2833d144375eac36ad74111495a825a9288 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 20 Jun 2011 13:40:46 +0200
Subject: netlink: advertise incomplete dumps

Consider the following situation:
 * a dump that would show 8 entries, four in the first
   round, and four in the second
 * between the first and second rounds, 6 entries are
   removed
 * now the second round will not show any entry, and
   even if there is a sequence/generation counter the
   application will not know

To solve this problem, add a new flag NLM_F_DUMP_INTR
to the netlink header that indicates the dump wasn't
consistent, this flag can also be set on the MSG_DONE
message that terminates the dump, and as such above
situation can be detected.

To achieve this, add a sequence counter to the netlink
callback struct. Of course, netlink code still needs
to use this new functionality. The correct way to do
that is to always set cb->seq when a dumpit callback
is invoked and call nl_dump_check_consistent() for
each new message. The core code will also call this
function for the final MSG_DONE message.

To make it usable with generic netlink, a new function
genlmsg_nlhdr() is needed to obtain the netlink header
from the genetlink user header.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/netlink.h  |  2 ++
 include/net/genetlink.h  | 32 ++++++++++++++++++++++++++++++++
 include/net/netlink.h    | 24 ++++++++++++++++++++++++
 net/netlink/af_netlink.c |  2 ++
 4 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 4c4ac3f3ce5a..8d1bcec5cc06 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -48,6 +48,7 @@ struct nlmsghdr {
 #define NLM_F_MULTI		2	/* Multipart message, terminated by NLMSG_DONE */
 #define NLM_F_ACK		4	/* Reply with ack, with zero or error code */
 #define NLM_F_ECHO		8	/* Echo this request 		*/
+#define NLM_F_DUMP_INTR		16	/* Dump was inconsistent due to sequence change */
 
 /* Modifiers to GET request */
 #define NLM_F_ROOT	0x100	/* specify tree	root	*/
@@ -221,6 +222,7 @@ struct netlink_callback {
 					struct netlink_callback *cb);
 	int			(*done)(struct netlink_callback *cb);
 	int			family;
+	unsigned int		prev_seq, seq;
 	long			args[6];
 };
 
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index d420f28b6d60..82d8d09faa44 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -159,6 +159,38 @@ static inline void *genlmsg_put(struct sk_buff *skb, u32 pid, u32 seq,
 	return (char *) hdr + GENL_HDRLEN;
 }
 
+/**
+ * genlmsg_nlhdr - Obtain netlink header from user specified header
+ * @user_hdr: user header as returned from genlmsg_put()
+ * @family: generic netlink family
+ *
+ * Returns pointer to netlink header.
+ */
+static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr,
+					     struct genl_family *family)
+{
+	return (struct nlmsghdr *)((char *)user_hdr -
+				   family->hdrsize -
+				   GENL_HDRLEN -
+				   NLMSG_HDRLEN);
+}
+
+/**
+ * genl_dump_check_consistent - check if sequence is consistent and advertise if not
+ * @cb: netlink callback structure that stores the sequence number
+ * @user_hdr: user header as returned from genlmsg_put()
+ * @family: generic netlink family
+ *
+ * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it
+ * simpler to use with generic netlink.
+ */
+static inline void genl_dump_check_consistent(struct netlink_callback *cb,
+					      void *user_hdr,
+					      struct genl_family *family)
+{
+	nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr, family));
+}
+
 /**
  * genlmsg_put_reply - Add generic netlink header to a reply message
  * @skb: socket buffer holding the message
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 02740a94f108..98c185441bee 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -638,6 +638,30 @@ static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 pid)
 	     nlmsg_ok(pos, rem); \
 	     pos = nlmsg_next(pos, &(rem)))
 
+/**
+ * nl_dump_check_consistent - check if sequence is consistent and advertise if not
+ * @cb: netlink callback structure that stores the sequence number
+ * @nlh: netlink message header to write the flag to
+ *
+ * This function checks if the sequence (generation) number changed during dump
+ * and if it did, advertises it in the netlink message header.
+ *
+ * The correct way to use it is to set cb->seq to the generation counter when
+ * all locks for dumping have been acquired, and then call this function for
+ * each message that is generated.
+ *
+ * Note that due to initialisation concerns, 0 is an invalid sequence number
+ * and must not be used by code that uses this functionality.
+ */
+static inline void
+nl_dump_check_consistent(struct netlink_callback *cb,
+			 struct nlmsghdr *nlh)
+{
+	if (cb->prev_seq && cb->seq != cb->prev_seq)
+		nlh->nlmsg_flags |= NLM_F_DUMP_INTR;
+	cb->prev_seq = cb->seq;
+}
+
 /**************************************************************************
  * Netlink Attributes
  **************************************************************************/
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6ef64adf7362..a7ec8512f552 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1693,6 +1693,8 @@ static int netlink_dump(struct sock *sk)
 	if (!nlh)
 		goto errout_skb;
 
+	nl_dump_check_consistent(cb, nlh);
+
 	memcpy(nlmsg_data(nlh), &len, sizeof(len));
 
 	if (sk_filter(sk, skb))
-- 
cgit v1.2.3


From 3762561aa8afb0bd9fb60d3d847961f9945f8143 Mon Sep 17 00:00:00 2001
From: Gabor Juhos <juhosg@openwrt.org>
Date: Tue, 21 Jun 2011 11:23:23 +0200
Subject: ath9k: add MAC revision detection for AR9330

The AR9330 1.0 and 1.1 are using the same revision,
thus it is not possible to distinguish the two chips.
The platform setup code can distinguish the chips based
on the SoC revision.

Add a callback function to ath9k_platform_data in order
to allow getting the revision number from the platform code.

Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/hw.c   | 9 +++++++++
 drivers/net/wireless/ath/ath9k/hw.h   | 1 +
 drivers/net/wireless/ath/ath9k/init.c | 1 +
 include/linux/ath9k_platform.h        | 1 +
 4 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index a3881b64f766..9840e0051084 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -251,6 +251,15 @@ static void ath9k_hw_read_revisions(struct ath_hw *ah)
 	case AR5416_AR9100_DEVID:
 		ah->hw_version.macVersion = AR_SREV_VERSION_9100;
 		break;
+	case AR9300_DEVID_AR9330:
+		ah->hw_version.macVersion = AR_SREV_VERSION_9330;
+		if (ah->get_mac_revision) {
+			ah->hw_version.macRev = ah->get_mac_revision();
+		} else {
+			val = REG_READ(ah, AR_SREV);
+			ah->hw_version.macRev = MS(val, AR_SREV_REVISION2);
+		}
+		return;
 	case AR9300_DEVID_AR9340:
 		ah->hw_version.macVersion = AR_SREV_VERSION_9340;
 		val = REG_READ(ah, AR_SREV);
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index 14c5857a020c..0749fa8c3a58 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -862,6 +862,7 @@ struct ath_hw {
 	u32 ent_mode;
 
 	bool is_clk_25mhz;
+	int (*get_mac_revision)(void);
 };
 
 struct ath_bus_ops {
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index d4b166cfdf60..661211291773 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -574,6 +574,7 @@ static int ath9k_init_softc(u16 devid, struct ath_softc *sc, u16 subsysid,
 		sc->sc_ah->gpio_val = pdata->gpio_val;
 		sc->sc_ah->led_pin = pdata->led_pin;
 		ah->is_clk_25mhz = pdata->is_clk_25mhz;
+		ah->get_mac_revision = pdata->get_mac_revision;
 	}
 
 	common = ath9k_hw_common(ah);
diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h
index 60a7c49dcb49..c207607acada 100644
--- a/include/linux/ath9k_platform.h
+++ b/include/linux/ath9k_platform.h
@@ -30,6 +30,7 @@ struct ath9k_platform_data {
 	u32 gpio_val;
 
 	bool is_clk_25mhz;
+	int (*get_mac_revision)(void);
 };
 
 #endif /* _LINUX_ATH9K_PLATFORM_H */
-- 
cgit v1.2.3


From 7d95847c9b3631d31f657d8cede153b518ed9e2e Mon Sep 17 00:00:00 2001
From: Gabor Juhos <juhosg@openwrt.org>
Date: Tue, 21 Jun 2011 11:23:51 +0200
Subject: ath9k: add external_reset callback to ath9k_platfom_data for AR9330

The patch adds a callback to ath9k_platform_data. If the
callback is provided by the platform code, then it can be
used to hard reset the WMAC device.

The callback is required for doing a hard reset of the AR9330
chips to get them working again after a hang.

Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/hw.c   | 35 +++++++++++++++++++++++++++++++++++
 drivers/net/wireless/ath/ath9k/hw.h   |  1 +
 drivers/net/wireless/ath/ath9k/init.c |  1 +
 include/linux/ath9k_platform.h        |  1 +
 4 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index 4da5284c7773..839ba64c9e49 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -1161,6 +1161,41 @@ static bool ath9k_hw_set_reset(struct ath_hw *ah, int type)
 			rst_flags |= AR_RTC_RC_MAC_COLD;
 	}
 
+	if (AR_SREV_9330(ah)) {
+		int npend = 0;
+		int i;
+
+		/* AR9330 WAR:
+		 * call external reset function to reset WMAC if:
+		 * - doing a cold reset
+		 * - we have pending frames in the TX queues
+		 */
+
+		for (i = 0; i < AR_NUM_QCU; i++) {
+			npend = ath9k_hw_numtxpending(ah, i);
+			if (npend)
+				break;
+		}
+
+		if (ah->external_reset &&
+		    (npend || type == ATH9K_RESET_COLD)) {
+			int reset_err = 0;
+
+			ath_dbg(ath9k_hw_common(ah), ATH_DBG_RESET,
+				"reset MAC via external reset\n");
+
+			reset_err = ah->external_reset();
+			if (reset_err) {
+				ath_err(ath9k_hw_common(ah),
+					"External reset failed, err=%d\n",
+					reset_err);
+				return false;
+			}
+
+			REG_WRITE(ah, AR_RTC_RESET, 1);
+		}
+	}
+
 	REG_WRITE(ah, AR_RTC_RC, rst_flags);
 
 	REGWRITE_BUFFER_FLUSH(ah);
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index 0749fa8c3a58..818acdd1ba90 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -863,6 +863,7 @@ struct ath_hw {
 
 	bool is_clk_25mhz;
 	int (*get_mac_revision)(void);
+	int (*external_reset)(void);
 };
 
 struct ath_bus_ops {
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index 1851c222fff0..50103b2792b5 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -575,6 +575,7 @@ static int ath9k_init_softc(u16 devid, struct ath_softc *sc, u16 subsysid,
 		sc->sc_ah->led_pin = pdata->led_pin;
 		ah->is_clk_25mhz = pdata->is_clk_25mhz;
 		ah->get_mac_revision = pdata->get_mac_revision;
+		ah->external_reset = pdata->external_reset;
 	}
 
 	common = ath9k_hw_common(ah);
diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h
index c207607acada..6e3f54f37844 100644
--- a/include/linux/ath9k_platform.h
+++ b/include/linux/ath9k_platform.h
@@ -31,6 +31,7 @@ struct ath9k_platform_data {
 
 	bool is_clk_25mhz;
 	int (*get_mac_revision)(void);
+	int (*external_reset)(void);
 };
 
 #endif /* _LINUX_ATH9K_PLATFORM_H */
-- 
cgit v1.2.3


From a7de915383a6d5c05663f9badbd10d5a87bc1586 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Jun 2011 22:53:27 +0200
Subject: genirq: Remove unused CHECK_IRQ_PER_CPU()

No more users. Kill it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8b4538446636..68258f0369db 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -96,11 +96,6 @@ enum {
 
 #define IRQ_NO_BALANCING_MASK	(IRQ_PER_CPU | IRQ_NO_BALANCING)
 
-static inline __deprecated bool CHECK_IRQ_PER_CPU(unsigned int status)
-{
-	return status & IRQ_PER_CPU;
-}
-
 /*
  * Return value for chip->irq_set_affinity()
  *
-- 
cgit v1.2.3


From d902db1eb60387040fe541573083e47469db50ac Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 8 Jun 2011 19:31:56 +0200
Subject: sched: Generalize sleep inside spinlock detection

The sleeping inside spinlock detection is actually used
for more general sleeping inside atomic sections
debugging: preemption disabled, rcu read side critical
sections, interrupts, interrupt disabled, etc...

Change the name of the config and its help section to
reflect its more general role.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
---
 Documentation/DocBook/kernel-hacking.tmpl  | 2 +-
 Documentation/SubmitChecklist              | 2 +-
 Documentation/development-process/4.Coding | 2 +-
 Documentation/ja_JP/SubmitChecklist        | 2 +-
 Documentation/zh_CN/SubmitChecklist        | 2 +-
 include/linux/kernel.h                     | 2 +-
 kernel/sched.c                             | 2 +-
 lib/Kconfig.debug                          | 8 +++++---
 8 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index 7b3f49363413..07a9c48de5a2 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -409,7 +409,7 @@ cond_resched(); /* Will sleep */
 
   <para>
    You should always compile your kernel
-   <symbol>CONFIG_DEBUG_SPINLOCK_SLEEP</symbol> on, and it will warn
+   <symbol>CONFIG_DEBUG_ATOMIC_SLEEP</symbol> on, and it will warn
    you if you break these rules.  If you <emphasis>do</emphasis> break
    the rules, you will eventually lock up your box.
   </para>
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index da0382daa395..7b13be41c085 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -53,7 +53,7 @@ kernel patches.
 
 12: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
     CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
-    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP all simultaneously
+    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP all simultaneously
     enabled.
 
 13: Has been build- and runtime tested with and without CONFIG_SMP and
diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding
index f3f1a469443c..83f5f5b365a3 100644
--- a/Documentation/development-process/4.Coding
+++ b/Documentation/development-process/4.Coding
@@ -244,7 +244,7 @@ testing purposes.  In particular, you should turn on:
  - DEBUG_SLAB can find a variety of memory allocation and use errors; it
    should be used on most development kernels.
 
- - DEBUG_SPINLOCK, DEBUG_SPINLOCK_SLEEP, and DEBUG_MUTEXES will find a
+ - DEBUG_SPINLOCK, DEBUG_ATOMIC_SLEEP, and DEBUG_MUTEXES will find a
    number of common locking errors.
 
 There are quite a few other debugging options, some of which will be
diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist
index 2df4576f1173..cb5507b1ac81 100644
--- a/Documentation/ja_JP/SubmitChecklist
+++ b/Documentation/ja_JP/SubmitChecklist
@@ -68,7 +68,7 @@ Linux カーネルパッチ投稿者向けチェックリスト
 
 12: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB,
     CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK,
-    CONFIG_DEBUG_SPINLOCK_SLEEP これら全てを同時に有効にして動作確認を
+    CONFIG_DEBUG_ATOMIC_SLEEP これら全てを同時に有効にして動作確認を
     行ってください。
 
 13: CONFIG_SMP, CONFIG_PREEMPT を有効にした場合と無効にした場合の両方で
diff --git a/Documentation/zh_CN/SubmitChecklist b/Documentation/zh_CN/SubmitChecklist
index 951415bbab0c..4c741d6bc048 100644
--- a/Documentation/zh_CN/SubmitChecklist
+++ b/Documentation/zh_CN/SubmitChecklist
@@ -67,7 +67,7 @@ Linux
 
 12���Ѿ�ͨ��CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
     CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
-    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP���ԣ�����ͬʱ��
+    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP���ԣ�����ͬʱ��
     ʹ�ܡ�
 
 13���Ѿ�����������ʹ�û��߲�ʹ�� CONFIG_SMP �� CONFIG_PREEMPT����ִ��ʱ�䡣
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index fb0e7329fee1..24b489f66592 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -121,7 +121,7 @@ extern int _cond_resched(void);
 # define might_resched() do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   void __might_sleep(const char *file, int line, int preempt_offset);
 /**
  * might_sleep - annotation for functions that can sleep
diff --git a/kernel/sched.c b/kernel/sched.c
index 90ad7cf2b290..a5f318b8d659 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8018,7 +8018,7 @@ void __init sched_init(void)
 	scheduler_running = 1;
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
 	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a7dd7b547fea..81a4f3302bc8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -648,13 +648,15 @@ config TRACE_IRQFLAGS
 	  Enables hooks to interrupt enabling and disabling for
 	  either tracing or lock debugging.
 
-config DEBUG_SPINLOCK_SLEEP
-	bool "Spinlock debugging: sleep-inside-spinlock checking"
+config DEBUG_ATOMIC_SLEEP
+	bool "Sleep inside atomic section checking"
 	select PREEMPT_COUNT
 	depends on DEBUG_KERNEL
 	help
 	  If you say Y here, various routines which may sleep will become very
-	  noisy if they are called with a spinlock held.
+	  noisy if they are called inside atomic sections: when a spinlock is
+	  held, inside an rcu read side critical section, inside preempt disabled
+	  sections, inside an interrupt, etc...
 
 config DEBUG_LOCKING_API_SELFTESTS
 	bool "Locking API boot-time self-tests"
-- 
cgit v1.2.3


From 01dc9cc314ad1009f5b7356399b386015fdb0400 Mon Sep 17 00:00:00 2001
From: David Wagner <david.wagner@free-electrons.com>
Date: Mon, 20 Jun 2011 17:34:19 +0200
Subject: UBI: clarify the volume notification types' doc

I realized the new descriptions of ADDED and REMOVED could also be
misleading: they can also be triggered after using a userland util
(ubi{mk,rm}vol).

Artem: amend the commentaries

Signed-off-by David Wagner <david.wagner@free-electrons.com>
Signed-off-by: Artem Bityutskiy <dedekind1@gmail.com>
---
 include/linux/mtd/ubi.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
index 15da0e99f48a..db4836bed514 100644
--- a/include/linux/mtd/ubi.h
+++ b/include/linux/mtd/ubi.h
@@ -155,12 +155,14 @@ struct ubi_device_info {
 };
 
 /*
- * enum - volume notification types.
- * @UBI_VOLUME_ADDED: volume has been added
- * @UBI_VOLUME_REMOVED: start volume volume
- * @UBI_VOLUME_RESIZED: volume size has been re-sized
- * @UBI_VOLUME_RENAMED: volume name has been re-named
- * @UBI_VOLUME_UPDATED: volume name has been updated
+ * Volume notification types.
+ * @UBI_VOLUME_ADDED: a volume has been added (an UBI device was attached or a
+ *                    volume was created)
+ * @UBI_VOLUME_REMOVED: a volume has been removed (an UBI device was detached
+ *			or a volume was removed)
+ * @UBI_VOLUME_RESIZED: a volume has been re-sized
+ * @UBI_VOLUME_RENAMED: a volume has been re-named
+ * @UBI_VOLUME_UPDATED: data has been written to a volume
  *
  * These constants define which type of event has happened when a volume
  * notification function is invoked.
-- 
cgit v1.2.3


From 39785eb1d3e6c58cc8bf8f6990956a58037ecc75 Mon Sep 17 00:00:00 2001
From: Timur Tabi <timur@freescale.com>
Date: Thu, 23 Jun 2011 20:20:26 +0000
Subject: fsl-diu-fb: remove check for pixel clock ranges

The Freescale DIU framebuffer driver defines two constants, MIN_PIX_CLK and
MAX_PIX_CLK, that are supposed to represent the lower and upper limits of
the pixel clock.  These values, however, are true only for one platform
clock rate (533MHz) and only for the MPC8610.  So the actual range for
the pixel clock is chip-specific, which means the current values are almost
always wrong.  The chance of an out-of-range pixel clock being used are also
remote.

Rather than try to detect an out-of-range clock in the DIU driver, we depend
on the board-specific pixel clock function (e.g. p1022ds_set_pixel_clock)
to clamp the pixel clock to a supported value.

Signed-off-by: Timur Tabi <timur@freescale.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/video/fsl-diu-fb.c | 16 ----------------
 include/linux/fsl-diu-fb.h |  6 ------
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fsl-diu-fb.c b/drivers/video/fsl-diu-fb.c
index bedf5be27f05..0acc7d65aeaa 100644
--- a/drivers/video/fsl-diu-fb.c
+++ b/drivers/video/fsl-diu-fb.c
@@ -555,8 +555,6 @@ static void adjust_aoi_size_position(struct fb_var_screeninfo *var,
 static int fsl_diu_check_var(struct fb_var_screeninfo *var,
 				struct fb_info *info)
 {
-	unsigned long htotal, vtotal;
-
 	pr_debug("check_var xres: %d\n", var->xres);
 	pr_debug("check_var yres: %d\n", var->yres);
 
@@ -635,20 +633,6 @@ static int fsl_diu_check_var(struct fb_var_screeninfo *var,
 
 		break;
 	}
-	/* If the pixclock is below the minimum spec'd value then set to
-	 * refresh rate for 60Hz since this is supported by most monitors.
-	 * Refer to Documentation/fb/ for calculations.
-	 */
-	if ((var->pixclock < MIN_PIX_CLK) || (var->pixclock > MAX_PIX_CLK)) {
-		htotal = var->xres + var->right_margin + var->hsync_len +
-		    var->left_margin;
-		vtotal = var->yres + var->lower_margin + var->vsync_len +
-		    var->upper_margin;
-		var->pixclock = (vtotal * htotal * 6UL) / 100UL;
-		var->pixclock = KHZ2PICOS(var->pixclock);
-		pr_debug("pixclock set for 60Hz refresh = %u ps\n",
-			var->pixclock);
-	}
 
 	var->height = -1;
 	var->width = -1;
diff --git a/include/linux/fsl-diu-fb.h b/include/linux/fsl-diu-fb.h
index 781d4671415f..daa9952d2174 100644
--- a/include/linux/fsl-diu-fb.h
+++ b/include/linux/fsl-diu-fb.h
@@ -24,12 +24,6 @@
  * See mpc8610fb_set_par(), map_video_memory(), and unmap_video_memory()
  */
 #define MEM_ALLOC_THRESHOLD (1024*768*4+32)
-/* Minimum value that the pixel clock can be set to in pico seconds
- * This is determined by platform clock/3 where the minimum platform
- * clock is 533MHz. This gives 5629 pico seconds.
- */
-#define MIN_PIX_CLK 5629
-#define MAX_PIX_CLK 96096
 
 #include <linux/types.h>
 
-- 
cgit v1.2.3


From 53c8f9f199b239668e6b1a907735ee323a0d1ccd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:08:18 +0200
Subject: make do_notify_parent() return bool

- change do_notify_parent() to return a boolean, true if the task should
  be reaped because its parent ignores SIGCHLD.

- update the only caller which checks the returned value, exit_notify().

This temporary uglifies exit_notify() even more, will be cleanuped by
the next change.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched.h |  4 ++--
 kernel/exit.c         |  9 ++++++---
 kernel/signal.c       | 17 +++++++++--------
 3 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 87f7ca7ed6f6..0df7231d9ee0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2145,7 +2145,7 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s
 	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 
 	return ret;
-}	
+}
 
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
@@ -2160,7 +2160,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern int do_notify_parent(struct task_struct *, int);
+extern bool do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/exit.c b/kernel/exit.c
index d49134a7f250..34d135f4fccc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -820,6 +820,7 @@ static void forget_original_parent(struct task_struct *father)
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
 	int signal;
+	bool autoreap;
 	void *cookie;
 
 	/*
@@ -858,9 +859,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
 	if (signal >= 0)
-		signal = do_notify_parent(tsk, signal);
+		autoreap = do_notify_parent(tsk, signal);
+	else
+		autoreap = (signal == DEATH_REAP);
 
-	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
 
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
@@ -868,7 +871,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	write_unlock_irq(&tasklist_lock);
 
 	/* If the process is dead, release it - nobody will wait for it */
-	if (signal == DEATH_REAP)
+	if (autoreap)
 		release_task(tsk);
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 1550aee34f42..d52e82cd62bb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1577,15 +1577,15 @@ ret:
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
  *
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
+ * Returns true if our parent ignored us and so we've switched to
+ * self-reaping.
  */
-int do_notify_parent(struct task_struct *tsk, int sig)
+bool do_notify_parent(struct task_struct *tsk, int sig)
 {
 	struct siginfo info;
 	unsigned long flags;
 	struct sighand_struct *psig;
-	int ret = sig;
+	bool autoreap = false;
 
 	BUG_ON(sig == -1);
 
@@ -1649,16 +1649,17 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 		 * is implementation-defined: we do (if you don't want
 		 * it, just use SIG_IGN instead).
 		 */
-		ret = tsk->exit_signal = -1;
+		autoreap = true;
+		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-			sig = -1;
+			sig = 0;
 	}
-	if (valid_signal(sig) && sig > 0)
+	if (valid_signal(sig) && sig)
 		__group_send_sig_info(sig, &info, tsk->parent);
 	__wake_up_parent(tsk, tsk->parent);
 	spin_unlock_irqrestore(&psig->siglock, flags);
 
-	return ret;
+	return autoreap;
 }
 
 /**
-- 
cgit v1.2.3


From 45cdf5cc0703c537194588c63d53bad1f2539d36 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jun 2011 19:06:50 +0200
Subject: kill tracehook_notify_death()

Kill tracehook_notify_death(), reimplement the logic in its caller,
exit_notify().

Also, change the exec_id's check to use thread_group_leader() instead
of task_detached(), this is more clear. This logic only applies to
the exiting leader, a sub-thread must never change its exit_signal.

Note: when the traced group leader exits the exit_signal-or-SIGCHLD
logic looks really strange:

	- we notify the tracer even if !thread_group_empty() but
	   do_wait(WEXITED) can't work until all threads exit

	- if the tracer is real_parent, it is not clear why can't
	  we use ->exit_signal event if !thread_group_empty()

-v2: do not try to fix the 2nd oddity to avoid the subtle behavior
     change mixed with reorganization, suggested by Tejun.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 include/linux/tracehook.h | 34 ----------------------------------
 kernel/exit.c             | 21 +++++++++++++--------
 2 files changed, 13 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 7a1bd12aeffa..a71a2927a6a0 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -152,40 +152,6 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
 		ptrace_notify(SIGTRAP);
 }
 
-#define DEATH_REAP			-1
-#define DEATH_DELAYED_GROUP_LEADER	-2
-
-/**
- * tracehook_notify_death - task is dead, ready to notify parent
- * @task:		@current task now exiting
- * @death_cookie:	value to pass to tracehook_report_death()
- * @group_dead:		nonzero if this was the last thread in the group to die
- *
- * A return value >= 0 means call do_notify_parent() with that signal
- * number.  Negative return value can be %DEATH_REAP to self-reap right
- * now, or %DEATH_DELAYED_GROUP_LEADER to a zombie without notifying our
- * parent.  Note that a return value of 0 means a do_notify_parent() call
- * that sends no signal, but still wakes up a parent blocked in wait*().
- *
- * Called with write_lock_irq(&tasklist_lock) held.
- */
-static inline int tracehook_notify_death(struct task_struct *task,
-					 void **death_cookie, int group_dead)
-{
-	if (task_detached(task))
-		return task->ptrace ? SIGCHLD : DEATH_REAP;
-
-	/*
-	 * If something other than our normal parent is ptracing us, then
-	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
-	 * only has special meaning to our real parent.
-	 */
-	if (thread_group_empty(task) && !ptrace_reparented(task))
-		return task->exit_signal;
-
-	return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER;
-}
-
 #ifdef TIF_NOTIFY_RESUME
 /**
  * set_notify_resume - cause tracehook_notify_resume() to be called
diff --git a/kernel/exit.c b/kernel/exit.c
index 34d135f4fccc..bb08e938ca74 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -819,9 +819,7 @@ static void forget_original_parent(struct task_struct *father)
  */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-	int signal;
 	bool autoreap;
-	void *cookie;
 
 	/*
 	 * This does two things:
@@ -852,16 +850,23 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 * we have changed execution domain as these two values started
 	 * the same after a fork.
 	 */
-	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
+	if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
 	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
 	     tsk->self_exec_id != tsk->parent_exec_id))
 		tsk->exit_signal = SIGCHLD;
 
-	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-	if (signal >= 0)
-		autoreap = do_notify_parent(tsk, signal);
-	else
-		autoreap = (signal == DEATH_REAP);
+	if (unlikely(tsk->ptrace)) {
+		int sig = thread_group_leader(tsk) &&
+				thread_group_empty(tsk) &&
+				!ptrace_reparented(tsk) ?
+			tsk->exit_signal : SIGCHLD;
+		autoreap = do_notify_parent(tsk, sig);
+	} else if (thread_group_leader(tsk)) {
+		autoreap = thread_group_empty(tsk) &&
+			do_notify_parent(tsk, tsk->exit_signal);
+	} else {
+		autoreap = true;
+	}
 
 	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
 
-- 
cgit v1.2.3


From 8677347378044ab564470bced2275520efb3670d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:09:09 +0200
Subject: make do_notify_parent() __must_check, update the callers

Change other callers of do_notify_parent() to check the value it
returns, this makes the subsequent task_detached() unnecessary.
Mark do_notify_parent() as __must_check.

Use thread_group_leader() instead of !task_detached() to check
if we need to notify the real parent in wait_task_zombie().

Remove the stale comment in release_task(). "just for sanity" is
no longer true, we have to set EXIT_DEAD to avoid the races with
do_wait().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched.h |  2 +-
 kernel/exit.c         | 29 ++++++++---------------------
 2 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0df7231d9ee0..0cb4f097f76c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2160,7 +2160,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern bool do_notify_parent(struct task_struct *, int);
+extern __must_check bool do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/exit.c b/kernel/exit.c
index bb08e938ca74..f68d137ffeb4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -190,21 +190,12 @@ repeat:
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
 		BUG_ON(task_detached(leader));
-		do_notify_parent(leader, leader->exit_signal);
 		/*
 		 * If we were the last child thread and the leader has
 		 * exited already, and the leader's parent ignores SIGCHLD,
 		 * then we are the one who should release the leader.
-		 *
-		 * do_notify_parent() will have marked it self-reaping in
-		 * that case.
-		 */
-		zap_leader = task_detached(leader);
-
-		/*
-		 * This maintains the invariant that release_task()
-		 * only runs on a task in EXIT_DEAD, just for sanity.
 		 */
+		zap_leader = do_notify_parent(leader, leader->exit_signal);
 		if (zap_leader)
 			leader->exit_state = EXIT_DEAD;
 	}
@@ -766,8 +757,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 	/* If it has exited notify the new parent about this child's death. */
 	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
-		do_notify_parent(p, p->exit_signal);
-		if (task_detached(p)) {
+		if (do_notify_parent(p, p->exit_signal)) {
 			p->exit_state = EXIT_DEAD;
 			list_move_tail(&p->sibling, dead);
 		}
@@ -1351,16 +1341,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		/* We dropped tasklist, ptracer could die and untrace */
 		ptrace_unlink(p);
 		/*
-		 * If this is not a detached task, notify the parent.
-		 * If it's still not detached after that, don't release
-		 * it now.
+		 * If this is not a sub-thread, notify the parent.
+		 * If parent wants a zombie, don't release it now.
 		 */
-		if (!task_detached(p)) {
-			do_notify_parent(p, p->exit_signal);
-			if (!task_detached(p)) {
-				p->exit_state = EXIT_ZOMBIE;
-				p = NULL;
-			}
+		if (thread_group_leader(p) &&
+		    !do_notify_parent(p, p->exit_signal)) {
+			p->exit_state = EXIT_ZOMBIE;
+			p = NULL;
 		}
 		write_unlock_irq(&tasklist_lock);
 	}
-- 
cgit v1.2.3


From e550f14dc6322e794d4e70825f63c9c99177ae8b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:09:54 +0200
Subject: kill task_detached()

Upadate the last user of task_detached(), wait_task_zombie(), to
use thread_group_leader() and kill task_detached().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched.h | 5 -----
 kernel/exit.c         | 5 ++---
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0cb4f097f76c..39acee2c8929 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2318,11 +2318,6 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
-static inline int task_detached(struct task_struct *p)
-{
-	return p->exit_signal == -1;
-}
-
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
diff --git a/kernel/exit.c b/kernel/exit.c
index 2b1ba8048a14..9fa99702645d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -189,7 +189,6 @@ repeat:
 	zap_leader = 0;
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
-		BUG_ON(task_detached(leader));
 		/*
 		 * If we were the last child thread and the leader has
 		 * exited already, and the leader's parent ignores SIGCHLD,
@@ -1231,9 +1230,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	traced = ptrace_reparented(p);
 	/*
 	 * It can be ptraced but not reparented, check
-	 * !task_detached() to filter out sub-threads.
+	 * thread_group_leader() to filter out sub-threads.
 	 */
-	if (likely(!traced) && likely(!task_detached(p))) {
+	if (likely(!traced) && thread_group_leader(p)) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 		unsigned long maxrss;
-- 
cgit v1.2.3


From 087806b1281563e4ae7a5bce3155f894af5f4118 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:10:26 +0200
Subject: redefine thread_group_leader() as exit_signal >= 0

Change de_thread() to set old_leader->exit_signal = -1. This is
good for the consistency, it is no longer the leader and all
sub-threads have exit_signal = -1 set by copy_process(CLONE_THREAD).

And this allows us to micro-optimize thread_group_leader(), it can
simply check exit_signal >= 0. This also makes sense because we
should move ->group_leader from task_struct to signal_struct.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 fs/exec.c             | 1 +
 include/linux/sched.h | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 8dca45b0dae8..c3d517bfdd27 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -963,6 +963,7 @@ static int de_thread(struct task_struct *tsk)
 		leader->group_leader = tsk;
 
 		tsk->exit_signal = SIGCHLD;
+		leader->exit_signal = -1;
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 		leader->exit_state = EXIT_DEAD;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 39acee2c8929..b38ed51d5c64 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2284,8 +2284,10 @@ static inline int get_nr_threads(struct task_struct *tsk)
 	return tsk->signal->nr_threads;
 }
 
-/* de_thread depends on thread_group_leader not being a pid based check */
-#define thread_group_leader(p)	(p == p->group_leader)
+static inline bool thread_group_leader(struct task_struct *p)
+{
+	return p->exit_signal >= 0;
+}
 
 /* Do to the insanities of de_thread it is possible for a process
  * to have the pid of the thread group leader without actually being
-- 
cgit v1.2.3


From 0347e17739095c58c0194fed6a61aced3536d258 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 24 Jun 2011 17:34:06 +0200
Subject: ptrace: ptrace_reparented() should check same_thread_group()

ptrace_reparented() naively does parent != real_parent, this means
it returns true even if the tracer _is_ the real parent. This is per
process thing, not per-thread. The only reason ->real_parent can
point to the non-leader thread is that we have __WNOTHREAD.

Change it to check !same_thread_group(parent, real_parent).

It has two callers, and in both cases the current check does not
look right.

exit_notify: we should respect ->exit_signal if the exiting leader
is traced by any thread from the parent thread group. It is the
child of the whole group, and we are going to send the signal to
the whole group.

wait_task_zombie: without __WNOTHREAD do_wait() should do the same
for any thread, only sys_ptrace() is "bound" to the single thread.
However do_wait(WEXITED) succeeds but does not release a traced
natural child unless the caller is the tracer.

Test-case:

	void *tfunc(void *arg)
	{
		assert(ptrace(PTRACE_ATTACH, (long)arg, 0,0) == 0);
		pause();
		return NULL;
	}

	int main(void)
	{
		pthread_t thr;
		pid_t pid, stat, ret;

		pid = fork();
		if (!pid) {
			pause();
			assert(0);
		}

		assert(pthread_create(&thr, NULL, tfunc, (void*)(long)pid) == 0);

		assert(waitpid(-1, &stat, 0) == pid);
		assert(WIFSTOPPED(stat));

		kill(pid, SIGKILL);

		assert(waitpid(-1, &stat, 0) == pid);
		assert(WIFSIGNALED(stat) && WTERMSIG(stat) == SIGKILL);

		ret = waitpid(pid, &stat, 0);
		if (ret < 0)
			return 0;

		printf("WTF? %d is dead, but: wait=%d stat=%x\n",
				pid, ret, stat);

		return 1;
	}

Note that the main thread simply does

	pid = fork();
	kill(pid, SIGKILL);

and then without the patch wait4(WEXITED) succeeds twice and reports
WTERMSIG(stat) == SIGKILL.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ptrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index bb157bdd0c55..eae381d584f9 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -136,7 +136,7 @@ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
 
 static inline int ptrace_reparented(struct task_struct *child)
 {
-	return child->real_parent != child->parent;
+	return !same_thread_group(child->real_parent, child->parent);
 }
 
 static inline void ptrace_unlink(struct task_struct *child)
-- 
cgit v1.2.3


From 04b7dcf979d71e870683c804802e44287a802760 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 22 Jun 2011 10:06:59 +0200
Subject: wireless: unify QoS control field definitions

Move all that mac80211 has into the generic
ieee80211.h header file and use them. At the
same time move them from mask+shift to just
bits and rename them for consistent names.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/carl9170/rx.c  |  2 +-
 drivers/net/wireless/libertas_tf/main.c |  2 +-
 include/linux/ieee80211.h               | 18 +++++++++++++-----
 net/mac80211/rx.c                       |  2 +-
 net/mac80211/wme.c                      |  3 +--
 net/mac80211/wme.h                      |  5 -----
 6 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/carl9170/rx.c b/drivers/net/wireless/ath/carl9170/rx.c
index ec21ea9fd8d5..dc99030ea8b6 100644
--- a/drivers/net/wireless/ath/carl9170/rx.c
+++ b/drivers/net/wireless/ath/carl9170/rx.c
@@ -472,7 +472,7 @@ static struct sk_buff *carl9170_rx_copy_data(u8 *buf, int len)
 		u8 *qc = ieee80211_get_qos_ctl(hdr);
 		reserved += NET_IP_ALIGN;
 
-		if (*qc & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT)
+		if (*qc & IEEE80211_QOS_CTL_A_MSDU_PRESENT)
 			reserved += NET_IP_ALIGN;
 	}
 
diff --git a/drivers/net/wireless/libertas_tf/main.c b/drivers/net/wireless/libertas_tf/main.c
index d4005081f1df..2aa4de7cad24 100644
--- a/drivers/net/wireless/libertas_tf/main.c
+++ b/drivers/net/wireless/libertas_tf/main.c
@@ -585,7 +585,7 @@ int lbtf_rx(struct lbtf_private *priv, struct sk_buff *skb)
 	need_padding ^= ieee80211_has_a4(hdr->frame_control);
 	need_padding ^= ieee80211_is_data_qos(hdr->frame_control) &&
 			(*ieee80211_get_qos_ctl(hdr) &
-			 IEEE80211_QOS_CONTROL_A_MSDU_PRESENT);
+			 IEEE80211_QOS_CTL_A_MSDU_PRESENT);
 
 	if (need_padding) {
 		memmove(skb->data + 2, skb->data, skb->len);
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index bf56b6f78270..a26108e4d924 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -117,8 +117,19 @@
 #define IEEE80211_MAX_MESH_ID_LEN	32
 
 #define IEEE80211_QOS_CTL_LEN		2
-#define IEEE80211_QOS_CTL_TID_MASK	0x000F
-#define IEEE80211_QOS_CTL_TAG1D_MASK	0x0007
+/* 1d tag mask */
+#define IEEE80211_QOS_CTL_TAG1D_MASK		0x0007
+/* TID mask */
+#define IEEE80211_QOS_CTL_TID_MASK		0x000f
+/* EOSP */
+#define IEEE80211_QOS_CTL_EOSP			0x0010
+/* ACK policy */
+#define IEEE80211_QOS_CTL_ACK_POLICY_NORMAL	0x0000
+#define IEEE80211_QOS_CTL_ACK_POLICY_NOACK	0x0020
+#define IEEE80211_QOS_CTL_ACK_POLICY_NO_EXPL	0x0040
+#define IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK	0x0060
+/* A-MSDU 802.11n */
+#define IEEE80211_QOS_CTL_A_MSDU_PRESENT	0x0080
 
 /* U-APSD queue for WMM IEs sent by AP */
 #define IEEE80211_WMM_IE_AP_QOSINFO_UAPSD	(1<<7)
@@ -1423,9 +1434,6 @@ enum ieee80211_sa_query_action {
 };
 
 
-/* A-MSDU 802.11n */
-#define IEEE80211_QOS_CONTROL_A_MSDU_PRESENT 0x0080
-
 /* cipher suite selectors */
 #define WLAN_CIPHER_SUITE_USE_GROUP	0x000FAC00
 #define WLAN_CIPHER_SUITE_WEP40		0x000FAC01
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 7fa8c6be7bf0..b5493ecd1e93 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -338,7 +338,7 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
 		u8 *qc = ieee80211_get_qos_ctl(hdr);
 		/* frame has qos control */
 		tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
-		if (*qc & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT)
+		if (*qc & IEEE80211_QOS_CTL_A_MSDU_PRESENT)
 			status->rx_flags |= IEEE80211_RX_AMSDU;
 	} else {
 		/*
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 28bc084dbfb9..7a49532f14cb 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -151,8 +151,7 @@ void ieee80211_set_qos_hdr(struct ieee80211_local *local, struct sk_buff *skb)
 		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 
 		if (unlikely(local->wifi_wme_noack_test))
-			ack_policy |= QOS_CONTROL_ACK_POLICY_NOACK <<
-					QOS_CONTROL_ACK_POLICY_SHIFT;
+			ack_policy |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK;
 		/* qos header is 2 bytes, second reserved */
 		*p++ = ack_policy | tid;
 		*p = 0;
diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h
index 6053b1c9feee..faead6d02026 100644
--- a/net/mac80211/wme.h
+++ b/net/mac80211/wme.h
@@ -13,11 +13,6 @@
 #include <linux/netdevice.h>
 #include "ieee80211_i.h"
 
-#define QOS_CONTROL_ACK_POLICY_NORMAL 0
-#define QOS_CONTROL_ACK_POLICY_NOACK 1
-
-#define QOS_CONTROL_ACK_POLICY_SHIFT 5
-
 extern const int ieee802_1d_to_ac[8];
 
 u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
-- 
cgit v1.2.3


From 6d3321e8e2b3bf6a5892e2ef673c7bf536e3f904 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 23 Jun 2011 11:19:26 -0700
Subject: x86, mtrr: lock stop machine during MTRR rendezvous sequence

MTRR rendezvous sequence using stop_one_cpu_nowait() can potentially
happen in parallel with another system wide rendezvous using
stop_machine(). This can lead to deadlock (The order in which
works are queued can be different on different cpu's. Some cpu's
will be running the first rendezvous handler and others will be running
the second rendezvous handler. Each set waiting for the other set to join
for the system wide rendezvous, leading to a deadlock).

MTRR rendezvous sequence is not implemented using stop_machine() as this
gets called both from the process context aswell as the cpu online paths
(where the cpu has not come online and the interrupts are disabled etc).
stop_machine() works with only online cpus.

For now, take the stop_machine mutex in the MTRR rendezvous sequence that
gets called from an online cpu (here we are in the process context
and can potentially sleep while taking the mutex). And the MTRR rendezvous
that gets triggered during cpu online doesn't need to take this stop_machine
lock (as the stop_machine() already ensures that there is no cpu hotplug
going on in parallel by doing get_online_cpus())

    TBD: Pursue a cleaner solution of extending the stop_machine()
         infrastructure to handle the case where the calling cpu is
         still not online and use this for MTRR rendezvous sequence.

fixes: https://bugzilla.novell.com/show_bug.cgi?id=672008

Reported-by: Vadim Kotelnikov <vadimuzzz@inbox.ru>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/20110623182056.807230326@sbsiddha-MOBL3.sc.intel.com
Cc: stable@kernel.org # 2.6.35+, backport a week or two after this gets more testing in mainline
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mtrr/main.c | 23 +++++++++++++++++++++++
 include/linux/stop_machine.h    |  2 ++
 kernel/stop_machine.c           |  2 +-
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d1..3d17bc7f06e6 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -248,6 +248,25 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 	unsigned long flags;
 	int cpu;
 
+#ifdef CONFIG_SMP
+	/*
+	 * If this cpu is not yet active, we are in the cpu online path. There
+	 * can be no stop_machine() in parallel, as stop machine ensures this
+	 * by using get_online_cpus(). We can skip taking the stop_cpus_mutex,
+	 * as we don't need it and also we can't afford to block while waiting
+	 * for the mutex.
+	 *
+	 * If this cpu is active, we need to prevent stop_machine() happening
+	 * in parallel by taking the stop cpus mutex.
+	 *
+	 * Also, this is called in the context of cpu online path or in the
+	 * context where cpu hotplug is prevented. So checking the active status
+	 * of the raw_smp_processor_id() is safe.
+	 */
+	if (cpu_active(raw_smp_processor_id()))
+		mutex_lock(&stop_cpus_mutex);
+#endif
+
 	preempt_disable();
 
 	data.smp_reg = reg;
@@ -330,6 +349,10 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 
 	local_irq_restore(flags);
 	preempt_enable();
+#ifdef CONFIG_SMP
+	if (cpu_active(raw_smp_processor_id()))
+		mutex_unlock(&stop_cpus_mutex);
+#endif
 }
 
 /**
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 092dc9b1ce7d..14d3524d1274 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -27,6 +27,8 @@ struct cpu_stop_work {
 	struct cpu_stop_done	*done;
 };
 
+extern struct mutex stop_cpus_mutex;
+
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
 void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			 struct cpu_stop_work *work_buf);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e3516b29076c..0cae1cc323dc 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -132,8 +132,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
 }
 
+DEFINE_MUTEX(stop_cpus_mutex);
 /* static data for stop_cpus */
-static DEFINE_MUTEX(stop_cpus_mutex);
 static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
 
 int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
-- 
cgit v1.2.3


From 06c3df49521c1b112b777cc4946e5de057c814ba Mon Sep 17 00:00:00 2001
From: Jamie Iles <jamie@jamieiles.com>
Date: Mon, 6 Jun 2011 12:43:07 +0100
Subject: clocksource: apb: Share APB timer code with other platforms

The APB timers are an IP block from Synopsys (DesignWare APB timers)
and are also found in other systems including ARM SoC's.  This patch
adds functions for creating clock_event_devices and clocksources from
APB timers but does not do the resource allocation.  This is handled
in a higher layer to allow the timers to be created from multiple
methods such as platform_devices.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Jamie Iles <jamie@jamieiles.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/Kconfig                   |   1 +
 arch/x86/include/asm/apb_timer.h   |  22 +-
 arch/x86/kernel/apb_timer.c        | 409 +++++++------------------------------
 drivers/clocksource/Kconfig        |   3 +
 drivers/clocksource/Makefile       |   1 +
 drivers/clocksource/dw_apb_timer.c | 401 ++++++++++++++++++++++++++++++++++++
 include/linux/dw_apb_timer.h       |  56 +++++
 7 files changed, 533 insertions(+), 360 deletions(-)
 create mode 100644 drivers/clocksource/dw_apb_timer.c
 create mode 100644 include/linux/dw_apb_timer.h

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index da349723d411..10ce62f17ea1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -617,6 +617,7 @@ config HPET_EMULATE_RTC
 config APB_TIMER
        def_bool y if MRST
        prompt "Langwell APB Timer Support" if X86_MRST
+       select DW_APB_TIMER
        help
          APB timer is the replacement for 8254, HPET on X86 MID platforms.
          The APBT provides a stable time base on SMP
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index 2fefa501d3ba..230000f124a7 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -18,24 +18,6 @@
 
 #ifdef CONFIG_APB_TIMER
 
-/* Langwell DW APB timer registers */
-#define APBTMR_N_LOAD_COUNT    0x00
-#define APBTMR_N_CURRENT_VALUE 0x04
-#define APBTMR_N_CONTROL       0x08
-#define APBTMR_N_EOI           0x0c
-#define APBTMR_N_INT_STATUS    0x10
-
-#define APBTMRS_INT_STATUS     0xa0
-#define APBTMRS_EOI            0xa4
-#define APBTMRS_RAW_INT_STATUS 0xa8
-#define APBTMRS_COMP_VERSION   0xac
-#define APBTMRS_REG_SIZE       0x14
-
-/* register bits */
-#define APBTMR_CONTROL_ENABLE  (1<<0)
-#define APBTMR_CONTROL_MODE_PERIODIC   (1<<1) /*1: periodic 0:free running */
-#define APBTMR_CONTROL_INT     (1<<2)
-
 /* default memory mapped register base */
 #define LNW_SCU_ADDR           0xFF100000
 #define LNW_EXT_TIMER_OFFSET   0x1B800
@@ -43,8 +25,8 @@
 #define LNW_EXT_TIMER_PGOFFSET         0x800
 
 /* APBT clock speed range from PCLK to fabric base, 25-100MHz */
-#define APBT_MAX_FREQ          50
-#define APBT_MIN_FREQ          1
+#define APBT_MAX_FREQ          50000000
+#define APBT_MIN_FREQ          1000000
 #define APBT_MMAP_SIZE         1024
 
 #define APBT_DEV_USED  1
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 289e92862fd9..033d3ecc5744 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -27,15 +27,12 @@
  * timer, but by default APB timer has higher rating than local APIC timers.
  */
 
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
 #include <linux/delay.h>
+#include <linux/dw_apb_timer.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/sysdev.h>
 #include <linux/slab.h>
 #include <linux/pm.h>
-#include <linux/pci.h>
 #include <linux/sfi.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
@@ -45,75 +42,46 @@
 #include <asm/apb_timer.h>
 #include <asm/mrst.h>
 
-#define APBT_MASK			CLOCKSOURCE_MASK(32)
-#define APBT_SHIFT			22
 #define APBT_CLOCKEVENT_RATING		110
 #define APBT_CLOCKSOURCE_RATING		250
-#define APBT_MIN_DELTA_USEC		200
 
-#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
 #define APBT_CLOCKEVENT0_NUM   (0)
-#define APBT_CLOCKEVENT1_NUM   (1)
 #define APBT_CLOCKSOURCE_NUM   (2)
 
-static unsigned long apbt_address;
+static phys_addr_t apbt_address;
 static int apb_timer_block_enabled;
 static void __iomem *apbt_virt_address;
-static int phy_cs_timer_id;
 
 /*
  * Common DW APB timer info
  */
-static uint64_t apbt_freq;
-
-static void apbt_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt);
-static int apbt_next_event(unsigned long delta,
-			   struct clock_event_device *evt);
-static cycle_t apbt_read_clocksource(struct clocksource *cs);
-static void apbt_restart_clocksource(struct clocksource *cs);
+static unsigned long apbt_freq;
 
 struct apbt_dev {
-	struct clock_event_device evt;
-	unsigned int num;
-	int cpu;
-	unsigned int irq;
-	unsigned int tick;
-	unsigned int count;
-	unsigned int flags;
-	char name[10];
+	struct dw_apb_clock_event_device	*timer;
+	unsigned int				num;
+	int					cpu;
+	unsigned int				irq;
+	char					name[10];
 };
 
-static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+static struct dw_apb_clocksource *clocksource_apbt;
 
-#ifdef CONFIG_SMP
-static unsigned int apbt_num_timers_used;
-static struct apbt_dev *apbt_devs;
-#endif
-
-static	inline unsigned long apbt_readl_reg(unsigned long a)
+static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
 {
-	return readl(apbt_virt_address + a);
+	return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
 }
 
-static inline void apbt_writel_reg(unsigned long d, unsigned long a)
-{
-	writel(d, apbt_virt_address + a);
-}
-
-static inline unsigned long apbt_readl(int n, unsigned long a)
-{
-	return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
 
-static inline void apbt_writel(int n, unsigned long d, unsigned long a)
-{
-	writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+#endif
 
 static inline void apbt_set_mapping(void)
 {
 	struct sfi_timer_table_entry *mtmr;
+	int phy_cs_timer_id = 0;
 
 	if (apbt_virt_address) {
 		pr_debug("APBT base already mapped\n");
@@ -125,21 +93,18 @@ static inline void apbt_set_mapping(void)
 		       APBT_CLOCKEVENT0_NUM);
 		return;
 	}
-	apbt_address = (unsigned long)mtmr->phys_addr;
+	apbt_address = (phys_addr_t)mtmr->phys_addr;
 	if (!apbt_address) {
 		printk(KERN_WARNING "No timer base from SFI, use default\n");
 		apbt_address = APBT_DEFAULT_BASE;
 	}
 	apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
-	if (apbt_virt_address) {
-		pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
-			 (void *)apbt_address, (void *)apbt_virt_address);
-	} else {
-		pr_debug("Failed mapping APBT phy address at %p\n",\
-			 (void *)apbt_address);
+	if (!apbt_virt_address) {
+		pr_debug("Failed mapping APBT phy address at %lu\n",\
+			 (unsigned long)apbt_address);
 		goto panic_noapbt;
 	}
-	apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
+	apbt_freq = mtmr->freq_hz;
 	sfi_free_mtmr(mtmr);
 
 	/* Now figure out the physical timer id for clocksource device */
@@ -148,9 +113,14 @@ static inline void apbt_set_mapping(void)
 		goto panic_noapbt;
 
 	/* Now figure out the physical timer id */
-	phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
-		/ APBTMRS_REG_SIZE;
-	pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
+	pr_debug("Use timer %d for clocksource\n",
+		 (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
+	phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
+		APBTMRS_REG_SIZE;
+
+	clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
+		"apbt0", apbt_virt_address + phy_cs_timer_id *
+		APBTMRS_REG_SIZE, apbt_freq);
 	return;
 
 panic_noapbt:
@@ -172,82 +142,6 @@ static inline int is_apbt_capable(void)
 	return apbt_virt_address ? 1 : 0;
 }
 
-static struct clocksource clocksource_apbt = {
-	.name		= "apbt",
-	.rating		= APBT_CLOCKSOURCE_RATING,
-	.read		= apbt_read_clocksource,
-	.mask		= APBT_MASK,
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-	.resume		= apbt_restart_clocksource,
-};
-
-/* boot APB clock event device */
-static struct clock_event_device apbt_clockevent = {
-	.name		= "apbt0",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= apbt_set_mode,
-	.set_next_event = apbt_next_event,
-	.shift		= APBT_SHIFT,
-	.irq		= 0,
-	.rating		= APBT_CLOCKEVENT_RATING,
-};
-
-/*
- * start count down from 0xffff_ffff. this is done by toggling the enable bit
- * then load initial load count to ~0.
- */
-static void apbt_start_counter(int n)
-{
-	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
-	ctrl &= ~APBTMR_CONTROL_ENABLE;
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-	apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
-	/* enable, mask interrupt */
-	ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-	ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-	/* read it once to get cached counter value initialized */
-	apbt_read_clocksource(&clocksource_apbt);
-}
-
-static irqreturn_t apbt_interrupt_handler(int irq, void *data)
-{
-	struct apbt_dev *dev = (struct apbt_dev *)data;
-	struct clock_event_device *aevt = &dev->evt;
-
-	if (!aevt->event_handler) {
-		printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
-		       dev->num);
-		return IRQ_NONE;
-	}
-	aevt->event_handler(aevt);
-	return IRQ_HANDLED;
-}
-
-static void apbt_restart_clocksource(struct clocksource *cs)
-{
-	apbt_start_counter(phy_cs_timer_id);
-}
-
-static void apbt_enable_int(int n)
-{
-	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-	/* clear pending intr */
-	apbt_readl(n, APBTMR_N_EOI);
-	ctrl &= ~APBTMR_CONTROL_INT;
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-static void apbt_disable_int(int n)
-{
-	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
-	ctrl |= APBTMR_CONTROL_INT;
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-
 static int __init apbt_clockevent_register(void)
 {
 	struct sfi_timer_table_entry *mtmr;
@@ -260,45 +154,21 @@ static int __init apbt_clockevent_register(void)
 		return -ENODEV;
 	}
 
-	/*
-	 * We need to calculate the scaled math multiplication factor for
-	 * nanosecond to apbt tick conversion.
-	 * mult = (nsec/cycle)*2^APBT_SHIFT
-	 */
-	apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
-				      , NSEC_PER_SEC, APBT_SHIFT);
-
-	/* Calculate the min / max delta */
-	apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
-							   &apbt_clockevent);
-	apbt_clockevent.min_delta_ns = clockevent_delta2ns(
-		APBT_MIN_DELTA_USEC*apbt_freq,
-		&apbt_clockevent);
-	/*
-	 * Start apbt with the boot cpu mask and make it
-	 * global if not used for per cpu timer.
-	 */
-	apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
 	adev->num = smp_processor_id();
-	memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
+	adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
+		mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+		APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
+		adev_virt_addr(adev), 0, apbt_freq);
+	/* Firmware does EOI handling for us. */
+	adev->timer->eoi = NULL;
 
 	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
-		adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
-		global_clock_event = &adev->evt;
+		global_clock_event = &adev->timer->ced;
 		printk(KERN_DEBUG "%s clockevent registered as global\n",
 		       global_clock_event->name);
 	}
 
-	if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
-			IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
-			apbt_clockevent.name, adev)) {
-		printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-		       apbt_clockevent.irq);
-	}
-
-	clockevents_register_device(&adev->evt);
-	/* Start APBT 0 interrupts */
-	apbt_enable_int(APBT_CLOCKEVENT0_NUM);
+	dw_apb_clockevent_register(adev->timer);
 
 	sfi_free_mtmr(mtmr);
 	return 0;
@@ -316,52 +186,34 @@ static void apbt_setup_irq(struct apbt_dev *adev)
 	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
 	/* APB timer irqs are set up as mp_irqs, timer is edge type */
 	__irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
-
-	if (system_state == SYSTEM_BOOTING) {
-		if (request_irq(adev->irq, apbt_interrupt_handler,
-					IRQF_TIMER | IRQF_DISABLED |
-					IRQF_NOBALANCING,
-					adev->name, adev)) {
-			printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-			       adev->num);
-		}
-	} else
-		enable_irq(adev->irq);
 }
 
 /* Should be called with per cpu */
 void apbt_setup_secondary_clock(void)
 {
 	struct apbt_dev *adev;
-	struct clock_event_device *aevt;
 	int cpu;
 
 	/* Don't register boot CPU clockevent */
 	cpu = smp_processor_id();
 	if (!cpu)
 		return;
-	/*
-	 * We need to calculate the scaled math multiplication factor for
-	 * nanosecond to apbt tick conversion.
-	 * mult = (nsec/cycle)*2^APBT_SHIFT
-	 */
-	printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
-	adev = &per_cpu(cpu_apbt_dev, cpu);
-	aevt = &adev->evt;
 
-	memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
-	aevt->cpumask = cpumask_of(cpu);
-	aevt->name = adev->name;
-	aevt->mode = CLOCK_EVT_MODE_UNUSED;
+	adev = &__get_cpu_var(cpu_apbt_dev);
+	if (!adev->timer) {
+		adev->timer = dw_apb_clockevent_init(cpu, adev->name,
+			APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
+			adev->irq, apbt_freq);
+		adev->timer->eoi = NULL;
+	} else {
+		dw_apb_clockevent_resume(adev->timer);
+	}
 
-	printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
-	       cpu, aevt->name, *(u32 *)aevt->cpumask);
+	printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
+	       cpu, adev->name, adev->cpu);
 
 	apbt_setup_irq(adev);
-
-	clockevents_register_device(aevt);
-
-	apbt_enable_int(cpu);
+	dw_apb_clockevent_register(adev->timer);
 
 	return;
 }
@@ -384,13 +236,12 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 
 	switch (action & 0xf) {
 	case CPU_DEAD:
-		disable_irq(adev->irq);
-		apbt_disable_int(cpu);
+		dw_apb_clockevent_pause(adev->timer);
 		if (system_state == SYSTEM_RUNNING) {
 			pr_debug("skipping APBT CPU %lu offline\n", cpu);
 		} else if (adev) {
 			pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
-			free_irq(adev->irq, adev);
+			dw_apb_clockevent_stop(adev->timer);
 		}
 		break;
 	default:
@@ -415,116 +266,16 @@ void apbt_setup_secondary_clock(void) {}
 
 #endif /* CONFIG_SMP */
 
-static void apbt_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt)
-{
-	unsigned long ctrl;
-	uint64_t delta;
-	int timer_num;
-	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
-	BUG_ON(!apbt_virt_address);
-
-	timer_num = adev->num;
-	pr_debug("%s CPU %d timer %d mode=%d\n",
-		 __func__, first_cpu(*evt->cpumask), timer_num, mode);
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
-		delta >>= apbt_clockevent.shift;
-		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-		ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		/*
-		 * DW APB p. 46, have to disable timer before load counter,
-		 * may cause sync problem.
-		 */
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		udelay(1);
-		pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
-		apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
-		ctrl |= APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		break;
-		/* APB timer does not have one-shot mode, use free running mode */
-	case CLOCK_EVT_MODE_ONESHOT:
-		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-		/*
-		 * set free running mode, this mode will let timer reload max
-		 * timeout which will give time (3min on 25MHz clock) to rearm
-		 * the next event, therefore emulate the one-shot mode.
-		 */
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		/* write again to set free running mode */
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-
-		/*
-		 * DW APB p. 46, load counter with all 1s before starting free
-		 * running mode.
-		 */
-		apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
-		ctrl &= ~APBTMR_CONTROL_INT;
-		ctrl |= APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		break;
-
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		apbt_disable_int(timer_num);
-		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		break;
-
-	case CLOCK_EVT_MODE_RESUME:
-		apbt_enable_int(timer_num);
-		break;
-	}
-}
-
-static int apbt_next_event(unsigned long delta,
-			   struct clock_event_device *evt)
-{
-	unsigned long ctrl;
-	int timer_num;
-
-	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
-	timer_num = adev->num;
-	/* Disable timer */
-	ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-	ctrl &= ~APBTMR_CONTROL_ENABLE;
-	apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-	/* write new count */
-	apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
-	ctrl |= APBTMR_CONTROL_ENABLE;
-	apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-	return 0;
-}
-
-static cycle_t apbt_read_clocksource(struct clocksource *cs)
-{
-	unsigned long current_count;
-
-	current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
-	return (cycle_t)~current_count;
-}
-
 static int apbt_clocksource_register(void)
 {
 	u64 start, now;
 	cycle_t t1;
 
 	/* Start the counter, use timer 2 as source, timer 0/1 for event */
-	apbt_start_counter(phy_cs_timer_id);
+	dw_apb_clocksource_start(clocksource_apbt);
 
 	/* Verify whether apbt counter works */
-	t1 = apbt_read_clocksource(&clocksource_apbt);
+	t1 = dw_apb_clocksource_read(clocksource_apbt);
 	rdtscll(start);
 
 	/*
@@ -539,10 +290,10 @@ static int apbt_clocksource_register(void)
 	} while ((now - start) < 200000UL);
 
 	/* APBT is the only always on clocksource, it has to work! */
-	if (t1 == apbt_read_clocksource(&clocksource_apbt))
+	if (t1 == dw_apb_clocksource_read(clocksource_apbt))
 		panic("APBT counter not counting. APBT disabled\n");
 
-	clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
+	dw_apb_clocksource_register(clocksource_apbt);
 
 	return 0;
 }
@@ -566,10 +317,7 @@ void __init apbt_time_init(void)
 	if (apb_timer_block_enabled)
 		return;
 	apbt_set_mapping();
-	if (apbt_virt_address) {
-		pr_debug("Found APBT version 0x%lx\n",\
-			 apbt_readl_reg(APBTMRS_COMP_VERSION));
-	} else
+	if (!apbt_virt_address)
 		goto out_noapbt;
 	/*
 	 * Read the frequency and check for a sane value, for ESL model
@@ -577,7 +325,7 @@ void __init apbt_time_init(void)
 	 */
 
 	if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
-		pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
+		pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
 		goto out_noapbt;
 	}
 	if (apbt_clocksource_register()) {
@@ -603,30 +351,20 @@ void __init apbt_time_init(void)
 	} else {
 		percpu_timer = 0;
 		apbt_num_timers_used = 1;
-		adev = &per_cpu(cpu_apbt_dev, 0);
-		adev->flags &= ~APBT_DEV_USED;
 	}
 	pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
 
 	/* here we set up per CPU timer data structure */
-	apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
-			    GFP_KERNEL);
-	if (!apbt_devs) {
-		printk(KERN_ERR "Failed to allocate APB timer devices\n");
-		return;
-	}
 	for (i = 0; i < apbt_num_timers_used; i++) {
 		adev = &per_cpu(cpu_apbt_dev, i);
 		adev->num = i;
 		adev->cpu = i;
 		p_mtmr = sfi_get_mtmr(i);
-		if (p_mtmr) {
-			adev->tick = p_mtmr->freq_hz;
+		if (p_mtmr)
 			adev->irq = p_mtmr->irq;
-		} else
+		else
 			printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
-		adev->count = 0;
-		sprintf(adev->name, "apbt%d", i);
+		snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
 	}
 #endif
 
@@ -638,17 +376,8 @@ out_noapbt:
 	panic("failed to enable APB timer\n");
 }
 
-static inline void apbt_disable(int n)
-{
-	if (is_apbt_capable()) {
-		unsigned long ctrl =  apbt_readl(n, APBTMR_N_CONTROL);
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-	}
-}
-
 /* called before apb_timer_enable, use early map */
-unsigned long apbt_quick_calibrate()
+unsigned long apbt_quick_calibrate(void)
 {
 	int i, scale;
 	u64 old, new;
@@ -657,31 +386,31 @@ unsigned long apbt_quick_calibrate()
 	u32 loop, shift;
 
 	apbt_set_mapping();
-	apbt_start_counter(phy_cs_timer_id);
+	dw_apb_clocksource_start(clocksource_apbt);
 
 	/* check if the timer can count down, otherwise return */
-	old = apbt_read_clocksource(&clocksource_apbt);
+	old = dw_apb_clocksource_read(clocksource_apbt);
 	i = 10000;
 	while (--i) {
-		if (old != apbt_read_clocksource(&clocksource_apbt))
+		if (old != dw_apb_clocksource_read(clocksource_apbt))
 			break;
 	}
 	if (!i)
 		goto failed;
 
 	/* count 16 ms */
-	loop = (apbt_freq * 1000) << 4;
+	loop = (apbt_freq / 1000) << 4;
 
 	/* restart the timer to ensure it won't get to 0 in the calibration */
-	apbt_start_counter(phy_cs_timer_id);
+	dw_apb_clocksource_start(clocksource_apbt);
 
-	old = apbt_read_clocksource(&clocksource_apbt);
+	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 
 	t1 = __native_read_tsc();
 
 	do {
-		new = apbt_read_clocksource(&clocksource_apbt);
+		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 
 	t2 = __native_read_tsc();
@@ -693,7 +422,7 @@ unsigned long apbt_quick_calibrate()
 		return 0;
 	}
 	scale = (int)div_u64((t2 - t1), loop >> shift);
-	khz = (scale * apbt_freq * 1000) >> shift;
+	khz = (scale * (apbt_freq / 1000)) >> shift;
 	printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
 	return khz;
 failed:
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 96c921910469..141417fcce30 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -3,3 +3,6 @@ config CLKSRC_I8253
 
 config CLKSRC_MMIO
 	bool
+
+config DW_APB_TIMER
+	bool
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index b995942a5060..4c3d8fda136a 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_SH_TIMER_MTU2)	+= sh_mtu2.o
 obj-$(CONFIG_SH_TIMER_TMU)	+= sh_tmu.o
 obj-$(CONFIG_CLKSRC_I8253)	+= i8253.o
 obj-$(CONFIG_CLKSRC_MMIO)	+= mmio.o
+obj-$(CONFIG_DW_APB_TIMER)	+= dw_apb_timer.o
diff --git a/drivers/clocksource/dw_apb_timer.c b/drivers/clocksource/dw_apb_timer.c
new file mode 100644
index 000000000000..580f870541a3
--- /dev/null
+++ b/drivers/clocksource/dw_apb_timer.c
@@ -0,0 +1,401 @@
+/*
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * Shared with ARM platforms, Jamie Iles, Picochip 2011
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Support for the Synopsys DesignWare APB Timers.
+ */
+#include <linux/dw_apb_timer.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#define APBT_MIN_PERIOD			4
+#define APBT_MIN_DELTA_USEC		200
+
+#define APBTMR_N_LOAD_COUNT		0x00
+#define APBTMR_N_CURRENT_VALUE		0x04
+#define APBTMR_N_CONTROL		0x08
+#define APBTMR_N_EOI			0x0c
+#define APBTMR_N_INT_STATUS		0x10
+
+#define APBTMRS_INT_STATUS		0xa0
+#define APBTMRS_EOI			0xa4
+#define APBTMRS_RAW_INT_STATUS		0xa8
+#define APBTMRS_COMP_VERSION		0xac
+
+#define APBTMR_CONTROL_ENABLE		(1 << 0)
+/* 1: periodic, 0:free running. */
+#define APBTMR_CONTROL_MODE_PERIODIC	(1 << 1)
+#define APBTMR_CONTROL_INT		(1 << 2)
+
+static inline struct dw_apb_clock_event_device *
+ced_to_dw_apb_ced(struct clock_event_device *evt)
+{
+	return container_of(evt, struct dw_apb_clock_event_device, ced);
+}
+
+static inline struct dw_apb_clocksource *
+clocksource_to_dw_apb_clocksource(struct clocksource *cs)
+{
+	return container_of(cs, struct dw_apb_clocksource, cs);
+}
+
+static unsigned long apbt_readl(struct dw_apb_timer *timer, unsigned long offs)
+{
+	return readl(timer->base + offs);
+}
+
+static void apbt_writel(struct dw_apb_timer *timer, unsigned long val,
+		 unsigned long offs)
+{
+	writel(val, timer->base + offs);
+}
+
+static void apbt_disable_int(struct dw_apb_timer *timer)
+{
+	unsigned long ctrl = apbt_readl(timer, APBTMR_N_CONTROL);
+
+	ctrl |= APBTMR_CONTROL_INT;
+	apbt_writel(timer, ctrl, APBTMR_N_CONTROL);
+}
+
+/**
+ * dw_apb_clockevent_pause() - stop the clock_event_device from running
+ *
+ * @dw_ced:	The APB clock to stop generating events.
+ */
+void dw_apb_clockevent_pause(struct dw_apb_clock_event_device *dw_ced)
+{
+	disable_irq(dw_ced->timer.irq);
+	apbt_disable_int(&dw_ced->timer);
+}
+
+static void apbt_eoi(struct dw_apb_timer *timer)
+{
+	apbt_readl(timer, APBTMR_N_EOI);
+}
+
+static irqreturn_t dw_apb_clockevent_irq(int irq, void *data)
+{
+	struct clock_event_device *evt = data;
+	struct dw_apb_clock_event_device *dw_ced = ced_to_dw_apb_ced(evt);
+
+	if (!evt->event_handler) {
+		pr_info("Spurious APBT timer interrupt %d", irq);
+		return IRQ_NONE;
+	}
+
+	if (dw_ced->eoi)
+		dw_ced->eoi(&dw_ced->timer);
+
+	evt->event_handler(evt);
+	return IRQ_HANDLED;
+}
+
+static void apbt_enable_int(struct dw_apb_timer *timer)
+{
+	unsigned long ctrl = apbt_readl(timer, APBTMR_N_CONTROL);
+	/* clear pending intr */
+	apbt_readl(timer, APBTMR_N_EOI);
+	ctrl &= ~APBTMR_CONTROL_INT;
+	apbt_writel(timer, ctrl, APBTMR_N_CONTROL);
+}
+
+static void apbt_set_mode(enum clock_event_mode mode,
+			  struct clock_event_device *evt)
+{
+	unsigned long ctrl;
+	unsigned long period;
+	struct dw_apb_clock_event_device *dw_ced = ced_to_dw_apb_ced(evt);
+
+	pr_debug("%s CPU %d mode=%d\n", __func__, first_cpu(*evt->cpumask),
+		 mode);
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		period = DIV_ROUND_UP(dw_ced->timer.freq, HZ);
+		ctrl = apbt_readl(&dw_ced->timer, APBTMR_N_CONTROL);
+		ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
+		apbt_writel(&dw_ced->timer, ctrl, APBTMR_N_CONTROL);
+		/*
+		 * DW APB p. 46, have to disable timer before load counter,
+		 * may cause sync problem.
+		 */
+		ctrl &= ~APBTMR_CONTROL_ENABLE;
+		apbt_writel(&dw_ced->timer, ctrl, APBTMR_N_CONTROL);
+		udelay(1);
+		pr_debug("Setting clock period %lu for HZ %d\n", period, HZ);
+		apbt_writel(&dw_ced->timer, period, APBTMR_N_LOAD_COUNT);
+		ctrl |= APBTMR_CONTROL_ENABLE;
+		apbt_writel(&dw_ced->timer, ctrl, APBTMR_N_CONTROL);
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		ctrl = apbt_readl(&dw_ced->timer, APBTMR_N_CONTROL);
+		/*
+		 * set free running mode, this mode will let timer reload max
+		 * timeout which will give time (3min on 25MHz clock) to rearm
+		 * the next event, therefore emulate the one-shot mode.
+		 */
+		ctrl &= ~APBTMR_CONTROL_ENABLE;
+		ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
+
+		apbt_writel(&dw_ced->timer, ctrl, APBTMR_N_CONTROL);
+		/* write again to set free running mode */
+		apbt_writel(&dw_ced->timer, ctrl, APBTMR_N_CONTROL);
+
+		/*
+		 * DW APB p. 46, load counter with all 1s before starting free
+		 * running mode.
+		 */
+		apbt_writel(&dw_ced->timer, ~0, APBTMR_N_LOAD_COUNT);
+		ctrl &= ~APBTMR_CONTROL_INT;
+		ctrl |= APBTMR_CONTROL_ENABLE;
+		apbt_writel(&dw_ced->timer, ctrl, APBTMR_N_CONTROL);
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		ctrl = apbt_readl(&dw_ced->timer, APBTMR_N_CONTROL);
+		ctrl &= ~APBTMR_CONTROL_ENABLE;
+		apbt_writel(&dw_ced->timer, ctrl, APBTMR_N_CONTROL);
+		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		apbt_enable_int(&dw_ced->timer);
+		break;
+	}
+}
+
+static int apbt_next_event(unsigned long delta,
+			   struct clock_event_device *evt)
+{
+	unsigned long ctrl;
+	struct dw_apb_clock_event_device *dw_ced = ced_to_dw_apb_ced(evt);
+
+	/* Disable timer */
+	ctrl = apbt_readl(&dw_ced->timer, APBTMR_N_CONTROL);
+	ctrl &= ~APBTMR_CONTROL_ENABLE;
+	apbt_writel(&dw_ced->timer, ctrl, APBTMR_N_CONTROL);
+	/* write new count */
+	apbt_writel(&dw_ced->timer, delta, APBTMR_N_LOAD_COUNT);
+	ctrl |= APBTMR_CONTROL_ENABLE;
+	apbt_writel(&dw_ced->timer, ctrl, APBTMR_N_CONTROL);
+
+	return 0;
+}
+
+/**
+ * dw_apb_clockevent_init() - use an APB timer as a clock_event_device
+ *
+ * @cpu:	The CPU the events will be targeted at.
+ * @name:	The name used for the timer and the IRQ for it.
+ * @rating:	The rating to give the timer.
+ * @base:	I/O base for the timer registers.
+ * @irq:	The interrupt number to use for the timer.
+ * @freq:	The frequency that the timer counts at.
+ *
+ * This creates a clock_event_device for using with the generic clock layer
+ * but does not start and register it.  This should be done with
+ * dw_apb_clockevent_register() as the next step.  If this is the first time
+ * it has been called for a timer then the IRQ will be requested, if not it
+ * just be enabled to allow CPU hotplug to avoid repeatedly requesting and
+ * releasing the IRQ.
+ */
+struct dw_apb_clock_event_device *
+dw_apb_clockevent_init(int cpu, const char *name, unsigned rating,
+		       void __iomem *base, int irq, unsigned long freq)
+{
+	struct dw_apb_clock_event_device *dw_ced =
+		kzalloc(sizeof(*dw_ced), GFP_KERNEL);
+	int err;
+
+	if (!dw_ced)
+		return NULL;
+
+	dw_ced->timer.base = base;
+	dw_ced->timer.irq = irq;
+	dw_ced->timer.freq = freq;
+
+	clockevents_calc_mult_shift(&dw_ced->ced, freq, APBT_MIN_PERIOD);
+	dw_ced->ced.max_delta_ns = clockevent_delta2ns(0x7fffffff,
+						       &dw_ced->ced);
+	dw_ced->ced.min_delta_ns = clockevent_delta2ns(5000, &dw_ced->ced);
+	dw_ced->ced.cpumask = cpumask_of(cpu);
+	dw_ced->ced.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
+	dw_ced->ced.set_mode = apbt_set_mode;
+	dw_ced->ced.set_next_event = apbt_next_event;
+	dw_ced->ced.irq = dw_ced->timer.irq;
+	dw_ced->ced.rating = rating;
+	dw_ced->ced.name = name;
+
+	dw_ced->irqaction.name		= dw_ced->ced.name;
+	dw_ced->irqaction.handler	= dw_apb_clockevent_irq;
+	dw_ced->irqaction.dev_id	= &dw_ced->ced;
+	dw_ced->irqaction.irq		= irq;
+	dw_ced->irqaction.flags		= IRQF_TIMER | IRQF_IRQPOLL |
+					  IRQF_NOBALANCING |
+					  IRQF_DISABLED;
+
+	dw_ced->eoi = apbt_eoi;
+	err = setup_irq(irq, &dw_ced->irqaction);
+	if (err) {
+		pr_err("failed to request timer irq\n");
+		kfree(dw_ced);
+		dw_ced = NULL;
+	}
+
+	return dw_ced;
+}
+
+/**
+ * dw_apb_clockevent_resume() - resume a clock that has been paused.
+ *
+ * @dw_ced:	The APB clock to resume.
+ */
+void dw_apb_clockevent_resume(struct dw_apb_clock_event_device *dw_ced)
+{
+	enable_irq(dw_ced->timer.irq);
+}
+
+/**
+ * dw_apb_clockevent_stop() - stop the clock_event_device and release the IRQ.
+ *
+ * @dw_ced:	The APB clock to stop generating the events.
+ */
+void dw_apb_clockevent_stop(struct dw_apb_clock_event_device *dw_ced)
+{
+	free_irq(dw_ced->timer.irq, &dw_ced->ced);
+}
+
+/**
+ * dw_apb_clockevent_register() - register the clock with the generic layer
+ *
+ * @dw_ced:	The APB clock to register as a clock_event_device.
+ */
+void dw_apb_clockevent_register(struct dw_apb_clock_event_device *dw_ced)
+{
+	apbt_writel(&dw_ced->timer, 0, APBTMR_N_CONTROL);
+	clockevents_register_device(&dw_ced->ced);
+	apbt_enable_int(&dw_ced->timer);
+}
+
+/**
+ * dw_apb_clocksource_start() - start the clocksource counting.
+ *
+ * @dw_cs:	The clocksource to start.
+ *
+ * This is used to start the clocksource before registration and can be used
+ * to enable calibration of timers.
+ */
+void dw_apb_clocksource_start(struct dw_apb_clocksource *dw_cs)
+{
+	/*
+	 * start count down from 0xffff_ffff. this is done by toggling the
+	 * enable bit then load initial load count to ~0.
+	 */
+	unsigned long ctrl = apbt_readl(&dw_cs->timer, APBTMR_N_CONTROL);
+
+	ctrl &= ~APBTMR_CONTROL_ENABLE;
+	apbt_writel(&dw_cs->timer, ctrl, APBTMR_N_CONTROL);
+	apbt_writel(&dw_cs->timer, ~0, APBTMR_N_LOAD_COUNT);
+	/* enable, mask interrupt */
+	ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
+	ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
+	apbt_writel(&dw_cs->timer, ctrl, APBTMR_N_CONTROL);
+	/* read it once to get cached counter value initialized */
+	dw_apb_clocksource_read(dw_cs);
+}
+
+static cycle_t __apbt_read_clocksource(struct clocksource *cs)
+{
+	unsigned long current_count;
+	struct dw_apb_clocksource *dw_cs =
+		clocksource_to_dw_apb_clocksource(cs);
+
+	current_count = apbt_readl(&dw_cs->timer, APBTMR_N_CURRENT_VALUE);
+
+	return (cycle_t)~current_count;
+}
+
+static void apbt_restart_clocksource(struct clocksource *cs)
+{
+	struct dw_apb_clocksource *dw_cs =
+		clocksource_to_dw_apb_clocksource(cs);
+
+	dw_apb_clocksource_start(dw_cs);
+}
+
+/**
+ * dw_apb_clocksource_init() - use an APB timer as a clocksource.
+ *
+ * @rating:	The rating to give the clocksource.
+ * @name:	The name for the clocksource.
+ * @base:	The I/O base for the timer registers.
+ * @freq:	The frequency that the timer counts at.
+ *
+ * This creates a clocksource using an APB timer but does not yet register it
+ * with the clocksource system.  This should be done with
+ * dw_apb_clocksource_register() as the next step.
+ */
+struct dw_apb_clocksource *
+dw_apb_clocksource_init(unsigned rating, char *name, void __iomem *base,
+			unsigned long freq)
+{
+	struct dw_apb_clocksource *dw_cs = kzalloc(sizeof(*dw_cs), GFP_KERNEL);
+
+	if (!dw_cs)
+		return NULL;
+
+	dw_cs->timer.base = base;
+	dw_cs->timer.freq = freq;
+	dw_cs->cs.name = name;
+	dw_cs->cs.rating = rating;
+	dw_cs->cs.read = __apbt_read_clocksource;
+	dw_cs->cs.mask = CLOCKSOURCE_MASK(32);
+	dw_cs->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS;
+	dw_cs->cs.resume = apbt_restart_clocksource;
+
+	return dw_cs;
+}
+
+/**
+ * dw_apb_clocksource_register() - register the APB clocksource.
+ *
+ * @dw_cs:	The clocksource to register.
+ */
+void dw_apb_clocksource_register(struct dw_apb_clocksource *dw_cs)
+{
+	clocksource_register_hz(&dw_cs->cs, dw_cs->timer.freq);
+}
+
+/**
+ * dw_apb_clocksource_read() - read the current value of a clocksource.
+ *
+ * @dw_cs:	The clocksource to read.
+ */
+cycle_t dw_apb_clocksource_read(struct dw_apb_clocksource *dw_cs)
+{
+	return (cycle_t)~apbt_readl(&dw_cs->timer, APBTMR_N_CURRENT_VALUE);
+}
+
+/**
+ * dw_apb_clocksource_unregister() - unregister and free a clocksource.
+ *
+ * @dw_cs:	The clocksource to unregister/free.
+ */
+void dw_apb_clocksource_unregister(struct dw_apb_clocksource *dw_cs)
+{
+	clocksource_unregister(&dw_cs->cs);
+
+	kfree(dw_cs);
+}
diff --git a/include/linux/dw_apb_timer.h b/include/linux/dw_apb_timer.h
new file mode 100644
index 000000000000..49638ea3b776
--- /dev/null
+++ b/include/linux/dw_apb_timer.h
@@ -0,0 +1,56 @@
+/*
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * Shared with ARM platforms, Jamie Iles, Picochip 2011
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Support for the Synopsys DesignWare APB Timers.
+ */
+#ifndef __DW_APB_TIMER_H__
+#define __DW_APB_TIMER_H__
+
+#include <linux/clockchips.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+
+#define APBTMRS_REG_SIZE       0x14
+
+struct dw_apb_timer {
+	void __iomem				*base;
+	unsigned long				freq;
+	int					irq;
+};
+
+struct dw_apb_clock_event_device {
+	struct clock_event_device		ced;
+	struct dw_apb_timer			timer;
+	struct irqaction			irqaction;
+	void					(*eoi)(struct dw_apb_timer *);
+};
+
+struct dw_apb_clocksource {
+	struct dw_apb_timer			timer;
+	struct clocksource			cs;
+};
+
+void dw_apb_clockevent_register(struct dw_apb_clock_event_device *dw_ced);
+void dw_apb_clockevent_pause(struct dw_apb_clock_event_device *dw_ced);
+void dw_apb_clockevent_resume(struct dw_apb_clock_event_device *dw_ced);
+void dw_apb_clockevent_stop(struct dw_apb_clock_event_device *dw_ced);
+
+struct dw_apb_clock_event_device *
+dw_apb_clockevent_init(int cpu, const char *name, unsigned rating,
+		       void __iomem *base, int irq, unsigned long freq);
+struct dw_apb_clocksource *
+dw_apb_clocksource_init(unsigned rating, char *name, void __iomem *base,
+			unsigned long freq);
+void dw_apb_clocksource_register(struct dw_apb_clocksource *dw_cs);
+void dw_apb_clocksource_start(struct dw_apb_clocksource *dw_cs);
+cycle_t dw_apb_clocksource_read(struct dw_apb_clocksource *dw_cs);
+void dw_apb_clocksource_unregister(struct dw_apb_clocksource *dw_cs);
+
+#endif /* __DW_APB_TIMER_H__ */
-- 
cgit v1.2.3


From f740e6cd0cb5e7468e46831aeb4d9c30e03d5ebc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Jun 2011 11:19:28 -0700
Subject: stop_machine: implement stop_machine_from_inactive_cpu()

Currently, mtrr wants stop_machine functionality while a CPU is being
brought up.  As stop_machine() requires the calling CPU to be active,
mtrr implements its own stop_machine using stop_one_cpu() on each
online CPU.  This doesn't only unnecessarily duplicate complex logic
but also introduces a possibility of deadlock when it races against
the generic stop_machine().

This patch implements stop_machine_from_inactive_cpu() to serve such
use cases.  Its functionality is basically the same as stop_machine();
however, it should be called from a CPU which isn't active and doesn't
depend on working scheduling on the calling CPU.

This is achieved by using busy loops for synchronization and
open-coding stop_cpus queuing and waiting with direct invocation of
fn() for local CPU inbetween.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20110623182056.982526827@sbsiddha-MOBL3.sc.intel.com
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 include/linux/stop_machine.h | 14 ++++++++--
 kernel/stop_machine.c        | 62 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 73 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 14d3524d1274..e0f2da25d751 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -126,15 +126,19 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
+int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+				   const struct cpumask *cpus);
+
 #else	 /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
 static inline int __stop_machine(int (*fn)(void *), void *data,
 				 const struct cpumask *cpus)
 {
+	unsigned long flags;
 	int ret;
-	local_irq_disable();
+	local_irq_save(flags);
 	ret = fn(data);
-	local_irq_enable();
+	local_irq_restore(flags);
 	return ret;
 }
 
@@ -144,5 +148,11 @@ static inline int stop_machine(int (*fn)(void *), void *data,
 	return __stop_machine(fn, data, cpus);
 }
 
+static inline int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+						 const struct cpumask *cpus)
+{
+	return __stop_machine(fn, data, cpus);
+}
+
 #endif	/* CONFIG_STOP_MACHINE && CONFIG_SMP */
 #endif	/* _LINUX_STOP_MACHINE */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4c89ee9fc56b..e8f05b14cd43 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -439,8 +439,15 @@ static int stop_machine_cpu_stop(void *data)
 	struct stop_machine_data *smdata = data;
 	enum stopmachine_state curstate = STOPMACHINE_NONE;
 	int cpu = smp_processor_id(), err = 0;
+	unsigned long flags;
 	bool is_active;
 
+	/*
+	 * When called from stop_machine_from_inactive_cpu(), irq might
+	 * already be disabled.  Save the state and restore it on exit.
+	 */
+	local_save_flags(flags);
+
 	if (!smdata->active_cpus)
 		is_active = cpu == cpumask_first(cpu_online_mask);
 	else
@@ -468,7 +475,7 @@ static int stop_machine_cpu_stop(void *data)
 		}
 	} while (curstate != STOPMACHINE_EXIT);
 
-	local_irq_enable();
+	local_irq_restore(flags);
 	return err;
 }
 
@@ -495,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 }
 EXPORT_SYMBOL_GPL(stop_machine);
 
+/**
+ * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
+ * @fn: the function to run
+ * @data: the data ptr for the @fn()
+ * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
+ *
+ * This is identical to stop_machine() but can be called from a CPU which
+ * is not active.  The local CPU is in the process of hotplug (so no other
+ * CPU hotplug can start) and not marked active and doesn't have enough
+ * context to sleep.
+ *
+ * This function provides stop_machine() functionality for such state by
+ * using busy-wait for synchronization and executing @fn directly for local
+ * CPU.
+ *
+ * CONTEXT:
+ * Local CPU is inactive.  Temporarily stops all active CPUs.
+ *
+ * RETURNS:
+ * 0 if all executions of @fn returned 0, any non zero return value if any
+ * returned non zero.
+ */
+int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+				  const struct cpumask *cpus)
+{
+	struct stop_machine_data smdata = { .fn = fn, .data = data,
+					    .active_cpus = cpus };
+	struct cpu_stop_done done;
+	int ret;
+
+	/* Local CPU must be inactive and CPU hotplug in progress. */
+	BUG_ON(cpu_active(raw_smp_processor_id()));
+	smdata.num_threads = num_active_cpus() + 1;	/* +1 for local */
+
+	/* No proper task established and can't sleep - busy wait for lock. */
+	while (!mutex_trylock(&stop_cpus_mutex))
+		cpu_relax();
+
+	/* Schedule work on other CPUs and execute directly for local CPU */
+	set_state(&smdata, STOPMACHINE_PREPARE);
+	cpu_stop_init_done(&done, num_active_cpus());
+	queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
+			     &done);
+	ret = stop_machine_cpu_stop(&smdata);
+
+	/* Busy wait for completion. */
+	while (!completion_done(&done.completion))
+		cpu_relax();
+
+	mutex_unlock(&stop_cpus_mutex);
+	return ret ?: done.ret;
+}
+
 #endif	/* CONFIG_STOP_MACHINE */
-- 
cgit v1.2.3


From 192d8857427dd23707d5f0b86ca990c3af6f2d74 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 23 Jun 2011 11:19:29 -0700
Subject: x86, mtrr: use stop_machine APIs for doing MTRR rendezvous

MTRR rendezvous sequence is not implemened using stop_machine() before, as this
gets called both from the process context aswell as the cpu online paths
(where the cpu has not come online and the interrupts are disabled etc).

Now that we have a new stop_machine_from_inactive_cpu() API, use it for
rendezvous during mtrr init of a logical processor that is coming online.

For the rest (runtime MTRR modification, system boot, resume paths), use
stop_machine() to implement the rendezvous sequence. This will consolidate and
cleanup the code.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/20110623182057.076997177@sbsiddha-MOBL3.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mtrr/main.c | 192 +++++++++-------------------------------
 include/linux/stop_machine.h    |   2 -
 kernel/stop_machine.c           |   2 +-
 3 files changed, 42 insertions(+), 154 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 3d17bc7f06e6..707b6377adf1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -137,55 +137,43 @@ static void __init init_table(void)
 }
 
 struct set_mtrr_data {
-	atomic_t	count;
-	atomic_t	gate;
 	unsigned long	smp_base;
 	unsigned long	smp_size;
 	unsigned int	smp_reg;
 	mtrr_type	smp_type;
 };
 
-static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
-
 /**
- * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
  * @info: pointer to mtrr configuration data
  *
  * Returns nothing.
  */
-static int mtrr_work_handler(void *info)
+static int mtrr_rendezvous_handler(void *info)
 {
 #ifdef CONFIG_SMP
 	struct set_mtrr_data *data = info;
-	unsigned long flags;
-
-	atomic_dec(&data->count);
-	while (!atomic_read(&data->gate))
-		cpu_relax();
-
-	local_irq_save(flags);
-
-	atomic_dec(&data->count);
-	while (atomic_read(&data->gate))
-		cpu_relax();
 
-	/*  The master has cleared me to execute  */
+	/*
+	 * We use this same function to initialize the mtrrs during boot,
+	 * resume, runtime cpu online and on an explicit request to set a
+	 * specific MTRR.
+	 *
+	 * During boot or suspend, the state of the boot cpu's mtrrs has been
+	 * saved, and we want to replicate that across all the cpus that come
+	 * online (either at the end of boot or resume or during a runtime cpu
+	 * online). If we're doing that, @reg is set to something special and on
+	 * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
+	 * started the boot/resume sequence, this might be a duplicate
+	 * set_all()).
+	 */
 	if (data->smp_reg != ~0U) {
 		mtrr_if->set(data->smp_reg, data->smp_base,
 			     data->smp_size, data->smp_type);
-	} else if (mtrr_aps_delayed_init) {
-		/*
-		 * Initialize the MTRRs inaddition to the synchronisation.
-		 */
+	} else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
 		mtrr_if->set_all();
 	}
-
-	atomic_dec(&data->count);
-	while (!atomic_read(&data->gate))
-		cpu_relax();
-
-	atomic_dec(&data->count);
-	local_irq_restore(flags);
 #endif
 	return 0;
 }
@@ -223,20 +211,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
  * 14. Wait for buddies to catch up
  * 15. Enable interrupts.
  *
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU announces that it started the rendezvous handler by
- * decrementing the count, We reset data.count and set the data.gate flag
- * allowing all the cpu's to proceed with the work. As each cpu disables
- * interrupts, it'll decrement data.count once. We wait until it hits 0 and
- * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
- * are waiting for that flag to be cleared. Once it's cleared, each
- * CPU goes through the transition of updating MTRRs.
- * The CPU vendors may each do it differently,
- * so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate
- * to be set.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
- * Everyone then enables interrupts and we all continue on.
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
  *
  * Note that the mechanism is the same for UP systems, too; all the SMP stuff
  * becomes nops.
@@ -244,115 +223,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
 static void
 set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
 {
-	struct set_mtrr_data data;
-	unsigned long flags;
-	int cpu;
-
-#ifdef CONFIG_SMP
-	/*
-	 * If this cpu is not yet active, we are in the cpu online path. There
-	 * can be no stop_machine() in parallel, as stop machine ensures this
-	 * by using get_online_cpus(). We can skip taking the stop_cpus_mutex,
-	 * as we don't need it and also we can't afford to block while waiting
-	 * for the mutex.
-	 *
-	 * If this cpu is active, we need to prevent stop_machine() happening
-	 * in parallel by taking the stop cpus mutex.
-	 *
-	 * Also, this is called in the context of cpu online path or in the
-	 * context where cpu hotplug is prevented. So checking the active status
-	 * of the raw_smp_processor_id() is safe.
-	 */
-	if (cpu_active(raw_smp_processor_id()))
-		mutex_lock(&stop_cpus_mutex);
-#endif
-
-	preempt_disable();
-
-	data.smp_reg = reg;
-	data.smp_base = base;
-	data.smp_size = size;
-	data.smp_type = type;
-	atomic_set(&data.count, num_booting_cpus() - 1);
-
-	/* Make sure data.count is visible before unleashing other CPUs */
-	smp_wmb();
-	atomic_set(&data.gate, 0);
-
-	/* Start the ball rolling on other CPUs */
-	for_each_online_cpu(cpu) {
-		struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
-
-		if (cpu == smp_processor_id())
-			continue;
+	struct set_mtrr_data data = { .smp_reg = reg,
+				      .smp_base = base,
+				      .smp_size = size,
+				      .smp_type = type
+				    };
 
-		stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
-	}
-
-
-	while (atomic_read(&data.count))
-		cpu_relax();
-
-	/* Ok, reset count and toggle gate */
-	atomic_set(&data.count, num_booting_cpus() - 1);
-	smp_wmb();
-	atomic_set(&data.gate, 1);
-
-	local_irq_save(flags);
-
-	while (atomic_read(&data.count))
-		cpu_relax();
-
-	/* Ok, reset count and toggle gate */
-	atomic_set(&data.count, num_booting_cpus() - 1);
-	smp_wmb();
-	atomic_set(&data.gate, 0);
-
-	/* Do our MTRR business */
-
-	/*
-	 * HACK!
-	 *
-	 * We use this same function to initialize the mtrrs during boot,
-	 * resume, runtime cpu online and on an explicit request to set a
-	 * specific MTRR.
-	 *
-	 * During boot or suspend, the state of the boot cpu's mtrrs has been
-	 * saved, and we want to replicate that across all the cpus that come
-	 * online (either at the end of boot or resume or during a runtime cpu
-	 * online). If we're doing that, @reg is set to something special and on
-	 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
-	 * is unnecessary if at this point we are still on the cpu that started
-	 * the boot/resume sequence. But there is no guarantee that we are still
-	 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
-	 * sure that we are in sync with everyone else.
-	 */
-	if (reg != ~0U)
-		mtrr_if->set(reg, base, size, type);
-	else
-		mtrr_if->set_all();
-
-	/* Wait for the others */
-	while (atomic_read(&data.count))
-		cpu_relax();
-
-	atomic_set(&data.count, num_booting_cpus() - 1);
-	smp_wmb();
-	atomic_set(&data.gate, 1);
-
-	/*
-	 * Wait here for everyone to have seen the gate change
-	 * So we're the last ones to touch 'data'
-	 */
-	while (atomic_read(&data.count))
-		cpu_relax();
+	stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
 
-	local_irq_restore(flags);
-	preempt_enable();
-#ifdef CONFIG_SMP
-	if (cpu_active(raw_smp_processor_id()))
-		mutex_unlock(&stop_cpus_mutex);
-#endif
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
+				      unsigned long size, mtrr_type type)
+{
+	struct set_mtrr_data data = { .smp_reg = reg,
+				      .smp_base = base,
+				      .smp_size = size,
+				      .smp_type = type
+				    };
+
+	stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+				       cpu_callout_mask);
 }
 
 /**
@@ -806,7 +696,7 @@ void mtrr_ap_init(void)
 	 *   2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
 	 *      lock to prevent mtrr entry changes
 	 */
-	set_mtrr(~0U, 0, 0, 0);
+	set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
 }
 
 /**
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index e0f2da25d751..4a9d0c7edc65 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -27,8 +27,6 @@ struct cpu_stop_work {
 	struct cpu_stop_done	*done;
 };
 
-extern struct mutex stop_cpus_mutex;
-
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
 void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			 struct cpu_stop_work *work_buf);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e8f05b14cd43..c1124752e1d3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -132,8 +132,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
 }
 
-DEFINE_MUTEX(stop_cpus_mutex);
 /* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
 static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
 
 static void queue_stop_cpus_work(const struct cpumask *cpumask,
-- 
cgit v1.2.3


From a6b0919140b49e0871584362ae0cf1d18c476058 Mon Sep 17 00:00:00 2001
From: John Bonesio <bones@secretlab.ca>
Date: Mon, 27 Jun 2011 16:49:57 -0700
Subject: of/gpio: Add new method for getting gpios under different property
 names

This patch adds a new routine, of_get_named_gpio_flags(), which takes the
property name as a parameter rather than assuming "gpios".

of_get_gpio_flags() is modified to call of_get_named_gpio_flags() with "gpios"
as the property parameter.

Signed-off-by: John Bonesio <bones@secretlab.ca>
[grant.likely: Tidied up whitespace and tweaked kerneldoc comments.]
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/gpio.c       | 11 ++++++-----
 include/linux/of_gpio.h | 42 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/gpio.c b/drivers/of/gpio.c
index 905960338fb2..3007662ac614 100644
--- a/drivers/of/gpio.c
+++ b/drivers/of/gpio.c
@@ -21,8 +21,9 @@
 #include <linux/slab.h>
 
 /**
- * of_get_gpio_flags - Get a GPIO number and flags to use with GPIO API
+ * of_get_named_gpio_flags() - Get a GPIO number and flags to use with GPIO API
  * @np:		device node to get GPIO from
+ * @propname:	property name containing gpio specifier(s)
  * @index:	index of the GPIO
  * @flags:	a flags pointer to fill in
  *
@@ -30,8 +31,8 @@
  * value on the error condition. If @flags is not NULL the function also fills
  * in flags for the GPIO.
  */
-int of_get_gpio_flags(struct device_node *np, int index,
-		      enum of_gpio_flags *flags)
+int of_get_named_gpio_flags(struct device_node *np, const char *propname,
+                           int index, enum of_gpio_flags *flags)
 {
 	int ret;
 	struct device_node *gpio_np;
@@ -40,7 +41,7 @@ int of_get_gpio_flags(struct device_node *np, int index,
 	const void *gpio_spec;
 	const __be32 *gpio_cells;
 
-	ret = of_parse_phandles_with_args(np, "gpios", "#gpio-cells", index,
+	ret = of_parse_phandles_with_args(np, propname, "#gpio-cells", index,
 					  &gpio_np, &gpio_spec);
 	if (ret) {
 		pr_debug("%s: can't parse gpios property\n", __func__);
@@ -79,7 +80,7 @@ err0:
 	pr_debug("%s exited with status %d\n", __func__, ret);
 	return ret;
 }
-EXPORT_SYMBOL(of_get_gpio_flags);
+EXPORT_SYMBOL(of_get_named_gpio_flags);
 
 /**
  * of_gpio_count - Count GPIOs for a device
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 6598c04dab01..aec8025c786a 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -46,8 +46,9 @@ static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc)
 	return container_of(gc, struct of_mm_gpio_chip, gc);
 }
 
-extern int of_get_gpio_flags(struct device_node *np, int index,
-			     enum of_gpio_flags *flags);
+extern int of_get_named_gpio_flags(struct device_node *np,
+		const char *list_name, int index, enum of_gpio_flags *flags);
+
 extern unsigned int of_gpio_count(struct device_node *np);
 
 extern int of_mm_gpiochip_add(struct device_node *np,
@@ -60,8 +61,8 @@ extern struct gpio_chip *of_node_to_gpiochip(struct device_node *np);
 #else /* CONFIG_OF_GPIO */
 
 /* Drivers may not strictly depend on the GPIO support, so let them link. */
-static inline int of_get_gpio_flags(struct device_node *np, int index,
-				    enum of_gpio_flags *flags)
+static inline int of_get_named_gpio_flags(struct device_node *np,
+		const char *list_name, int index, enum of_gpio_flags *flags)
 {
 	return -ENOSYS;
 }
@@ -77,7 +78,38 @@ static inline void of_gpiochip_remove(struct gpio_chip *gc) { }
 #endif /* CONFIG_OF_GPIO */
 
 /**
- * of_get_gpio - Get a GPIO number to use with GPIO API
+ * of_get_gpio_flags() - Get a GPIO number and flags to use with GPIO API
+ * @np:		device node to get GPIO from
+ * @index:	index of the GPIO
+ * @flags:	a flags pointer to fill in
+ *
+ * Returns GPIO number to use with Linux generic GPIO API, or one of the errno
+ * value on the error condition. If @flags is not NULL the function also fills
+ * in flags for the GPIO.
+ */
+static inline int of_get_gpio_flags(struct device_node *np, int index,
+		      enum of_gpio_flags *flags)
+{
+	return of_get_named_gpio_flags(np, "gpios", index, flags);
+}
+
+/**
+ * of_get_named_gpio() - Get a GPIO number to use with GPIO API
+ * @np:		device node to get GPIO from
+ * @propname:	Name of property containing gpio specifier(s)
+ * @index:	index of the GPIO
+ *
+ * Returns GPIO number to use with Linux generic GPIO API, or one of the errno
+ * value on the error condition.
+ */
+static inline int of_get_named_gpio(struct device_node *np,
+                                   const char *propname, int index)
+{
+	return of_get_named_gpio_flags(np, propname, index, NULL);
+}
+
+/**
+ * of_get_gpio() - Get a GPIO number to use with GPIO API
  * @np:		device node to get GPIO from
  * @index:	index of the GPIO
  *
-- 
cgit v1.2.3


From f457a46f179df41b0f6d80dee33b6e629945f276 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Fri, 24 Jun 2011 15:11:53 -0500
Subject: [SCSI] iscsi_ibft, be2iscsi, iscsi_boot: fix boot kobj data lifetime
 management

be2iscsi passes the boot functions its phba object which is
allocated in the shost, but iscsi_ibft passes in a object
allocated for each item to display. The problem is that
iscsi_boot_sysfs was managing the lifetime of the object
passed in and doing a kfree on release. This causes a double
free for be2iscsi which frees the shost in its pci_remove.

This patch fixes the problem by adding a release callback
which the drivers can call kfree or a put() type of function
(needed for be2iscsi which will do a get/put on the shost).

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/firmware/iscsi_ibft.c    |  14 ++-
 drivers/scsi/be2iscsi/be_main.c  | 190 ++++++++++++++++++++-------------------
 drivers/scsi/iscsi_boot_sysfs.c  |  28 ++++--
 include/linux/iscsi_boot_sysfs.h |  16 +++-
 4 files changed, 142 insertions(+), 106 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c
index ce33f4626957..c811cb107904 100644
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -566,6 +566,11 @@ static mode_t __init ibft_check_initiator_for(void *data, int type)
 	return rc;
 }
 
+static void ibft_kobj_release(void *data)
+{
+	kfree(data);
+}
+
 /*
  * Helper function for ibft_register_kobjects.
  */
@@ -595,7 +600,8 @@ static int __init ibft_create_kobject(struct acpi_table_ibft *header,
 		boot_kobj = iscsi_boot_create_initiator(boot_kset, hdr->index,
 						ibft_kobj,
 						ibft_attr_show_initiator,
-						ibft_check_initiator_for);
+						ibft_check_initiator_for,
+						ibft_kobj_release);
 		if (!boot_kobj) {
 			rc = -ENOMEM;
 			goto free_ibft_obj;
@@ -610,7 +616,8 @@ static int __init ibft_create_kobject(struct acpi_table_ibft *header,
 		boot_kobj = iscsi_boot_create_ethernet(boot_kset, hdr->index,
 						       ibft_kobj,
 						       ibft_attr_show_nic,
-						       ibft_check_nic_for);
+						       ibft_check_nic_for,
+						       ibft_kobj_release);
 		if (!boot_kobj) {
 			rc = -ENOMEM;
 			goto free_ibft_obj;
@@ -625,7 +632,8 @@ static int __init ibft_create_kobject(struct acpi_table_ibft *header,
 		boot_kobj = iscsi_boot_create_target(boot_kset, hdr->index,
 						     ibft_kobj,
 						     ibft_attr_show_target,
-						     ibft_check_tgt_for);
+						     ibft_check_tgt_for,
+						     ibft_kobj_release);
 		if (!boot_kobj) {
 			rc = -ENOMEM;
 			goto free_ibft_obj;
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index 2e214390c63b..0a9bdfa3d939 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -215,73 +215,62 @@ unlock:
 static ssize_t beiscsi_show_boot_tgt_info(void *data, int type, char *buf)
 {
 	struct beiscsi_hba *phba = data;
+	struct mgmt_session_info *boot_sess = &phba->boot_sess;
+	struct mgmt_conn_info *boot_conn = &boot_sess->conn_list[0];
 	char *str = buf;
 	int rc;
 
 	switch (type) {
 	case ISCSI_BOOT_TGT_NAME:
 		rc = sprintf(buf, "%.*s\n",
-				(int)strlen(phba->boot_sess.target_name),
-				(char *)&phba->boot_sess.target_name);
+			    (int)strlen(boot_sess->target_name),
+			    (char *)&boot_sess->target_name);
 		break;
 	case ISCSI_BOOT_TGT_IP_ADDR:
-		if (phba->boot_sess.conn_list[0].dest_ipaddr.ip_type == 0x1)
+		if (boot_conn->dest_ipaddr.ip_type == 0x1)
 			rc = sprintf(buf, "%pI4\n",
-				(char *)&phba->boot_sess.conn_list[0].
-				dest_ipaddr.ip_address);
+				(char *)&boot_conn->dest_ipaddr.ip_address);
 		else
 			rc = sprintf(str, "%pI6\n",
-				(char *)&phba->boot_sess.conn_list[0].
-				dest_ipaddr.ip_address);
+				(char *)&boot_conn->dest_ipaddr.ip_address);
 		break;
 	case ISCSI_BOOT_TGT_PORT:
-		rc = sprintf(str, "%d\n", phba->boot_sess.conn_list[0].
-				  dest_port);
+		rc = sprintf(str, "%d\n", boot_conn->dest_port);
 		break;
 
 	case ISCSI_BOOT_TGT_CHAP_NAME:
 		rc = sprintf(str,  "%.*s\n",
-				      phba->boot_sess.conn_list[0].
-				      negotiated_login_options.auth_data.chap.
-				      target_chap_name_length,
-				      (char *)&phba->boot_sess.conn_list[0].
-				      negotiated_login_options.auth_data.chap.
-				      target_chap_name);
+			     boot_conn->negotiated_login_options.auth_data.chap.
+			     target_chap_name_length,
+			     (char *)&boot_conn->negotiated_login_options.
+			     auth_data.chap.target_chap_name);
 		break;
 	case ISCSI_BOOT_TGT_CHAP_SECRET:
 		rc = sprintf(str,  "%.*s\n",
-				      phba->boot_sess.conn_list[0].
-				      negotiated_login_options.auth_data.chap.
-				      target_secret_length,
-				      (char *)&phba->boot_sess.conn_list[0].
-				      negotiated_login_options.auth_data.chap.
-				      target_secret);
-
+			     boot_conn->negotiated_login_options.auth_data.chap.
+			     target_secret_length,
+			     (char *)&boot_conn->negotiated_login_options.
+			     auth_data.chap.target_secret);
 		break;
 	case ISCSI_BOOT_TGT_REV_CHAP_NAME:
 		rc = sprintf(str,  "%.*s\n",
-				      phba->boot_sess.conn_list[0].
-				      negotiated_login_options.auth_data.chap.
-				      intr_chap_name_length,
-				      (char *)&phba->boot_sess.conn_list[0].
-				      negotiated_login_options.auth_data.chap.
-				      intr_chap_name);
-
+			     boot_conn->negotiated_login_options.auth_data.chap.
+			     intr_chap_name_length,
+			     (char *)&boot_conn->negotiated_login_options.
+			     auth_data.chap.intr_chap_name);
 		break;
 	case ISCSI_BOOT_TGT_REV_CHAP_SECRET:
-			rc = sprintf(str,  "%.*s\n",
-				      phba->boot_sess.conn_list[0].
-				      negotiated_login_options.auth_data.chap.
-				      intr_secret_length,
-				      (char *)&phba->boot_sess.conn_list[0].
-				      negotiated_login_options.auth_data.chap.
-				      intr_secret);
+		rc = sprintf(str,  "%.*s\n",
+			     boot_conn->negotiated_login_options.auth_data.chap.
+			     intr_secret_length,
+			     (char *)&boot_conn->negotiated_login_options.
+			     auth_data.chap.intr_secret);
 		break;
 	case ISCSI_BOOT_TGT_FLAGS:
-			rc = sprintf(str, "2\n");
+		rc = sprintf(str, "2\n");
 		break;
 	case ISCSI_BOOT_TGT_NIC_ASSOC:
-			rc = sprintf(str, "0\n");
+		rc = sprintf(str, "0\n");
 		break;
 	default:
 		rc = -ENOSYS;
@@ -315,10 +304,10 @@ static ssize_t beiscsi_show_boot_eth_info(void *data, int type, char *buf)
 
 	switch (type) {
 	case ISCSI_BOOT_ETH_FLAGS:
-			rc = sprintf(str, "2\n");
+		rc = sprintf(str, "2\n");
 		break;
 	case ISCSI_BOOT_ETH_INDEX:
-			rc = sprintf(str, "0\n");
+		rc = sprintf(str, "0\n");
 		break;
 	case ISCSI_BOOT_ETH_MAC:
 		rc  = beiscsi_get_macaddr(buf, phba);
@@ -391,39 +380,6 @@ static mode_t beiscsi_eth_get_attr_visibility(void *data, int type)
 	return rc;
 }
 
-static int beiscsi_setup_boot_info(struct beiscsi_hba *phba)
-{
-	struct iscsi_boot_kobj *boot_kobj;
-
-	phba->boot_kset = iscsi_boot_create_host_kset(phba->shost->host_no);
-	if (!phba->boot_kset)
-		return -ENOMEM;
-
-	/* get boot info using mgmt cmd */
-	boot_kobj = iscsi_boot_create_target(phba->boot_kset, 0, phba,
-					     beiscsi_show_boot_tgt_info,
-					     beiscsi_tgt_get_attr_visibility);
-	if (!boot_kobj)
-		goto free_kset;
-
-	boot_kobj = iscsi_boot_create_initiator(phba->boot_kset, 0, phba,
-					     beiscsi_show_boot_ini_info,
-					     beiscsi_ini_get_attr_visibility);
-	if (!boot_kobj)
-		goto free_kset;
-
-	boot_kobj = iscsi_boot_create_ethernet(phba->boot_kset, 0, phba,
-					     beiscsi_show_boot_eth_info,
-					     beiscsi_eth_get_attr_visibility);
-	if (!boot_kobj)
-		goto free_kset;
-	return 0;
-
-free_kset:
-	iscsi_boot_destroy_kset(phba->boot_kset);
-	return -ENOMEM;
-}
-
 /*------------------- PCI Driver operations and data ----------------- */
 static DEFINE_PCI_DEVICE_TABLE(beiscsi_pci_id_table) = {
 	{ PCI_DEVICE(BE_VENDOR_ID, BE_DEVICE_ID1) },
@@ -482,14 +438,6 @@ static struct beiscsi_hba *beiscsi_hba_alloc(struct pci_dev *pcidev)
 	if (iscsi_host_add(shost, &phba->pcidev->dev))
 		goto free_devices;
 
-	if (beiscsi_setup_boot_info(phba))
-		/*
-		 * log error but continue, because we may not be using
-		 * iscsi boot.
-		 */
-		shost_printk(KERN_ERR, phba->shost, "Could not set up "
-		"iSCSI boot info.");
-
 	return phba;
 
 free_devices:
@@ -3510,6 +3458,7 @@ static int beiscsi_get_boot_info(struct beiscsi_hba *phba)
 	unsigned int tag, wrb_num;
 	unsigned short status, extd_status;
 	struct be_queue_info *mccq = &phba->ctrl.mcc_obj.q;
+	int ret = -ENOMEM;
 
 	tag = beiscsi_get_boot_target(phba);
 	if (!tag) {
@@ -3534,8 +3483,7 @@ static int beiscsi_get_boot_info(struct beiscsi_hba *phba)
 	boot_resp = embedded_payload(wrb);
 
 	if (boot_resp->boot_session_handle < 0) {
-		printk(KERN_ERR "No Boot Session for this pci_func,"
-			"session Hndl = %d\n", boot_resp->boot_session_handle);
+		shost_printk(KERN_INFO, phba->shost, "No Boot Session.\n");
 		return -ENXIO;
 	}
 
@@ -3573,14 +3521,70 @@ static int beiscsi_get_boot_info(struct beiscsi_hba *phba)
 	wrb = queue_get_wrb(mccq, wrb_num);
 	free_mcc_tag(&phba->ctrl, tag);
 	session_resp = nonemb_cmd.va ;
+
 	memcpy(&phba->boot_sess, &session_resp->session_info,
 	       sizeof(struct mgmt_session_info));
-	pci_free_consistent(phba->ctrl.pdev, nonemb_cmd.size,
-		    nonemb_cmd.va, nonemb_cmd.dma);
-	return 0;
+	ret = 0;
+
 boot_freemem:
 	pci_free_consistent(phba->ctrl.pdev, nonemb_cmd.size,
 		    nonemb_cmd.va, nonemb_cmd.dma);
+	return ret;
+}
+
+static void beiscsi_boot_release(void *data)
+{
+	struct beiscsi_hba *phba = data;
+
+	scsi_host_put(phba->shost);
+}
+
+static int beiscsi_setup_boot_info(struct beiscsi_hba *phba)
+{
+	struct iscsi_boot_kobj *boot_kobj;
+
+	/* get boot info using mgmt cmd */
+	if (beiscsi_get_boot_info(phba))
+		/* Try to see if we can carry on without this */
+		return 0;
+
+	phba->boot_kset = iscsi_boot_create_host_kset(phba->shost->host_no);
+	if (!phba->boot_kset)
+		return -ENOMEM;
+
+	/* get a ref because the show function will ref the phba */
+	if (!scsi_host_get(phba->shost))
+		goto free_kset;
+	boot_kobj = iscsi_boot_create_target(phba->boot_kset, 0, phba,
+					     beiscsi_show_boot_tgt_info,
+					     beiscsi_tgt_get_attr_visibility,
+					     beiscsi_boot_release);
+	if (!boot_kobj)
+		goto put_shost;
+
+	if (!scsi_host_get(phba->shost))
+		goto free_kset;
+	boot_kobj = iscsi_boot_create_initiator(phba->boot_kset, 0, phba,
+						beiscsi_show_boot_ini_info,
+						beiscsi_ini_get_attr_visibility,
+						beiscsi_boot_release);
+	if (!boot_kobj)
+		goto put_shost;
+
+	if (!scsi_host_get(phba->shost))
+		goto free_kset;
+	boot_kobj = iscsi_boot_create_ethernet(phba->boot_kset, 0, phba,
+					       beiscsi_show_boot_eth_info,
+					       beiscsi_eth_get_attr_visibility,
+					       beiscsi_boot_release);
+	if (!boot_kobj)
+		goto put_shost;
+	return 0;
+
+put_shost:
+	scsi_host_put(phba->shost);
+free_kset:
+	iscsi_boot_destroy_kset(phba->boot_kset);
 	return -ENOMEM;
 }
 
@@ -4307,11 +4311,15 @@ static int __devinit beiscsi_dev_probe(struct pci_dev *pcidev,
 		goto free_blkenbld;
 	}
 	hwi_enable_intr(phba);
-	ret = beiscsi_get_boot_info(phba);
-	if (ret < 0) {
-		shost_printk(KERN_ERR, phba->shost, "beiscsi_dev_probe-"
-			     "No Boot Devices !!!!!\n");
-	}
+
+	if (beiscsi_setup_boot_info(phba))
+		/*
+		 * log error but continue, because we may not be using
+		 * iscsi boot.
+		 */
+		shost_printk(KERN_ERR, phba->shost, "Could not set up "
+			     "iSCSI boot info.");
+
 	SE_DEBUG(DBG_LVL_8, "\n\n\n SUCCESS - DRIVER LOADED\n\n\n");
 	return 0;
 
diff --git a/drivers/scsi/iscsi_boot_sysfs.c b/drivers/scsi/iscsi_boot_sysfs.c
index 4274ce93d8f2..89700cbca16e 100644
--- a/drivers/scsi/iscsi_boot_sysfs.c
+++ b/drivers/scsi/iscsi_boot_sysfs.c
@@ -64,7 +64,8 @@ static void iscsi_boot_kobj_release(struct kobject *kobj)
 	struct iscsi_boot_kobj *boot_kobj =
 			container_of(kobj, struct iscsi_boot_kobj, kobj);
 
-	kfree(boot_kobj->data);
+	if (boot_kobj->release)
+		boot_kobj->release(boot_kobj->data);
 	kfree(boot_kobj);
 }
 
@@ -305,7 +306,8 @@ iscsi_boot_create_kobj(struct iscsi_boot_kset *boot_kset,
 		       struct attribute_group *attr_group,
 		       const char *name, int index, void *data,
 		       ssize_t (*show) (void *data, int type, char *buf),
-		       mode_t (*is_visible) (void *data, int type))
+		       mode_t (*is_visible) (void *data, int type),
+		       void (*release) (void *data))
 {
 	struct iscsi_boot_kobj *boot_kobj;
 
@@ -323,6 +325,7 @@ iscsi_boot_create_kobj(struct iscsi_boot_kset *boot_kset,
 	boot_kobj->data = data;
 	boot_kobj->show = show;
 	boot_kobj->is_visible = is_visible;
+	boot_kobj->release = release;
 
 	if (sysfs_create_group(&boot_kobj->kobj, attr_group)) {
 		/*
@@ -331,7 +334,7 @@ iscsi_boot_create_kobj(struct iscsi_boot_kset *boot_kset,
 		 * the boot kobj was not setup and the normal release
 		 * path is not being run.
 		 */
-		boot_kobj->data = NULL;
+		boot_kobj->release = NULL;
 		kobject_put(&boot_kobj->kobj);
 		return NULL;
 	}
@@ -357,6 +360,7 @@ static void iscsi_boot_remove_kobj(struct iscsi_boot_kobj *boot_kobj)
  * @data: driver specific data for target
  * @show: attr show function
  * @is_visible: attr visibility function
+ * @release: release function
  *
  * Note: The boot sysfs lib will free the data passed in for the caller
  * when all refs to the target kobject have been released.
@@ -365,10 +369,12 @@ struct iscsi_boot_kobj *
 iscsi_boot_create_target(struct iscsi_boot_kset *boot_kset, int index,
 			 void *data,
 			 ssize_t (*show) (void *data, int type, char *buf),
-			 mode_t (*is_visible) (void *data, int type))
+			 mode_t (*is_visible) (void *data, int type),
+			 void (*release) (void *data))
 {
 	return iscsi_boot_create_kobj(boot_kset, &iscsi_boot_target_attr_group,
-				      "target%d", index, data, show, is_visible);
+				      "target%d", index, data, show, is_visible,
+				      release);
 }
 EXPORT_SYMBOL_GPL(iscsi_boot_create_target);
 
@@ -379,6 +385,7 @@ EXPORT_SYMBOL_GPL(iscsi_boot_create_target);
  * @data: driver specific data
  * @show: attr show function
  * @is_visible: attr visibility function
+ * @release: release function
  *
  * Note: The boot sysfs lib will free the data passed in for the caller
  * when all refs to the initiator kobject have been released.
@@ -387,12 +394,13 @@ struct iscsi_boot_kobj *
 iscsi_boot_create_initiator(struct iscsi_boot_kset *boot_kset, int index,
 			    void *data,
 			    ssize_t (*show) (void *data, int type, char *buf),
-			    mode_t (*is_visible) (void *data, int type))
+			    mode_t (*is_visible) (void *data, int type),
+			    void (*release) (void *data))
 {
 	return iscsi_boot_create_kobj(boot_kset,
 				      &iscsi_boot_initiator_attr_group,
 				      "initiator", index, data, show,
-				      is_visible);
+				      is_visible, release);
 }
 EXPORT_SYMBOL_GPL(iscsi_boot_create_initiator);
 
@@ -403,6 +411,7 @@ EXPORT_SYMBOL_GPL(iscsi_boot_create_initiator);
  * @data: driver specific data
  * @show: attr show function
  * @is_visible: attr visibility function
+ * @release: release function
  *
  * Note: The boot sysfs lib will free the data passed in for the caller
  * when all refs to the ethernet kobject have been released.
@@ -411,12 +420,13 @@ struct iscsi_boot_kobj *
 iscsi_boot_create_ethernet(struct iscsi_boot_kset *boot_kset, int index,
 			   void *data,
 			   ssize_t (*show) (void *data, int type, char *buf),
-			   mode_t (*is_visible) (void *data, int type))
+			   mode_t (*is_visible) (void *data, int type),
+			   void (*release) (void *data))
 {
 	return iscsi_boot_create_kobj(boot_kset,
 				      &iscsi_boot_ethernet_attr_group,
 				      "ethernet%d", index, data, show,
-				      is_visible);
+				      is_visible, release);
 }
 EXPORT_SYMBOL_GPL(iscsi_boot_create_ethernet);
 
diff --git a/include/linux/iscsi_boot_sysfs.h b/include/linux/iscsi_boot_sysfs.h
index f1e6c184f14f..f0a2f8b0aa13 100644
--- a/include/linux/iscsi_boot_sysfs.h
+++ b/include/linux/iscsi_boot_sysfs.h
@@ -92,6 +92,13 @@ struct iscsi_boot_kobj {
 	 * properties.
 	 */
 	mode_t (*is_visible) (void *data, int type);
+
+	/*
+	 * Driver specific release function.
+	 *
+	 * The function should free the data passed in.
+	 */
+	void (*release) (void *data);
 };
 
 struct iscsi_boot_kset {
@@ -103,18 +110,21 @@ struct iscsi_boot_kobj *
 iscsi_boot_create_initiator(struct iscsi_boot_kset *boot_kset, int index,
 			    void *data,
 			    ssize_t (*show) (void *data, int type, char *buf),
-			    mode_t (*is_visible) (void *data, int type));
+			    mode_t (*is_visible) (void *data, int type),
+			    void (*release) (void *data));
 
 struct iscsi_boot_kobj *
 iscsi_boot_create_ethernet(struct iscsi_boot_kset *boot_kset, int index,
 			   void *data,
 			   ssize_t (*show) (void *data, int type, char *buf),
-			   mode_t (*is_visible) (void *data, int type));
+			   mode_t (*is_visible) (void *data, int type),
+			   void (*release) (void *data));
 struct iscsi_boot_kobj *
 iscsi_boot_create_target(struct iscsi_boot_kset *boot_kset, int index,
 			 void *data,
 			 ssize_t (*show) (void *data, int type, char *buf),
-			 mode_t (*is_visible) (void *data, int type));
+			 mode_t (*is_visible) (void *data, int type),
+			 void (*release) (void *data));
 
 struct iscsi_boot_kset *iscsi_boot_create_kset(const char *set_name);
 struct iscsi_boot_kset *iscsi_boot_create_host_kset(unsigned int hostno);
-- 
cgit v1.2.3


From 15b493d11fcce3c5547e3d7fb6d90e11ffe12777 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 28 Jun 2011 09:48:48 +0200
Subject: drbd: fix limit define, we support 1 PiByte now

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 include/linux/drbd_limits.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 246f576c981d..447c36752385 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -117,10 +117,10 @@
 /* drbdsetup XY resize -d Z
  * you are free to reduce the device size to nothing, if you want to.
  * the upper limit with 64bit kernel, enough ram and flexible meta data
- * is 16 TB, currently. */
+ * is 1 PiB, currently. */
 /* DRBD_MAX_SECTORS */
 #define DRBD_DISK_SIZE_SECT_MIN  0
-#define DRBD_DISK_SIZE_SECT_MAX  (16 * (2LLU << 30))
+#define DRBD_DISK_SIZE_SECT_MAX  (1 * (2LLU << 40))
 #define DRBD_DISK_SIZE_SECT_DEF  0 /* = disabled = no user size... */
 
 #define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
-- 
cgit v1.2.3


From 131ad62d8fc06d9d0a5c61d9526876352c2f2bbd Mon Sep 17 00:00:00 2001
From: Mr Dash Four <mr.dash.four@googlemail.com>
Date: Thu, 30 Jun 2011 13:31:57 +0200
Subject: netfilter: add SELinux context support to AUDIT target

In this revision the conversion of secid to SELinux context and adding it
to the audit log is moved from xt_AUDIT.c to audit.c with the aid of a
separate helper function - audit_log_secctx - which does both the conversion
and logging of SELinux context, thus also preventing internal secid number
being leaked to userspace. If conversion is not successful an error is raised.

With the introduction of this helper function the work done in xt_AUDIT.c is
much more simplified. It also opens the possibility of this helper function
being used by other modules (including auditd itself), if desired. With this
addition, typical (raw auditd) output after applying the patch would be:

type=NETFILTER_PKT msg=audit(1305852240.082:31012): action=0 hook=1 len=52 inif=? outif=eth0 saddr=10.1.1.7 daddr=10.1.2.1 ipid=16312 proto=6 sport=56150 dport=22 obj=system_u:object_r:ssh_client_packet_t:s0
type=NETFILTER_PKT msg=audit(1306772064.079:56): action=0 hook=3 len=48 inif=eth0 outif=? smac=00:05:5d:7c:27:0b dmac=00:02:b3:0a:7f:81 macproto=0x0800 saddr=10.1.2.1 daddr=10.1.1.7 ipid=462 proto=6 sport=22 dport=3561 obj=system_u:object_r:ssh_server_packet_t:s0

Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Mr Dash Four <mr.dash.four@googlemail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/audit.h    |  7 +++++++
 kernel/audit.c           | 29 +++++++++++++++++++++++++++++
 net/netfilter/xt_AUDIT.c |  5 +++++
 3 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 9d339eb27881..0c8006129fb2 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -613,6 +613,12 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,
 extern void		    audit_log_key(struct audit_buffer *ab,
 					  char *key);
 extern void		    audit_log_lost(const char *message);
+#ifdef CONFIG_SECURITY
+extern void 		    audit_log_secctx(struct audit_buffer *ab, u32 secid);
+#else
+#define audit_log_secctx(b,s) do { ; } while (0)
+#endif
+
 extern int		    audit_update_lsm_rules(void);
 
 				/* Private API (for audit.c only) */
@@ -635,6 +641,7 @@ extern int audit_enabled;
 #define audit_log_untrustedstring(a,s) do { ; } while (0)
 #define audit_log_d_path(b, p, d) do { ; } while (0)
 #define audit_log_key(b, k) do { ; } while (0)
+#define audit_log_secctx(b,s) do { ; } while (0)
 #define audit_enabled 0
 #endif
 #endif
diff --git a/kernel/audit.c b/kernel/audit.c
index 939500317066..52501b5d4902 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -55,6 +55,9 @@
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <linux/skbuff.h>
+#ifdef CONFIG_SECURITY
+#include <linux/security.h>
+#endif
 #include <linux/netlink.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
 	}
 }
 
+#ifdef CONFIG_SECURITY
+/**
+ * audit_log_secctx - Converts and logs SELinux context
+ * @ab: audit_buffer
+ * @secid: security number
+ *
+ * This is a helper function that calls security_secid_to_secctx to convert
+ * secid to secctx and then adds the (converted) SELinux context to the audit
+ * log by calling audit_log_format, thus also preventing leak of internal secid
+ * to userspace. If secid cannot be converted audit_panic is called.
+ */
+void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{
+	u32 len;
+	char *secctx;
+
+	if (security_secid_to_secctx(secid, &secctx, &len)) {
+		audit_panic("Cannot convert secid to context");
+	} else {
+		audit_log_format(ab, " obj=%s", secctx);
+		security_release_secctx(secctx, len);
+	}
+}
+EXPORT_SYMBOL(audit_log_secctx);
+#endif
+
 EXPORT_SYMBOL(audit_log_start);
 EXPORT_SYMBOL(audit_log_end);
 EXPORT_SYMBOL(audit_log_format);
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index 363a99ec0637..4bca15a0c385 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -163,6 +163,11 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		break;
 	}
 
+#ifdef CONFIG_NETWORK_SECMARK
+	if (skb->secmark)
+		audit_log_secctx(ab, skb->secmark);
+#endif
+
 	audit_log_end(ab);
 
 errout:
-- 
cgit v1.2.3


From a3b853633d78c3930b513ee219df48637ac82eed Mon Sep 17 00:00:00 2001
From: Thomas Abraham <thomas.abraham@linaro.org>
Date: Thu, 30 Jun 2011 21:26:10 +0530
Subject: dt: add helper functions to read u32 and string property values

Add helper functions to retrieve unsigned integer and string property
values from properties of a device node. These helper functions can be
used to lookup a property in a device node, perform error checking and
read the property value.

[grant.likely@secretlab.ca: Proposal and initial implementation]
Signed-off-by: Thomas Abraham <thomas.abraham@linaro.org>
[grant.likely: some word smithing and be more defensive validating the string]
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/base.c  | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of.h |  4 ++++
 2 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 632ebae7f17a..b8b65fddbeb5 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -595,6 +595,64 @@ struct device_node *of_find_node_by_phandle(phandle handle)
 }
 EXPORT_SYMBOL(of_find_node_by_phandle);
 
+/**
+ * of_property_read_u32 - Find and read a 32 bit integer from a property
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @out_value:	pointer to return value, modified only if return value is 0.
+ *
+ * Search for a property in a device node and read a 32-bit value from
+ * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_value is modified only if a valid u32 value can be decoded.
+ */
+int of_property_read_u32(struct device_node *np, char *propname, u32 *out_value)
+{
+	struct property *prop = of_find_property(np, propname, NULL);
+
+	if (!prop)
+		return -EINVAL;
+	if (!prop->value)
+		return -ENODATA;
+	if (sizeof(*out_value) > prop->length)
+		return -EOVERFLOW;
+	*out_value = be32_to_cpup(prop->value);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_property_read_u32);
+
+/**
+ * of_property_read_string - Find and read a string from a property
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @out_string:	pointer to null terminated return string, modified only if
+ *		return value is 0.
+ *
+ * Search for a property in a device tree node and retrieve a null
+ * terminated string value (pointer to data, not a copy). Returns 0 on
+ * success, -EINVAL if the property does not exist, -ENODATA if property
+ * does not have a value, and -EILSEQ if the string is not null-terminated
+ * within the length of the property data.
+ *
+ * The out_string pointer is modified only if a valid string can be decoded.
+ */
+int of_property_read_string(struct device_node *np, char *propname,
+				char **out_string)
+{
+	struct property *prop = of_find_property(np, propname, NULL);
+	if (!prop)
+		return -EINVAL;
+	if (!prop->value)
+		return -ENODATA;
+	if (strnlen(prop->value, prop->length) >= prop->length)
+		return -EILSEQ;
+	*out_string = prop->value;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_property_read_string);
+
 /**
  * of_parse_phandle - Resolve a phandle property to a device_node pointer
  * @np: Pointer to device node holding phandle property
diff --git a/include/linux/of.h b/include/linux/of.h
index bfc0ed1b0ced..4fc4c1b8d5d5 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -195,6 +195,10 @@ extern struct device_node *of_find_node_with_property(
 extern struct property *of_find_property(const struct device_node *np,
 					 const char *name,
 					 int *lenp);
+extern int of_property_read_u32(struct device_node *np, char *propname,
+					u32 *out_value);
+extern int of_property_read_string(struct device_node *np, char *propname,
+					char **out_string);
 extern int of_device_is_compatible(const struct device_node *device,
 				   const char *);
 extern int of_device_is_available(const struct device_node *device);
-- 
cgit v1.2.3


From 0209bcd4d9ee66569d4ea76f9ab2de3a9c740c71 Mon Sep 17 00:00:00 2001
From: Amit Kumar Salecha <amit.salecha@qlogic.com>
Date: Wed, 29 Jun 2011 20:00:49 +0000
Subject: net: add external loopback test in ethtool self test

External loopback test can be performed by application without any driver
support on normal Ethernet cards.
But on CNA devices, where multiple functions share same physical port.
Here internal loopback test and external loopback test can be initiated by
multiple functions at same time. To co exist all functions, firmware need
to regulate what test can be run by which function. So before performing external
loopback test, command need to send to firmware, which will quiescent other functions.

User may not want to run external loopback test always. As special cable need to be
connected for this test.
So adding explicit flag in ethtool self test, which will specify interface
to perform external loopback test.
 ETH_TEST_FL_EXTERNAL_LB: Application set to request external loopback test
 ETH_TEST_FL_EXTERNAL_LB_DONE: Driver ack if test performed

Signed-off-by: Amit Kumar Salecha <amit.salecha@qlogic.com>
Reviewed-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 048d0fa38d03..c6e427ab65fe 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -310,9 +310,21 @@ struct ethtool_sset_info {
 				   __u32's, etc. */
 };
 
+/**
+ * enum ethtool_test_flags - flags definition of ethtool_test
+ * @ETH_TEST_FL_OFFLINE: if set perform online and offline tests, otherwise
+ *	only online tests.
+ * @ETH_TEST_FL_FAILED: Driver set this flag if test fails.
+ * @ETH_TEST_FL_EXTERNAL_LB: Application request to perform external loopback
+ *	test.
+ * @ETH_TEST_FL_EXTERNAL_LB_DONE: Driver performed the external loopback test
+ */
+
 enum ethtool_test_flags {
-	ETH_TEST_FL_OFFLINE	= (1 << 0),	/* online / offline */
-	ETH_TEST_FL_FAILED	= (1 << 1),	/* test passed / failed */
+	ETH_TEST_FL_OFFLINE	= (1 << 0),
+	ETH_TEST_FL_FAILED	= (1 << 1),
+	ETH_TEST_FL_EXTERNAL_LB	= (1 << 2),
+	ETH_TEST_FL_EXTERNAL_LB_DONE	= (1 << 3),
 };
 
 /* for requesting NIC test and getting results*/
-- 
cgit v1.2.3


From e6220bdc9485c5ea972f9e0e6d062a05934bb74b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Jun 2011 13:08:25 +0000
Subject: i8253: Create common clockevent implementation

arm, mips and x86 implement i8253 based clockevents. All the same code
copied. Create a common implementation in drivers/clocksource/i8253.c.

About time to rename drivers/clocksource/ to something else.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20110609130621.921710458@linutronix.de
---
 drivers/clocksource/Kconfig |   5 ++-
 drivers/clocksource/i8253.c | 101 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/i8253.h       |   2 +
 3 files changed, 100 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 330343bcb67e..d8d3e02b912c 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -1,11 +1,14 @@
 config CLKSRC_I8253
 	bool
 
+config CLKEVT_I8253
+	bool
+
 config I8253_LOCK
 	bool
 
 config CLKBLD_I8253
-	def_bool y if CLKSRC_I8253 || I8253_LOCK
+	def_bool y if CLKSRC_I8253 || CLKEVT_I8253 || I8253_LOCK
 
 config CLKSRC_MMIO
 	bool
diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c
index e594f52eb88e..27c49e60b7d6 100644
--- a/drivers/clocksource/i8253.c
+++ b/drivers/clocksource/i8253.c
@@ -1,13 +1,14 @@
 /*
  * i8253 PIT clocksource
  */
-#include <linux/clocksource.h>
+#include <linux/clockchips.h>
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/spinlock.h>
 #include <linux/timex.h>
 #include <linux/module.h>
 #include <linux/i8253.h>
+#include <linux/smp.h>
 
 /*
  * Protects access to I/O ports
@@ -47,15 +48,15 @@ static cycle_t i8253_read(struct clocksource *cs)
 	 * count), it cannot be newer.
 	 */
 	jifs = jiffies;
-	outb_pit(0x00, PIT_MODE);	/* latch the count ASAP */
-	count = inb_pit(PIT_CH0);	/* read the latched count */
-	count |= inb_pit(PIT_CH0) << 8;
+	outb_p(0x00, PIT_MODE);	/* latch the count ASAP */
+	count = inb_p(PIT_CH0);	/* read the latched count */
+	count |= inb_p(PIT_CH0) << 8;
 
 	/* VIA686a test code... reset the latch if count > max + 1 */
 	if (count > LATCH) {
-		outb_pit(0x34, PIT_MODE);
-		outb_pit(PIT_LATCH & 0xff, PIT_CH0);
-		outb_pit(PIT_LATCH >> 8, PIT_CH0);
+		outb_p(0x34, PIT_MODE);
+		outb_p(PIT_LATCH & 0xff, PIT_CH0);
+		outb_p(PIT_LATCH >> 8, PIT_CH0);
 		count = PIT_LATCH - 1;
 	}
 
@@ -97,3 +98,89 @@ int __init clocksource_i8253_init(void)
 	return clocksource_register_hz(&i8253_cs, PIT_TICK_RATE);
 }
 #endif
+
+#ifdef CONFIG_CLKEVT_I8253
+/*
+ * Initialize the PIT timer.
+ *
+ * This is also called after resume to bring the PIT into operation again.
+ */
+static void init_pit_timer(enum clock_event_mode mode,
+			   struct clock_event_device *evt)
+{
+	raw_spin_lock(&i8253_lock);
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* binary, mode 2, LSB/MSB, ch 0 */
+		outb_p(0x34, PIT_MODE);
+		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
+		outb_p(LATCH >> 8 , PIT_CH0);		/* MSB */
+		break;
+
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	case CLOCK_EVT_MODE_UNUSED:
+		if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
+		    evt->mode == CLOCK_EVT_MODE_ONESHOT) {
+			outb_p(0x30, PIT_MODE);
+			outb_p(0, PIT_CH0);
+			outb_p(0, PIT_CH0);
+		}
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* One shot setup */
+		outb_p(0x38, PIT_MODE);
+		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
+		break;
+	}
+	raw_spin_unlock(&i8253_lock);
+}
+
+/*
+ * Program the next event in oneshot mode
+ *
+ * Delta is given in PIT ticks
+ */
+static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
+{
+	raw_spin_lock(&i8253_lock);
+	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
+	outb_p(delta >> 8 , PIT_CH0);		/* MSB */
+	raw_spin_unlock(&i8253_lock);
+
+	return 0;
+}
+
+/*
+ * On UP the PIT can serve all of the possible timer functions. On SMP systems
+ * it can be solely used for the global tick.
+ */
+struct clock_event_device i8253_clockevent = {
+	.name		= "pit",
+	.features	= CLOCK_EVT_FEAT_PERIODIC,
+	.set_mode	= init_pit_timer,
+	.set_next_event = pit_next_event,
+};
+
+/*
+ * Initialize the conversion factor and the min/max deltas of the clock event
+ * structure and register the clock event source with the framework.
+ */
+void __init clockevent_i8253_init(bool oneshot)
+{
+	if (oneshot)
+		i8253_clockevent.features |= CLOCK_EVT_FEAT_ONESHOT;
+	/*
+	 * Start pit with the boot cpu mask. x86 might make it global
+	 * when it is used as broadcast device later.
+	 */
+	i8253_clockevent.cpumask = cpumask_of(smp_processor_id());
+
+	clockevents_config_and_register(&i8253_clockevent, PIT_TICK_RATE,
+					0xF, 0x7FFF);
+}
+#endif
diff --git a/include/linux/i8253.h b/include/linux/i8253.h
index 76039c86ab58..487d50c7db39 100644
--- a/include/linux/i8253.h
+++ b/include/linux/i8253.h
@@ -24,6 +24,8 @@
 #define outb_pit        outb_p
 
 extern raw_spinlock_t i8253_lock;
+extern struct clock_event_device i8253_clockevent;
+extern void clockevent_i8253_init(bool oneshot);
 
 extern void setup_pit_timer(void);
 
-- 
cgit v1.2.3


From 01898e3e29ea8242d81923da11ce88ba71290a48 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Jun 2011 13:08:30 +0000
Subject: i8253: Cleanup outb/inb magic

Remove the hysterical outb/inb_pit defines and use outb_p/inb_p in the
code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20110609130622.348437125@linutronix.de
---
 arch/x86/kernel/apm_32.c | 6 +++---
 include/linux/i8253.h    | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index a30740e6890e..0371c484bb8a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1220,11 +1220,11 @@ static void reinit_timer(void)
 
 	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/* set the clock to HZ */
-	outb_pit(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
+	outb_p(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
-	outb_pit(LATCH & 0xff, PIT_CH0);	/* LSB */
+	outb_p(LATCH & 0xff, PIT_CH0);	/* LSB */
 	udelay(10);
-	outb_pit(LATCH >> 8, PIT_CH0);	/* MSB */
+	outb_p(LATCH >> 8, PIT_CH0);	/* MSB */
 	udelay(10);
 	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
diff --git a/include/linux/i8253.h b/include/linux/i8253.h
index 487d50c7db39..e6bb36a97519 100644
--- a/include/linux/i8253.h
+++ b/include/linux/i8253.h
@@ -20,9 +20,6 @@
 
 #define PIT_LATCH	((PIT_TICK_RATE + HZ/2) / HZ)
 
-#define inb_pit         inb_p
-#define outb_pit        outb_p
-
 extern raw_spinlock_t i8253_lock;
 extern struct clock_event_device i8253_clockevent;
 extern void clockevent_i8253_init(bool oneshot);
-- 
cgit v1.2.3


From 28009ce4a8130af7260a9271901b4419834ad152 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Tue, 7 Jun 2011 16:33:38 +0100
Subject: perf: Remove 64-bit alignment padding from perf_event_context

Reorder perf_event_context to remove 8 bytes of 64 bit alignment padding
shrinking its size to 192 bytes, allowing it to fit into a smaller slab
and use one fewer cache lines.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1307460819.1950.5.camel@castor.rsk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e76a41010e1f..2f7b5d42ab41 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -894,8 +894,8 @@ struct perf_event_context {
 	u64				parent_gen;
 	u64				generation;
 	int				pin_count;
-	struct rcu_head			rcu_head;
 	int				nr_cgroups; /* cgroup events present */
+	struct rcu_head			rcu_head;
 };
 
 /*
-- 
cgit v1.2.3


From a8b0ca17b80e92faab46ee7179ba9e99ccb61233 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 27 Jun 2011 14:41:57 +0200
Subject: perf: Remove the nmi parameter from the swevent and overflow
 interface

The nmi parameter indicated if we could do wakeups from the current
context, if not, we would set some state and self-IPI and let the
resulting interrupt do the wakeup.

For the various event classes:

  - hardware: nmi=0; PMI is in fact an NMI or we run irq_work_run from
    the PMI-tail (ARM etc.)
  - tracepoint: nmi=0; since tracepoint could be from NMI context.
  - software: nmi=[0,1]; some, like the schedule thing cannot
    perform wakeups, and hence need 0.

As one can see, there is very little nmi=1 usage, and the down-side of
not using it is that on some platforms some software events can have a
jiffy delay in wakeup (when arch_irq_work_raise isn't implemented).

The up-side however is that we can remove the nmi parameter and save a
bunch of conditionals in fast paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Michael Cree <mcree@orcon.net.nz>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/n/tip-agjev8eu666tvknpb3iaj0fg@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/perf_event.c            |  2 +-
 arch/arm/kernel/perf_event_v6.c           |  2 +-
 arch/arm/kernel/perf_event_v7.c           |  2 +-
 arch/arm/kernel/perf_event_xscale.c       |  4 +-
 arch/arm/kernel/ptrace.c                  |  2 +-
 arch/arm/kernel/swp_emulate.c             |  2 +-
 arch/arm/mm/fault.c                       |  6 +--
 arch/mips/kernel/perf_event.c             |  2 +-
 arch/mips/kernel/traps.c                  |  8 ++--
 arch/mips/kernel/unaligned.c              |  5 +--
 arch/mips/math-emu/cp1emu.c               |  3 +-
 arch/mips/mm/fault.c                      |  8 ++--
 arch/powerpc/include/asm/emulated_ops.h   |  4 +-
 arch/powerpc/kernel/perf_event.c          |  6 +--
 arch/powerpc/kernel/perf_event_fsl_emb.c  |  6 +--
 arch/powerpc/kernel/ptrace.c              |  2 +-
 arch/powerpc/mm/fault.c                   |  6 +--
 arch/s390/mm/fault.c                      |  6 +--
 arch/sh/kernel/ptrace_32.c                |  2 +-
 arch/sh/kernel/traps_32.c                 |  2 +-
 arch/sh/kernel/traps_64.c                 |  8 ++--
 arch/sh/math-emu/math.c                   |  2 +-
 arch/sh/mm/fault_32.c                     |  6 +--
 arch/sh/mm/tlbflush_64.c                  |  6 +--
 arch/sparc/kernel/perf_event.c            |  2 +-
 arch/sparc/kernel/unaligned_32.c          |  4 +-
 arch/sparc/kernel/unaligned_64.c          | 12 +++---
 arch/sparc/kernel/visemul.c               |  2 +-
 arch/sparc/math-emu/math_32.c             |  2 +-
 arch/sparc/math-emu/math_64.c             |  2 +-
 arch/sparc/mm/fault_32.c                  |  8 ++--
 arch/sparc/mm/fault_64.c                  |  8 ++--
 arch/x86/kernel/cpu/perf_event.c          |  2 +-
 arch/x86/kernel/cpu/perf_event_intel.c    |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |  4 +-
 arch/x86/kernel/cpu/perf_event_p4.c       |  2 +-
 arch/x86/kernel/kgdb.c                    |  2 +-
 arch/x86/kernel/ptrace.c                  |  2 +-
 arch/x86/mm/fault.c                       |  6 +--
 include/linux/perf_event.h                | 18 ++++-----
 kernel/events/core.c                      | 63 ++++++++++++++-----------------
 kernel/events/internal.h                  |  1 -
 kernel/events/ring_buffer.c               | 10 ++---
 kernel/sched.c                            |  2 +-
 kernel/watchdog.c                         |  2 +-
 samples/hw_breakpoint/data_breakpoint.c   |  2 +-
 46 files changed, 119 insertions(+), 141 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 90561c45e7d8..8e47709160f8 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -847,7 +847,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
 	data.period = event->hw.last_period;
 
 	if (alpha_perf_event_set_period(event, hwc, idx)) {
-		if (perf_event_overflow(event, 1, &data, regs)) {
+		if (perf_event_overflow(event, &data, regs)) {
 			/* Interrupts coming too quickly; "throttle" the
 			 * counter, i.e., disable it for a little while.
 			 */
diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index f1e8dd94afe8..38dc4da1d612 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -479,7 +479,7 @@ armv6pmu_handle_irq(int irq_num,
 		if (!armpmu_event_set_period(event, hwc, idx))
 			continue;
 
-		if (perf_event_overflow(event, 0, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			armpmu->disable(hwc, idx);
 	}
 
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 4960686afb58..6e5f8752303b 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -787,7 +787,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
 		if (!armpmu_event_set_period(event, hwc, idx))
 			continue;
 
-		if (perf_event_overflow(event, 0, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			armpmu->disable(hwc, idx);
 	}
 
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index 39affbe4fdb2..99b6b85c7e49 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -251,7 +251,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
 		if (!armpmu_event_set_period(event, hwc, idx))
 			continue;
 
-		if (perf_event_overflow(event, 0, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			armpmu->disable(hwc, idx);
 	}
 
@@ -583,7 +583,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
 		if (!armpmu_event_set_period(event, hwc, idx))
 			continue;
 
-		if (perf_event_overflow(event, 0, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			armpmu->disable(hwc, idx);
 	}
 
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 97260060bf26..0c9b1054f790 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -396,7 +396,7 @@ static long ptrace_hbp_idx_to_num(int idx)
 /*
  * Handle hitting a HW-breakpoint.
  */
-static void ptrace_hbptriggered(struct perf_event *bp, int unused,
+static void ptrace_hbptriggered(struct perf_event *bp,
 				     struct perf_sample_data *data,
 				     struct pt_regs *regs)
 {
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 40ee7e5045e4..5f452f8fde05 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -183,7 +183,7 @@ static int swp_handler(struct pt_regs *regs, unsigned int instr)
 	unsigned int address, destreg, data, type;
 	unsigned int res = 0;
 
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, regs->ARM_pc);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->ARM_pc);
 
 	if (current->pid != previous_pid) {
 		pr_debug("\"%s\" (%ld) uses deprecated SWP{B} instruction\n",
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index bc0e1d88fd3b..9ea4f7ddd665 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -318,11 +318,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	fault = __do_page_fault(mm, addr, fsr, tsk);
 	up_read(&mm->mmap_sem);
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, addr);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 	if (fault & VM_FAULT_MAJOR)
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, addr);
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, addr);
 	else if (fault & VM_FAULT_MINOR)
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, addr);
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, addr);
 
 	/*
 	 * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index a8244854d3dc..d0deaab9ace2 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -527,7 +527,7 @@ handle_associated_event(struct cpu_hw_events *cpuc,
 	if (!mipspmu_event_set_period(event, hwc, idx))
 		return;
 
-	if (perf_event_overflow(event, 0, data, regs))
+	if (perf_event_overflow(event, data, regs))
 		mipspmu->disable_event(idx);
 }
 
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index e9b3af27d844..b7517e3abc85 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -578,12 +578,12 @@ static int simulate_llsc(struct pt_regs *regs, unsigned int opcode)
 {
 	if ((opcode & OPCODE) == LL) {
 		perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
-				1, 0, regs, 0);
+				1, regs, 0);
 		return simulate_ll(regs, opcode);
 	}
 	if ((opcode & OPCODE) == SC) {
 		perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
-				1, 0, regs, 0);
+				1, regs, 0);
 		return simulate_sc(regs, opcode);
 	}
 
@@ -602,7 +602,7 @@ static int simulate_rdhwr(struct pt_regs *regs, unsigned int opcode)
 		int rd = (opcode & RD) >> 11;
 		int rt = (opcode & RT) >> 16;
 		perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
-				1, 0, regs, 0);
+				1, regs, 0);
 		switch (rd) {
 		case 0:		/* CPU number */
 			regs->regs[rt] = smp_processor_id();
@@ -640,7 +640,7 @@ static int simulate_sync(struct pt_regs *regs, unsigned int opcode)
 {
 	if ((opcode & OPCODE) == SPEC0 && (opcode & FUNC) == SYNC) {
 		perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
-				1, 0, regs, 0);
+				1, regs, 0);
 		return 0;
 	}
 
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index cfea1adfa153..eb319b580353 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -111,8 +111,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 	unsigned long value;
 	unsigned int res;
 
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
-		      1, 0, regs, 0);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0);
 
 	/*
 	 * This load never faults.
@@ -517,7 +516,7 @@ asmlinkage void do_ade(struct pt_regs *regs)
 	mm_segment_t seg;
 
 	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS,
-			1, 0, regs, regs->cp0_badvaddr);
+			1, regs, regs->cp0_badvaddr);
 	/*
 	 * Did we catch a fault trying to load an instruction?
 	 * Or are we running in MIPS16 mode?
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index d32cb0503110..dbf2f93a5091 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -272,8 +272,7 @@ static int cop1Emulate(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 	}
 
       emul:
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
-			1, 0, xcp, 0);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, xcp, 0);
 	MIPS_FPU_EMU_INC_STATS(emulated);
 	switch (MIPSInst_OPCODE(ir)) {
 	case ldc1_op:{
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 137ee76a0045..937cf3368164 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -145,7 +145,7 @@ good_area:
 	 * the fault.
 	 */
 	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
@@ -154,12 +154,10 @@ good_area:
 		BUG();
 	}
 	if (fault & VM_FAULT_MAJOR) {
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ,
-				1, 0, regs, address);
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
 		tsk->maj_flt++;
 	} else {
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN,
-				1, 0, regs, address);
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
 		tsk->min_flt++;
 	}
 
diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h
index 45921672b97a..2cc41c715d2b 100644
--- a/arch/powerpc/include/asm/emulated_ops.h
+++ b/arch/powerpc/include/asm/emulated_ops.h
@@ -78,14 +78,14 @@ extern void ppc_warn_emulated_print(const char *type);
 #define PPC_WARN_EMULATED(type, regs)					\
 	do {								\
 		perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,		\
-			1, 0, regs, 0);					\
+			1, regs, 0);					\
 		__PPC_WARN_EMULATED(type);				\
 	} while (0)
 
 #define PPC_WARN_ALIGNMENT(type, regs)					\
 	do {								\
 		perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS,		\
-			1, 0, regs, regs->dar);				\
+			1, regs, regs->dar);				\
 		__PPC_WARN_EMULATED(type);				\
 	} while (0)
 
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 822f63008ae1..14967de98876 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -1207,7 +1207,7 @@ struct pmu power_pmu = {
  * here so there is no possibility of being interrupted.
  */
 static void record_and_restart(struct perf_event *event, unsigned long val,
-			       struct pt_regs *regs, int nmi)
+			       struct pt_regs *regs)
 {
 	u64 period = event->hw.sample_period;
 	s64 prev, delta, left;
@@ -1258,7 +1258,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
 			perf_get_data_addr(regs, &data.addr);
 
-		if (perf_event_overflow(event, nmi, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			power_pmu_stop(event, 0);
 	}
 }
@@ -1346,7 +1346,7 @@ static void perf_event_interrupt(struct pt_regs *regs)
 		if ((int)val < 0) {
 			/* event has overflowed */
 			found = 1;
-			record_and_restart(event, val, regs, nmi);
+			record_and_restart(event, val, regs);
 		}
 	}
 
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c
index b0dc8f7069cd..0a6d2a9d569c 100644
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -568,7 +568,7 @@ static struct pmu fsl_emb_pmu = {
  * here so there is no possibility of being interrupted.
  */
 static void record_and_restart(struct perf_event *event, unsigned long val,
-			       struct pt_regs *regs, int nmi)
+			       struct pt_regs *regs)
 {
 	u64 period = event->hw.sample_period;
 	s64 prev, delta, left;
@@ -616,7 +616,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 		perf_sample_data_init(&data, 0);
 		data.period = event->hw.last_period;
 
-		if (perf_event_overflow(event, nmi, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			fsl_emb_pmu_stop(event, 0);
 	}
 }
@@ -644,7 +644,7 @@ static void perf_event_interrupt(struct pt_regs *regs)
 			if (event) {
 				/* event has overflowed */
 				found = 1;
-				record_and_restart(event, val, regs, nmi);
+				record_and_restart(event, val, regs);
 			} else {
 				/*
 				 * Disabled counter is negative,
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index cb22024f2b42..3177617af2ef 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -882,7 +882,7 @@ void user_disable_single_step(struct task_struct *task)
 }
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-void ptrace_triggered(struct perf_event *bp, int nmi,
+void ptrace_triggered(struct perf_event *bp,
 		      struct perf_sample_data *data, struct pt_regs *regs)
 {
 	struct perf_event_attr attr;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 54f4fb994e99..dbc48254c6cc 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -173,7 +173,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -319,7 +319,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
@@ -330,7 +330,7 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index fe103e891e7a..095f782a5512 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -299,7 +299,7 @@ static inline int do_exception(struct pt_regs *regs, int access,
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 	flags = FAULT_FLAG_ALLOW_RETRY;
 	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
 		flags |= FAULT_FLAG_WRITE;
@@ -345,11 +345,11 @@ retry:
 	if (flags & FAULT_FLAG_ALLOW_RETRY) {
 		if (fault & VM_FAULT_MAJOR) {
 			tsk->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 				      regs, address);
 		} else {
 			tsk->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 				      regs, address);
 		}
 		if (fault & VM_FAULT_RETRY) {
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 3d7b209b2178..8051976100a6 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -63,7 +63,7 @@ static inline int put_stack_long(struct task_struct *task, int offset,
 	return 0;
 }
 
-void ptrace_triggered(struct perf_event *bp, int nmi,
+void ptrace_triggered(struct perf_event *bp,
 		      struct perf_sample_data *data, struct pt_regs *regs)
 {
 	struct perf_event_attr attr;
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index b51a17104b5f..d9006f8ffc14 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -393,7 +393,7 @@ int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs,
 	 */
 	if (!expected) {
 		unaligned_fixups_notify(current, instruction, regs);
-		perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1,
 			      regs, address);
 	}
 
diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c
index 6713ca97e553..67110be83fd7 100644
--- a/arch/sh/kernel/traps_64.c
+++ b/arch/sh/kernel/traps_64.c
@@ -434,7 +434,7 @@ static int misaligned_load(struct pt_regs *regs,
 		return error;
 	}
 
-	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, address);
 
 	destreg = (opcode >> 4) & 0x3f;
 	if (user_mode(regs)) {
@@ -512,7 +512,7 @@ static int misaligned_store(struct pt_regs *regs,
 		return error;
 	}
 
-	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, address);
 
 	srcreg = (opcode >> 4) & 0x3f;
 	if (user_mode(regs)) {
@@ -588,7 +588,7 @@ static int misaligned_fpu_load(struct pt_regs *regs,
 		return error;
 	}
 
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, address);
 
 	destreg = (opcode >> 4) & 0x3f;
 	if (user_mode(regs)) {
@@ -665,7 +665,7 @@ static int misaligned_fpu_store(struct pt_regs *regs,
 		return error;
 	}
 
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, address);
 
 	srcreg = (opcode >> 4) & 0x3f;
 	if (user_mode(regs)) {
diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c
index f76a5090d5d1..977195210653 100644
--- a/arch/sh/math-emu/math.c
+++ b/arch/sh/math-emu/math.c
@@ -620,7 +620,7 @@ int do_fpu_inst(unsigned short inst, struct pt_regs *regs)
 	struct task_struct *tsk = current;
 	struct sh_fpu_soft_struct *fpu = &(tsk->thread.xstate->softfpu);
 
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0);
 
 	if (!(task_thread_info(tsk)->status & TS_USEDFPU)) {
 		/* initialize once. */
diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c
index d4c34d757f0d..7bebd044f2a1 100644
--- a/arch/sh/mm/fault_32.c
+++ b/arch/sh/mm/fault_32.c
@@ -160,7 +160,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	if ((regs->sr & SR_IMASK) != SR_IMASK)
 		local_irq_enable();
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -210,11 +210,11 @@ good_area:
 	}
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 				     regs, address);
 	}
 
diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c
index 7f5810f5dfdc..e3430e093d43 100644
--- a/arch/sh/mm/tlbflush_64.c
+++ b/arch/sh/mm/tlbflush_64.c
@@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	/* Not an IO address, so reenable interrupts */
 	local_irq_enable();
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	/*
 	 * If we're in an interrupt or have no user
@@ -200,11 +200,11 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 				     regs, address);
 	}
 
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 2cb0e1c001e2..0b32f2e9e08d 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1277,7 +1277,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
 		if (!sparc_perf_event_set_period(event, hwc, idx))
 			continue;
 
-		if (perf_event_overflow(event, 1, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			sparc_pmu_stop(event, 0);
 	}
 
diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c
index 4491f4cb2695..7efbb2f9e77f 100644
--- a/arch/sparc/kernel/unaligned_32.c
+++ b/arch/sparc/kernel/unaligned_32.c
@@ -247,7 +247,7 @@ asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn)
 		unsigned long addr = compute_effective_address(regs, insn);
 		int err;
 
-		perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, addr);
+		perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr);
 		switch (dir) {
 		case load:
 			err = do_int_load(fetch_reg_addr(((insn>>25)&0x1f),
@@ -338,7 +338,7 @@ asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn)
 		}
 
 		addr = compute_effective_address(regs, insn);
-		perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, addr);
+		perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr);
 		switch(dir) {
 		case load:
 			err = do_int_load(fetch_reg_addr(((insn>>25)&0x1f),
diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c
index b2b019ea8caa..35cff1673aa4 100644
--- a/arch/sparc/kernel/unaligned_64.c
+++ b/arch/sparc/kernel/unaligned_64.c
@@ -317,7 +317,7 @@ asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn)
 
 		addr = compute_effective_address(regs, insn,
 						 ((insn >> 25) & 0x1f));
-		perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, addr);
+		perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr);
 		switch (asi) {
 		case ASI_NL:
 		case ASI_AIUPL:
@@ -384,7 +384,7 @@ int handle_popc(u32 insn, struct pt_regs *regs)
 	int ret, i, rd = ((insn >> 25) & 0x1f);
 	int from_kernel = (regs->tstate & TSTATE_PRIV) != 0;
 	                        
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0);
 	if (insn & 0x2000) {
 		maybe_flush_windows(0, 0, rd, from_kernel);
 		value = sign_extend_imm13(insn);
@@ -431,7 +431,7 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 	int asi = decode_asi(insn, regs);
 	int flag = (freg < 32) ? FPRS_DL : FPRS_DU;
 
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0);
 
 	save_and_clear_fpu();
 	current_thread_info()->xfsr[0] &= ~0x1c000;
@@ -554,7 +554,7 @@ void handle_ld_nf(u32 insn, struct pt_regs *regs)
 	int from_kernel = (regs->tstate & TSTATE_PRIV) != 0;
 	unsigned long *reg;
 	                        
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0);
 
 	maybe_flush_windows(0, 0, rd, from_kernel);
 	reg = fetch_reg_addr(rd, regs);
@@ -586,7 +586,7 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 
 	if (tstate & TSTATE_PRIV)
 		die_if_kernel("lddfmna from kernel", regs);
-	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, sfar);
+	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, sfar);
 	if (test_thread_flag(TIF_32BIT))
 		pc = (u32)pc;
 	if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
@@ -647,7 +647,7 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 
 	if (tstate & TSTATE_PRIV)
 		die_if_kernel("stdfmna from kernel", regs);
-	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, sfar);
+	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, sfar);
 	if (test_thread_flag(TIF_32BIT))
 		pc = (u32)pc;
 	if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
diff --git a/arch/sparc/kernel/visemul.c b/arch/sparc/kernel/visemul.c
index 36357717d691..32b626c9d815 100644
--- a/arch/sparc/kernel/visemul.c
+++ b/arch/sparc/kernel/visemul.c
@@ -802,7 +802,7 @@ int vis_emul(struct pt_regs *regs, unsigned int insn)
 
 	BUG_ON(regs->tstate & TSTATE_PRIV);
 
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0);
 
 	if (test_thread_flag(TIF_32BIT))
 		pc = (u32)pc;
diff --git a/arch/sparc/math-emu/math_32.c b/arch/sparc/math-emu/math_32.c
index a3fccde894ec..aa4d55b0bdf0 100644
--- a/arch/sparc/math-emu/math_32.c
+++ b/arch/sparc/math-emu/math_32.c
@@ -164,7 +164,7 @@ int do_mathemu(struct pt_regs *regs, struct task_struct *fpt)
 	int retcode = 0;                               /* assume all succeed */
 	unsigned long insn;
 
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0);
 
 #ifdef DEBUG_MATHEMU
 	printk("In do_mathemu()... pc is %08lx\n", regs->pc);
diff --git a/arch/sparc/math-emu/math_64.c b/arch/sparc/math-emu/math_64.c
index 56d2c44747b8..e575bd2fe381 100644
--- a/arch/sparc/math-emu/math_64.c
+++ b/arch/sparc/math-emu/math_64.c
@@ -184,7 +184,7 @@ int do_mathemu(struct pt_regs *regs, struct fpustate *f)
 
 	if (tstate & TSTATE_PRIV)
 		die_if_kernel("unfinished/unimplemented FPop from kernel", regs);
-	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0);
+	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0);
 	if (test_thread_flag(TIF_32BIT))
 		pc = (u32)pc;
 	if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 7543ddbdadb2..aa1c1b1ce5cc 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -251,7 +251,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
         if (in_atomic() || !mm)
                 goto no_context;
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	down_read(&mm->mmap_sem);
 
@@ -301,12 +301,10 @@ good_area:
 	}
 	if (fault & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
-			      regs, address);
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
 	} else {
 		current->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
-			      regs, address);
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
 	}
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index f92ce56a8b22..504c0622f729 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -325,7 +325,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	if (in_atomic() || !mm)
 		goto intr_or_no_mm;
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		if ((regs->tstate & TSTATE_PRIV) &&
@@ -433,12 +433,10 @@ good_area:
 	}
 	if (fault & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
-			      regs, address);
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
 	} else {
 		current->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
-			      regs, address);
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
 	}
 	up_read(&mm->mmap_sem);
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8a57f9aa8e36..5b86ec51534c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1339,7 +1339,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_event_set_period(event))
 			continue;
 
-		if (perf_event_overflow(event, 1, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 41178c826c48..d38b0020f775 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1003,7 +1003,7 @@ again:
 
 		data.period = event->hw.last_period;
 
-		if (perf_event_overflow(event, 1, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee25..0941f93f2940 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
 	 */
 	perf_prepare_sample(&header, &data, event, &regs);
 
-	if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+	if (perf_output_begin(&handle, event, header.size * (top - at), 1))
 		return 1;
 
 	for (; at < top; at++) {
@@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	else
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
-	if (perf_event_overflow(event, 1, &data, &regs))
+	if (perf_event_overflow(event, &data, &regs))
 		x86_pmu_stop(event, 0);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index f76fddf63381..d6e6a67b9608 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -970,7 +970,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
 		if (!x86_perf_event_set_period(event))
 			continue;
-		if (perf_event_overflow(event, 1, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
 
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5f9ecff328b5..98da6a7b5e82 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -608,7 +608,7 @@ int kgdb_arch_init(void)
 	return register_die_notifier(&kgdb_notifier);
 }
 
-static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
+static void kgdb_hw_overflow_handler(struct perf_event *event,
 		struct perf_sample_data *data, struct pt_regs *regs)
 {
 	struct task_struct *tsk = current;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 807c2a2b80f1..11db2e9b860a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target,
 	return ret;
 }
 
-static void ptrace_triggered(struct perf_event *bp, int nmi,
+static void ptrace_triggered(struct perf_event *bp,
 			     struct perf_sample_data *data,
 			     struct pt_regs *regs)
 {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2dbf6bf4c7e5..4d09df054e39 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1059,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1161,11 +1161,11 @@ good_area:
 	if (flags & FAULT_FLAG_ALLOW_RETRY) {
 		if (fault & VM_FAULT_MAJOR) {
 			tsk->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 				      regs, address);
 		} else {
 			tsk->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 				      regs, address);
 		}
 		if (fault & VM_FAULT_RETRY) {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2f7b5d42ab41..0946a8bc098d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -682,7 +682,7 @@ enum perf_event_active_state {
 struct file;
 struct perf_sample_data;
 
-typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
+typedef void (*perf_overflow_handler_t)(struct perf_event *,
 					struct perf_sample_data *,
 					struct pt_regs *regs);
 
@@ -925,7 +925,6 @@ struct perf_output_handle {
 	unsigned long			size;
 	void				*addr;
 	int				page;
-	int				nmi;
 	int				sample;
 };
 
@@ -993,7 +992,7 @@ extern void perf_prepare_sample(struct perf_event_header *header,
 				struct perf_event *event,
 				struct pt_regs *regs);
 
-extern int perf_event_overflow(struct perf_event *event, int nmi,
+extern int perf_event_overflow(struct perf_event *event,
 				 struct perf_sample_data *data,
 				 struct pt_regs *regs);
 
@@ -1012,7 +1011,7 @@ static inline int is_software_event(struct perf_event *event)
 
 extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
-extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
+extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);
 
 #ifndef perf_arch_fetch_caller_regs
 static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
@@ -1034,7 +1033,7 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 }
 
 static __always_inline void
-perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
 	struct pt_regs hot_regs;
 
@@ -1043,7 +1042,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 			perf_fetch_caller_regs(&hot_regs);
 			regs = &hot_regs;
 		}
-		__perf_sw_event(event_id, nr, nmi, regs, addr);
+		__perf_sw_event(event_id, nr, regs, addr);
 	}
 }
 
@@ -1057,7 +1056,7 @@ static inline void perf_event_task_sched_in(struct task_struct *task)
 
 static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
 {
-	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
 
 	__perf_event_task_sched_out(task, next);
 }
@@ -1119,7 +1118,7 @@ extern void perf_bp_event(struct perf_event *event, void *data);
 
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size,
-			     int nmi, int sample);
+			     int sample);
 extern void perf_output_end(struct perf_output_handle *handle);
 extern void perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len);
@@ -1143,8 +1142,7 @@ static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
 
 static inline void
-perf_sw_event(u32 event_id, u64 nr, int nmi,
-		     struct pt_regs *regs, u64 addr)			{ }
+perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)	{ }
 static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 270e32f9fc06..dbd1ca75bd3c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3972,7 +3972,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 }
 
-static void perf_event_output(struct perf_event *event, int nmi,
+static void perf_event_output(struct perf_event *event,
 				struct perf_sample_data *data,
 				struct pt_regs *regs)
 {
@@ -3984,7 +3984,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
 
 	perf_prepare_sample(&header, data, event, regs);
 
-	if (perf_output_begin(&handle, event, header.size, nmi, 1))
+	if (perf_output_begin(&handle, event, header.size, 1))
 		goto exit;
 
 	perf_output_sample(&handle, &header, data, event);
@@ -4024,7 +4024,7 @@ perf_event_read_event(struct perf_event *event,
 	int ret;
 
 	perf_event_header__init_id(&read_event.header, &sample, event);
-	ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
+	ret = perf_output_begin(&handle, event, read_event.header.size, 0);
 	if (ret)
 		return;
 
@@ -4067,7 +4067,7 @@ static void perf_event_task_output(struct perf_event *event,
 	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
 
 	ret = perf_output_begin(&handle, event,
-				task_event->event_id.header.size, 0, 0);
+				task_event->event_id.header.size, 0);
 	if (ret)
 		goto out;
 
@@ -4204,7 +4204,7 @@ static void perf_event_comm_output(struct perf_event *event,
 
 	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
 	ret = perf_output_begin(&handle, event,
-				comm_event->event_id.header.size, 0, 0);
+				comm_event->event_id.header.size, 0);
 
 	if (ret)
 		goto out;
@@ -4351,7 +4351,7 @@ static void perf_event_mmap_output(struct perf_event *event,
 
 	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
 	ret = perf_output_begin(&handle, event,
-				mmap_event->event_id.header.size, 0, 0);
+				mmap_event->event_id.header.size, 0);
 	if (ret)
 		goto out;
 
@@ -4546,7 +4546,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_event_header__init_id(&throttle_event.header, &sample, event);
 
 	ret = perf_output_begin(&handle, event,
-				throttle_event.header.size, 1, 0);
+				throttle_event.header.size, 0);
 	if (ret)
 		return;
 
@@ -4559,7 +4559,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
  * Generic event overflow handling, sampling.
  */
 
-static int __perf_event_overflow(struct perf_event *event, int nmi,
+static int __perf_event_overflow(struct perf_event *event,
 				   int throttle, struct perf_sample_data *data,
 				   struct pt_regs *regs)
 {
@@ -4602,34 +4602,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 	if (events && atomic_dec_and_test(&event->event_limit)) {
 		ret = 1;
 		event->pending_kill = POLL_HUP;
-		if (nmi) {
-			event->pending_disable = 1;
-			irq_work_queue(&event->pending);
-		} else
-			perf_event_disable(event);
+		event->pending_disable = 1;
+		irq_work_queue(&event->pending);
 	}
 
 	if (event->overflow_handler)
-		event->overflow_handler(event, nmi, data, regs);
+		event->overflow_handler(event, data, regs);
 	else
-		perf_event_output(event, nmi, data, regs);
+		perf_event_output(event, data, regs);
 
 	if (event->fasync && event->pending_kill) {
-		if (nmi) {
-			event->pending_wakeup = 1;
-			irq_work_queue(&event->pending);
-		} else
-			perf_event_wakeup(event);
+		event->pending_wakeup = 1;
+		irq_work_queue(&event->pending);
 	}
 
 	return ret;
 }
 
-int perf_event_overflow(struct perf_event *event, int nmi,
+int perf_event_overflow(struct perf_event *event,
 			  struct perf_sample_data *data,
 			  struct pt_regs *regs)
 {
-	return __perf_event_overflow(event, nmi, 1, data, regs);
+	return __perf_event_overflow(event, 1, data, regs);
 }
 
 /*
@@ -4678,7 +4672,7 @@ again:
 }
 
 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
-				    int nmi, struct perf_sample_data *data,
+				    struct perf_sample_data *data,
 				    struct pt_regs *regs)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -4692,7 +4686,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 		return;
 
 	for (; overflow; overflow--) {
-		if (__perf_event_overflow(event, nmi, throttle,
+		if (__perf_event_overflow(event, throttle,
 					    data, regs)) {
 			/*
 			 * We inhibit the overflow from happening when
@@ -4705,7 +4699,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 }
 
 static void perf_swevent_event(struct perf_event *event, u64 nr,
-			       int nmi, struct perf_sample_data *data,
+			       struct perf_sample_data *data,
 			       struct pt_regs *regs)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -4719,12 +4713,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
 		return;
 
 	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
-		return perf_swevent_overflow(event, 1, nmi, data, regs);
+		return perf_swevent_overflow(event, 1, data, regs);
 
 	if (local64_add_negative(nr, &hwc->period_left))
 		return;
 
-	perf_swevent_overflow(event, 0, nmi, data, regs);
+	perf_swevent_overflow(event, 0, data, regs);
 }
 
 static int perf_exclude_event(struct perf_event *event,
@@ -4812,7 +4806,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
 }
 
 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-				    u64 nr, int nmi,
+				    u64 nr,
 				    struct perf_sample_data *data,
 				    struct pt_regs *regs)
 {
@@ -4828,7 +4822,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 
 	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 		if (perf_swevent_match(event, type, event_id, data, regs))
-			perf_swevent_event(event, nr, nmi, data, regs);
+			perf_swevent_event(event, nr, data, regs);
 	}
 end:
 	rcu_read_unlock();
@@ -4849,8 +4843,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
 	put_recursion_context(swhash->recursion, rctx);
 }
 
-void __perf_sw_event(u32 event_id, u64 nr, int nmi,
-			    struct pt_regs *regs, u64 addr)
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data;
 	int rctx;
@@ -4862,7 +4855,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 
 	perf_sample_data_init(&data, addr);
 
-	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
 
 	perf_swevent_put_recursion_context(rctx);
 	preempt_enable_notrace();
@@ -5110,7 +5103,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 
 	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
-			perf_swevent_event(event, count, 1, &data, regs);
+			perf_swevent_event(event, count, &data, regs);
 	}
 
 	perf_swevent_put_recursion_context(rctx);
@@ -5203,7 +5196,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
 	perf_sample_data_init(&sample, bp->attr.bp_addr);
 
 	if (!bp->hw.state && !perf_exclude_event(bp, regs))
-		perf_swevent_event(bp, 1, 1, &sample, regs);
+		perf_swevent_event(bp, 1, &sample, regs);
 }
 #endif
 
@@ -5232,7 +5225,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 
 	if (regs && !perf_exclude_event(event, regs)) {
 		if (!(event->attr.exclude_idle && current->pid == 0))
-			if (perf_event_overflow(event, 0, &data, regs))
+			if (perf_event_overflow(event, &data, regs))
 				ret = HRTIMER_NORESTART;
 	}
 
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 114f27f3a624..09097dd8116c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -27,7 +27,6 @@ struct ring_buffer {
 	void				*data_pages[0];
 };
 
-
 extern void rb_free(struct ring_buffer *rb);
 extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fc2701c99207..8b3b73630fa4 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -38,11 +38,8 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 {
 	atomic_set(&handle->rb->poll, POLL_IN);
 
-	if (handle->nmi) {
-		handle->event->pending_wakeup = 1;
-		irq_work_queue(&handle->event->pending);
-	} else
-		perf_event_wakeup(handle->event);
+	handle->event->pending_wakeup = 1;
+	irq_work_queue(&handle->event->pending);
 }
 
 /*
@@ -102,7 +99,7 @@ out:
 
 int perf_output_begin(struct perf_output_handle *handle,
 		      struct perf_event *event, unsigned int size,
-		      int nmi, int sample)
+		      int sample)
 {
 	struct ring_buffer *rb;
 	unsigned long tail, offset, head;
@@ -127,7 +124,6 @@ int perf_output_begin(struct perf_output_handle *handle,
 
 	handle->rb	= rb;
 	handle->event	= event;
-	handle->nmi	= nmi;
 	handle->sample	= sample;
 
 	if (!rb->nr_pages)
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f2e502d609b..d08d110b8976 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2220,7 +2220,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	if (task_cpu(p) != new_cpu) {
 		p->se.nr_migrations++;
-		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
 	}
 
 	__set_task_cpu(p, new_cpu);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 752b75ba662b..a6708e677a0a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -211,7 +211,7 @@ static struct perf_event_attr wd_hw_attr = {
 };
 
 /* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event,
 		 struct perf_sample_data *data,
 		 struct pt_regs *regs)
 {
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
index 063653955f9f..7b164d3200ff 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -41,7 +41,7 @@ module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
 MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
 			" write operations on the kernel symbol");
 
-static void sample_hbp_handler(struct perf_event *bp, int nmi,
+static void sample_hbp_handler(struct perf_event *bp,
 			       struct perf_sample_data *data,
 			       struct pt_regs *regs)
 {
-- 
cgit v1.2.3


From a7ac67ea021b4603095d2aa458bc41641238f22c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 27 Jun 2011 16:47:16 +0200
Subject: perf: Remove the perf_output_begin(.sample) argument

Since only samples call perf_output_sample() its much saner (and more
correct) to put the sample logic in there than in the
perf_output_begin()/perf_output_end() pair.

Saves a useless argument, reduces conditionals and shrinks
struct perf_output_handle, win!

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-2crpvsx3cqu67q3zqjbnlpsc@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c |  2 +-
 include/linux/perf_event.h                |  4 +---
 kernel/events/core.c                      | 26 ++++++++++++++++++++------
 kernel/events/ring_buffer.c               | 19 +------------------
 4 files changed, 23 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 0941f93f2940..1b1ef3addcfd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
 	 */
 	perf_prepare_sample(&header, &data, event, &regs);
 
-	if (perf_output_begin(&handle, event, header.size * (top - at), 1))
+	if (perf_output_begin(&handle, event, header.size * (top - at)))
 		return 1;
 
 	for (; at < top; at++) {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0946a8bc098d..771b0b2845e4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -925,7 +925,6 @@ struct perf_output_handle {
 	unsigned long			size;
 	void				*addr;
 	int				page;
-	int				sample;
 };
 
 #ifdef CONFIG_PERF_EVENTS
@@ -1117,8 +1116,7 @@ extern void perf_bp_event(struct perf_event *event, void *data);
 #endif
 
 extern int perf_output_begin(struct perf_output_handle *handle,
-			     struct perf_event *event, unsigned int size,
-			     int sample);
+			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
 extern void perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbd1ca75bd3c..81de28dcca8c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3928,6 +3928,20 @@ void perf_output_sample(struct perf_output_handle *handle,
 			perf_output_put(handle, raw);
 		}
 	}
+
+	if (!event->attr.watermark) {
+		int wakeup_events = event->attr.wakeup_events;
+
+		if (wakeup_events) {
+			struct ring_buffer *rb = handle->rb;
+			int events = local_inc_return(&rb->events);
+
+			if (events >= wakeup_events) {
+				local_sub(wakeup_events, &rb->events);
+				local_inc(&rb->wakeup);
+			}
+		}
+	}
 }
 
 void perf_prepare_sample(struct perf_event_header *header,
@@ -3984,7 +3998,7 @@ static void perf_event_output(struct perf_event *event,
 
 	perf_prepare_sample(&header, data, event, regs);
 
-	if (perf_output_begin(&handle, event, header.size, 1))
+	if (perf_output_begin(&handle, event, header.size))
 		goto exit;
 
 	perf_output_sample(&handle, &header, data, event);
@@ -4024,7 +4038,7 @@ perf_event_read_event(struct perf_event *event,
 	int ret;
 
 	perf_event_header__init_id(&read_event.header, &sample, event);
-	ret = perf_output_begin(&handle, event, read_event.header.size, 0);
+	ret = perf_output_begin(&handle, event, read_event.header.size);
 	if (ret)
 		return;
 
@@ -4067,7 +4081,7 @@ static void perf_event_task_output(struct perf_event *event,
 	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
 
 	ret = perf_output_begin(&handle, event,
-				task_event->event_id.header.size, 0);
+				task_event->event_id.header.size);
 	if (ret)
 		goto out;
 
@@ -4204,7 +4218,7 @@ static void perf_event_comm_output(struct perf_event *event,
 
 	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
 	ret = perf_output_begin(&handle, event,
-				comm_event->event_id.header.size, 0);
+				comm_event->event_id.header.size);
 
 	if (ret)
 		goto out;
@@ -4351,7 +4365,7 @@ static void perf_event_mmap_output(struct perf_event *event,
 
 	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
 	ret = perf_output_begin(&handle, event,
-				mmap_event->event_id.header.size, 0);
+				mmap_event->event_id.header.size);
 	if (ret)
 		goto out;
 
@@ -4546,7 +4560,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_event_header__init_id(&throttle_event.header, &sample, event);
 
 	ret = perf_output_begin(&handle, event,
-				throttle_event.header.size, 0);
+				throttle_event.header.size);
 	if (ret)
 		return;
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 8b3b73630fa4..a2a29205cc0f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -98,8 +98,7 @@ out:
 }
 
 int perf_output_begin(struct perf_output_handle *handle,
-		      struct perf_event *event, unsigned int size,
-		      int sample)
+		      struct perf_event *event, unsigned int size)
 {
 	struct ring_buffer *rb;
 	unsigned long tail, offset, head;
@@ -124,7 +123,6 @@ int perf_output_begin(struct perf_output_handle *handle,
 
 	handle->rb	= rb;
 	handle->event	= event;
-	handle->sample	= sample;
 
 	if (!rb->nr_pages)
 		goto out;
@@ -192,21 +190,6 @@ void perf_output_copy(struct perf_output_handle *handle,
 
 void perf_output_end(struct perf_output_handle *handle)
 {
-	struct perf_event *event = handle->event;
-	struct ring_buffer *rb = handle->rb;
-
-	if (handle->sample && !event->attr.watermark) {
-		int wakeup_events = event->attr.wakeup_events;
-
-		if (wakeup_events) {
-			int events = local_inc_return(&rb->events);
-			if (events >= wakeup_events) {
-				local_sub(wakeup_events, &rb->events);
-				local_inc(&rb->wakeup);
-			}
-		}
-	}
-
 	perf_output_put_handle(handle);
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3


From efc9f05df2dd171280dcb736a4d973ffefd5508e Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 6 Jun 2011 16:57:03 +0200
Subject: perf_events: Update Intel extra regs shared constraints management

This patch improves the code managing the extra shared registers
used for offcore_response events on Intel Nehalem/Westmere. The
idea is to use static allocation instead of dynamic allocation.
This simplifies greatly the get and put constraint routines for
those events.

The patch also renames per_core to shared_regs because the same
data structure gets used whether or not HT is on. When HT is
off, those events still need to coordination because they use
a extra MSR that has to be shared within an event group.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110606145703.GA7258@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       |  78 +++++++---
 arch/x86/kernel/cpu/perf_event_intel.c | 260 ++++++++++++++++-----------------
 include/linux/perf_event.h             |  14 +-
 3 files changed, 200 insertions(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5b86ec51534c..019fda7489e7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -44,6 +44,29 @@ do {								\
 } while (0)
 #endif
 
+/*
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
+ */
+enum extra_reg_type {
+	EXTRA_REG_NONE  = -1,	/* not used */
+
+	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
+	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
+
+	EXTRA_REG_MAX		/* number of entries needed */
+};
+
 /*
  * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
  */
@@ -132,11 +155,10 @@ struct cpu_hw_events {
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 
 	/*
-	 * Intel percore register state.
-	 * Coordinate shared resources between HT threads.
+	 * manage shared (per-core, per-cpu) registers
+	 * used on Intel NHM/WSM/SNB
 	 */
-	int				percore_used; /* Used by this CPU? */
-	struct intel_percore		*per_core;
+	struct intel_shared_regs	*shared_regs;
 
 	/*
 	 * AMD specific bits
@@ -186,27 +208,46 @@ struct cpu_hw_events {
 #define for_each_event_constraint(e, c)	\
 	for ((e) = (c); (e)->weight; (e)++)
 
+/*
+ * Per register state.
+ */
+struct er_account {
+	raw_spinlock_t		lock;	/* per-core: protect structure */
+	u64			config;	/* extra MSR config */
+	u64			reg;	/* extra MSR number */
+	atomic_t		ref;	/* reference count */
+};
+
 /*
  * Extra registers for specific events.
+ *
  * Some events need large masks and require external MSRs.
- * Define a mapping to these extra registers.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
  */
 struct extra_reg {
 	unsigned int		event;
 	unsigned int		msr;
 	u64			config_mask;
 	u64			valid_mask;
+	int			idx;  /* per_xxx->regs[] reg index */
 };
 
-#define EVENT_EXTRA_REG(e, ms, m, vm) {	\
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\
 	.event = (e),		\
 	.msr = (ms),		\
 	.config_mask = (m),	\
 	.valid_mask = (vm),	\
+	.idx = EXTRA_REG_##i	\
 	}
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm)	\
-	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
 
 union perf_capabilities {
 	struct {
@@ -253,7 +294,6 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
-	struct event_constraint *percore_constraints;
 	void		(*quirks)(void);
 	int		perfctr_second_write;
 
@@ -400,10 +440,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
  */
 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 {
+	struct hw_perf_event_extra *reg;
 	struct extra_reg *er;
 
-	event->hw.extra_reg = 0;
-	event->hw.extra_config = 0;
+	reg = &event->hw.extra_reg;
 
 	if (!x86_pmu.extra_regs)
 		return 0;
@@ -413,8 +453,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 			continue;
 		if (event->attr.config1 & ~er->valid_mask)
 			return -EINVAL;
-		event->hw.extra_reg = er->msr;
-		event->hw.extra_config = event->attr.config1;
+
+		reg->idx = er->idx;
+		reg->config = event->attr.config1;
+		reg->reg = er->msr;
 		break;
 	}
 	return 0;
@@ -713,6 +755,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
 	event->hw.last_cpu = -1;
 	event->hw.last_tag = ~0ULL;
 
+	/* mark unused */
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+
 	return x86_pmu.hw_config(event);
 }
 
@@ -754,8 +799,8 @@ static void x86_pmu_disable(struct pmu *pmu)
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
 {
-	if (hwc->extra_reg)
-		wrmsrl(hwc->extra_reg, hwc->extra_config);
+	if (hwc->extra_reg.reg)
+		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
 	wrmsrl(hwc->config_base, hwc->config | enable_mask);
 }
 
@@ -1692,7 +1737,6 @@ static int validate_group(struct perf_event *event)
 	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
 	if (!fake_cpuc)
 		goto out;
-
 	/*
 	 * the event is not yet connected with its
 	 * siblings therefore we must first collect
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index d38b0020f775..6ad95baff856 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,25 +1,15 @@
 #ifdef CONFIG_CPU_SUP_INTEL
 
-#define MAX_EXTRA_REGS 2
-
-/*
- * Per register state.
- */
-struct er_account {
-	int			ref;		/* reference count */
-	unsigned int		extra_reg;	/* extra MSR number */
-	u64			extra_config;	/* extra MSR config */
-};
-
 /*
- * Per core state
- * This used to coordinate shared registers for HT threads.
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
  */
-struct intel_percore {
-	raw_spinlock_t		lock;		/* protect structure */
-	struct er_account	regs[MAX_EXTRA_REGS];
-	int			refcnt;		/* number of threads */
-	unsigned		core_id;
+struct intel_shared_regs {
+	struct er_account       regs[EXTRA_REG_MAX];
+	int                     refcnt;		/* per-core: #HT threads */
+	unsigned                core_id;	/* per-core: core id */
 };
 
 /*
@@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
 
 static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
 {
-	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
 	EVENT_EXTRA_END
 };
 
-static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
-{
-	INTEL_EVENT_CONSTRAINT(0xb7, 0),
-	EVENT_CONSTRAINT_END
-};
-
 static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -125,18 +109,11 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 
 static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
 {
-	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
-	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
 	EVENT_EXTRA_END
 };
 
-static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
-{
-	INTEL_EVENT_CONSTRAINT(0xb7, 0),
-	INTEL_EVENT_CONSTRAINT(0xbb, 0),
-	EVENT_CONSTRAINT_END
-};
-
 static struct event_constraint intel_gen_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -1037,65 +1014,89 @@ intel_bts_constraints(struct perf_event *event)
 	return NULL;
 }
 
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
 static struct event_constraint *
-intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+				   struct hw_perf_event_extra *reg)
 {
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
-	struct event_constraint *c;
-	struct intel_percore *pc;
+	struct event_constraint *c = &emptyconstraint;
 	struct er_account *era;
-	int i;
-	int free_slot;
-	int found;
 
-	if (!x86_pmu.percore_constraints || hwc->extra_alloc)
-		return NULL;
+	/* already allocated shared msr */
+	if (reg->alloc || !cpuc->shared_regs)
+		return &unconstrained;
 
-	for (c = x86_pmu.percore_constraints; c->cmask; c++) {
-		if (e != c->code)
-			continue;
+	era = &cpuc->shared_regs->regs[reg->idx];
+
+	raw_spin_lock(&era->lock);
+
+	if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+		/* lock in msr value */
+		era->config = reg->config;
+		era->reg = reg->reg;
+
+		/* one more user */
+		atomic_inc(&era->ref);
+
+		/* no need to reallocate during incremental event scheduling */
+		reg->alloc = 1;
 
 		/*
-		 * Allocate resource per core.
+		 * All events using extra_reg are unconstrained.
+		 * Avoids calling x86_get_event_constraints()
+		 *
+		 * Must revisit if extra_reg controlling events
+		 * ever have constraints. Worst case we go through
+		 * the regular event constraint table.
 		 */
-		pc = cpuc->per_core;
-		if (!pc)
-			break;
-		c = &emptyconstraint;
-		raw_spin_lock(&pc->lock);
-		free_slot = -1;
-		found = 0;
-		for (i = 0; i < MAX_EXTRA_REGS; i++) {
-			era = &pc->regs[i];
-			if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
-				/* Allow sharing same config */
-				if (hwc->extra_config == era->extra_config) {
-					era->ref++;
-					cpuc->percore_used = 1;
-					hwc->extra_alloc = 1;
-					c = NULL;
-				}
-				/* else conflict */
-				found = 1;
-				break;
-			} else if (era->ref == 0 && free_slot == -1)
-				free_slot = i;
-		}
-		if (!found && free_slot != -1) {
-			era = &pc->regs[free_slot];
-			era->ref = 1;
-			era->extra_reg = hwc->extra_reg;
-			era->extra_config = hwc->extra_config;
-			cpuc->percore_used = 1;
-			hwc->extra_alloc = 1;
-			c = NULL;
-		}
-		raw_spin_unlock(&pc->lock);
-		return c;
+		c = &unconstrained;
 	}
+	raw_spin_unlock(&era->lock);
 
-	return NULL;
+	return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+				   struct hw_perf_event_extra *reg)
+{
+	struct er_account *era;
+
+	/*
+	 * only put constraint if extra reg was actually
+	 * allocated. Also takes care of event which do
+	 * not use an extra shared reg
+	 */
+	if (!reg->alloc)
+		return;
+
+	era = &cpuc->shared_regs->regs[reg->idx];
+
+	/* one fewer user */
+	atomic_dec(&era->ref);
+
+	/* allocate again next time */
+	reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+			      struct perf_event *event)
+{
+	struct event_constraint *c = NULL;
+	struct hw_perf_event_extra *xreg;
+
+	xreg = &event->hw.extra_reg;
+	if (xreg->idx != EXTRA_REG_NONE)
+		c = __intel_shared_reg_get_constraints(cpuc, xreg);
+	return c;
 }
 
 static struct event_constraint *
@@ -1111,49 +1112,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	if (c)
 		return c;
 
-	c = intel_percore_constraints(cpuc, event);
+	c = intel_shared_regs_constraints(cpuc, event);
 	if (c)
 		return c;
 
 	return x86_get_event_constraints(cpuc, event);
 }
 
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
 {
-	struct extra_reg *er;
-	struct intel_percore *pc;
-	struct er_account *era;
-	struct hw_perf_event *hwc = &event->hw;
-	int i, allref;
-
-	if (!cpuc->percore_used)
-		return;
+	struct hw_perf_event_extra *reg;
 
-	for (er = x86_pmu.extra_regs; er->msr; er++) {
-		if (er->event != (hwc->config & er->config_mask))
-			continue;
+	reg = &event->hw.extra_reg;
+	if (reg->idx != EXTRA_REG_NONE)
+		__intel_shared_reg_put_constraints(cpuc, reg);
+}
 
-		pc = cpuc->per_core;
-		raw_spin_lock(&pc->lock);
-		for (i = 0; i < MAX_EXTRA_REGS; i++) {
-			era = &pc->regs[i];
-			if (era->ref > 0 &&
-			    era->extra_config == hwc->extra_config &&
-			    era->extra_reg == er->msr) {
-				era->ref--;
-				hwc->extra_alloc = 0;
-				break;
-			}
-		}
-		allref = 0;
-		for (i = 0; i < MAX_EXTRA_REGS; i++)
-			allref += pc->regs[i].ref;
-		if (allref == 0)
-			cpuc->percore_used = 0;
-		raw_spin_unlock(&pc->lock);
-		break;
-	}
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	intel_put_shared_regs_event_constraints(cpuc, event);
 }
 
 static int intel_pmu_hw_config(struct perf_event *event)
@@ -1231,20 +1211,36 @@ static __initconst const struct x86_pmu core_pmu = {
 	.event_constraints	= intel_core_event_constraints,
 };
 
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	struct intel_shared_regs *regs;
+	int i;
+
+	regs = kzalloc_node(sizeof(struct intel_shared_regs),
+			    GFP_KERNEL, cpu_to_node(cpu));
+	if (regs) {
+		/*
+		 * initialize the locks to keep lockdep happy
+		 */
+		for (i = 0; i < EXTRA_REG_MAX; i++)
+			raw_spin_lock_init(&regs->regs[i].lock);
+
+		regs->core_id = -1;
+	}
+	return regs;
+}
+
 static int intel_pmu_cpu_prepare(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 
-	if (!cpu_has_ht_siblings())
+	if (!x86_pmu.extra_regs)
 		return NOTIFY_OK;
 
-	cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
-				      GFP_KERNEL, cpu_to_node(cpu));
-	if (!cpuc->per_core)
+	cpuc->shared_regs = allocate_shared_regs(cpu);
+	if (!cpuc->shared_regs)
 		return NOTIFY_BAD;
 
-	raw_spin_lock_init(&cpuc->per_core->lock);
-	cpuc->per_core->core_id = -1;
 	return NOTIFY_OK;
 }
 
@@ -1260,32 +1256,34 @@ static void intel_pmu_cpu_starting(int cpu)
 	 */
 	intel_pmu_lbr_reset();
 
-	if (!cpu_has_ht_siblings())
+	if (!cpuc->shared_regs)
 		return;
 
 	for_each_cpu(i, topology_thread_cpumask(cpu)) {
-		struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+		struct intel_shared_regs *pc;
 
+		pc = per_cpu(cpu_hw_events, i).shared_regs;
 		if (pc && pc->core_id == core_id) {
-			kfree(cpuc->per_core);
-			cpuc->per_core = pc;
+			kfree(cpuc->shared_regs);
+			cpuc->shared_regs = pc;
 			break;
 		}
 	}
 
-	cpuc->per_core->core_id = core_id;
-	cpuc->per_core->refcnt++;
+	cpuc->shared_regs->core_id = core_id;
+	cpuc->shared_regs->refcnt++;
 }
 
 static void intel_pmu_cpu_dying(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	struct intel_percore *pc = cpuc->per_core;
+	struct intel_shared_regs *pc;
 
+	pc = cpuc->shared_regs;
 	if (pc) {
 		if (pc->core_id == -1 || --pc->refcnt == 0)
 			kfree(pc);
-		cpuc->per_core = NULL;
+		cpuc->shared_regs = NULL;
 	}
 
 	fini_debug_store_on_cpu(cpu);
@@ -1436,7 +1434,6 @@ static __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
 		x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-		x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.extra_regs = intel_nehalem_extra_regs;
 
@@ -1481,7 +1478,6 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
-		x86_pmu.percore_constraints = intel_westmere_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_westmere_extra_regs;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 771b0b2845e4..069315eefb22 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -536,6 +536,16 @@ struct perf_branch_stack {
 
 struct task_struct;
 
+/*
+ * extra PMU register associated with an event
+ */
+struct hw_perf_event_extra {
+	u64		config;	/* register value */
+	unsigned int	reg;	/* register address or index */
+	int		alloc;	/* extra register already allocated */
+	int		idx;	/* index in shared_regs->regs[] */
+};
+
 /**
  * struct hw_perf_event - performance event hardware details:
  */
@@ -549,9 +559,7 @@ struct hw_perf_event {
 			unsigned long	event_base;
 			int		idx;
 			int		last_cpu;
-			unsigned int	extra_reg;
-			u64		extra_config;
-			int		extra_alloc;
+			struct hw_perf_event_extra extra_reg;
 		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
-- 
cgit v1.2.3


From 89d6c0b5bdbb1927775584dcf532d98b3efe1477 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 22 Apr 2011 23:37:06 +0200
Subject: perf, arch: Add generic NODE cache events

Add a NODE level to the generic cache events which is used to measure
local vs remote memory accesses. Like all other cache events, an
ACCESS is HIT+MISS, if there is no way to distinguish between reads
and writes do reads only etc..

The below needs filling out for !x86 (which I filled out with
unsupported events).

I'm fairly sure ARM can leave it like that since it doesn't strike me as
an architecture that even has NUMA support. SH might have something since
it does appear to have some NUMA bits.

Sparc64, PowerPC and MIPS certainly want a good look there since they
clearly are NUMA capable.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Miller <davem@davemloft.net>
Cc: Anton Blanchard <anton@samba.org>
Cc: David Daney <ddaney@caviumnetworks.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1303508226.4865.8.camel@laptop
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/perf_event_v6.c        | 28 ++++++++++++++++
 arch/arm/kernel/perf_event_v7.c        | 28 ++++++++++++++++
 arch/arm/kernel/perf_event_xscale.c    | 14 ++++++++
 arch/mips/kernel/perf_event_mipsxx.c   | 28 ++++++++++++++++
 arch/powerpc/kernel/e500-pmu.c         |  5 +++
 arch/powerpc/kernel/mpc7450-pmu.c      |  5 +++
 arch/powerpc/kernel/power4-pmu.c       |  5 +++
 arch/powerpc/kernel/power5+-pmu.c      |  5 +++
 arch/powerpc/kernel/power5-pmu.c       |  5 +++
 arch/powerpc/kernel/power6-pmu.c       |  5 +++
 arch/powerpc/kernel/power7-pmu.c       |  5 +++
 arch/powerpc/kernel/ppc970-pmu.c       |  5 +++
 arch/sh/kernel/cpu/sh4/perf_event.c    | 15 +++++++++
 arch/sh/kernel/cpu/sh4a/perf_event.c   | 15 +++++++++
 arch/sparc/kernel/perf_event.c         | 42 ++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_amd.c   | 14 ++++++++
 arch/x86/kernel/cpu/perf_event_intel.c | 59 +++++++++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/perf_event_p4.c    | 14 ++++++++
 include/linux/perf_event.h             |  3 +-
 19 files changed, 298 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index 38dc4da1d612..dd7f3b9f4cb3 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -173,6 +173,20 @@ static const unsigned armv6_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
 		},
 	},
+	[C(NODE)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
 };
 
 enum armv6mpcore_perf_types {
@@ -310,6 +324,20 @@ static const unsigned armv6mpcore_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
 		},
 	},
+	[C(NODE)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
+		},
+	},
 };
 
 static inline unsigned long
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 6e5f8752303b..e20ca9cafef5 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -255,6 +255,20 @@ static const unsigned armv7_a8_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
 		},
 	},
+	[C(NODE)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
 };
 
 /*
@@ -371,6 +385,20 @@ static const unsigned armv7_a9_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
 		},
 	},
+	[C(NODE)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
 };
 
 /*
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index 99b6b85c7e49..3c4397491d08 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -144,6 +144,20 @@ static const unsigned xscale_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
 		},
 	},
+	[C(NODE)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
 };
 
 #define	XSCALE_PMU_ENABLE	0x001
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 75266ff4cc33..e5ad09a9baf7 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -377,6 +377,20 @@ static const struct mips_perf_event mipsxxcore_cache_map
 		[C(RESULT_MISS)]	= { UNSUPPORTED_PERF_EVENT_ID },
 	},
 },
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+		[C(RESULT_MISS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+		[C(RESULT_MISS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+		[C(RESULT_MISS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+	},
+},
 };
 
 /* 74K core has completely different cache event map. */
@@ -480,6 +494,20 @@ static const struct mips_perf_event mipsxx74Kcore_cache_map
 		[C(RESULT_MISS)]	= { UNSUPPORTED_PERF_EVENT_ID },
 	},
 },
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+		[C(RESULT_MISS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+		[C(RESULT_MISS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+		[C(RESULT_MISS)]	= { UNSUPPORTED_PERF_EVENT_ID },
+	},
+},
 };
 
 #ifdef CONFIG_MIPS_MT_SMP
diff --git a/arch/powerpc/kernel/e500-pmu.c b/arch/powerpc/kernel/e500-pmu.c
index b150b510510f..cb2e2949c8d1 100644
--- a/arch/powerpc/kernel/e500-pmu.c
+++ b/arch/powerpc/kernel/e500-pmu.c
@@ -75,6 +75,11 @@ static int e500_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	-1,		-1	},
 	},
+	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	-1,		-1 	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
 };
 
 static int num_events = 128;
diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c
index 2cc5e0301d0b..845a58478890 100644
--- a/arch/powerpc/kernel/mpc7450-pmu.c
+++ b/arch/powerpc/kernel/mpc7450-pmu.c
@@ -388,6 +388,11 @@ static int mpc7450_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	-1,		-1	},
 	},
+	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	-1,		-1	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
 };
 
 struct power_pmu mpc7450_pmu = {
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index ead8b3c2649e..e9dbc2d35c9c 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -587,6 +587,11 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	-1,		-1	},
 	},
+	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	-1,		-1	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
 };
 
 static struct power_pmu power4_pmu = {
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index eca0ac595cb6..f58a2bd41b59 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -653,6 +653,11 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	-1,		-1		},
 	},
+	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	-1,		-1		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
 };
 
 static struct power_pmu power5p_pmu = {
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index d5ff0f64a5e6..b1acab684142 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -595,6 +595,11 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	-1,		-1		},
 	},
+	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	-1,		-1		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
 };
 
 static struct power_pmu power5_pmu = {
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 31603927e376..b24a3a23d073 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -516,6 +516,11 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	-1,		-1		},
 	},
+	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	-1,		-1		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
 };
 
 static struct power_pmu power6_pmu = {
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 593740fcb799..6d9dccb2ea59 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -342,6 +342,11 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	-1,		-1	},
 	},
+	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	-1,		-1	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
 };
 
 static struct power_pmu power7_pmu = {
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 9a6e093858fe..b121de9658eb 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -467,6 +467,11 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	-1,		-1	},
 	},
+	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	-1,		-1	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
 };
 
 static struct power_pmu ppc970_pmu = {
diff --git a/arch/sh/kernel/cpu/sh4/perf_event.c b/arch/sh/kernel/cpu/sh4/perf_event.c
index 748955df018d..fa4f724b295a 100644
--- a/arch/sh/kernel/cpu/sh4/perf_event.c
+++ b/arch/sh/kernel/cpu/sh4/perf_event.c
@@ -180,6 +180,21 @@ static const int sh7750_cache_events
 			[ C(RESULT_MISS)   ] = -1,
 		},
 	},
+
+	[ C(NODE) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
 };
 
 static int sh7750_event_map(int event)
diff --git a/arch/sh/kernel/cpu/sh4a/perf_event.c b/arch/sh/kernel/cpu/sh4a/perf_event.c
index 17e6bebfede0..84a2c396ceee 100644
--- a/arch/sh/kernel/cpu/sh4a/perf_event.c
+++ b/arch/sh/kernel/cpu/sh4a/perf_event.c
@@ -205,6 +205,21 @@ static const int sh4a_cache_events
 			[ C(RESULT_MISS)   ] = -1,
 		},
 	},
+
+	[ C(NODE) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
 };
 
 static int sh4a_event_map(int event)
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 0b32f2e9e08d..62a034318b18 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -246,6 +246,20 @@ static const cache_map_t ultra3_cache_map = {
 		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
 	},
 },
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)  ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
 };
 
 static const struct sparc_pmu ultra3_pmu = {
@@ -361,6 +375,20 @@ static const cache_map_t niagara1_cache_map = {
 		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
 	},
 },
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)  ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
 };
 
 static const struct sparc_pmu niagara1_pmu = {
@@ -473,6 +501,20 @@ static const cache_map_t niagara2_cache_map = {
 		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
 	},
 },
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)  ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
 };
 
 static const struct sparc_pmu niagara2_pmu = {
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index fe29c1d2219e..941caa2e449b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+		[ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 5c448622468c..bf6f92f7c19d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -226,6 +226,21 @@ static __initconst const u64 snb_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+
 };
 
 static __initconst const u64 westmere_hw_cache_event_ids
@@ -327,6 +342,20 @@ static __initconst const u64 westmere_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
 };
 
 /*
@@ -379,7 +408,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
 		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
 		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
 	},
- }
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
+		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
+		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
+		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+	},
+ },
 };
 
 static __initconst const u64 nehalem_hw_cache_event_ids
@@ -481,6 +524,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
 };
 
 static __initconst const u64 core2_hw_cache_event_ids
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index d6e6a67b9608..fb901c5080f7 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -554,6 +554,20 @@ static __initconst const u64 p4_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 069315eefb22..a5f54b973bdb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -61,7 +61,7 @@ enum perf_hw_id {
 /*
  * Generalized hardware cache events:
  *
- *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x
  *       { read, write, prefetch } x
  *       { accesses, misses }
  */
@@ -72,6 +72,7 @@ enum perf_hw_cache_id {
 	PERF_COUNT_HW_CACHE_DTLB		= 3,
 	PERF_COUNT_HW_CACHE_ITLB		= 4,
 	PERF_COUNT_HW_CACHE_BPU			= 5,
+	PERF_COUNT_HW_CACHE_NODE		= 6,
 
 	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
 };
-- 
cgit v1.2.3


From 4dc0da86967d5463708631d02a70cfed5b104884 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 29 Jun 2011 18:42:35 +0300
Subject: perf: Add context field to perf_event

The perf_event overflow handler does not receive any caller-derived
argument, so many callers need to resort to looking up the perf_event
in their local data structure.  This is ugly and doesn't scale if a
single callback services many perf_events.

Fix by adding a context parameter to perf_event_create_kernel_counter()
(and derived hardware breakpoints APIs) and storing it in the perf_event.
The field can be accessed from the callback as event->overflow_handler_context.
All callers are updated.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1309362157-6596-2-git-send-email-avi@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/ptrace.c                |  3 ++-
 arch/powerpc/kernel/ptrace.c            |  2 +-
 arch/sh/kernel/ptrace_32.c              |  3 ++-
 arch/x86/kernel/kgdb.c                  |  2 +-
 arch/x86/kernel/ptrace.c                |  3 ++-
 drivers/oprofile/oprofile_perf.c        |  2 +-
 include/linux/hw_breakpoint.h           | 10 ++++++++--
 include/linux/perf_event.h              |  4 +++-
 kernel/events/core.c                    | 21 +++++++++++++++------
 kernel/events/hw_breakpoint.c           | 10 +++++++---
 kernel/watchdog.c                       |  2 +-
 samples/hw_breakpoint/data_breakpoint.c |  2 +-
 12 files changed, 44 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 0c9b1054f790..5c199610719f 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -479,7 +479,8 @@ static struct perf_event *ptrace_hbp_create(struct task_struct *tsk, int type)
 	attr.bp_type	= type;
 	attr.disabled	= 1;
 
-	return register_user_hw_breakpoint(&attr, ptrace_hbptriggered, tsk);
+	return register_user_hw_breakpoint(&attr, ptrace_hbptriggered, NULL,
+					   tsk);
 }
 
 static int ptrace_gethbpregs(struct task_struct *tsk, long num,
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 3177617af2ef..05b7dd217f60 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -973,7 +973,7 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 								&attr.bp_type);
 
 	thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr,
-							ptrace_triggered, task);
+					       ptrace_triggered, NULL, task);
 	if (IS_ERR(bp)) {
 		thread->ptrace_bps[0] = NULL;
 		ptrace_put_breakpoints(task);
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 8051976100a6..92b3c276339a 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -91,7 +91,8 @@ static int set_single_step(struct task_struct *tsk, unsigned long addr)
 		attr.bp_len = HW_BREAKPOINT_LEN_2;
 		attr.bp_type = HW_BREAKPOINT_R;
 
-		bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+		bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+						 NULL, tsk);
 		if (IS_ERR(bp))
 			return PTR_ERR(bp);
 
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 98da6a7b5e82..00354d4919a9 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -638,7 +638,7 @@ void kgdb_arch_late(void)
 	for (i = 0; i < HBP_NUM; i++) {
 		if (breakinfo[i].pev)
 			continue;
-		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
+		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
 		if (IS_ERR((void * __force)breakinfo[i].pev)) {
 			printk(KERN_ERR "kgdb: Could not allocate hw"
 			       "breakpoints\nDisabling the kernel debugger\n");
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 11db2e9b860a..82528799c5de 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 		attr.bp_type = HW_BREAKPOINT_W;
 		attr.disabled = 1;
 
-		bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+		bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+						 NULL, tsk);
 
 		/*
 		 * CHECKME: the previous code returned -EIO if the addr wasn't
diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c
index 9046f7b2ed79..59acf9ef78a4 100644
--- a/drivers/oprofile/oprofile_perf.c
+++ b/drivers/oprofile/oprofile_perf.c
@@ -79,7 +79,7 @@ static int op_create_counter(int cpu, int event)
 
 	pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
 						  cpu, NULL,
-						  op_overflow_handler);
+						  op_overflow_handler, NULL);
 
 	if (IS_ERR(pevent))
 		return PTR_ERR(pevent);
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index d1e55fed2c7d..6ae9c631a1be 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -73,6 +73,7 @@ static inline unsigned long hw_breakpoint_len(struct perf_event *bp)
 extern struct perf_event *
 register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_overflow_handler_t triggered,
+			    void *context,
 			    struct task_struct *tsk);
 
 /* FIXME: only change from the attr, and don't unregister */
@@ -85,11 +86,13 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr);
 extern struct perf_event *
 register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr,
 				perf_overflow_handler_t	triggered,
+				void *context,
 				int cpu);
 
 extern struct perf_event * __percpu *
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
-			    perf_overflow_handler_t triggered);
+			    perf_overflow_handler_t triggered,
+			    void *context);
 
 extern int register_perf_hw_breakpoint(struct perf_event *bp);
 extern int __register_perf_hw_breakpoint(struct perf_event *bp);
@@ -115,6 +118,7 @@ static inline int __init init_hw_breakpoint(void) { return 0; }
 static inline struct perf_event *
 register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_overflow_handler_t triggered,
+			    void *context,
 			    struct task_struct *tsk)	{ return NULL; }
 static inline int
 modify_user_hw_breakpoint(struct perf_event *bp,
@@ -122,10 +126,12 @@ modify_user_hw_breakpoint(struct perf_event *bp,
 static inline struct perf_event *
 register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr,
 				perf_overflow_handler_t	 triggered,
+				void *context,
 				int cpu)		{ return NULL; }
 static inline struct perf_event * __percpu *
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
-			    perf_overflow_handler_t triggered)	{ return NULL; }
+			    perf_overflow_handler_t triggered,
+			    void *context)		{ return NULL; }
 static inline int
 register_perf_hw_breakpoint(struct perf_event *bp)	{ return -ENOSYS; }
 static inline int
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a5f54b973bdb..2a08cacb1628 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -839,6 +839,7 @@ struct perf_event {
 	u64				id;
 
 	perf_overflow_handler_t		overflow_handler;
+	void				*overflow_handler_context;
 
 #ifdef CONFIG_EVENT_TRACING
 	struct ftrace_event_call	*tp_event;
@@ -960,7 +961,8 @@ extern struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				int cpu,
 				struct task_struct *task,
-				perf_overflow_handler_t callback);
+				perf_overflow_handler_t callback,
+				void *context);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81de28dcca8c..ba8e0f4a20e6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5745,7 +5745,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		 struct task_struct *task,
 		 struct perf_event *group_leader,
 		 struct perf_event *parent_event,
-		 perf_overflow_handler_t overflow_handler)
+		 perf_overflow_handler_t overflow_handler,
+		 void *context)
 {
 	struct pmu *pmu;
 	struct perf_event *event;
@@ -5803,10 +5804,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 #endif
 	}
 
-	if (!overflow_handler && parent_event)
+	if (!overflow_handler && parent_event) {
 		overflow_handler = parent_event->overflow_handler;
+		context = parent_event->overflow_handler_context;
+	}
 
 	event->overflow_handler	= overflow_handler;
+	event->overflow_handler_context = context;
 
 	if (attr->disabled)
 		event->state = PERF_EVENT_STATE_OFF;
@@ -6073,7 +6077,8 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
-	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
+	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
+				 NULL, NULL);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
 		goto err_task;
@@ -6258,7 +6263,8 @@ err_fd:
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 				 struct task_struct *task,
-				 perf_overflow_handler_t overflow_handler)
+				 perf_overflow_handler_t overflow_handler,
+				 void *context)
 {
 	struct perf_event_context *ctx;
 	struct perf_event *event;
@@ -6268,7 +6274,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	 * Get the target context (task or percpu):
 	 */
 
-	event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
+	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+				 overflow_handler, context);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
 		goto err;
@@ -6552,7 +6559,7 @@ inherit_event(struct perf_event *parent_event,
 					   parent_event->cpu,
 					   child,
 					   group_leader, parent_event,
-					   NULL);
+				           NULL, NULL);
 	if (IS_ERR(child_event))
 		return child_event;
 	get_ctx(child_ctx);
@@ -6579,6 +6586,8 @@ inherit_event(struct perf_event *parent_event,
 
 	child_event->ctx = child_ctx;
 	child_event->overflow_handler = parent_event->overflow_handler;
+	child_event->overflow_handler_context
+		= parent_event->overflow_handler_context;
 
 	/*
 	 * Precalculate sample_data sizes
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..b7971d6f38bf 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 struct perf_event *
 register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_overflow_handler_t triggered,
+			    void *context,
 			    struct task_struct *tsk)
 {
-	return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
+	return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
+						context);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
  */
 struct perf_event * __percpu *
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
-			    perf_overflow_handler_t triggered)
+			    perf_overflow_handler_t triggered,
+			    void *context)
 {
 	struct perf_event * __percpu *cpu_events, **pevent, *bp;
 	long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		pevent = per_cpu_ptr(cpu_events, cpu);
-		bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
+		bp = perf_event_create_kernel_counter(attr, cpu, NULL,
+						      triggered, context);
 
 		*pevent = bp;
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6708e677a0a..a933e3a0398b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -375,7 +375,7 @@ static int watchdog_nmi_enable(int cpu)
 	hw_nmi_watchdog_set_attr(wd_attr);
 
 	/* Try to register using hardware perf events */
-	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
+	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
 	if (!IS_ERR(event)) {
 		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
 		goto out_save;
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
index 7b164d3200ff..ef7f32291852 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -60,7 +60,7 @@ static int __init hw_break_module_init(void)
 	attr.bp_len = HW_BREAKPOINT_LEN_4;
 	attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
 
-	sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler);
+	sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL);
 	if (IS_ERR((void __force *)sample_hbp)) {
 		ret = PTR_ERR((void __force *)sample_hbp);
 		goto fail;
-- 
cgit v1.2.3


From 26ca5c11fb45ae2b2ac7e3574b8db6b3a3c7d350 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 29 Jun 2011 18:42:37 +0300
Subject: perf: export perf_event_refresh() to modules

KVM needs one-shot samples, since a PMC programmed to -X will fire after X
events and then again after 2^40 events (i.e. variable period).

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1309362157-6596-4-git-send-email-avi@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 5 +++++
 kernel/events/core.c       | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2a08cacb1628..3f2711ccf910 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -955,6 +955,7 @@ extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
+extern int perf_event_refresh(struct perf_event *event, int refresh);
 extern void perf_event_update_userpage(struct perf_event *event);
 extern int perf_event_release_kernel(struct perf_event *event);
 extern struct perf_event *
@@ -1149,6 +1150,10 @@ static inline void perf_event_delayed_put(struct task_struct *task)	{ }
 static inline void perf_event_print_debug(void)				{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
+static inline int perf_event_refresh(struct perf_event *event, int refresh)
+{
+	return -EINVAL;
+}
 
 static inline void
 perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)	{ }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ba8e0f4a20e6..0567e32d71aa 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1764,7 +1764,7 @@ out:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
-static int perf_event_refresh(struct perf_event *event, int refresh)
+int perf_event_refresh(struct perf_event *event, int refresh)
 {
 	/*
 	 * not supported on inherited events
@@ -1777,6 +1777,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_refresh);
 
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
-- 
cgit v1.2.3


From 564b905ab10d17fb42f86aa8b7b9b796276d1336 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Jun 2011 01:52:55 +0200
Subject: PM / Domains: Rename struct dev_power_domain to struct dev_pm_domain

The naming convention used by commit 7538e3db6e015e890825fbd9f86599b
(PM: Add support for device power domains), which introduced the
struct dev_power_domain type for representing device power domains,
evidently confuses some developers who tend to think that objects
of this type must correspond to "power domains" as defined by
hardware, which is not the case.  Namely, at the kernel level, a
struct dev_power_domain object can represent arbitrary set of devices
that are mutually dependent power management-wise and need not belong
to one hardware power domain.  To avoid that confusion, rename struct
dev_power_domain to struct dev_pm_domain and rename the related
pointers in struct device and struct pm_clk_notifier_block from
pwr_domain to pm_domain.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Kevin Hilman <khilman@ti.com>
---
 Documentation/power/devices.txt          |  8 ++++----
 arch/arm/mach-omap1/pm_bus.c             |  8 ++++----
 arch/arm/mach-shmobile/pm_runtime.c      |  8 ++++----
 arch/arm/plat-omap/omap_device.c         |  4 ++--
 arch/sh/kernel/cpu/shmobile/pm_runtime.c |  6 +++---
 drivers/base/power/clock_ops.c           | 14 +++++++-------
 drivers/base/power/main.c                | 30 +++++++++++++++---------------
 drivers/base/power/runtime.c             | 12 ++++++------
 include/linux/device.h                   |  4 ++--
 include/linux/pm.h                       |  2 +-
 include/linux/pm_runtime.h               |  2 +-
 11 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 64565aac6e40..85c6f980b642 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -506,8 +506,8 @@ routines.  Nevertheless, different callback pointers are used in case there is a
 situation where it actually matters.
 
 
-Device Power Domains
---------------------
+Device Power Management Domains
+-------------------------------
 Sometimes devices share reference clocks or other power resources.  In those
 cases it generally is not possible to put devices into low-power states
 individually.  Instead, a set of devices sharing a power resource can be put
@@ -516,8 +516,8 @@ power resource.  Of course, they also need to be put into the full-power state
 together, by turning the shared power resource on.  A set of devices with this
 property is often referred to as a power domain.
 
-Support for power domains is provided through the pwr_domain field of struct
-device.  This field is a pointer to an object of type struct dev_power_domain,
+Support for power domains is provided through the pm_domain field of struct
+device.  This field is a pointer to an object of type struct dev_pm_domain,
 defined in include/linux/pm.h, providing a set of power management callbacks
 analogous to the subsystem-level and device driver callbacks that are executed
 for the given device during all power transitions, instead of the respective
diff --git a/arch/arm/mach-omap1/pm_bus.c b/arch/arm/mach-omap1/pm_bus.c
index 334fb8871bc3..212f33103712 100644
--- a/arch/arm/mach-omap1/pm_bus.c
+++ b/arch/arm/mach-omap1/pm_bus.c
@@ -49,20 +49,20 @@ static int omap1_pm_runtime_resume(struct device *dev)
 	return pm_generic_runtime_resume(dev);
 }
 
-static struct dev_power_domain default_power_domain = {
+static struct dev_pm_domain default_pm_domain = {
 	.ops = {
 		.runtime_suspend = omap1_pm_runtime_suspend,
 		.runtime_resume = omap1_pm_runtime_resume,
 		USE_PLATFORM_PM_SLEEP_OPS
 	},
 };
-#define OMAP1_PWR_DOMAIN (&default_power_domain)
+#define OMAP1_PM_DOMAIN (&default_pm_domain)
 #else
-#define OMAP1_PWR_DOMAIN NULL
+#define OMAP1_PM_DOMAIN NULL
 #endif /* CONFIG_PM_RUNTIME */
 
 static struct pm_clk_notifier_block platform_bus_notifier = {
-	.pwr_domain = OMAP1_PWR_DOMAIN,
+	.pm_domain = OMAP1_PM_DOMAIN,
 	.con_ids = { "ick", "fck", NULL, },
 };
 
diff --git a/arch/arm/mach-shmobile/pm_runtime.c b/arch/arm/mach-shmobile/pm_runtime.c
index 2d1b67a59e4a..99802d28e5d3 100644
--- a/arch/arm/mach-shmobile/pm_runtime.c
+++ b/arch/arm/mach-shmobile/pm_runtime.c
@@ -28,7 +28,7 @@ static int default_platform_runtime_idle(struct device *dev)
 	return pm_runtime_suspend(dev);
 }
 
-static struct dev_power_domain default_power_domain = {
+static struct dev_pm_domain default_pm_domain = {
 	.ops = {
 		.runtime_suspend = pm_runtime_clk_suspend,
 		.runtime_resume = pm_runtime_clk_resume,
@@ -37,16 +37,16 @@ static struct dev_power_domain default_power_domain = {
 	},
 };
 
-#define DEFAULT_PWR_DOMAIN_PTR	(&default_power_domain)
+#define DEFAULT_PM_DOMAIN_PTR	(&default_pm_domain)
 
 #else
 
-#define DEFAULT_PWR_DOMAIN_PTR	NULL
+#define DEFAULT_PM_DOMAIN_PTR	NULL
 
 #endif /* CONFIG_PM_RUNTIME */
 
 static struct pm_clk_notifier_block platform_bus_notifier = {
-	.pwr_domain = DEFAULT_PWR_DOMAIN_PTR,
+	.pm_domain = DEFAULT_PM_DOMAIN_PTR,
 	.con_ids = { NULL, },
 };
 
diff --git a/arch/arm/plat-omap/omap_device.c b/arch/arm/plat-omap/omap_device.c
index 49fc0df0c21f..d21579b2c11e 100644
--- a/arch/arm/plat-omap/omap_device.c
+++ b/arch/arm/plat-omap/omap_device.c
@@ -564,7 +564,7 @@ static int _od_runtime_resume(struct device *dev)
 	return pm_generic_runtime_resume(dev);
 }
 
-static struct dev_power_domain omap_device_power_domain = {
+static struct dev_pm_domain omap_device_pm_domain = {
 	.ops = {
 		.runtime_suspend = _od_runtime_suspend,
 		.runtime_idle = _od_runtime_idle,
@@ -586,7 +586,7 @@ int omap_device_register(struct omap_device *od)
 	pr_debug("omap_device: %s: registering\n", od->pdev.name);
 
 	od->pdev.dev.parent = &omap_device_parent;
-	od->pdev.dev.pwr_domain = &omap_device_power_domain;
+	od->pdev.dev.pm_domain = &omap_device_pm_domain;
 	return platform_device_register(&od->pdev);
 }
 
diff --git a/arch/sh/kernel/cpu/shmobile/pm_runtime.c b/arch/sh/kernel/cpu/shmobile/pm_runtime.c
index 64c807c39208..bf280c812d2f 100644
--- a/arch/sh/kernel/cpu/shmobile/pm_runtime.c
+++ b/arch/sh/kernel/cpu/shmobile/pm_runtime.c
@@ -256,7 +256,7 @@ out:
 	return ret;
 }
 
-static struct dev_power_domain default_power_domain = {
+static struct dev_pm_domain default_pm_domain = {
 	.ops = {
 		.runtime_suspend = default_platform_runtime_suspend,
 		.runtime_resume = default_platform_runtime_resume,
@@ -285,7 +285,7 @@ static int platform_bus_notify(struct notifier_block *nb,
 		hwblk_disable(hwblk_info, hwblk);
 		/* make sure driver re-inits itself once */
 		__set_bit(PDEV_ARCHDATA_FLAG_INIT, &pdev->archdata.flags);
-		dev->pwr_domain = &default_power_domain;
+		dev->pm_domain = &default_pm_domain;
 		break;
 	/* TODO: add BUS_NOTIFY_BIND_DRIVER and increase idle count */
 	case BUS_NOTIFY_BOUND_DRIVER:
@@ -299,7 +299,7 @@ static int platform_bus_notify(struct notifier_block *nb,
 		__set_bit(PDEV_ARCHDATA_FLAG_INIT, &pdev->archdata.flags);
 		break;
 	case BUS_NOTIFY_DEL_DEVICE:
-		dev->pwr_domain = NULL;
+		dev->pm_domain = NULL;
 		break;
 	}
 	return 0;
diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c
index ad367c4139b1..c5624818259e 100644
--- a/drivers/base/power/clock_ops.c
+++ b/drivers/base/power/clock_ops.c
@@ -278,11 +278,11 @@ int pm_runtime_clk_resume(struct device *dev)
  *
  * For this function to work, @nb must be a member of an object of type
  * struct pm_clk_notifier_block containing all of the requisite data.
- * Specifically, the pwr_domain member of that object is copied to the device's
- * pwr_domain field and its con_ids member is used to populate the device's list
+ * Specifically, the pm_domain member of that object is copied to the device's
+ * pm_domain field and its con_ids member is used to populate the device's list
  * of runtime PM clocks, depending on @action.
  *
- * If the device's pwr_domain field is already populated with a value different
+ * If the device's pm_domain field is already populated with a value different
  * from the one stored in the struct pm_clk_notifier_block object, the function
  * does nothing.
  */
@@ -300,14 +300,14 @@ static int pm_runtime_clk_notify(struct notifier_block *nb,
 
 	switch (action) {
 	case BUS_NOTIFY_ADD_DEVICE:
-		if (dev->pwr_domain)
+		if (dev->pm_domain)
 			break;
 
 		error = pm_runtime_clk_init(dev);
 		if (error)
 			break;
 
-		dev->pwr_domain = clknb->pwr_domain;
+		dev->pm_domain = clknb->pm_domain;
 		if (clknb->con_ids[0]) {
 			for (con_id = clknb->con_ids; *con_id; con_id++)
 				pm_runtime_clk_add(dev, *con_id);
@@ -317,10 +317,10 @@ static int pm_runtime_clk_notify(struct notifier_block *nb,
 
 		break;
 	case BUS_NOTIFY_DEL_DEVICE:
-		if (dev->pwr_domain != clknb->pwr_domain)
+		if (dev->pm_domain != clknb->pm_domain)
 			break;
 
-		dev->pwr_domain = NULL;
+		dev->pm_domain = NULL;
 		pm_runtime_clk_destroy(dev);
 		break;
 	}
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 06f09bf89cb2..85b591a5429a 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -425,9 +425,9 @@ static int device_resume_noirq(struct device *dev, pm_message_t state)
 	TRACE_DEVICE(dev);
 	TRACE_RESUME(0);
 
-	if (dev->pwr_domain) {
+	if (dev->pm_domain) {
 		pm_dev_dbg(dev, state, "EARLY power domain ");
-		error = pm_noirq_op(dev, &dev->pwr_domain->ops, state);
+		error = pm_noirq_op(dev, &dev->pm_domain->ops, state);
 	} else if (dev->type && dev->type->pm) {
 		pm_dev_dbg(dev, state, "EARLY type ");
 		error = pm_noirq_op(dev, dev->type->pm, state);
@@ -521,9 +521,9 @@ static int device_resume(struct device *dev, pm_message_t state, bool async)
 	if (!dev->power.is_suspended)
 		goto Unlock;
 
-	if (dev->pwr_domain) {
+	if (dev->pm_domain) {
 		pm_dev_dbg(dev, state, "power domain ");
-		error = pm_op(dev, &dev->pwr_domain->ops, state);
+		error = pm_op(dev, &dev->pm_domain->ops, state);
 		goto End;
 	}
 
@@ -641,10 +641,10 @@ static void device_complete(struct device *dev, pm_message_t state)
 {
 	device_lock(dev);
 
-	if (dev->pwr_domain) {
+	if (dev->pm_domain) {
 		pm_dev_dbg(dev, state, "completing power domain ");
-		if (dev->pwr_domain->ops.complete)
-			dev->pwr_domain->ops.complete(dev);
+		if (dev->pm_domain->ops.complete)
+			dev->pm_domain->ops.complete(dev);
 	} else if (dev->type && dev->type->pm) {
 		pm_dev_dbg(dev, state, "completing type ");
 		if (dev->type->pm->complete)
@@ -744,9 +744,9 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state)
 {
 	int error;
 
-	if (dev->pwr_domain) {
+	if (dev->pm_domain) {
 		pm_dev_dbg(dev, state, "LATE power domain ");
-		error = pm_noirq_op(dev, &dev->pwr_domain->ops, state);
+		error = pm_noirq_op(dev, &dev->pm_domain->ops, state);
 		if (error)
 			return error;
 	} else if (dev->type && dev->type->pm) {
@@ -853,9 +853,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 		goto Unlock;
 	}
 
-	if (dev->pwr_domain) {
+	if (dev->pm_domain) {
 		pm_dev_dbg(dev, state, "power domain ");
-		error = pm_op(dev, &dev->pwr_domain->ops, state);
+		error = pm_op(dev, &dev->pm_domain->ops, state);
 		goto End;
 	}
 
@@ -982,11 +982,11 @@ static int device_prepare(struct device *dev, pm_message_t state)
 
 	device_lock(dev);
 
-	if (dev->pwr_domain) {
+	if (dev->pm_domain) {
 		pm_dev_dbg(dev, state, "preparing power domain ");
-		if (dev->pwr_domain->ops.prepare)
-			error = dev->pwr_domain->ops.prepare(dev);
-		suspend_report_result(dev->pwr_domain->ops.prepare, error);
+		if (dev->pm_domain->ops.prepare)
+			error = dev->pm_domain->ops.prepare(dev);
+		suspend_report_result(dev->pm_domain->ops.prepare, error);
 		if (error)
 			goto End;
 	} else if (dev->type && dev->type->pm) {
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 0d4587b15c55..5f5c4236f006 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -213,8 +213,8 @@ static int rpm_idle(struct device *dev, int rpmflags)
 
 	dev->power.idle_notification = true;
 
-	if (dev->pwr_domain)
-		callback = dev->pwr_domain->ops.runtime_idle;
+	if (dev->pm_domain)
+		callback = dev->pm_domain->ops.runtime_idle;
 	else if (dev->type && dev->type->pm)
 		callback = dev->type->pm->runtime_idle;
 	else if (dev->class && dev->class->pm)
@@ -374,8 +374,8 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 
 	__update_runtime_status(dev, RPM_SUSPENDING);
 
-	if (dev->pwr_domain)
-		callback = dev->pwr_domain->ops.runtime_suspend;
+	if (dev->pm_domain)
+		callback = dev->pm_domain->ops.runtime_suspend;
 	else if (dev->type && dev->type->pm)
 		callback = dev->type->pm->runtime_suspend;
 	else if (dev->class && dev->class->pm)
@@ -573,8 +573,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
 	__update_runtime_status(dev, RPM_RESUMING);
 
-	if (dev->pwr_domain)
-		callback = dev->pwr_domain->ops.runtime_resume;
+	if (dev->pm_domain)
+		callback = dev->pm_domain->ops.runtime_resume;
 	else if (dev->type && dev->type->pm)
 		callback = dev->type->pm->runtime_resume;
 	else if (dev->class && dev->class->pm)
diff --git a/include/linux/device.h b/include/linux/device.h
index e4f62d8896b7..160d4ddb2499 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -516,7 +516,7 @@ struct device_dma_parameters {
  * 		minimizes board-specific #ifdefs in drivers.
  * @power:	For device power management.
  * 		See Documentation/power/devices.txt for details.
- * @pwr_domain:	Provide callbacks that are executed during system suspend,
+ * @pm_domain:	Provide callbacks that are executed during system suspend,
  * 		hibernation, system resume and during runtime PM transitions
  * 		along with subsystem-level and driver-level callbacks.
  * @numa_node:	NUMA node this device is close to.
@@ -567,7 +567,7 @@ struct device {
 	void		*platform_data;	/* Platform specific data, device
 					   core doesn't touch it */
 	struct dev_pm_info	power;
-	struct dev_power_domain	*pwr_domain;
+	struct dev_pm_domain	*pm_domain;
 
 #ifdef CONFIG_NUMA
 	int		numa_node;	/* NUMA node this device is close to */
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 411e4f4be52b..e3963208aa93 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -472,7 +472,7 @@ extern void update_pm_runtime_accounting(struct device *dev);
  * hibernation, system resume and during runtime PM transitions along with
  * subsystem-level and driver-level callbacks.
  */
-struct dev_power_domain {
+struct dev_pm_domain {
 	struct dev_pm_ops	ops;
 };
 
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 878cf84baeb1..ef91904c7110 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -247,7 +247,7 @@ static inline void pm_runtime_dont_use_autosuspend(struct device *dev)
 
 struct pm_clk_notifier_block {
 	struct notifier_block nb;
-	struct dev_power_domain *pwr_domain;
+	struct dev_pm_domain *pm_domain;
 	char *con_ids[];
 };
 
-- 
cgit v1.2.3


From dc6e4e56e6ef473a696a1ab24f80b79b9aceb92d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Jun 2011 01:53:04 +0200
Subject: PM: subsys_data in struct dev_pm_info need not depend on RM_RUNTIME

The subsys_data field of struct dev_pm_info, introduced by commit
1d2b71f61b6a10216274e27b717becf9ae101fc7 (PM / Runtime: Add subsystem
data field to struct dev_pm_info), is going to be used even if
CONFIG_PM_RUNTIME is not set, so move it from under the #ifdef.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/pm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index e3963208aa93..7e8f0763d1ec 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -461,8 +461,8 @@ struct dev_pm_info {
 	unsigned long		active_jiffies;
 	unsigned long		suspended_jiffies;
 	unsigned long		accounting_timestamp;
-	void			*subsys_data;  /* Owned by the subsystem. */
 #endif
+	void			*subsys_data;  /* Owned by the subsystem. */
 };
 
 extern void update_pm_runtime_accounting(struct device *dev);
-- 
cgit v1.2.3


From f721889ff65afa6243c463832c74dee3bed418d5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 1 Jul 2011 22:12:45 +0200
Subject: PM / Domains: Support for generic I/O PM domains (v8)

Introduce common headers, helper functions and callbacks allowing
platforms to use simple generic power domains for runtime power
management.

Introduce struct generic_pm_domain to be used for representing
power domains that each contain a number of devices and may be
parent domains or subdomains with respect to other power domains.
Among other things, this structure includes callbacks to be
provided by platforms for performing specific tasks related to
power management (i.e. ->stop_device() may disable a device's
clocks, while ->start_device() may enable them, ->power_off() is
supposed to remove power from the entire power domain
and ->power_on() is supposed to restore it).

Introduce functions that can be used as power domain runtime PM
callbacks, pm_genpd_runtime_suspend() and pm_genpd_runtime_resume(),
as well as helper functions for the initialization of a power
domain represented by a struct generic_power_domain object,
adding a device to or removing a device from it and adding or
removing subdomains.

Introduce configuration option CONFIG_PM_GENERIC_DOMAINS to be
selected by the platforms that want to use the new code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Reviewed-by: Kevin Hilman <khilman@ti.com>
---
 drivers/base/power/Makefile |   1 +
 drivers/base/power/domain.c | 494 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_domain.h   |  78 +++++++
 kernel/power/Kconfig        |   4 +
 4 files changed, 577 insertions(+)
 create mode 100644 drivers/base/power/domain.c
 create mode 100644 include/linux/pm_domain.h

(limited to 'include/linux')

diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 3647e114d0e7..2639ae79a372 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_PM_SLEEP)	+= main.o wakeup.o
 obj-$(CONFIG_PM_RUNTIME)	+= runtime.o
 obj-$(CONFIG_PM_TRACE_RTC)	+= trace.o
 obj-$(CONFIG_PM_OPP)	+= opp.o
+obj-$(CONFIG_PM_GENERIC_DOMAINS)	+=  domain.o
 obj-$(CONFIG_HAVE_CLK)	+= clock_ops.o
 
 ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
\ No newline at end of file
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
new file mode 100644
index 000000000000..fd31be3be404
--- /dev/null
+++ b/drivers/base/power/domain.c
@@ -0,0 +1,494 @@
+/*
+ * drivers/base/power/domain.c - Common code related to device power domains.
+ *
+ * Copyright (C) 2011 Rafael J. Wysocki <rjw@sisk.pl>, Renesas Electronics Corp.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <linux/pm_runtime.h>
+#include <linux/pm_domain.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+
+#ifdef CONFIG_PM_RUNTIME
+
+static void genpd_sd_counter_dec(struct generic_pm_domain *genpd)
+{
+	if (!WARN_ON(genpd->sd_count == 0))
+			genpd->sd_count--;
+}
+
+/**
+ * __pm_genpd_save_device - Save the pre-suspend state of a device.
+ * @dle: Device list entry of the device to save the state of.
+ * @genpd: PM domain the device belongs to.
+ */
+static int __pm_genpd_save_device(struct dev_list_entry *dle,
+				  struct generic_pm_domain *genpd)
+{
+	struct device *dev = dle->dev;
+	struct device_driver *drv = dev->driver;
+	int ret = 0;
+
+	if (dle->need_restore)
+		return 0;
+
+	if (drv && drv->pm && drv->pm->runtime_suspend) {
+		if (genpd->start_device)
+			genpd->start_device(dev);
+
+		ret = drv->pm->runtime_suspend(dev);
+
+		if (genpd->stop_device)
+			genpd->stop_device(dev);
+	}
+
+	if (!ret)
+		dle->need_restore = true;
+
+	return ret;
+}
+
+/**
+ * __pm_genpd_restore_device - Restore the pre-suspend state of a device.
+ * @dle: Device list entry of the device to restore the state of.
+ * @genpd: PM domain the device belongs to.
+ */
+static void __pm_genpd_restore_device(struct dev_list_entry *dle,
+				      struct generic_pm_domain *genpd)
+{
+	struct device *dev = dle->dev;
+	struct device_driver *drv = dev->driver;
+
+	if (!dle->need_restore)
+		return;
+
+	if (drv && drv->pm && drv->pm->runtime_resume) {
+		if (genpd->start_device)
+			genpd->start_device(dev);
+
+		drv->pm->runtime_resume(dev);
+
+		if (genpd->stop_device)
+			genpd->stop_device(dev);
+	}
+
+	dle->need_restore = false;
+}
+
+/**
+ * pm_genpd_poweroff - Remove power from a given PM domain.
+ * @genpd: PM domain to power down.
+ *
+ * If all of the @genpd's devices have been suspended and all of its subdomains
+ * have been powered down, run the runtime suspend callbacks provided by all of
+ * the @genpd's devices' drivers and remove power from @genpd.
+ */
+static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
+{
+	struct generic_pm_domain *parent;
+	struct dev_list_entry *dle;
+	unsigned int not_suspended;
+	int ret;
+
+	if (genpd->power_is_off)
+		return 0;
+
+	if (genpd->sd_count > 0)
+		return -EBUSY;
+
+	not_suspended = 0;
+	list_for_each_entry(dle, &genpd->dev_list, node)
+		if (dle->dev->driver && !pm_runtime_suspended(dle->dev))
+			not_suspended++;
+
+	if (not_suspended > genpd->in_progress)
+		return -EBUSY;
+
+	if (genpd->gov && genpd->gov->power_down_ok) {
+		if (!genpd->gov->power_down_ok(&genpd->domain))
+			return -EAGAIN;
+	}
+
+	list_for_each_entry_reverse(dle, &genpd->dev_list, node) {
+		ret = __pm_genpd_save_device(dle, genpd);
+		if (ret)
+			goto err_dev;
+	}
+
+	if (genpd->power_off)
+		genpd->power_off(genpd);
+
+	genpd->power_is_off = true;
+
+	parent = genpd->parent;
+	if (parent) {
+		genpd_sd_counter_dec(parent);
+		if (parent->sd_count == 0)
+			queue_work(pm_wq, &parent->power_off_work);
+	}
+
+	return 0;
+
+ err_dev:
+	list_for_each_entry_continue(dle, &genpd->dev_list, node)
+		__pm_genpd_restore_device(dle, genpd);
+
+	return ret;
+}
+
+/**
+ * genpd_power_off_work_fn - Power off PM domain whose subdomain count is 0.
+ * @work: Work structure used for scheduling the execution of this function.
+ */
+static void genpd_power_off_work_fn(struct work_struct *work)
+{
+	struct generic_pm_domain *genpd;
+
+	genpd = container_of(work, struct generic_pm_domain, power_off_work);
+
+	if (genpd->parent)
+		mutex_lock(&genpd->parent->lock);
+	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+	pm_genpd_poweroff(genpd);
+	mutex_unlock(&genpd->lock);
+	if (genpd->parent)
+		mutex_unlock(&genpd->parent->lock);
+}
+
+/**
+ * pm_genpd_runtime_suspend - Suspend a device belonging to I/O PM domain.
+ * @dev: Device to suspend.
+ *
+ * Carry out a runtime suspend of a device under the assumption that its
+ * pm_domain field points to the domain member of an object of type
+ * struct generic_pm_domain representing a PM domain consisting of I/O devices.
+ */
+static int pm_genpd_runtime_suspend(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	if (IS_ERR_OR_NULL(dev->pm_domain))
+		return -EINVAL;
+
+	genpd = container_of(dev->pm_domain, struct generic_pm_domain, domain);
+
+	if (genpd->parent)
+		mutex_lock(&genpd->parent->lock);
+	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+
+	if (genpd->stop_device) {
+		int ret = genpd->stop_device(dev);
+		if (ret)
+			goto out;
+	}
+	genpd->in_progress++;
+	pm_genpd_poweroff(genpd);
+	genpd->in_progress--;
+
+ out:
+	mutex_unlock(&genpd->lock);
+	if (genpd->parent)
+		mutex_unlock(&genpd->parent->lock);
+
+	return 0;
+}
+
+/**
+ * pm_genpd_poweron - Restore power to a given PM domain and its parents.
+ * @genpd: PM domain to power up.
+ *
+ * Restore power to @genpd and all of its parents so that it is possible to
+ * resume a device belonging to it.
+ */
+static int pm_genpd_poweron(struct generic_pm_domain *genpd)
+{
+	int ret = 0;
+
+ start:
+	if (genpd->parent)
+		mutex_lock(&genpd->parent->lock);
+	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+
+	if (!genpd->power_is_off)
+		goto out;
+
+	if (genpd->parent && genpd->parent->power_is_off) {
+		mutex_unlock(&genpd->lock);
+		mutex_unlock(&genpd->parent->lock);
+
+		ret = pm_genpd_poweron(genpd->parent);
+		if (ret)
+			return ret;
+
+		goto start;
+	}
+
+	if (genpd->power_on) {
+		int ret = genpd->power_on(genpd);
+		if (ret)
+			goto out;
+	}
+
+	genpd->power_is_off = false;
+	if (genpd->parent)
+		genpd->parent->sd_count++;
+
+ out:
+	mutex_unlock(&genpd->lock);
+	if (genpd->parent)
+		mutex_unlock(&genpd->parent->lock);
+
+	return ret;
+}
+
+/**
+ * pm_genpd_runtime_resume - Resume a device belonging to I/O PM domain.
+ * @dev: Device to resume.
+ *
+ * Carry out a runtime resume of a device under the assumption that its
+ * pm_domain field points to the domain member of an object of type
+ * struct generic_pm_domain representing a PM domain consisting of I/O devices.
+ */
+static int pm_genpd_runtime_resume(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+	struct dev_list_entry *dle;
+	int ret;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	if (IS_ERR_OR_NULL(dev->pm_domain))
+		return -EINVAL;
+
+	genpd = container_of(dev->pm_domain, struct generic_pm_domain, domain);
+
+	ret = pm_genpd_poweron(genpd);
+	if (ret)
+		return ret;
+
+	mutex_lock(&genpd->lock);
+
+	list_for_each_entry(dle, &genpd->dev_list, node) {
+		if (dle->dev == dev) {
+			__pm_genpd_restore_device(dle, genpd);
+			break;
+		}
+	}
+
+	if (genpd->start_device)
+		genpd->start_device(dev);
+
+	mutex_unlock(&genpd->lock);
+
+	return 0;
+}
+
+#else
+
+static inline void genpd_power_off_work_fn(struct work_struct *work) {}
+
+#define pm_genpd_runtime_suspend	NULL
+#define pm_genpd_runtime_resume		NULL
+
+#endif /* CONFIG_PM_RUNTIME */
+
+/**
+ * pm_genpd_add_device - Add a device to an I/O PM domain.
+ * @genpd: PM domain to add the device to.
+ * @dev: Device to be added.
+ */
+int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev)
+{
+	struct dev_list_entry *dle;
+	int ret = 0;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(dev))
+		return -EINVAL;
+
+	mutex_lock(&genpd->lock);
+
+	if (genpd->power_is_off) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	list_for_each_entry(dle, &genpd->dev_list, node)
+		if (dle->dev == dev) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+	dle = kzalloc(sizeof(*dle), GFP_KERNEL);
+	if (!dle) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	dle->dev = dev;
+	dle->need_restore = false;
+	list_add_tail(&dle->node, &genpd->dev_list);
+
+	spin_lock_irq(&dev->power.lock);
+	dev->pm_domain = &genpd->domain;
+	spin_unlock_irq(&dev->power.lock);
+
+ out:
+	mutex_unlock(&genpd->lock);
+
+	return ret;
+}
+
+/**
+ * pm_genpd_remove_device - Remove a device from an I/O PM domain.
+ * @genpd: PM domain to remove the device from.
+ * @dev: Device to be removed.
+ */
+int pm_genpd_remove_device(struct generic_pm_domain *genpd,
+			   struct device *dev)
+{
+	struct dev_list_entry *dle;
+	int ret = -EINVAL;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(dev))
+		return -EINVAL;
+
+	mutex_lock(&genpd->lock);
+
+	list_for_each_entry(dle, &genpd->dev_list, node) {
+		if (dle->dev != dev)
+			continue;
+
+		spin_lock_irq(&dev->power.lock);
+		dev->pm_domain = NULL;
+		spin_unlock_irq(&dev->power.lock);
+
+		list_del(&dle->node);
+		kfree(dle);
+
+		ret = 0;
+		break;
+	}
+
+	mutex_unlock(&genpd->lock);
+
+	return ret;
+}
+
+/**
+ * pm_genpd_add_subdomain - Add a subdomain to an I/O PM domain.
+ * @genpd: Master PM domain to add the subdomain to.
+ * @new_subdomain: Subdomain to be added.
+ */
+int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
+			   struct generic_pm_domain *new_subdomain)
+{
+	struct generic_pm_domain *subdomain;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(new_subdomain))
+		return -EINVAL;
+
+	mutex_lock(&genpd->lock);
+
+	if (genpd->power_is_off && !new_subdomain->power_is_off) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	list_for_each_entry(subdomain, &genpd->sd_list, sd_node) {
+		if (subdomain == new_subdomain) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	mutex_lock_nested(&new_subdomain->lock, SINGLE_DEPTH_NESTING);
+
+	list_add_tail(&new_subdomain->sd_node, &genpd->sd_list);
+	new_subdomain->parent = genpd;
+	if (!subdomain->power_is_off)
+		genpd->sd_count++;
+
+	mutex_unlock(&new_subdomain->lock);
+
+ out:
+	mutex_unlock(&genpd->lock);
+
+	return ret;
+}
+
+/**
+ * pm_genpd_remove_subdomain - Remove a subdomain from an I/O PM domain.
+ * @genpd: Master PM domain to remove the subdomain from.
+ * @target: Subdomain to be removed.
+ */
+int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
+			      struct generic_pm_domain *target)
+{
+	struct generic_pm_domain *subdomain;
+	int ret = -EINVAL;
+
+	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(target))
+		return -EINVAL;
+
+	mutex_lock(&genpd->lock);
+
+	list_for_each_entry(subdomain, &genpd->sd_list, sd_node) {
+		if (subdomain != target)
+			continue;
+
+		mutex_lock_nested(&subdomain->lock, SINGLE_DEPTH_NESTING);
+
+		list_del(&subdomain->sd_node);
+		subdomain->parent = NULL;
+		if (!subdomain->power_is_off)
+			genpd_sd_counter_dec(genpd);
+
+		mutex_unlock(&subdomain->lock);
+
+		ret = 0;
+		break;
+	}
+
+	mutex_unlock(&genpd->lock);
+
+	return ret;
+}
+
+/**
+ * pm_genpd_init - Initialize a generic I/O PM domain object.
+ * @genpd: PM domain object to initialize.
+ * @gov: PM domain governor to associate with the domain (may be NULL).
+ * @is_off: Initial value of the domain's power_is_off field.
+ */
+void pm_genpd_init(struct generic_pm_domain *genpd,
+		   struct dev_power_governor *gov, bool is_off)
+{
+	if (IS_ERR_OR_NULL(genpd))
+		return;
+
+	INIT_LIST_HEAD(&genpd->sd_node);
+	genpd->parent = NULL;
+	INIT_LIST_HEAD(&genpd->dev_list);
+	INIT_LIST_HEAD(&genpd->sd_list);
+	mutex_init(&genpd->lock);
+	genpd->gov = gov;
+	INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
+	genpd->in_progress = 0;
+	genpd->sd_count = 0;
+	genpd->power_is_off = is_off;
+	genpd->domain.ops.runtime_suspend = pm_genpd_runtime_suspend;
+	genpd->domain.ops.runtime_resume = pm_genpd_runtime_resume;
+	genpd->domain.ops.runtime_idle = pm_generic_runtime_idle;
+}
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
new file mode 100644
index 000000000000..b1a22c65380b
--- /dev/null
+++ b/include/linux/pm_domain.h
@@ -0,0 +1,78 @@
+/*
+ * pm_domain.h - Definitions and headers related to device power domains.
+ *
+ * Copyright (C) 2011 Rafael J. Wysocki <rjw@sisk.pl>, Renesas Electronics Corp.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef _LINUX_PM_DOMAIN_H
+#define _LINUX_PM_DOMAIN_H
+
+#include <linux/device.h>
+
+struct dev_power_governor {
+	bool (*power_down_ok)(struct dev_pm_domain *domain);
+};
+
+struct generic_pm_domain {
+	struct dev_pm_domain domain;	/* PM domain operations */
+	struct list_head sd_node;	/* Node in the parent's subdomain list */
+	struct generic_pm_domain *parent;	/* Parent PM domain */
+	struct list_head sd_list;	/* List of dubdomains */
+	struct list_head dev_list;	/* List of devices */
+	struct mutex lock;
+	struct dev_power_governor *gov;
+	struct work_struct power_off_work;
+	unsigned int in_progress;	/* Number of devices being suspended now */
+	unsigned int sd_count;	/* Number of subdomains with power "on" */
+	bool power_is_off;	/* Whether or not power has been removed */
+	int (*power_off)(struct generic_pm_domain *domain);
+	int (*power_on)(struct generic_pm_domain *domain);
+	int (*start_device)(struct device *dev);
+	int (*stop_device)(struct device *dev);
+};
+
+struct dev_list_entry {
+	struct list_head node;
+	struct device *dev;
+	bool need_restore;
+};
+
+#ifdef CONFIG_PM_GENERIC_DOMAINS
+extern int pm_genpd_add_device(struct generic_pm_domain *genpd,
+			       struct device *dev);
+extern int pm_genpd_remove_device(struct generic_pm_domain *genpd,
+				  struct device *dev);
+extern int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
+				  struct generic_pm_domain *new_subdomain);
+extern int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
+				     struct generic_pm_domain *target);
+extern void pm_genpd_init(struct generic_pm_domain *genpd,
+			  struct dev_power_governor *gov, bool is_off);
+#else
+static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
+				      struct device *dev)
+{
+	return -ENOSYS;
+}
+static inline int pm_genpd_remove_device(struct generic_pm_domain *genpd,
+					 struct device *dev)
+{
+	return -ENOSYS;
+}
+static inline int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
+					 struct generic_pm_domain *new_sd)
+{
+	return -ENOSYS;
+}
+static inline int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
+					    struct generic_pm_domain *target)
+{
+	return -ENOSYS;
+}
+static inline void pm_genpd_init(struct generic_pm_domain *genpd,
+				 struct dev_power_governor *gov, bool is_off) {}
+#endif
+
+#endif /* _LINUX_PM_DOMAIN_H */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 87f4d24b55b0..e83ac2556c86 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -227,3 +227,7 @@ config PM_OPP
 config PM_RUNTIME_CLK
 	def_bool y
 	depends on PM_RUNTIME && HAVE_CLK
+
+config PM_GENERIC_DOMAINS
+	bool
+	depends on PM
-- 
cgit v1.2.3


From e5291928839877f8e73c2643ee1d3fe0bcdcaf5c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 1 Jul 2011 22:12:59 +0200
Subject: PM: Introduce generic "noirq" callback routines for subsystems (v2)

Introduce generic "noirq" power management callback routines for
subsystems in addition to the "regular" generic PM callback routines.

The new routines will be used, among other things, for implementing
system-wide PM transitions support for generic PM domains.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/power/runtime_pm.txt | 32 ++++++++++++-
 drivers/base/power/generic_ops.c   | 98 ++++++++++++++++++++++++++++++++------
 include/linux/pm.h                 |  6 +++
 3 files changed, 119 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index b24875b1ced5..4b011b171be4 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -606,32 +606,60 @@ driver/base/power/generic_ops.c:
       callback provided by its driver and return its result, or return 0 if not
       defined
 
+  int pm_generic_suspend_noirq(struct device *dev);
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
   int pm_generic_resume(struct device *dev);
     - invoke the ->resume() callback provided by the driver of this device and,
       if successful, change the device's runtime PM status to 'active'
 
+  int pm_generic_resume_noirq(struct device *dev);
+    - invoke the ->resume_noirq() callback provided by the driver of this device
+
   int pm_generic_freeze(struct device *dev);
     - if the device has not been suspended at run time, invoke the ->freeze()
       callback provided by its driver and return its result, or return 0 if not
       defined
 
+  int pm_generic_freeze_noirq(struct device *dev);
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
   int pm_generic_thaw(struct device *dev);
     - if the device has not been suspended at run time, invoke the ->thaw()
       callback provided by its driver and return its result, or return 0 if not
       defined
 
+  int pm_generic_thaw_noirq(struct device *dev);
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
   int pm_generic_poweroff(struct device *dev);
     - if the device has not been suspended at run time, invoke the ->poweroff()
       callback provided by its driver and return its result, or return 0 if not
       defined
 
+  int pm_generic_poweroff_noirq(struct device *dev);
+    - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
   int pm_generic_restore(struct device *dev);
     - invoke the ->restore() callback provided by the driver of this device and,
       if successful, change the device's runtime PM status to 'active'
 
+  int pm_generic_restore_noirq(struct device *dev);
+    - invoke the ->restore_noirq() callback provided by the device's driver
+
 These functions can be assigned to the ->runtime_idle(), ->runtime_suspend(),
-->runtime_resume(), ->suspend(), ->resume(), ->freeze(), ->thaw(), ->poweroff(),
-or ->restore() callback pointers in the subsystem-level dev_pm_ops structures.
+->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
+->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
+->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() callback
+pointers in the subsystem-level dev_pm_ops structures.
 
 If a subsystem wishes to use all of them at the same time, it can simply assign
 the GENERIC_SUBSYS_PM_OPS macro, defined in include/linux/pm.h, to its
diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c
index cb3bb368681c..9508df71274b 100644
--- a/drivers/base/power/generic_ops.c
+++ b/drivers/base/power/generic_ops.c
@@ -94,12 +94,13 @@ int pm_generic_prepare(struct device *dev)
  * __pm_generic_call - Generic suspend/freeze/poweroff/thaw subsystem callback.
  * @dev: Device to handle.
  * @event: PM transition of the system under way.
+ * @bool: Whether or not this is the "noirq" stage.
  *
  * If the device has not been suspended at run time, execute the
  * suspend/freeze/poweroff/thaw callback provided by its driver, if defined, and
  * return its error code.  Otherwise, return zero.
  */
-static int __pm_generic_call(struct device *dev, int event)
+static int __pm_generic_call(struct device *dev, int event, bool noirq)
 {
 	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 	int (*callback)(struct device *);
@@ -109,16 +110,16 @@ static int __pm_generic_call(struct device *dev, int event)
 
 	switch (event) {
 	case PM_EVENT_SUSPEND:
-		callback = pm->suspend;
+		callback = noirq ? pm->suspend_noirq : pm->suspend;
 		break;
 	case PM_EVENT_FREEZE:
-		callback = pm->freeze;
+		callback = noirq ? pm->freeze_noirq : pm->freeze;
 		break;
 	case PM_EVENT_HIBERNATE:
-		callback = pm->poweroff;
+		callback = noirq ? pm->poweroff_noirq : pm->poweroff;
 		break;
 	case PM_EVENT_THAW:
-		callback = pm->thaw;
+		callback = noirq ? pm->thaw_noirq : pm->thaw;
 		break;
 	default:
 		callback = NULL;
@@ -128,43 +129,83 @@ static int __pm_generic_call(struct device *dev, int event)
 	return callback ? callback(dev) : 0;
 }
 
+/**
+ * pm_generic_suspend_noirq - Generic suspend_noirq callback for subsystems.
+ * @dev: Device to suspend.
+ */
+int pm_generic_suspend_noirq(struct device *dev)
+{
+	return __pm_generic_call(dev, PM_EVENT_SUSPEND, true);
+}
+EXPORT_SYMBOL_GPL(pm_generic_suspend_noirq);
+
 /**
  * pm_generic_suspend - Generic suspend callback for subsystems.
  * @dev: Device to suspend.
  */
 int pm_generic_suspend(struct device *dev)
 {
-	return __pm_generic_call(dev, PM_EVENT_SUSPEND);
+	return __pm_generic_call(dev, PM_EVENT_SUSPEND, false);
 }
 EXPORT_SYMBOL_GPL(pm_generic_suspend);
 
+/**
+ * pm_generic_freeze_noirq - Generic freeze_noirq callback for subsystems.
+ * @dev: Device to freeze.
+ */
+int pm_generic_freeze_noirq(struct device *dev)
+{
+	return __pm_generic_call(dev, PM_EVENT_FREEZE, true);
+}
+EXPORT_SYMBOL_GPL(pm_generic_freeze_noirq);
+
 /**
  * pm_generic_freeze - Generic freeze callback for subsystems.
  * @dev: Device to freeze.
  */
 int pm_generic_freeze(struct device *dev)
 {
-	return __pm_generic_call(dev, PM_EVENT_FREEZE);
+	return __pm_generic_call(dev, PM_EVENT_FREEZE, false);
 }
 EXPORT_SYMBOL_GPL(pm_generic_freeze);
 
+/**
+ * pm_generic_poweroff_noirq - Generic poweroff_noirq callback for subsystems.
+ * @dev: Device to handle.
+ */
+int pm_generic_poweroff_noirq(struct device *dev)
+{
+	return __pm_generic_call(dev, PM_EVENT_HIBERNATE, true);
+}
+EXPORT_SYMBOL_GPL(pm_generic_poweroff_noirq);
+
 /**
  * pm_generic_poweroff - Generic poweroff callback for subsystems.
  * @dev: Device to handle.
  */
 int pm_generic_poweroff(struct device *dev)
 {
-	return __pm_generic_call(dev, PM_EVENT_HIBERNATE);
+	return __pm_generic_call(dev, PM_EVENT_HIBERNATE, false);
 }
 EXPORT_SYMBOL_GPL(pm_generic_poweroff);
 
+/**
+ * pm_generic_thaw_noirq - Generic thaw_noirq callback for subsystems.
+ * @dev: Device to thaw.
+ */
+int pm_generic_thaw_noirq(struct device *dev)
+{
+	return __pm_generic_call(dev, PM_EVENT_THAW, true);
+}
+EXPORT_SYMBOL_GPL(pm_generic_thaw_noirq);
+
 /**
  * pm_generic_thaw - Generic thaw callback for subsystems.
  * @dev: Device to thaw.
  */
 int pm_generic_thaw(struct device *dev)
 {
-	return __pm_generic_call(dev, PM_EVENT_THAW);
+	return __pm_generic_call(dev, PM_EVENT_THAW, false);
 }
 EXPORT_SYMBOL_GPL(pm_generic_thaw);
 
@@ -172,12 +213,13 @@ EXPORT_SYMBOL_GPL(pm_generic_thaw);
  * __pm_generic_resume - Generic resume/restore callback for subsystems.
  * @dev: Device to handle.
  * @event: PM transition of the system under way.
+ * @bool: Whether or not this is the "noirq" stage.
  *
  * Execute the resume/resotre callback provided by the @dev's driver, if
  * defined.  If it returns 0, change the device's runtime PM status to 'active'.
  * Return the callback's error code.
  */
-static int __pm_generic_resume(struct device *dev, int event)
+static int __pm_generic_resume(struct device *dev, int event, bool noirq)
 {
 	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 	int (*callback)(struct device *);
@@ -188,10 +230,10 @@ static int __pm_generic_resume(struct device *dev, int event)
 
 	switch (event) {
 	case PM_EVENT_RESUME:
-		callback = pm->resume;
+		callback = noirq ? pm->resume_noirq : pm->resume;
 		break;
 	case PM_EVENT_RESTORE:
-		callback = pm->restore;
+		callback = noirq ? pm->restore_noirq : pm->restore;
 		break;
 	default:
 		callback = NULL;
@@ -202,7 +244,7 @@ static int __pm_generic_resume(struct device *dev, int event)
 		return 0;
 
 	ret = callback(dev);
-	if (!ret && pm_runtime_enabled(dev)) {
+	if (!ret && !noirq && pm_runtime_enabled(dev)) {
 		pm_runtime_disable(dev);
 		pm_runtime_set_active(dev);
 		pm_runtime_enable(dev);
@@ -211,23 +253,43 @@ static int __pm_generic_resume(struct device *dev, int event)
 	return ret;
 }
 
+/**
+ * pm_generic_resume_noirq - Generic resume_noirq callback for subsystems.
+ * @dev: Device to resume.
+ */
+int pm_generic_resume_noirq(struct device *dev)
+{
+	return __pm_generic_resume(dev, PM_EVENT_RESUME, true);
+}
+EXPORT_SYMBOL_GPL(pm_generic_resume_noirq);
+
 /**
  * pm_generic_resume - Generic resume callback for subsystems.
  * @dev: Device to resume.
  */
 int pm_generic_resume(struct device *dev)
 {
-	return __pm_generic_resume(dev, PM_EVENT_RESUME);
+	return __pm_generic_resume(dev, PM_EVENT_RESUME, false);
 }
 EXPORT_SYMBOL_GPL(pm_generic_resume);
 
+/**
+ * pm_generic_restore_noirq - Generic restore_noirq callback for subsystems.
+ * @dev: Device to restore.
+ */
+int pm_generic_restore_noirq(struct device *dev)
+{
+	return __pm_generic_resume(dev, PM_EVENT_RESTORE, true);
+}
+EXPORT_SYMBOL_GPL(pm_generic_restore_noirq);
+
 /**
  * pm_generic_restore - Generic restore callback for subsystems.
  * @dev: Device to restore.
  */
 int pm_generic_restore(struct device *dev)
 {
-	return __pm_generic_resume(dev, PM_EVENT_RESTORE);
+	return __pm_generic_resume(dev, PM_EVENT_RESTORE, false);
 }
 EXPORT_SYMBOL_GPL(pm_generic_restore);
 
@@ -256,11 +318,17 @@ struct dev_pm_ops generic_subsys_pm_ops = {
 #ifdef CONFIG_PM_SLEEP
 	.prepare = pm_generic_prepare,
 	.suspend = pm_generic_suspend,
+	.suspend_noirq = pm_generic_suspend_noirq,
 	.resume = pm_generic_resume,
+	.resume_noirq = pm_generic_resume_noirq,
 	.freeze = pm_generic_freeze,
+	.freeze_noirq = pm_generic_freeze_noirq,
 	.thaw = pm_generic_thaw,
+	.thaw_noirq = pm_generic_thaw_noirq,
 	.poweroff = pm_generic_poweroff,
+	.poweroff_noirq = pm_generic_poweroff_noirq,
 	.restore = pm_generic_restore,
+	.restore_noirq = pm_generic_restore_noirq,
 	.complete = pm_generic_complete,
 #endif
 #ifdef CONFIG_PM_RUNTIME
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 7e8f0763d1ec..f7c84c9abd30 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -553,11 +553,17 @@ extern void __suspend_report_result(const char *function, void *fn, int ret);
 extern int device_pm_wait_for_dev(struct device *sub, struct device *dev);
 
 extern int pm_generic_prepare(struct device *dev);
+extern int pm_generic_suspend_noirq(struct device *dev);
 extern int pm_generic_suspend(struct device *dev);
+extern int pm_generic_resume_noirq(struct device *dev);
 extern int pm_generic_resume(struct device *dev);
+extern int pm_generic_freeze_noirq(struct device *dev);
 extern int pm_generic_freeze(struct device *dev);
+extern int pm_generic_thaw_noirq(struct device *dev);
 extern int pm_generic_thaw(struct device *dev);
+extern int pm_generic_restore_noirq(struct device *dev);
 extern int pm_generic_restore(struct device *dev);
+extern int pm_generic_poweroff_noirq(struct device *dev);
 extern int pm_generic_poweroff(struct device *dev);
 extern void pm_generic_complete(struct device *dev);
 
-- 
cgit v1.2.3


From 596ba34bcd2978ee9823cc1d84df230576f8ffb9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 1 Jul 2011 22:13:19 +0200
Subject: PM / Domains: System-wide transitions support for generic domains
 (v5)

Make generic PM domains support system-wide power transitions
(system suspend and hibernation).  Add suspend, resume, freeze, thaw,
poweroff and restore callbacks to be associated with struct
generic_pm_domain objects and make pm_genpd_init() use them as
appropriate.

The new callbacks do nothing for devices belonging to power domains
that were powered down at run time (before the transition).  For the
other devices the action carried out depends on the type of the
transition.  During system suspend the power domain .suspend()
callback executes pm_generic_suspend() for the device, while the
PM domain .suspend_noirq() callback runs pm_generic_suspend_noirq()
for it, stops it and eventually removes power from the PM domain it
belongs to (after all devices in the domain have been stopped and its
subdomains have been powered off).

During system resume the PM domain .resume_noirq() callback
restores power to the PM domain (when executed for it first time),
starts the device and executes pm_generic_resume_noirq() for it,
while the .resume() callback executes pm_generic_resume() for the
device.  Finally, the .complete() callback executes pm_runtime_idle()
for the device which should put it back into the suspended state if
its runtime PM usage count is equal to zero at that time.

The actions carried out during hibernation and resume from it are
analogous to the ones described above.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Kevin Hilman <khilman@ti.com>
---
 drivers/base/power/domain.c | 551 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/pm_domain.h   |  12 +
 2 files changed, 548 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index f14ba32818dc..33086e9afaf6 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -21,7 +21,7 @@ static struct generic_pm_domain *dev_to_genpd(struct device *dev)
 	if (IS_ERR_OR_NULL(dev->pm_domain))
 		return ERR_PTR(-EINVAL);
 
-	return container_of(dev->pm_domain, struct generic_pm_domain, domain);
+	return pd_to_genpd(dev->pm_domain);
 }
 
 static void genpd_sd_counter_dec(struct generic_pm_domain *genpd)
@@ -46,7 +46,8 @@ static int pm_genpd_poweron(struct generic_pm_domain *genpd)
 		mutex_lock(&genpd->parent->lock);
 	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
 
-	if (!genpd->power_is_off)
+	if (!genpd->power_is_off
+	    || (genpd->prepared_count > 0 && genpd->suspend_power_off))
 		goto out;
 
 	if (genpd->parent && genpd->parent->power_is_off) {
@@ -155,7 +156,7 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 	unsigned int not_suspended;
 	int ret;
 
-	if (genpd->power_is_off)
+	if (genpd->power_is_off || genpd->prepared_count > 0)
 		return 0;
 
 	if (genpd->sd_count > 0)
@@ -259,6 +260,27 @@ static int pm_genpd_runtime_suspend(struct device *dev)
 	return 0;
 }
 
+/**
+ * __pm_genpd_runtime_resume - Resume a device belonging to I/O PM domain.
+ * @dev: Device to resume.
+ * @genpd: PM domain the device belongs to.
+ */
+static void __pm_genpd_runtime_resume(struct device *dev,
+				      struct generic_pm_domain *genpd)
+{
+	struct dev_list_entry *dle;
+
+	list_for_each_entry(dle, &genpd->dev_list, node) {
+		if (dle->dev == dev) {
+			__pm_genpd_restore_device(dle, genpd);
+			break;
+		}
+	}
+
+	if (genpd->start_device)
+		genpd->start_device(dev);
+}
+
 /**
  * pm_genpd_runtime_resume - Resume a device belonging to I/O PM domain.
  * @dev: Device to resume.
@@ -270,7 +292,6 @@ static int pm_genpd_runtime_suspend(struct device *dev)
 static int pm_genpd_runtime_resume(struct device *dev)
 {
 	struct generic_pm_domain *genpd;
-	struct dev_list_entry *dle;
 	int ret;
 
 	dev_dbg(dev, "%s()\n", __func__);
@@ -284,17 +305,7 @@ static int pm_genpd_runtime_resume(struct device *dev)
 		return ret;
 
 	mutex_lock(&genpd->lock);
-
-	list_for_each_entry(dle, &genpd->dev_list, node) {
-		if (dle->dev == dev) {
-			__pm_genpd_restore_device(dle, genpd);
-			break;
-		}
-	}
-
-	if (genpd->start_device)
-		genpd->start_device(dev);
-
+	__pm_genpd_runtime_resume(dev, genpd);
 	mutex_unlock(&genpd->lock);
 
 	return 0;
@@ -303,12 +314,493 @@ static int pm_genpd_runtime_resume(struct device *dev)
 #else
 
 static inline void genpd_power_off_work_fn(struct work_struct *work) {}
+static inline void __pm_genpd_runtime_resume(struct device *dev,
+					     struct generic_pm_domain *genpd) {}
 
 #define pm_genpd_runtime_suspend	NULL
 #define pm_genpd_runtime_resume		NULL
 
 #endif /* CONFIG_PM_RUNTIME */
 
+#ifdef CONFIG_PM_SLEEP
+
+/**
+ * pm_genpd_sync_poweroff - Synchronously power off a PM domain and its parents.
+ * @genpd: PM domain to power off, if possible.
+ *
+ * Check if the given PM domain can be powered off (during system suspend or
+ * hibernation) and do that if so.  Also, in that case propagate to its parent.
+ *
+ * This function is only called in "noirq" stages of system power transitions,
+ * so it need not acquire locks (all of the "noirq" callbacks are executed
+ * sequentially, so it is guaranteed that it will never run twice in parallel).
+ */
+static void pm_genpd_sync_poweroff(struct generic_pm_domain *genpd)
+{
+	struct generic_pm_domain *parent = genpd->parent;
+
+	if (genpd->power_is_off)
+		return;
+
+	if (genpd->suspended_count != genpd->device_count || genpd->sd_count > 0)
+		return;
+
+	if (genpd->power_off)
+		genpd->power_off(genpd);
+
+	genpd->power_is_off = true;
+	if (parent) {
+		genpd_sd_counter_dec(parent);
+		pm_genpd_sync_poweroff(parent);
+	}
+}
+
+/**
+ * pm_genpd_prepare - Start power transition of a device in a PM domain.
+ * @dev: Device to start the transition of.
+ *
+ * Start a power transition of a device (during a system-wide power transition)
+ * under the assumption that its pm_domain field points to the domain member of
+ * an object of type struct generic_pm_domain representing a PM domain
+ * consisting of I/O devices.
+ */
+static int pm_genpd_prepare(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	mutex_lock(&genpd->lock);
+
+	if (genpd->prepared_count++ == 0)
+		genpd->suspend_power_off = genpd->power_is_off;
+
+	if (genpd->suspend_power_off) {
+		mutex_unlock(&genpd->lock);
+		return 0;
+	}
+
+	/*
+	 * If the device is in the (runtime) "suspended" state, call
+	 * .start_device() for it, if defined.
+	 */
+	if (pm_runtime_suspended(dev))
+		__pm_genpd_runtime_resume(dev, genpd);
+
+	/*
+	 * Do not check if runtime resume is pending at this point, because it
+	 * has been taken care of already and if pm_genpd_poweron() ran at this
+	 * point as a result of the check, it would deadlock.
+	 */
+	__pm_runtime_disable(dev, false);
+
+	mutex_unlock(&genpd->lock);
+
+	return pm_generic_prepare(dev);
+}
+
+/**
+ * pm_genpd_suspend - Suspend a device belonging to an I/O PM domain.
+ * @dev: Device to suspend.
+ *
+ * Suspend a device under the assumption that its pm_domain field points to the
+ * domain member of an object of type struct generic_pm_domain representing
+ * a PM domain consisting of I/O devices.
+ */
+static int pm_genpd_suspend(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	return genpd->suspend_power_off ? 0 : pm_generic_suspend(dev);
+}
+
+/**
+ * pm_genpd_suspend_noirq - Late suspend of a device from an I/O PM domain.
+ * @dev: Device to suspend.
+ *
+ * Carry out a late suspend of a device under the assumption that its
+ * pm_domain field points to the domain member of an object of type
+ * struct generic_pm_domain representing a PM domain consisting of I/O devices.
+ */
+static int pm_genpd_suspend_noirq(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+	int ret;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	if (genpd->suspend_power_off)
+		return 0;
+
+	ret = pm_generic_suspend_noirq(dev);
+	if (ret)
+		return ret;
+
+	if (genpd->stop_device)
+		genpd->stop_device(dev);
+
+	/*
+	 * Since all of the "noirq" callbacks are executed sequentially, it is
+	 * guaranteed that this function will never run twice in parallel for
+	 * the same PM domain, so it is not necessary to use locking here.
+	 */
+	genpd->suspended_count++;
+	pm_genpd_sync_poweroff(genpd);
+
+	return 0;
+}
+
+/**
+ * pm_genpd_resume_noirq - Early resume of a device from an I/O power domain.
+ * @dev: Device to resume.
+ *
+ * Carry out an early resume of a device under the assumption that its
+ * pm_domain field points to the domain member of an object of type
+ * struct generic_pm_domain representing a power domain consisting of I/O
+ * devices.
+ */
+static int pm_genpd_resume_noirq(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	if (genpd->suspend_power_off)
+		return 0;
+
+	/*
+	 * Since all of the "noirq" callbacks are executed sequentially, it is
+	 * guaranteed that this function will never run twice in parallel for
+	 * the same PM domain, so it is not necessary to use locking here.
+	 */
+	pm_genpd_poweron(genpd);
+	genpd->suspended_count--;
+	if (genpd->start_device)
+		genpd->start_device(dev);
+
+	return pm_generic_resume_noirq(dev);
+}
+
+/**
+ * pm_genpd_resume - Resume a device belonging to an I/O power domain.
+ * @dev: Device to resume.
+ *
+ * Resume a device under the assumption that its pm_domain field points to the
+ * domain member of an object of type struct generic_pm_domain representing
+ * a power domain consisting of I/O devices.
+ */
+static int pm_genpd_resume(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	return genpd->suspend_power_off ? 0 : pm_generic_resume(dev);
+}
+
+/**
+ * pm_genpd_freeze - Freeze a device belonging to an I/O power domain.
+ * @dev: Device to freeze.
+ *
+ * Freeze a device under the assumption that its pm_domain field points to the
+ * domain member of an object of type struct generic_pm_domain representing
+ * a power domain consisting of I/O devices.
+ */
+static int pm_genpd_freeze(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	return genpd->suspend_power_off ? 0 : pm_generic_freeze(dev);
+}
+
+/**
+ * pm_genpd_freeze_noirq - Late freeze of a device from an I/O power domain.
+ * @dev: Device to freeze.
+ *
+ * Carry out a late freeze of a device under the assumption that its
+ * pm_domain field points to the domain member of an object of type
+ * struct generic_pm_domain representing a power domain consisting of I/O
+ * devices.
+ */
+static int pm_genpd_freeze_noirq(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+	int ret;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	if (genpd->suspend_power_off)
+		return 0;
+
+	ret = pm_generic_freeze_noirq(dev);
+	if (ret)
+		return ret;
+
+	if (genpd->stop_device)
+		genpd->stop_device(dev);
+
+	return 0;
+}
+
+/**
+ * pm_genpd_thaw_noirq - Early thaw of a device from an I/O power domain.
+ * @dev: Device to thaw.
+ *
+ * Carry out an early thaw of a device under the assumption that its
+ * pm_domain field points to the domain member of an object of type
+ * struct generic_pm_domain representing a power domain consisting of I/O
+ * devices.
+ */
+static int pm_genpd_thaw_noirq(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	if (genpd->suspend_power_off)
+		return 0;
+
+	if (genpd->start_device)
+		genpd->start_device(dev);
+
+	return pm_generic_thaw_noirq(dev);
+}
+
+/**
+ * pm_genpd_thaw - Thaw a device belonging to an I/O power domain.
+ * @dev: Device to thaw.
+ *
+ * Thaw a device under the assumption that its pm_domain field points to the
+ * domain member of an object of type struct generic_pm_domain representing
+ * a power domain consisting of I/O devices.
+ */
+static int pm_genpd_thaw(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	return genpd->suspend_power_off ? 0 : pm_generic_thaw(dev);
+}
+
+/**
+ * pm_genpd_dev_poweroff - Power off a device belonging to an I/O PM domain.
+ * @dev: Device to suspend.
+ *
+ * Power off a device under the assumption that its pm_domain field points to
+ * the domain member of an object of type struct generic_pm_domain representing
+ * a PM domain consisting of I/O devices.
+ */
+static int pm_genpd_dev_poweroff(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	return genpd->suspend_power_off ? 0 : pm_generic_poweroff(dev);
+}
+
+/**
+ * pm_genpd_dev_poweroff_noirq - Late power off of a device from a PM domain.
+ * @dev: Device to suspend.
+ *
+ * Carry out a late powering off of a device under the assumption that its
+ * pm_domain field points to the domain member of an object of type
+ * struct generic_pm_domain representing a PM domain consisting of I/O devices.
+ */
+static int pm_genpd_dev_poweroff_noirq(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+	int ret;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	if (genpd->suspend_power_off)
+		return 0;
+
+	ret = pm_generic_poweroff_noirq(dev);
+	if (ret)
+		return ret;
+
+	if (genpd->stop_device)
+		genpd->stop_device(dev);
+
+	/*
+	 * Since all of the "noirq" callbacks are executed sequentially, it is
+	 * guaranteed that this function will never run twice in parallel for
+	 * the same PM domain, so it is not necessary to use locking here.
+	 */
+	genpd->suspended_count++;
+	pm_genpd_sync_poweroff(genpd);
+
+	return 0;
+}
+
+/**
+ * pm_genpd_restore_noirq - Early restore of a device from an I/O power domain.
+ * @dev: Device to resume.
+ *
+ * Carry out an early restore of a device under the assumption that its
+ * pm_domain field points to the domain member of an object of type
+ * struct generic_pm_domain representing a power domain consisting of I/O
+ * devices.
+ */
+static int pm_genpd_restore_noirq(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	/*
+	 * Since all of the "noirq" callbacks are executed sequentially, it is
+	 * guaranteed that this function will never run twice in parallel for
+	 * the same PM domain, so it is not necessary to use locking here.
+	 */
+	genpd->power_is_off = true;
+	if (genpd->suspend_power_off) {
+		/*
+		 * The boot kernel might put the domain into the power on state,
+		 * so make sure it really is powered off.
+		 */
+		if (genpd->power_off)
+			genpd->power_off(genpd);
+		return 0;
+	}
+
+	pm_genpd_poweron(genpd);
+	genpd->suspended_count--;
+	if (genpd->start_device)
+		genpd->start_device(dev);
+
+	return pm_generic_restore_noirq(dev);
+}
+
+/**
+ * pm_genpd_restore - Restore a device belonging to an I/O power domain.
+ * @dev: Device to resume.
+ *
+ * Restore a device under the assumption that its pm_domain field points to the
+ * domain member of an object of type struct generic_pm_domain representing
+ * a power domain consisting of I/O devices.
+ */
+static int pm_genpd_restore(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -EINVAL;
+
+	return genpd->suspend_power_off ? 0 : pm_generic_restore(dev);
+}
+
+/**
+ * pm_genpd_complete - Complete power transition of a device in a power domain.
+ * @dev: Device to complete the transition of.
+ *
+ * Complete a power transition of a device (during a system-wide power
+ * transition) under the assumption that its pm_domain field points to the
+ * domain member of an object of type struct generic_pm_domain representing
+ * a power domain consisting of I/O devices.
+ */
+static void pm_genpd_complete(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+	bool run_complete;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return;
+
+	mutex_lock(&genpd->lock);
+
+	run_complete = !genpd->suspend_power_off;
+	if (--genpd->prepared_count == 0)
+		genpd->suspend_power_off = false;
+
+	mutex_unlock(&genpd->lock);
+
+	if (run_complete) {
+		pm_generic_complete(dev);
+		pm_runtime_enable(dev);
+	}
+}
+
+#else
+
+#define pm_genpd_prepare		NULL
+#define pm_genpd_suspend		NULL
+#define pm_genpd_suspend_noirq		NULL
+#define pm_genpd_resume_noirq		NULL
+#define pm_genpd_resume			NULL
+#define pm_genpd_freeze			NULL
+#define pm_genpd_freeze_noirq		NULL
+#define pm_genpd_thaw_noirq		NULL
+#define pm_genpd_thaw			NULL
+#define pm_genpd_dev_poweroff_noirq	NULL
+#define pm_genpd_dev_poweroff		NULL
+#define pm_genpd_restore_noirq		NULL
+#define pm_genpd_restore		NULL
+#define pm_genpd_complete		NULL
+
+#endif /* CONFIG_PM_SLEEP */
+
 /**
  * pm_genpd_add_device - Add a device to an I/O PM domain.
  * @genpd: PM domain to add the device to.
@@ -331,6 +823,11 @@ int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev)
 		goto out;
 	}
 
+	if (genpd->prepared_count > 0) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
 	list_for_each_entry(dle, &genpd->dev_list, node)
 		if (dle->dev == dev) {
 			ret = -EINVAL;
@@ -346,6 +843,7 @@ int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev)
 	dle->dev = dev;
 	dle->need_restore = false;
 	list_add_tail(&dle->node, &genpd->dev_list);
+	genpd->device_count++;
 
 	spin_lock_irq(&dev->power.lock);
 	dev->pm_domain = &genpd->domain;
@@ -375,6 +873,11 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 
 	mutex_lock(&genpd->lock);
 
+	if (genpd->prepared_count > 0) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
 	list_for_each_entry(dle, &genpd->dev_list, node) {
 		if (dle->dev != dev)
 			continue;
@@ -383,6 +886,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 		dev->pm_domain = NULL;
 		spin_unlock_irq(&dev->power.lock);
 
+		genpd->device_count--;
 		list_del(&dle->node);
 		kfree(dle);
 
@@ -390,6 +894,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 		break;
 	}
 
+ out:
 	mutex_unlock(&genpd->lock);
 
 	return ret;
@@ -498,7 +1003,23 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
 	genpd->in_progress = 0;
 	genpd->sd_count = 0;
 	genpd->power_is_off = is_off;
+	genpd->device_count = 0;
+	genpd->suspended_count = 0;
 	genpd->domain.ops.runtime_suspend = pm_genpd_runtime_suspend;
 	genpd->domain.ops.runtime_resume = pm_genpd_runtime_resume;
 	genpd->domain.ops.runtime_idle = pm_generic_runtime_idle;
+	genpd->domain.ops.prepare = pm_genpd_prepare;
+	genpd->domain.ops.suspend = pm_genpd_suspend;
+	genpd->domain.ops.suspend_noirq = pm_genpd_suspend_noirq;
+	genpd->domain.ops.resume_noirq = pm_genpd_resume_noirq;
+	genpd->domain.ops.resume = pm_genpd_resume;
+	genpd->domain.ops.freeze = pm_genpd_freeze;
+	genpd->domain.ops.freeze_noirq = pm_genpd_freeze_noirq;
+	genpd->domain.ops.thaw_noirq = pm_genpd_thaw_noirq;
+	genpd->domain.ops.thaw = pm_genpd_thaw;
+	genpd->domain.ops.poweroff = pm_genpd_dev_poweroff;
+	genpd->domain.ops.poweroff_noirq = pm_genpd_dev_poweroff_noirq;
+	genpd->domain.ops.restore_noirq = pm_genpd_restore_noirq;
+	genpd->domain.ops.restore = pm_genpd_restore;
+	genpd->domain.ops.complete = pm_genpd_complete;
 }
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index b1a22c65380b..7961b0dac437 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -11,6 +11,9 @@
 
 #include <linux/device.h>
 
+#define GPD_IN_SUSPEND	1
+#define GPD_POWER_OFF	2
+
 struct dev_power_governor {
 	bool (*power_down_ok)(struct dev_pm_domain *domain);
 };
@@ -27,12 +30,21 @@ struct generic_pm_domain {
 	unsigned int in_progress;	/* Number of devices being suspended now */
 	unsigned int sd_count;	/* Number of subdomains with power "on" */
 	bool power_is_off;	/* Whether or not power has been removed */
+	unsigned int device_count;	/* Number of devices */
+	unsigned int suspended_count;	/* System suspend device counter */
+	unsigned int prepared_count;	/* Suspend counter of prepared devices */
+	bool suspend_power_off;	/* Power status before system suspend */
 	int (*power_off)(struct generic_pm_domain *domain);
 	int (*power_on)(struct generic_pm_domain *domain);
 	int (*start_device)(struct device *dev);
 	int (*stop_device)(struct device *dev);
 };
 
+static inline struct generic_pm_domain *pd_to_genpd(struct dev_pm_domain *pd)
+{
+	return container_of(pd, struct generic_pm_domain, domain);
+}
+
 struct dev_list_entry {
 	struct list_head node;
 	struct device *dev;
-- 
cgit v1.2.3


From d4f2d87a8b46c14c4307c690c92bd08229f66ecf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 1 Jul 2011 22:13:29 +0200
Subject: PM / Domains: Wakeup devices support for system sleep transitions

There is the problem how to handle devices set up to wake up the
system from sleep states during system-wide power transitions.
In some cases, those devices can be turned off entirely, because the
wakeup signals will be generated on their behalf anyway.  In some
other cases, they will generate wakeup signals if their clocks are
stopped, but only if power is not removed from them.  Finally, in
some cases, they can only generate wakeup signals if power is not
removed from them and their clocks are enabled.

To allow platform-specific code to decide whether or not to put
wakeup devices (and their PM domains) into low-power state during
system-wide transitions, such as system suspend, introduce a new
generic PM domain callback, .active_wakeup(), that will be used
during the "noirq" phase of system suspend and hibernation (after
image creation) to decide what to do with wakeup devices.
Specifically, if this callback is present and returns "true", the
generic PM domain code will not execute .stop_device() for the
given wakeup device and its PM domain won't be powered off.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Kevin Hilman <khilman@ti.com>
---
 drivers/base/power/domain.c | 8 ++++++++
 include/linux/pm_domain.h   | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 33086e9afaf6..1aed94c73cfc 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -450,6 +450,10 @@ static int pm_genpd_suspend_noirq(struct device *dev)
 	if (ret)
 		return ret;
 
+	if (device_may_wakeup(dev)
+	    && genpd->active_wakeup && genpd->active_wakeup(dev))
+		return 0;
+
 	if (genpd->stop_device)
 		genpd->stop_device(dev);
 
@@ -670,6 +674,10 @@ static int pm_genpd_dev_poweroff_noirq(struct device *dev)
 	if (ret)
 		return ret;
 
+	if (device_may_wakeup(dev)
+	    && genpd->active_wakeup && genpd->active_wakeup(dev))
+		return 0;
+
 	if (genpd->stop_device)
 		genpd->stop_device(dev);
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 7961b0dac437..98491ee35102 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -38,6 +38,7 @@ struct generic_pm_domain {
 	int (*power_on)(struct generic_pm_domain *domain);
 	int (*start_device)(struct device *dev);
 	int (*stop_device)(struct device *dev);
+	bool (*active_wakeup)(struct device *dev);
 };
 
 static inline struct generic_pm_domain *pd_to_genpd(struct dev_pm_domain *pd)
-- 
cgit v1.2.3


From b7b95920aa2e89e655afe9913ee0e55855ceda90 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 1 Jul 2011 22:13:37 +0200
Subject: PM: Allow the clocks management code to be used during system suspend

The common clocks management code in drivers/base/power/clock_ops.c
is going to be used during system-wide power transitions as well as
for runtime PM, so it shouldn't depend on CONFIG_PM_RUNTIME.
However, the suspend/resume functions provided by it for
CONFIG_PM_RUNTIME unset, to be used during system-wide power
transitions, should not behave in the same way as their counterparts
defined for CONFIG_PM_RUNTIME set, because in that case the clocks
are managed differently at run time.

The names of the functions still contain the word "runtime" after
this change, but that is going to be modified by a separate patch
later.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Kevin Hilman <khilman@ti.com>
---
 drivers/base/power/clock_ops.c | 60 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/pm_runtime.h     |  2 +-
 kernel/power/Kconfig           |  4 +--
 3 files changed, 62 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c
index c5624818259e..2fb9c121c64b 100644
--- a/drivers/base/power/clock_ops.c
+++ b/drivers/base/power/clock_ops.c
@@ -15,7 +15,7 @@
 #include <linux/slab.h>
 #include <linux/err.h>
 
-#ifdef CONFIG_PM_RUNTIME
+#ifdef CONFIG_PM
 
 struct pm_runtime_clk_data {
 	struct list_head clock_list;
@@ -191,6 +191,10 @@ void pm_runtime_clk_destroy(struct device *dev)
 	kfree(prd);
 }
 
+#endif /* CONFIG_PM */
+
+#ifdef CONFIG_PM_RUNTIME
+
 /**
  * pm_runtime_clk_acquire - Acquire a device clock.
  * @dev: Device whose clock is to be acquired.
@@ -330,6 +334,60 @@ static int pm_runtime_clk_notify(struct notifier_block *nb,
 
 #else /* !CONFIG_PM_RUNTIME */
 
+#ifdef CONFIG_PM
+
+/**
+ * pm_runtime_clk_suspend - Disable clocks in a device's PM clock list.
+ * @dev: Device to disable the clocks for.
+ */
+int pm_runtime_clk_suspend(struct device *dev)
+{
+	struct pm_runtime_clk_data *prd = __to_prd(dev);
+	struct pm_clock_entry *ce;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	/* If there is no driver, the clocks are already disabled. */
+	if (!prd || !dev->driver)
+		return 0;
+
+	mutex_lock(&prd->lock);
+
+	list_for_each_entry_reverse(ce, &prd->clock_list, node)
+		clk_disable(ce->clk);
+
+	mutex_unlock(&prd->lock);
+
+	return 0;
+}
+
+/**
+ * pm_runtime_clk_resume - Enable clocks in a device's PM clock list.
+ * @dev: Device to enable the clocks for.
+ */
+int pm_runtime_clk_resume(struct device *dev)
+{
+	struct pm_runtime_clk_data *prd = __to_prd(dev);
+	struct pm_clock_entry *ce;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	/* If there is no driver, the clocks should remain disabled. */
+	if (!prd || !dev->driver)
+		return 0;
+
+	mutex_lock(&prd->lock);
+
+	list_for_each_entry(ce, &prd->clock_list, node)
+		clk_enable(ce->clk);
+
+	mutex_unlock(&prd->lock);
+
+	return 0;
+}
+
+#endif /* CONFIG_PM */
+
 /**
  * enable_clock - Enable a device clock.
  * @dev: Device whose clock is to be enabled.
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index ef91904c7110..1bd5063a2cc8 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -251,7 +251,7 @@ struct pm_clk_notifier_block {
 	char *con_ids[];
 };
 
-#ifdef CONFIG_PM_RUNTIME_CLK
+#ifdef CONFIG_PM_CLK
 extern int pm_runtime_clk_init(struct device *dev);
 extern void pm_runtime_clk_destroy(struct device *dev);
 extern int pm_runtime_clk_add(struct device *dev, const char *con_id);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e83ac2556c86..7b856b3458d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -224,9 +224,9 @@ config PM_OPP
 	  implementations a ready to use framework to manage OPPs.
 	  For more information, read <file:Documentation/power/opp.txt>
 
-config PM_RUNTIME_CLK
+config PM_CLK
 	def_bool y
-	depends on PM_RUNTIME && HAVE_CLK
+	depends on PM && HAVE_CLK
 
 config PM_GENERIC_DOMAINS
 	bool
-- 
cgit v1.2.3


From 3d5c30367cbc0c55c93bb158e824e00badc7ddc4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 1 Jul 2011 22:13:44 +0200
Subject: PM: Rename clock management functions

The common PM clock management functions may be used for system
suspend/resume as well as for runtime PM, so rename them
accordingly.  Modify kerneldoc comments describing these functions
and kernel messages printed by them, so that they refer to power
management in general rather that to runtime PM.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-omap1/pm_bus.c        |   6 +-
 arch/arm/mach-shmobile/pm_runtime.c |   6 +-
 drivers/base/power/clock_ops.c      | 188 ++++++++++++++++++------------------
 include/linux/pm_runtime.h          |  28 +++---
 4 files changed, 114 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap1/pm_bus.c b/arch/arm/mach-omap1/pm_bus.c
index 212f33103712..943072d5a1d5 100644
--- a/arch/arm/mach-omap1/pm_bus.c
+++ b/arch/arm/mach-omap1/pm_bus.c
@@ -32,7 +32,7 @@ static int omap1_pm_runtime_suspend(struct device *dev)
 	if (ret)
 		return ret;
 
-	ret = pm_runtime_clk_suspend(dev);
+	ret = pm_clk_suspend(dev);
 	if (ret) {
 		pm_generic_runtime_resume(dev);
 		return ret;
@@ -45,7 +45,7 @@ static int omap1_pm_runtime_resume(struct device *dev)
 {
 	dev_dbg(dev, "%s\n", __func__);
 
-	pm_runtime_clk_resume(dev);
+	pm_clk_resume(dev);
 	return pm_generic_runtime_resume(dev);
 }
 
@@ -71,7 +71,7 @@ static int __init omap1_pm_runtime_init(void)
 	if (!cpu_class_is_omap1())
 		return -ENODEV;
 
-	pm_runtime_clk_add_notifier(&platform_bus_type, &platform_bus_notifier);
+	pm_clk_add_notifier(&platform_bus_type, &platform_bus_notifier);
 
 	return 0;
 }
diff --git a/arch/arm/mach-shmobile/pm_runtime.c b/arch/arm/mach-shmobile/pm_runtime.c
index 99802d28e5d3..2bcde1c46a6b 100644
--- a/arch/arm/mach-shmobile/pm_runtime.c
+++ b/arch/arm/mach-shmobile/pm_runtime.c
@@ -30,8 +30,8 @@ static int default_platform_runtime_idle(struct device *dev)
 
 static struct dev_pm_domain default_pm_domain = {
 	.ops = {
-		.runtime_suspend = pm_runtime_clk_suspend,
-		.runtime_resume = pm_runtime_clk_resume,
+		.runtime_suspend = pm_clk_suspend,
+		.runtime_resume = pm_clk_resume,
 		.runtime_idle = default_platform_runtime_idle,
 		USE_PLATFORM_PM_SLEEP_OPS
 	},
@@ -52,7 +52,7 @@ static struct pm_clk_notifier_block platform_bus_notifier = {
 
 static int __init sh_pm_runtime_init(void)
 {
-	pm_runtime_clk_add_notifier(&platform_bus_type, &platform_bus_notifier);
+	pm_clk_add_notifier(&platform_bus_type, &platform_bus_notifier);
 	return 0;
 }
 core_initcall(sh_pm_runtime_init);
diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c
index 2fb9c121c64b..a846b2f95cfb 100644
--- a/drivers/base/power/clock_ops.c
+++ b/drivers/base/power/clock_ops.c
@@ -17,7 +17,7 @@
 
 #ifdef CONFIG_PM
 
-struct pm_runtime_clk_data {
+struct pm_clk_data {
 	struct list_head clock_list;
 	struct mutex lock;
 };
@@ -36,25 +36,25 @@ struct pm_clock_entry {
 	enum pce_status status;
 };
 
-static struct pm_runtime_clk_data *__to_prd(struct device *dev)
+static struct pm_clk_data *__to_pcd(struct device *dev)
 {
 	return dev ? dev->power.subsys_data : NULL;
 }
 
 /**
- * pm_runtime_clk_add - Start using a device clock for runtime PM.
- * @dev: Device whose clock is going to be used for runtime PM.
+ * pm_clk_add - Start using a device clock for power management.
+ * @dev: Device whose clock is going to be used for power management.
  * @con_id: Connection ID of the clock.
  *
  * Add the clock represented by @con_id to the list of clocks used for
- * the runtime PM of @dev.
+ * the power management of @dev.
  */
-int pm_runtime_clk_add(struct device *dev, const char *con_id)
+int pm_clk_add(struct device *dev, const char *con_id)
 {
-	struct pm_runtime_clk_data *prd = __to_prd(dev);
+	struct pm_clk_data *pcd = __to_pcd(dev);
 	struct pm_clock_entry *ce;
 
-	if (!prd)
+	if (!pcd)
 		return -EINVAL;
 
 	ce = kzalloc(sizeof(*ce), GFP_KERNEL);
@@ -73,20 +73,20 @@ int pm_runtime_clk_add(struct device *dev, const char *con_id)
 		}
 	}
 
-	mutex_lock(&prd->lock);
-	list_add_tail(&ce->node, &prd->clock_list);
-	mutex_unlock(&prd->lock);
+	mutex_lock(&pcd->lock);
+	list_add_tail(&ce->node, &pcd->clock_list);
+	mutex_unlock(&pcd->lock);
 	return 0;
 }
 
 /**
- * __pm_runtime_clk_remove - Destroy runtime PM clock entry.
- * @ce: Runtime PM clock entry to destroy.
+ * __pm_clk_remove - Destroy PM clock entry.
+ * @ce: PM clock entry to destroy.
  *
- * This routine must be called under the mutex protecting the runtime PM list
- * of clocks corresponding the the @ce's device.
+ * This routine must be called under the mutex protecting the PM list of clocks
+ * corresponding the the @ce's device.
  */
-static void __pm_runtime_clk_remove(struct pm_clock_entry *ce)
+static void __pm_clk_remove(struct pm_clock_entry *ce)
 {
 	if (!ce)
 		return;
@@ -108,87 +108,87 @@ static void __pm_runtime_clk_remove(struct pm_clock_entry *ce)
 }
 
 /**
- * pm_runtime_clk_remove - Stop using a device clock for runtime PM.
- * @dev: Device whose clock should not be used for runtime PM any more.
+ * pm_clk_remove - Stop using a device clock for power management.
+ * @dev: Device whose clock should not be used for PM any more.
  * @con_id: Connection ID of the clock.
  *
  * Remove the clock represented by @con_id from the list of clocks used for
- * the runtime PM of @dev.
+ * the power management of @dev.
  */
-void pm_runtime_clk_remove(struct device *dev, const char *con_id)
+void pm_clk_remove(struct device *dev, const char *con_id)
 {
-	struct pm_runtime_clk_data *prd = __to_prd(dev);
+	struct pm_clk_data *pcd = __to_pcd(dev);
 	struct pm_clock_entry *ce;
 
-	if (!prd)
+	if (!pcd)
 		return;
 
-	mutex_lock(&prd->lock);
+	mutex_lock(&pcd->lock);
 
-	list_for_each_entry(ce, &prd->clock_list, node) {
+	list_for_each_entry(ce, &pcd->clock_list, node) {
 		if (!con_id && !ce->con_id) {
-			__pm_runtime_clk_remove(ce);
+			__pm_clk_remove(ce);
 			break;
 		} else if (!con_id || !ce->con_id) {
 			continue;
 		} else if (!strcmp(con_id, ce->con_id)) {
-			__pm_runtime_clk_remove(ce);
+			__pm_clk_remove(ce);
 			break;
 		}
 	}
 
-	mutex_unlock(&prd->lock);
+	mutex_unlock(&pcd->lock);
 }
 
 /**
- * pm_runtime_clk_init - Initialize a device's list of runtime PM clocks.
- * @dev: Device to initialize the list of runtime PM clocks for.
+ * pm_clk_init - Initialize a device's list of power management clocks.
+ * @dev: Device to initialize the list of PM clocks for.
  *
- * Allocate a struct pm_runtime_clk_data object, initialize its lock member and
+ * Allocate a struct pm_clk_data object, initialize its lock member and
  * make the @dev's power.subsys_data field point to it.
  */
-int pm_runtime_clk_init(struct device *dev)
+int pm_clk_init(struct device *dev)
 {
-	struct pm_runtime_clk_data *prd;
+	struct pm_clk_data *pcd;
 
-	prd = kzalloc(sizeof(*prd), GFP_KERNEL);
-	if (!prd) {
-		dev_err(dev, "Not enough memory fo runtime PM data.\n");
+	pcd = kzalloc(sizeof(*pcd), GFP_KERNEL);
+	if (!pcd) {
+		dev_err(dev, "Not enough memory for PM clock data.\n");
 		return -ENOMEM;
 	}
 
-	INIT_LIST_HEAD(&prd->clock_list);
-	mutex_init(&prd->lock);
-	dev->power.subsys_data = prd;
+	INIT_LIST_HEAD(&pcd->clock_list);
+	mutex_init(&pcd->lock);
+	dev->power.subsys_data = pcd;
 	return 0;
 }
 
 /**
- * pm_runtime_clk_destroy - Destroy a device's list of runtime PM clocks.
- * @dev: Device to destroy the list of runtime PM clocks for.
+ * pm_clk_destroy - Destroy a device's list of power management clocks.
+ * @dev: Device to destroy the list of PM clocks for.
  *
  * Clear the @dev's power.subsys_data field, remove the list of clock entries
- * from the struct pm_runtime_clk_data object pointed to by it before and free
+ * from the struct pm_clk_data object pointed to by it before and free
  * that object.
  */
-void pm_runtime_clk_destroy(struct device *dev)
+void pm_clk_destroy(struct device *dev)
 {
-	struct pm_runtime_clk_data *prd = __to_prd(dev);
+	struct pm_clk_data *pcd = __to_pcd(dev);
 	struct pm_clock_entry *ce, *c;
 
-	if (!prd)
+	if (!pcd)
 		return;
 
 	dev->power.subsys_data = NULL;
 
-	mutex_lock(&prd->lock);
+	mutex_lock(&pcd->lock);
 
-	list_for_each_entry_safe_reverse(ce, c, &prd->clock_list, node)
-		__pm_runtime_clk_remove(ce);
+	list_for_each_entry_safe_reverse(ce, c, &pcd->clock_list, node)
+		__pm_clk_remove(ce);
 
-	mutex_unlock(&prd->lock);
+	mutex_unlock(&pcd->lock);
 
-	kfree(prd);
+	kfree(pcd);
 }
 
 #endif /* CONFIG_PM */
@@ -196,11 +196,11 @@ void pm_runtime_clk_destroy(struct device *dev)
 #ifdef CONFIG_PM_RUNTIME
 
 /**
- * pm_runtime_clk_acquire - Acquire a device clock.
+ * pm_clk_acquire - Acquire a device clock.
  * @dev: Device whose clock is to be acquired.
  * @con_id: Connection ID of the clock.
  */
-static void pm_runtime_clk_acquire(struct device *dev,
+static void pm_clk_acquire(struct device *dev,
 				    struct pm_clock_entry *ce)
 {
 	ce->clk = clk_get(dev, ce->con_id);
@@ -213,24 +213,24 @@ static void pm_runtime_clk_acquire(struct device *dev,
 }
 
 /**
- * pm_runtime_clk_suspend - Disable clocks in a device's runtime PM clock list.
+ * pm_clk_suspend - Disable clocks in a device's PM clock list.
  * @dev: Device to disable the clocks for.
  */
-int pm_runtime_clk_suspend(struct device *dev)
+int pm_clk_suspend(struct device *dev)
 {
-	struct pm_runtime_clk_data *prd = __to_prd(dev);
+	struct pm_clk_data *pcd = __to_pcd(dev);
 	struct pm_clock_entry *ce;
 
 	dev_dbg(dev, "%s()\n", __func__);
 
-	if (!prd)
+	if (!pcd)
 		return 0;
 
-	mutex_lock(&prd->lock);
+	mutex_lock(&pcd->lock);
 
-	list_for_each_entry_reverse(ce, &prd->clock_list, node) {
+	list_for_each_entry_reverse(ce, &pcd->clock_list, node) {
 		if (ce->status == PCE_STATUS_NONE)
-			pm_runtime_clk_acquire(dev, ce);
+			pm_clk_acquire(dev, ce);
 
 		if (ce->status < PCE_STATUS_ERROR) {
 			clk_disable(ce->clk);
@@ -238,30 +238,30 @@ int pm_runtime_clk_suspend(struct device *dev)
 		}
 	}
 
-	mutex_unlock(&prd->lock);
+	mutex_unlock(&pcd->lock);
 
 	return 0;
 }
 
 /**
- * pm_runtime_clk_resume - Enable clocks in a device's runtime PM clock list.
+ * pm_clk_resume - Enable clocks in a device's PM clock list.
  * @dev: Device to enable the clocks for.
  */
-int pm_runtime_clk_resume(struct device *dev)
+int pm_clk_resume(struct device *dev)
 {
-	struct pm_runtime_clk_data *prd = __to_prd(dev);
+	struct pm_clk_data *pcd = __to_pcd(dev);
 	struct pm_clock_entry *ce;
 
 	dev_dbg(dev, "%s()\n", __func__);
 
-	if (!prd)
+	if (!pcd)
 		return 0;
 
-	mutex_lock(&prd->lock);
+	mutex_lock(&pcd->lock);
 
-	list_for_each_entry(ce, &prd->clock_list, node) {
+	list_for_each_entry(ce, &pcd->clock_list, node) {
 		if (ce->status == PCE_STATUS_NONE)
-			pm_runtime_clk_acquire(dev, ce);
+			pm_clk_acquire(dev, ce);
 
 		if (ce->status < PCE_STATUS_ERROR) {
 			clk_enable(ce->clk);
@@ -269,13 +269,13 @@ int pm_runtime_clk_resume(struct device *dev)
 		}
 	}
 
-	mutex_unlock(&prd->lock);
+	mutex_unlock(&pcd->lock);
 
 	return 0;
 }
 
 /**
- * pm_runtime_clk_notify - Notify routine for device addition and removal.
+ * pm_clk_notify - Notify routine for device addition and removal.
  * @nb: Notifier block object this function is a member of.
  * @action: Operation being carried out by the caller.
  * @data: Device the routine is being run for.
@@ -284,13 +284,13 @@ int pm_runtime_clk_resume(struct device *dev)
  * struct pm_clk_notifier_block containing all of the requisite data.
  * Specifically, the pm_domain member of that object is copied to the device's
  * pm_domain field and its con_ids member is used to populate the device's list
- * of runtime PM clocks, depending on @action.
+ * of PM clocks, depending on @action.
  *
  * If the device's pm_domain field is already populated with a value different
  * from the one stored in the struct pm_clk_notifier_block object, the function
  * does nothing.
  */
-static int pm_runtime_clk_notify(struct notifier_block *nb,
+static int pm_clk_notify(struct notifier_block *nb,
 				 unsigned long action, void *data)
 {
 	struct pm_clk_notifier_block *clknb;
@@ -307,16 +307,16 @@ static int pm_runtime_clk_notify(struct notifier_block *nb,
 		if (dev->pm_domain)
 			break;
 
-		error = pm_runtime_clk_init(dev);
+		error = pm_clk_init(dev);
 		if (error)
 			break;
 
 		dev->pm_domain = clknb->pm_domain;
 		if (clknb->con_ids[0]) {
 			for (con_id = clknb->con_ids; *con_id; con_id++)
-				pm_runtime_clk_add(dev, *con_id);
+				pm_clk_add(dev, *con_id);
 		} else {
-			pm_runtime_clk_add(dev, NULL);
+			pm_clk_add(dev, NULL);
 		}
 
 		break;
@@ -325,7 +325,7 @@ static int pm_runtime_clk_notify(struct notifier_block *nb,
 			break;
 
 		dev->pm_domain = NULL;
-		pm_runtime_clk_destroy(dev);
+		pm_clk_destroy(dev);
 		break;
 	}
 
@@ -337,51 +337,51 @@ static int pm_runtime_clk_notify(struct notifier_block *nb,
 #ifdef CONFIG_PM
 
 /**
- * pm_runtime_clk_suspend - Disable clocks in a device's PM clock list.
+ * pm_clk_suspend - Disable clocks in a device's PM clock list.
  * @dev: Device to disable the clocks for.
  */
-int pm_runtime_clk_suspend(struct device *dev)
+int pm_clk_suspend(struct device *dev)
 {
-	struct pm_runtime_clk_data *prd = __to_prd(dev);
+	struct pm_clk_data *pcd = __to_pcd(dev);
 	struct pm_clock_entry *ce;
 
 	dev_dbg(dev, "%s()\n", __func__);
 
 	/* If there is no driver, the clocks are already disabled. */
-	if (!prd || !dev->driver)
+	if (!pcd || !dev->driver)
 		return 0;
 
-	mutex_lock(&prd->lock);
+	mutex_lock(&pcd->lock);
 
-	list_for_each_entry_reverse(ce, &prd->clock_list, node)
+	list_for_each_entry_reverse(ce, &pcd->clock_list, node)
 		clk_disable(ce->clk);
 
-	mutex_unlock(&prd->lock);
+	mutex_unlock(&pcd->lock);
 
 	return 0;
 }
 
 /**
- * pm_runtime_clk_resume - Enable clocks in a device's PM clock list.
+ * pm_clk_resume - Enable clocks in a device's PM clock list.
  * @dev: Device to enable the clocks for.
  */
-int pm_runtime_clk_resume(struct device *dev)
+int pm_clk_resume(struct device *dev)
 {
-	struct pm_runtime_clk_data *prd = __to_prd(dev);
+	struct pm_clk_data *pcd = __to_pcd(dev);
 	struct pm_clock_entry *ce;
 
 	dev_dbg(dev, "%s()\n", __func__);
 
 	/* If there is no driver, the clocks should remain disabled. */
-	if (!prd || !dev->driver)
+	if (!pcd || !dev->driver)
 		return 0;
 
-	mutex_lock(&prd->lock);
+	mutex_lock(&pcd->lock);
 
-	list_for_each_entry(ce, &prd->clock_list, node)
+	list_for_each_entry(ce, &pcd->clock_list, node)
 		clk_enable(ce->clk);
 
-	mutex_unlock(&prd->lock);
+	mutex_unlock(&pcd->lock);
 
 	return 0;
 }
@@ -423,7 +423,7 @@ static void disable_clock(struct device *dev, const char *con_id)
 }
 
 /**
- * pm_runtime_clk_notify - Notify routine for device addition and removal.
+ * pm_clk_notify - Notify routine for device addition and removal.
  * @nb: Notifier block object this function is a member of.
  * @action: Operation being carried out by the caller.
  * @data: Device the routine is being run for.
@@ -433,7 +433,7 @@ static void disable_clock(struct device *dev, const char *con_id)
  * Specifically, the con_ids member of that object is used to enable or disable
  * the device's clocks, depending on @action.
  */
-static int pm_runtime_clk_notify(struct notifier_block *nb,
+static int pm_clk_notify(struct notifier_block *nb,
 				 unsigned long action, void *data)
 {
 	struct pm_clk_notifier_block *clknb;
@@ -469,21 +469,21 @@ static int pm_runtime_clk_notify(struct notifier_block *nb,
 #endif /* !CONFIG_PM_RUNTIME */
 
 /**
- * pm_runtime_clk_add_notifier - Add bus type notifier for runtime PM clocks.
+ * pm_clk_add_notifier - Add bus type notifier for power management clocks.
  * @bus: Bus type to add the notifier to.
  * @clknb: Notifier to be added to the given bus type.
  *
  * The nb member of @clknb is not expected to be initialized and its
- * notifier_call member will be replaced with pm_runtime_clk_notify().  However,
+ * notifier_call member will be replaced with pm_clk_notify().  However,
  * the remaining members of @clknb should be populated prior to calling this
  * routine.
  */
-void pm_runtime_clk_add_notifier(struct bus_type *bus,
+void pm_clk_add_notifier(struct bus_type *bus,
 				 struct pm_clk_notifier_block *clknb)
 {
 	if (!bus || !clknb)
 		return;
 
-	clknb->nb.notifier_call = pm_runtime_clk_notify;
+	clknb->nb.notifier_call = pm_clk_notify;
 	bus_register_notifier(bus, &clknb->nb);
 }
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 1bd5063a2cc8..dfb8539ed686 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -252,36 +252,36 @@ struct pm_clk_notifier_block {
 };
 
 #ifdef CONFIG_PM_CLK
-extern int pm_runtime_clk_init(struct device *dev);
-extern void pm_runtime_clk_destroy(struct device *dev);
-extern int pm_runtime_clk_add(struct device *dev, const char *con_id);
-extern void pm_runtime_clk_remove(struct device *dev, const char *con_id);
-extern int pm_runtime_clk_suspend(struct device *dev);
-extern int pm_runtime_clk_resume(struct device *dev);
+extern int pm_clk_init(struct device *dev);
+extern void pm_clk_destroy(struct device *dev);
+extern int pm_clk_add(struct device *dev, const char *con_id);
+extern void pm_clk_remove(struct device *dev, const char *con_id);
+extern int pm_clk_suspend(struct device *dev);
+extern int pm_clk_resume(struct device *dev);
 #else
-static inline int pm_runtime_clk_init(struct device *dev)
+static inline int pm_clk_init(struct device *dev)
 {
 	return -EINVAL;
 }
-static inline void pm_runtime_clk_destroy(struct device *dev)
+static inline void pm_clk_destroy(struct device *dev)
 {
 }
-static inline int pm_runtime_clk_add(struct device *dev, const char *con_id)
+static inline int pm_clk_add(struct device *dev, const char *con_id)
 {
 	return -EINVAL;
 }
-static inline void pm_runtime_clk_remove(struct device *dev, const char *con_id)
+static inline void pm_clk_remove(struct device *dev, const char *con_id)
 {
 }
-#define pm_runtime_clock_suspend	NULL
-#define pm_runtime_clock_resume		NULL
+#define pm_clk_suspend	NULL
+#define pm_clk_resume	NULL
 #endif
 
 #ifdef CONFIG_HAVE_CLK
-extern void pm_runtime_clk_add_notifier(struct bus_type *bus,
+extern void pm_clk_add_notifier(struct bus_type *bus,
 					struct pm_clk_notifier_block *clknb);
 #else
-static inline void pm_runtime_clk_add_notifier(struct bus_type *bus,
+static inline void pm_clk_add_notifier(struct bus_type *bus,
 					struct pm_clk_notifier_block *clknb)
 {
 }
-- 
cgit v1.2.3


From 234b921dbcf144826e2e2b3663cd8090892ee2b2 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 30 Jun 2011 15:08:57 +0000
Subject: netpoll: Remove unused EXPORT_SYMBOLs of netpoll_poll and
 netpoll_poll_dev

Unused symbols waste space.

Commit 0e34e93177fb
"(netpoll: add generic support for bridge and bonding devices)"
added the symbol more than a year ago with the promise of "future use".

Because it is so far unused, remove it for now.
It can be easily readded if or when it actually needs to be used.

cc: WANG Cong <amwang@redhat.com>
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 2 --
 net/core/netpoll.c      | 6 ++----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 79358bb712c6..5dfa091c3347 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -40,8 +40,6 @@ struct netpoll_info {
 	struct netpoll *netpoll;
 };
 
-void netpoll_poll_dev(struct net_device *dev);
-void netpoll_poll(struct netpoll *np);
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len);
 void netpoll_print_options(struct netpoll *np);
 int netpoll_parse_options(struct netpoll *np, char *opt);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 18d9cbda3a39..4ce595e45f91 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -177,7 +177,7 @@ static void service_arp_queue(struct netpoll_info *npi)
 	}
 }
 
-void netpoll_poll_dev(struct net_device *dev)
+static void netpoll_poll_dev(struct net_device *dev)
 {
 	const struct net_device_ops *ops;
 
@@ -208,13 +208,11 @@ void netpoll_poll_dev(struct net_device *dev)
 
 	zap_completion_queue();
 }
-EXPORT_SYMBOL(netpoll_poll_dev);
 
-void netpoll_poll(struct netpoll *np)
+static void netpoll_poll(struct netpoll *np)
 {
 	netpoll_poll_dev(np->dev);
 }
-EXPORT_SYMBOL(netpoll_poll);
 
 static void refill_skbs(void)
 {
-- 
cgit v1.2.3


From f09bc831b7693f93ecb95dea7180d55b45b88e76 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Mon, 4 Jul 2011 09:01:18 +0800
Subject: dt: add 'const' for of_property_read_string parameter **out_string

The existing dt codes usually call of_get_property to get a string
property and save it as a 'const char *'.  The patch adds'const' for
of_property_read_string parameter **out_string to make the converting
of existing code a little easier.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/base.c  | 2 +-
 include/linux/of.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index b8b65fddbeb5..57ec27bed44f 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -639,7 +639,7 @@ EXPORT_SYMBOL_GPL(of_property_read_u32);
  * The out_string pointer is modified only if a valid string can be decoded.
  */
 int of_property_read_string(struct device_node *np, char *propname,
-				char **out_string)
+				const char **out_string)
 {
 	struct property *prop = of_find_property(np, propname, NULL);
 	if (!prop)
diff --git a/include/linux/of.h b/include/linux/of.h
index 4fc4c1b8d5d5..b23852002b30 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -198,7 +198,7 @@ extern struct property *of_find_property(const struct device_node *np,
 extern int of_property_read_u32(struct device_node *np, char *propname,
 					u32 *out_value);
 extern int of_property_read_string(struct device_node *np, char *propname,
-					char **out_string);
+					const char **out_string);
 extern int of_device_is_compatible(const struct device_node *device,
 				   const char *);
 extern int of_device_is_available(const struct device_node *device);
-- 
cgit v1.2.3


From e4c2fb0d5776b58049d2556b456144a4db3fe5a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 Jul 2011 10:56:32 +0200
Subject: sched: Disable (revert) SCHED_LOAD_SCALE increase

Alex reported that commit c8b281161df ("sched: Increase
SCHED_LOAD_SCALE resolution") caused a power usage regression
under light load as it increases the number of load-balance
operations and keeps idle cpus from staying idle.

Time has run out to find the root cause for this release so
disable the feature for v3.0 until we can figure out what
causes the problem.

Reported-by: "Alex, Shi" <alex.shi@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-m4onxn0sxnyn5iz9o88eskc3@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a837b20ba190..496770a96487 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -808,7 +808,7 @@ enum cpu_idle_type {
  * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
  * increased costs.
  */
-#if BITS_PER_LONG > 32
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
 # define SCHED_LOAD_RESOLUTION	10
 # define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
 # define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
-- 
cgit v1.2.3


From 304e21bbeab0d208dc7e6142fb75db8a466d5217 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Fri, 1 Jul 2011 22:35:28 +0400
Subject: ssb: PCI revision ID register is 8-bit wide

The SSB code reads PCI revision ID register as 16-bit entity while the register
is actually 8-bit only (the next 8 bits are the programming interface register).
Fix the read and make the 'rev' field of 'struct ssb_boardinfo' 8-bit as well,
to match the register size.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/ssb/pci.c       | 2 +-
 include/linux/ssb/ssb.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ssb/pci.c b/drivers/ssb/pci.c
index 7ad48585c5e6..52b1ceb748c7 100644
--- a/drivers/ssb/pci.c
+++ b/drivers/ssb/pci.c
@@ -738,7 +738,7 @@ static void ssb_pci_get_boardinfo(struct ssb_bus *bus,
 			     &bi->vendor);
 	pci_read_config_word(bus->host_pci, PCI_SUBSYSTEM_ID,
 			     &bi->type);
-	pci_read_config_word(bus->host_pci, PCI_REVISION_ID,
+	pci_read_config_byte(bus->host_pci, PCI_REVISION_ID,
 			     &bi->rev);
 }
 
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 252e44821787..b0928c10111b 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -99,7 +99,7 @@ struct ssb_sprom {
 struct ssb_boardinfo {
 	u16 vendor;
 	u16 type;
-	u16 rev;
+	u8  rev;
 };
 
 
-- 
cgit v1.2.3


From 4d12b8b129f170d0fc3188de1e51a2a1b0f87730 Mon Sep 17 00:00:00 2001
From: Lauro Ramos Venancio <lauro.venancio@openbossa.org>
Date: Fri, 1 Jul 2011 19:31:34 -0300
Subject: NFC: add nfc generic netlink interface

The NFC generic netlink interface exports the NFC control operations
to the user space.

Signed-off-by: Lauro Ramos Venancio <lauro.venancio@openbossa.org>
Signed-off-by: Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nfc.h | 112 +++++++++++
 include/net/nfc.h   |  21 ++
 net/nfc/Makefile    |   2 +-
 net/nfc/core.c      |  93 ++++++++-
 net/nfc/netlink.c   | 537 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/nfc/nfc.h       |  11 ++
 6 files changed, 773 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/nfc.h
 create mode 100644 net/nfc/netlink.c

(limited to 'include/linux')

diff --git a/include/linux/nfc.h b/include/linux/nfc.h
new file mode 100644
index 000000000000..1170476d270a
--- /dev/null
+++ b/include/linux/nfc.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2011 Instituto Nokia de Tecnologia
+ *
+ * Authors:
+ *    Lauro Ramos Venancio <lauro.venancio@openbossa.org>
+ *    Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#ifndef __LINUX_NFC_H
+#define __LINUX_NFC_H
+
+#define NFC_GENL_NAME "nfc"
+#define NFC_GENL_VERSION 1
+
+#define NFC_GENL_MCAST_EVENT_NAME "events"
+
+/**
+ * enum nfc_commands - supported nfc commands
+ *
+ * @NFC_CMD_UNSPEC: unspecified command
+ *
+ * @NFC_CMD_GET_DEVICE: request information about a device (requires
+ *	%NFC_ATTR_DEVICE_INDEX) or dump request to get a list of all nfc devices
+ * @NFC_CMD_START_POLL: start polling for targets using the given protocols
+ *	(requires %NFC_ATTR_DEVICE_INDEX and %NFC_ATTR_PROTOCOLS)
+ * @NFC_CMD_STOP_POLL: stop polling for targets (requires
+ *	%NFC_ATTR_DEVICE_INDEX)
+ * @NFC_CMD_GET_TARGET: dump all targets found by the previous poll (requires
+ *	%NFC_ATTR_DEVICE_INDEX)
+ * @NFC_EVENT_TARGETS_FOUND: event emitted when a new target is found
+ *	(it sends %NFC_ATTR_DEVICE_INDEX)
+ * @NFC_EVENT_DEVICE_ADDED: event emitted when a new device is registred
+ *	(it sends %NFC_ATTR_DEVICE_NAME, %NFC_ATTR_DEVICE_INDEX and
+ *	%NFC_ATTR_PROTOCOLS)
+ * @NFC_EVENT_DEVICE_REMOVED: event emitted when a device is removed
+ *	(it sends %NFC_ATTR_DEVICE_INDEX)
+ */
+enum nfc_commands {
+	NFC_CMD_UNSPEC,
+	NFC_CMD_GET_DEVICE,
+	NFC_CMD_START_POLL,
+	NFC_CMD_STOP_POLL,
+	NFC_CMD_GET_TARGET,
+	NFC_EVENT_TARGETS_FOUND,
+	NFC_EVENT_DEVICE_ADDED,
+	NFC_EVENT_DEVICE_REMOVED,
+/* private: internal use only */
+	__NFC_CMD_AFTER_LAST
+};
+#define NFC_CMD_MAX (__NFC_CMD_AFTER_LAST - 1)
+
+/**
+ * enum nfc_attrs - supported nfc attributes
+ *
+ * @NFC_ATTR_UNSPEC: unspecified attribute
+ *
+ * @NFC_ATTR_DEVICE_INDEX: index of nfc device
+ * @NFC_ATTR_DEVICE_NAME: device name, max 8 chars
+ * @NFC_ATTR_PROTOCOLS: nfc protocols - bitwise or-ed combination from
+ *	NFC_PROTO_*_MASK constants
+ * @NFC_ATTR_TARGET_INDEX: index of the nfc target
+ * @NFC_ATTR_TARGET_SENS_RES: NFC-A targets extra information such as NFCID
+ * @NFC_ATTR_TARGET_SEL_RES: NFC-A targets extra information (useful if the
+ *	target is not NFC-Forum compliant)
+ */
+enum nfc_attrs {
+	NFC_ATTR_UNSPEC,
+	NFC_ATTR_DEVICE_INDEX,
+	NFC_ATTR_DEVICE_NAME,
+	NFC_ATTR_PROTOCOLS,
+	NFC_ATTR_TARGET_INDEX,
+	NFC_ATTR_TARGET_SENS_RES,
+	NFC_ATTR_TARGET_SEL_RES,
+/* private: internal use only */
+	__NFC_ATTR_AFTER_LAST
+};
+#define NFC_ATTR_MAX (__NFC_ATTR_AFTER_LAST - 1)
+
+#define NFC_DEVICE_NAME_MAXSIZE 8
+
+/* NFC protocols */
+#define NFC_PROTO_JEWEL		1
+#define NFC_PROTO_MIFARE	2
+#define NFC_PROTO_FELICA	3
+#define NFC_PROTO_ISO14443	4
+#define NFC_PROTO_NFC_DEP	5
+
+#define NFC_PROTO_MAX		6
+
+/* NFC protocols masks used in bitsets */
+#define NFC_PROTO_JEWEL_MASK	(1 << NFC_PROTO_JEWEL)
+#define NFC_PROTO_MIFARE_MASK	(1 << NFC_PROTO_MIFARE)
+#define NFC_PROTO_FELICA_MASK	(1 << NFC_PROTO_FELICA)
+#define NFC_PROTO_ISO14443_MASK	(1 << NFC_PROTO_ISO14443)
+#define NFC_PROTO_NFC_DEP_MASK	(1 << NFC_PROTO_NFC_DEP)
+
+#endif /*__LINUX_NFC_H */
diff --git a/include/net/nfc.h b/include/net/nfc.h
index c57b579c8301..cc0130312f70 100644
--- a/include/net/nfc.h
+++ b/include/net/nfc.h
@@ -58,10 +58,28 @@ struct nfc_ops {
 							void *cb_context);
 };
 
+struct nfc_target {
+	u32 idx;
+	u32 supported_protocols;
+	u16 sens_res;
+	u8 sel_res;
+};
+
+struct nfc_genl_data {
+	u32 poll_req_pid;
+	struct mutex genl_data_mutex;
+};
+
 struct nfc_dev {
 	unsigned idx;
+	unsigned target_idx;
+	struct nfc_target *targets;
+	int n_targets;
+	int targets_generation;
+	spinlock_t targets_lock;
 	struct device dev;
 	bool polling;
+	struct nfc_genl_data genl_data;
 	u32 supported_protocols;
 
 	struct nfc_ops *ops;
@@ -132,4 +150,7 @@ static inline const char *nfc_device_name(struct nfc_dev *dev)
 
 struct sk_buff *nfc_alloc_skb(unsigned int size, gfp_t gfp);
 
+int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets,
+							int ntargets);
+
 #endif /* __NET_NFC_H */
diff --git a/net/nfc/Makefile b/net/nfc/Makefile
index 28bee5958687..fa6fc16246d6 100644
--- a/net/nfc/Makefile
+++ b/net/nfc/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NFC) += nfc.o
 
-nfc-objs := core.o
+nfc-objs := core.o netlink.o
diff --git a/net/nfc/core.c b/net/nfc/core.c
index 19f8035a1ba9..c70f607455c5 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -233,12 +233,60 @@ struct sk_buff *nfc_alloc_skb(unsigned int size, gfp_t gfp)
 }
 EXPORT_SYMBOL(nfc_alloc_skb);
 
+/**
+ * nfc_targets_found - inform that targets were found
+ *
+ * @dev: The nfc device that found the targets
+ * @targets: array of nfc targets found
+ * @ntargets: targets array size
+ *
+ * The device driver must call this function when one or many nfc targets
+ * are found. After calling this function, the device driver must stop
+ * polling for targets.
+ */
+int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets,
+							int n_targets)
+{
+	int i;
+
+	nfc_dbg("dev_name=%s n_targets=%d", dev_name(&dev->dev), n_targets);
+
+	dev->polling = false;
+
+	for (i = 0; i < n_targets; i++)
+		targets[i].idx = dev->target_idx++;
+
+	spin_lock_bh(&dev->targets_lock);
+
+	dev->targets_generation++;
+
+	kfree(dev->targets);
+	dev->targets = kmemdup(targets, n_targets * sizeof(struct nfc_target),
+								GFP_ATOMIC);
+
+	if (!dev->targets) {
+		dev->n_targets = 0;
+		spin_unlock_bh(&dev->targets_lock);
+		return -ENOMEM;
+	}
+
+	dev->n_targets = n_targets;
+	spin_unlock_bh(&dev->targets_lock);
+
+	nfc_genl_targets_found(dev);
+
+	return 0;
+}
+EXPORT_SYMBOL(nfc_targets_found);
+
 static void nfc_release(struct device *d)
 {
 	struct nfc_dev *dev = to_nfc_dev(d);
 
 	nfc_dbg("dev_name=%s", dev_name(&dev->dev));
 
+	nfc_genl_data_exit(&dev->genl_data);
+	kfree(dev->targets);
 	kfree(dev);
 }
 
@@ -298,6 +346,12 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops,
 	dev->ops = ops;
 	dev->supported_protocols = supported_protocols;
 
+	spin_lock_init(&dev->targets_lock);
+	nfc_genl_data_init(&dev->genl_data);
+
+	/* first generation must not be 0 */
+	dev->targets_generation = 1;
+
 	return dev;
 }
 EXPORT_SYMBOL(nfc_allocate_device);
@@ -318,7 +372,16 @@ int nfc_register_device(struct nfc_dev *dev)
 	rc = device_add(&dev->dev);
 	mutex_unlock(&nfc_devlist_mutex);
 
-	return rc;
+	if (rc < 0)
+		return rc;
+
+	rc = nfc_genl_device_added(dev);
+	if (rc)
+		nfc_dbg("The userspace won't be notified that the device %s was"
+						" added", dev_name(&dev->dev));
+
+
+	return 0;
 }
 EXPORT_SYMBOL(nfc_register_device);
 
@@ -329,6 +392,8 @@ EXPORT_SYMBOL(nfc_register_device);
  */
 void nfc_unregister_device(struct nfc_dev *dev)
 {
+	int rc;
+
 	nfc_dbg("dev_name=%s", dev_name(&dev->dev));
 
 	mutex_lock(&nfc_devlist_mutex);
@@ -341,18 +406,42 @@ void nfc_unregister_device(struct nfc_dev *dev)
 	device_unlock(&dev->dev);
 
 	mutex_unlock(&nfc_devlist_mutex);
+
+	rc = nfc_genl_device_removed(dev);
+	if (rc)
+		nfc_dbg("The userspace won't be notified that the device %s"
+					" was removed", dev_name(&dev->dev));
+
 }
 EXPORT_SYMBOL(nfc_unregister_device);
 
 static int __init nfc_init(void)
 {
+	int rc;
+
 	nfc_info("NFC Core ver %s", VERSION);
 
-	return class_register(&nfc_class);
+	rc = class_register(&nfc_class);
+	if (rc)
+		return rc;
+
+	rc = nfc_genl_init();
+	if (rc)
+		goto err_genl;
+
+	/* the first generation must not be 0 */
+	nfc_devlist_generation = 1;
+
+	return 0;
+
+err_genl:
+	class_unregister(&nfc_class);
+	return rc;
 }
 
 static void __exit nfc_exit(void)
 {
+	nfc_genl_exit();
 	class_unregister(&nfc_class);
 }
 
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
new file mode 100644
index 000000000000..ccdff7953f7d
--- /dev/null
+++ b/net/nfc/netlink.c
@@ -0,0 +1,537 @@
+/*
+ * Copyright (C) 2011 Instituto Nokia de Tecnologia
+ *
+ * Authors:
+ *    Lauro Ramos Venancio <lauro.venancio@openbossa.org>
+ *    Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <net/genetlink.h>
+#include <linux/nfc.h>
+#include <linux/slab.h>
+
+#include "nfc.h"
+
+static struct genl_multicast_group nfc_genl_event_mcgrp = {
+	.name = NFC_GENL_MCAST_EVENT_NAME,
+};
+
+struct genl_family nfc_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = NFC_GENL_NAME,
+	.version = NFC_GENL_VERSION,
+	.maxattr = NFC_ATTR_MAX,
+};
+
+static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
+	[NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 },
+	[NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING,
+				.len = NFC_DEVICE_NAME_MAXSIZE },
+	[NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 },
+};
+
+static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target,
+					struct netlink_callback *cb, int flags)
+{
+	void *hdr;
+
+	nfc_dbg("entry");
+
+	hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+				&nfc_genl_family, flags, NFC_CMD_GET_TARGET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	genl_dump_check_consistent(cb, hdr, &nfc_genl_family);
+
+	NLA_PUT_U32(msg, NFC_ATTR_TARGET_INDEX, target->idx);
+	NLA_PUT_U32(msg, NFC_ATTR_PROTOCOLS,
+				target->supported_protocols);
+	NLA_PUT_U16(msg, NFC_ATTR_TARGET_SENS_RES, target->sens_res);
+	NLA_PUT_U8(msg, NFC_ATTR_TARGET_SEL_RES, target->sel_res);
+
+	return genlmsg_end(msg, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb)
+{
+	struct nfc_dev *dev;
+	int rc;
+	u32 idx;
+
+	rc = nlmsg_parse(cb->nlh, GENL_HDRLEN + nfc_genl_family.hdrsize,
+						nfc_genl_family.attrbuf,
+						nfc_genl_family.maxattr,
+						nfc_genl_policy);
+	if (rc < 0)
+		return ERR_PTR(rc);
+
+	if (!nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX])
+		return ERR_PTR(-EINVAL);
+
+	idx = nla_get_u32(nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX]);
+
+	dev = nfc_get_device(idx);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	return dev;
+}
+
+static int nfc_genl_dump_targets(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	int i = cb->args[0];
+	struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
+	int rc;
+
+	nfc_dbg("entry");
+
+	if (!dev) {
+		dev = __get_device_from_cb(cb);
+		if (IS_ERR(dev))
+			return PTR_ERR(dev);
+
+		cb->args[1] = (long) dev;
+	}
+
+	spin_lock_bh(&dev->targets_lock);
+
+	cb->seq = dev->targets_generation;
+
+	while (i < dev->n_targets) {
+		rc = nfc_genl_send_target(skb, &dev->targets[i], cb,
+								NLM_F_MULTI);
+		if (rc < 0)
+			break;
+
+		i++;
+	}
+
+	spin_unlock_bh(&dev->targets_lock);
+
+	cb->args[0] = i;
+
+	return skb->len;
+}
+
+static int nfc_genl_dump_targets_done(struct netlink_callback *cb)
+{
+	struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
+
+	nfc_dbg("entry");
+
+	if (dev)
+		nfc_put_device(dev);
+
+	return 0;
+}
+
+int nfc_genl_targets_found(struct nfc_dev *dev)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	nfc_dbg("entry");
+
+	dev->genl_data.poll_req_pid = 0;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+				NFC_EVENT_TARGETS_FOUND);
+	if (!hdr)
+		goto free_msg;
+
+	NLA_PUT_U32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx);
+
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_ATOMIC);
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return -EMSGSIZE;
+}
+
+int nfc_genl_device_added(struct nfc_dev *dev)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	nfc_dbg("entry");
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+				NFC_EVENT_DEVICE_ADDED);
+	if (!hdr)
+		goto free_msg;
+
+	NLA_PUT_STRING(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev));
+	NLA_PUT_U32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx);
+	NLA_PUT_U32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols);
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return -EMSGSIZE;
+}
+
+int nfc_genl_device_removed(struct nfc_dev *dev)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	nfc_dbg("entry");
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+				NFC_EVENT_DEVICE_REMOVED);
+	if (!hdr)
+		goto free_msg;
+
+	NLA_PUT_U32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx);
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return -EMSGSIZE;
+}
+
+static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev,
+						u32 pid, u32 seq,
+						struct netlink_callback *cb,
+						int flags)
+{
+	void *hdr;
+
+	nfc_dbg("entry");
+
+	hdr = genlmsg_put(msg, pid, seq, &nfc_genl_family, flags,
+							NFC_CMD_GET_DEVICE);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (cb)
+		genl_dump_check_consistent(cb, hdr, &nfc_genl_family);
+
+	NLA_PUT_STRING(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev));
+	NLA_PUT_U32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx);
+	NLA_PUT_U32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols);
+
+	return genlmsg_end(msg, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nfc_genl_dump_devices(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+	struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
+	bool first_call = false;
+
+	nfc_dbg("entry");
+
+	if (!iter) {
+		first_call = true;
+		iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL);
+		if (!iter)
+			return -ENOMEM;
+		cb->args[0] = (long) iter;
+	}
+
+	mutex_lock(&nfc_devlist_mutex);
+
+	cb->seq = nfc_devlist_generation;
+
+	if (first_call) {
+		nfc_device_iter_init(iter);
+		dev = nfc_device_iter_next(iter);
+	}
+
+	while (dev) {
+		int rc;
+
+		rc = nfc_genl_send_device(skb, dev, NETLINK_CB(cb->skb).pid,
+							cb->nlh->nlmsg_seq,
+							cb, NLM_F_MULTI);
+		if (rc < 0)
+			break;
+
+		dev = nfc_device_iter_next(iter);
+	}
+
+	mutex_unlock(&nfc_devlist_mutex);
+
+	cb->args[1] = (long) dev;
+
+	return skb->len;
+}
+
+static int nfc_genl_dump_devices_done(struct netlink_callback *cb)
+{
+	struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+
+	nfc_dbg("entry");
+
+	nfc_device_iter_exit(iter);
+	kfree(iter);
+
+	return 0;
+}
+
+static int nfc_genl_get_device(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	struct nfc_dev *dev;
+	u32 idx;
+	int rc = -ENOBUFS;
+
+	nfc_dbg("entry");
+
+	if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+		return -EINVAL;
+
+	idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+	dev = nfc_get_device(idx);
+	if (!dev)
+		return -ENODEV;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg) {
+		rc = -ENOMEM;
+		goto out_putdev;
+	}
+
+	rc = nfc_genl_send_device(msg, dev, info->snd_pid, info->snd_seq,
+								NULL, 0);
+	if (rc < 0)
+		goto out_free;
+
+	nfc_put_device(dev);
+
+	return genlmsg_reply(msg, info);
+
+out_free:
+	nlmsg_free(msg);
+out_putdev:
+	nfc_put_device(dev);
+	return rc;
+}
+
+static int nfc_genl_start_poll(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nfc_dev *dev;
+	int rc;
+	u32 idx;
+	u32 protocols;
+
+	nfc_dbg("entry");
+
+	if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+		!info->attrs[NFC_ATTR_PROTOCOLS])
+		return -EINVAL;
+
+	idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+	protocols = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]);
+
+	dev = nfc_get_device(idx);
+	if (!dev)
+		return -ENODEV;
+
+	mutex_lock(&dev->genl_data.genl_data_mutex);
+
+	rc = nfc_start_poll(dev, protocols);
+	if (!rc)
+		dev->genl_data.poll_req_pid = info->snd_pid;
+
+	mutex_unlock(&dev->genl_data.genl_data_mutex);
+
+	nfc_put_device(dev);
+	return rc;
+}
+
+static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nfc_dev *dev;
+	int rc;
+	u32 idx;
+
+	nfc_dbg("entry");
+
+	if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+		return -EINVAL;
+
+	idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+	dev = nfc_get_device(idx);
+	if (!dev)
+		return -ENODEV;
+
+	mutex_lock(&dev->genl_data.genl_data_mutex);
+
+	if (dev->genl_data.poll_req_pid != info->snd_pid) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	rc = nfc_stop_poll(dev);
+	dev->genl_data.poll_req_pid = 0;
+
+out:
+	mutex_unlock(&dev->genl_data.genl_data_mutex);
+	nfc_put_device(dev);
+	return rc;
+}
+
+static struct genl_ops nfc_genl_ops[] = {
+	{
+		.cmd = NFC_CMD_GET_DEVICE,
+		.doit = nfc_genl_get_device,
+		.dumpit = nfc_genl_dump_devices,
+		.done = nfc_genl_dump_devices_done,
+		.policy = nfc_genl_policy,
+	},
+	{
+		.cmd = NFC_CMD_START_POLL,
+		.doit = nfc_genl_start_poll,
+		.policy = nfc_genl_policy,
+	},
+	{
+		.cmd = NFC_CMD_STOP_POLL,
+		.doit = nfc_genl_stop_poll,
+		.policy = nfc_genl_policy,
+	},
+	{
+		.cmd = NFC_CMD_GET_TARGET,
+		.dumpit = nfc_genl_dump_targets,
+		.done = nfc_genl_dump_targets_done,
+		.policy = nfc_genl_policy,
+	},
+};
+
+static int nfc_genl_rcv_nl_event(struct notifier_block *this,
+						unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+	struct class_dev_iter iter;
+	struct nfc_dev *dev;
+
+	if (event != NETLINK_URELEASE || n->protocol != NETLINK_GENERIC)
+		goto out;
+
+	nfc_dbg("NETLINK_URELEASE event from id %d", n->pid);
+
+	nfc_device_iter_init(&iter);
+	dev = nfc_device_iter_next(&iter);
+
+	while (dev) {
+		mutex_lock(&dev->genl_data.genl_data_mutex);
+		if (dev->genl_data.poll_req_pid == n->pid) {
+			nfc_stop_poll(dev);
+			dev->genl_data.poll_req_pid = 0;
+		}
+		mutex_unlock(&dev->genl_data.genl_data_mutex);
+		dev = nfc_device_iter_next(&iter);
+	}
+
+	nfc_device_iter_exit(&iter);
+
+out:
+	return NOTIFY_DONE;
+}
+
+void nfc_genl_data_init(struct nfc_genl_data *genl_data)
+{
+	genl_data->poll_req_pid = 0;
+	mutex_init(&genl_data->genl_data_mutex);
+}
+
+void nfc_genl_data_exit(struct nfc_genl_data *genl_data)
+{
+	mutex_destroy(&genl_data->genl_data_mutex);
+}
+
+static struct notifier_block nl_notifier = {
+	.notifier_call  = nfc_genl_rcv_nl_event,
+};
+
+/**
+ * nfc_genl_init() - Initialize netlink interface
+ *
+ * This initialization function registers the nfc netlink family.
+ */
+int __init nfc_genl_init(void)
+{
+	int rc;
+
+	rc = genl_register_family_with_ops(&nfc_genl_family, nfc_genl_ops,
+					ARRAY_SIZE(nfc_genl_ops));
+	if (rc)
+		return rc;
+
+	rc = genl_register_mc_group(&nfc_genl_family, &nfc_genl_event_mcgrp);
+
+	netlink_register_notifier(&nl_notifier);
+
+	return rc;
+}
+
+/**
+ * nfc_genl_exit() - Deinitialize netlink interface
+ *
+ * This exit function unregisters the nfc netlink family.
+ */
+void nfc_genl_exit(void)
+{
+	netlink_unregister_notifier(&nl_notifier);
+	genl_unregister_family(&nfc_genl_family);
+}
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index 55f83f86fe2c..2b31e808e6fb 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -36,6 +36,17 @@ int nfc_printk(const char *level, const char *fmt, ...);
 extern int nfc_devlist_generation;
 extern struct mutex nfc_devlist_mutex;
 
+int __init nfc_genl_init(void);
+void nfc_genl_exit(void);
+
+void nfc_genl_data_init(struct nfc_genl_data *genl_data);
+void nfc_genl_data_exit(struct nfc_genl_data *genl_data);
+
+int nfc_genl_targets_found(struct nfc_dev *dev);
+
+int nfc_genl_device_added(struct nfc_dev *dev);
+int nfc_genl_device_removed(struct nfc_dev *dev);
+
 struct nfc_dev *nfc_get_device(unsigned idx);
 
 static inline void nfc_put_device(struct nfc_dev *dev)
-- 
cgit v1.2.3


From c7fe3b52c1283b8ba810eb6ecddf1c8a0bcc13ab Mon Sep 17 00:00:00 2001
From: Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
Date: Fri, 1 Jul 2011 19:31:35 -0300
Subject: NFC: add NFC socket family

Signed-off-by: Lauro Ramos Venancio <lauro.venancio@openbossa.org>
Signed-off-by: Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nfc.h    |  3 ++
 include/linux/socket.h |  4 ++-
 net/core/sock.c        |  6 ++--
 net/nfc/Makefile       |  2 +-
 net/nfc/af_nfc.c       | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/nfc/core.c         |  7 ++++
 net/nfc/nfc.h          | 14 ++++++++
 7 files changed, 129 insertions(+), 5 deletions(-)
 create mode 100644 net/nfc/af_nfc.c

(limited to 'include/linux')

diff --git a/include/linux/nfc.h b/include/linux/nfc.h
index 1170476d270a..15f8cb3edcc6 100644
--- a/include/linux/nfc.h
+++ b/include/linux/nfc.h
@@ -109,4 +109,7 @@ enum nfc_attrs {
 #define NFC_PROTO_ISO14443_MASK	(1 << NFC_PROTO_ISO14443)
 #define NFC_PROTO_NFC_DEP_MASK	(1 << NFC_PROTO_NFC_DEP)
 
+/* NFC socket protocols */
+#define NFC_SOCKPROTO_MAX	0
+
 #endif /*__LINUX_NFC_H */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 4ef98e422fde..e17f82266639 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -192,7 +192,8 @@ struct ucred {
 #define AF_IEEE802154	36	/* IEEE802154 sockets		*/
 #define AF_CAIF		37	/* CAIF sockets			*/
 #define AF_ALG		38	/* Algorithm sockets		*/
-#define AF_MAX		39	/* For now.. */
+#define AF_NFC		39	/* NFC sockets			*/
+#define AF_MAX		40	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -234,6 +235,7 @@ struct ucred {
 #define PF_IEEE802154	AF_IEEE802154
 #define PF_CAIF		AF_CAIF
 #define PF_ALG		AF_ALG
+#define PF_NFC		AF_NFC
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/net/core/sock.c b/net/core/sock.c
index 6e819780c232..84d6de809352 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -158,7 +158,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
-  "sk_lock-AF_MAX"
+  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
 };
 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
@@ -174,7 +174,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
-  "slock-AF_MAX"
+  "slock-AF_NFC"   , "slock-AF_MAX"
 };
 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
@@ -190,7 +190,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
-  "clock-AF_MAX"
+  "clock-AF_NFC"   , "clock-AF_MAX"
 };
 
 /*
diff --git a/net/nfc/Makefile b/net/nfc/Makefile
index fa6fc16246d6..e081fdb86a59 100644
--- a/net/nfc/Makefile
+++ b/net/nfc/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NFC) += nfc.o
 
-nfc-objs := core.o netlink.o
+nfc-objs := core.o netlink.o af_nfc.o
diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c
new file mode 100644
index 000000000000..e982cef8f49d
--- /dev/null
+++ b/net/nfc/af_nfc.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2011 Instituto Nokia de Tecnologia
+ *
+ * Authors:
+ *    Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
+ *    Lauro Ramos Venancio <lauro.venancio@openbossa.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <linux/nfc.h>
+
+#include "nfc.h"
+
+static DEFINE_RWLOCK(proto_tab_lock);
+static const struct nfc_protocol *proto_tab[NFC_SOCKPROTO_MAX];
+
+static int nfc_sock_create(struct net *net, struct socket *sock, int proto,
+								int kern)
+{
+	int rc = -EPROTONOSUPPORT;
+
+	if (net != &init_net)
+		return -EAFNOSUPPORT;
+
+	if (proto < 0 || proto >= NFC_SOCKPROTO_MAX)
+		return -EINVAL;
+
+	read_lock(&proto_tab_lock);
+	if (proto_tab[proto] &&	try_module_get(proto_tab[proto]->owner)) {
+		rc = proto_tab[proto]->create(net, sock, proto_tab[proto]);
+		module_put(proto_tab[proto]->owner);
+	}
+	read_unlock(&proto_tab_lock);
+
+	return rc;
+}
+
+static struct net_proto_family nfc_sock_family_ops = {
+	.owner  = THIS_MODULE,
+	.family = PF_NFC,
+	.create = nfc_sock_create,
+};
+
+int nfc_proto_register(const struct nfc_protocol *nfc_proto)
+{
+	int rc;
+
+	if (nfc_proto->id < 0 || nfc_proto->id >= NFC_SOCKPROTO_MAX)
+		return -EINVAL;
+
+	rc = proto_register(nfc_proto->proto, 0);
+	if (rc)
+		return rc;
+
+	write_lock(&proto_tab_lock);
+	if (proto_tab[nfc_proto->id])
+		rc = -EBUSY;
+	else
+		proto_tab[nfc_proto->id] = nfc_proto;
+	write_unlock(&proto_tab_lock);
+
+	return rc;
+}
+EXPORT_SYMBOL(nfc_proto_register);
+
+void nfc_proto_unregister(const struct nfc_protocol *nfc_proto)
+{
+	write_lock(&proto_tab_lock);
+	proto_tab[nfc_proto->id] = NULL;
+	write_unlock(&proto_tab_lock);
+
+	proto_unregister(nfc_proto->proto);
+}
+EXPORT_SYMBOL(nfc_proto_unregister);
+
+int __init af_nfc_init(void)
+{
+	return sock_register(&nfc_sock_family_ops);
+}
+
+void af_nfc_exit(void)
+{
+	sock_unregister(PF_NFC);
+}
diff --git a/net/nfc/core.c b/net/nfc/core.c
index c70f607455c5..e804dc50f42f 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -432,8 +432,14 @@ static int __init nfc_init(void)
 	/* the first generation must not be 0 */
 	nfc_devlist_generation = 1;
 
+	rc = af_nfc_init();
+	if (rc)
+		goto err_af_nfc;
+
 	return 0;
 
+err_af_nfc:
+	nfc_genl_exit();
 err_genl:
 	class_unregister(&nfc_class);
 	return rc;
@@ -441,6 +447,7 @@ err_genl:
 
 static void __exit nfc_exit(void)
 {
+	af_nfc_exit();
 	nfc_genl_exit();
 	class_unregister(&nfc_class);
 }
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index 2b31e808e6fb..8335f4de8f4f 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -25,6 +25,7 @@
 #define __LOCAL_NFC_H
 
 #include <net/nfc.h>
+#include <net/sock.h>
 
 __attribute__((format (printf, 2, 3)))
 int nfc_printk(const char *level, const char *fmt, ...);
@@ -33,6 +34,19 @@ int nfc_printk(const char *level, const char *fmt, ...);
 #define nfc_err(fmt, arg...) nfc_printk(KERN_ERR, fmt, ##arg)
 #define nfc_dbg(fmt, arg...) pr_debug(fmt "\n", ##arg)
 
+struct nfc_protocol {
+	int id;
+	struct proto *proto;
+	struct module *owner;
+	int (*create)(struct net *net, struct socket *sock,
+			const struct nfc_protocol *nfc_proto);
+};
+
+int __init af_nfc_init(void);
+void af_nfc_exit(void);
+int nfc_proto_register(const struct nfc_protocol *nfc_proto);
+void nfc_proto_unregister(const struct nfc_protocol *nfc_proto);
+
 extern int nfc_devlist_generation;
 extern struct mutex nfc_devlist_mutex;
 
-- 
cgit v1.2.3


From 23b7869c0fd08d73c9f83a2db88a13312d6198bb Mon Sep 17 00:00:00 2001
From: Lauro Ramos Venancio <lauro.venancio@openbossa.org>
Date: Fri, 1 Jul 2011 19:31:36 -0300
Subject: NFC: add the NFC socket raw protocol

This socket protocol is used to perform data exchange with NFC
targets.

Signed-off-by: Lauro Ramos Venancio <lauro.venancio@openbossa.org>
Signed-off-by: Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nfc.h |  13 +-
 net/nfc/Makefile    |   2 +-
 net/nfc/core.c      |   7 ++
 net/nfc/nfc.h       |  14 +++
 net/nfc/rawsock.c   | 354 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 388 insertions(+), 2 deletions(-)
 create mode 100644 net/nfc/rawsock.c

(limited to 'include/linux')

diff --git a/include/linux/nfc.h b/include/linux/nfc.h
index 15f8cb3edcc6..330a4c5db588 100644
--- a/include/linux/nfc.h
+++ b/include/linux/nfc.h
@@ -24,6 +24,9 @@
 #ifndef __LINUX_NFC_H
 #define __LINUX_NFC_H
 
+#include <linux/types.h>
+#include <linux/socket.h>
+
 #define NFC_GENL_NAME "nfc"
 #define NFC_GENL_VERSION 1
 
@@ -109,7 +112,15 @@ enum nfc_attrs {
 #define NFC_PROTO_ISO14443_MASK	(1 << NFC_PROTO_ISO14443)
 #define NFC_PROTO_NFC_DEP_MASK	(1 << NFC_PROTO_NFC_DEP)
 
+struct sockaddr_nfc {
+	sa_family_t sa_family;
+	__u32 dev_idx;
+	__u32 target_idx;
+	__u32 nfc_protocol;
+};
+
 /* NFC socket protocols */
-#define NFC_SOCKPROTO_MAX	0
+#define NFC_SOCKPROTO_RAW	0
+#define NFC_SOCKPROTO_MAX	1
 
 #endif /*__LINUX_NFC_H */
diff --git a/net/nfc/Makefile b/net/nfc/Makefile
index e081fdb86a59..16250c353851 100644
--- a/net/nfc/Makefile
+++ b/net/nfc/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NFC) += nfc.o
 
-nfc-objs := core.o netlink.o af_nfc.o
+nfc-objs := core.o netlink.o af_nfc.o rawsock.o
diff --git a/net/nfc/core.c b/net/nfc/core.c
index e804dc50f42f..b6fd4e1f2057 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -432,6 +432,10 @@ static int __init nfc_init(void)
 	/* the first generation must not be 0 */
 	nfc_devlist_generation = 1;
 
+	rc = rawsock_init();
+	if (rc)
+		goto err_rawsock;
+
 	rc = af_nfc_init();
 	if (rc)
 		goto err_af_nfc;
@@ -439,6 +443,8 @@ static int __init nfc_init(void)
 	return 0;
 
 err_af_nfc:
+	rawsock_exit();
+err_rawsock:
 	nfc_genl_exit();
 err_genl:
 	class_unregister(&nfc_class);
@@ -448,6 +454,7 @@ err_genl:
 static void __exit nfc_exit(void)
 {
 	af_nfc_exit();
+	rawsock_exit();
 	nfc_genl_exit();
 	class_unregister(&nfc_class);
 }
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index 8335f4de8f4f..aaf9832298f3 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -42,6 +42,20 @@ struct nfc_protocol {
 			const struct nfc_protocol *nfc_proto);
 };
 
+struct nfc_rawsock {
+	struct sock sk;
+	struct nfc_dev *dev;
+	u32 target_idx;
+	struct work_struct tx_work;
+	bool tx_work_scheduled;
+};
+#define nfc_rawsock(sk) ((struct nfc_rawsock *) sk)
+#define to_rawsock_sk(_tx_work) \
+	((struct sock *) container_of(_tx_work, struct nfc_rawsock, tx_work))
+
+int __init rawsock_init(void);
+void rawsock_exit(void);
+
 int __init af_nfc_init(void);
 void af_nfc_exit(void);
 int nfc_proto_register(const struct nfc_protocol *nfc_proto);
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
new file mode 100644
index 000000000000..52de84a55115
--- /dev/null
+++ b/net/nfc/rawsock.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) 2011 Instituto Nokia de Tecnologia
+ *
+ * Authors:
+ *    Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
+ *    Lauro Ramos Venancio <lauro.venancio@openbossa.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <net/tcp_states.h>
+#include <linux/nfc.h>
+
+#include "nfc.h"
+
+static void rawsock_write_queue_purge(struct sock *sk)
+{
+	nfc_dbg("sk=%p", sk);
+
+	spin_lock_bh(&sk->sk_write_queue.lock);
+	__skb_queue_purge(&sk->sk_write_queue);
+	nfc_rawsock(sk)->tx_work_scheduled = false;
+	spin_unlock_bh(&sk->sk_write_queue.lock);
+}
+
+static void rawsock_report_error(struct sock *sk, int err)
+{
+	nfc_dbg("sk=%p err=%d", sk, err);
+
+	sk->sk_shutdown = SHUTDOWN_MASK;
+	sk->sk_err = -err;
+	sk->sk_error_report(sk);
+
+	rawsock_write_queue_purge(sk);
+}
+
+static int rawsock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	nfc_dbg("sock=%p", sock);
+
+	sock_orphan(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int rawsock_connect(struct socket *sock, struct sockaddr *_addr,
+							int len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_nfc *addr = (struct sockaddr_nfc *)_addr;
+	struct nfc_dev *dev;
+	int rc = 0;
+
+	nfc_dbg("sock=%p sk=%p flags=%d", sock, sk, flags);
+
+	if (!addr || len < sizeof(struct sockaddr_nfc) ||
+		addr->sa_family != AF_NFC)
+		return -EINVAL;
+
+	nfc_dbg("addr dev_idx=%u target_idx=%u protocol=%u", addr->dev_idx,
+					addr->target_idx, addr->nfc_protocol);
+
+	lock_sock(sk);
+
+	if (sock->state == SS_CONNECTED) {
+		rc = -EISCONN;
+		goto error;
+	}
+
+	dev = nfc_get_device(addr->dev_idx);
+	if (!dev) {
+		rc = -ENODEV;
+		goto error;
+	}
+
+	if (addr->target_idx > dev->target_idx - 1 ||
+		addr->target_idx < dev->target_idx - dev->n_targets) {
+		rc = -EINVAL;
+		goto error;
+	}
+
+	if (addr->target_idx > dev->target_idx - 1 ||
+		addr->target_idx < dev->target_idx - dev->n_targets) {
+		rc = -EINVAL;
+		goto error;
+	}
+
+	rc = nfc_activate_target(dev, addr->target_idx, addr->nfc_protocol);
+	if (rc)
+		goto put_dev;
+
+	nfc_rawsock(sk)->dev = dev;
+	nfc_rawsock(sk)->target_idx = addr->target_idx;
+	sock->state = SS_CONNECTED;
+	sk->sk_state = TCP_ESTABLISHED;
+	sk->sk_state_change(sk);
+
+	release_sock(sk);
+	return 0;
+
+put_dev:
+	nfc_put_device(dev);
+error:
+	release_sock(sk);
+	return rc;
+}
+
+static int rawsock_add_header(struct sk_buff *skb)
+{
+
+	if (skb_cow_head(skb, 1))
+		return -ENOMEM;
+
+	*skb_push(skb, 1) = 0;
+
+	return 0;
+}
+
+static void rawsock_data_exchange_complete(void *context, struct sk_buff *skb,
+								int err)
+{
+	struct sock *sk = (struct sock *) context;
+
+	BUG_ON(in_irq());
+
+	nfc_dbg("sk=%p err=%d", sk, err);
+
+	if (err)
+		goto error;
+
+	err = rawsock_add_header(skb);
+	if (err)
+		goto error;
+
+	err = sock_queue_rcv_skb(sk, skb);
+	if (err)
+		goto error;
+
+	spin_lock_bh(&sk->sk_write_queue.lock);
+	if (!skb_queue_empty(&sk->sk_write_queue))
+		schedule_work(&nfc_rawsock(sk)->tx_work);
+	else
+		nfc_rawsock(sk)->tx_work_scheduled = false;
+	spin_unlock_bh(&sk->sk_write_queue.lock);
+
+	sock_put(sk);
+	return;
+
+error:
+	rawsock_report_error(sk, err);
+	sock_put(sk);
+}
+
+static void rawsock_tx_work(struct work_struct *work)
+{
+	struct sock *sk = to_rawsock_sk(work);
+	struct nfc_dev *dev = nfc_rawsock(sk)->dev;
+	u32 target_idx = nfc_rawsock(sk)->target_idx;
+	struct sk_buff *skb;
+	int rc;
+
+	nfc_dbg("sk=%p target_idx=%u", sk, target_idx);
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN) {
+		rawsock_write_queue_purge(sk);
+		return;
+	}
+
+	skb = skb_dequeue(&sk->sk_write_queue);
+
+	sock_hold(sk);
+	rc = nfc_data_exchange(dev, target_idx, skb,
+				rawsock_data_exchange_complete, sk);
+	if (rc) {
+		rawsock_report_error(sk, rc);
+		sock_put(sk);
+	}
+}
+
+static int rawsock_sendmsg(struct kiocb *iocb, struct socket *sock,
+					struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int rc;
+
+	nfc_dbg("sock=%p sk=%p len=%zu", sock, sk, len);
+
+	if (msg->msg_namelen)
+		return -EOPNOTSUPP;
+
+	if (sock->state != SS_CONNECTED)
+		return -ENOTCONN;
+
+	skb = sock_alloc_send_skb(sk, len, msg->msg_flags & MSG_DONTWAIT,
+									&rc);
+	if (!skb)
+		return rc;
+
+	rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	if (rc < 0) {
+		kfree_skb(skb);
+		return rc;
+	}
+
+	spin_lock_bh(&sk->sk_write_queue.lock);
+	__skb_queue_tail(&sk->sk_write_queue, skb);
+	if (!nfc_rawsock(sk)->tx_work_scheduled) {
+		schedule_work(&nfc_rawsock(sk)->tx_work);
+		nfc_rawsock(sk)->tx_work_scheduled = true;
+	}
+	spin_unlock_bh(&sk->sk_write_queue.lock);
+
+	return len;
+}
+
+static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock,
+				struct msghdr *msg, size_t len, int flags)
+{
+	int noblock = flags & MSG_DONTWAIT;
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int copied;
+	int rc;
+
+	nfc_dbg("sock=%p sk=%p len=%zu flags=%d", sock, sk, len, flags);
+
+	skb = skb_recv_datagram(sk, flags, noblock, &rc);
+	if (!skb)
+		return rc;
+
+	msg->msg_namelen = 0;
+
+	copied = skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+
+	skb_free_datagram(sk, skb);
+
+	return rc ? : copied;
+}
+
+
+static const struct proto_ops rawsock_ops = {
+	.family         = PF_NFC,
+	.owner          = THIS_MODULE,
+	.release        = rawsock_release,
+	.bind           = sock_no_bind,
+	.connect        = rawsock_connect,
+	.socketpair     = sock_no_socketpair,
+	.accept         = sock_no_accept,
+	.getname        = sock_no_getname,
+	.poll           = datagram_poll,
+	.ioctl          = sock_no_ioctl,
+	.listen         = sock_no_listen,
+	.shutdown       = sock_no_shutdown,
+	.setsockopt     = sock_no_setsockopt,
+	.getsockopt     = sock_no_getsockopt,
+	.sendmsg        = rawsock_sendmsg,
+	.recvmsg        = rawsock_recvmsg,
+	.mmap           = sock_no_mmap,
+};
+
+static void rawsock_destruct(struct sock *sk)
+{
+	nfc_dbg("sk=%p", sk);
+
+	if (sk->sk_state == TCP_ESTABLISHED) {
+		nfc_deactivate_target(nfc_rawsock(sk)->dev,
+					nfc_rawsock(sk)->target_idx);
+		nfc_put_device(nfc_rawsock(sk)->dev);
+	}
+
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		nfc_err("Freeing alive NFC raw socket %p", sk);
+		return;
+	}
+}
+
+static int rawsock_create(struct net *net, struct socket *sock,
+				const struct nfc_protocol *nfc_proto)
+{
+	struct sock *sk;
+
+	nfc_dbg("sock=%p", sock);
+
+	if (sock->type != SOCK_SEQPACKET)
+		return -ESOCKTNOSUPPORT;
+
+	sock->ops = &rawsock_ops;
+
+	sk = sk_alloc(net, PF_NFC, GFP_KERNEL, nfc_proto->proto);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_init_data(sock, sk);
+	sk->sk_protocol = nfc_proto->id;
+	sk->sk_destruct = rawsock_destruct;
+	sock->state = SS_UNCONNECTED;
+
+	INIT_WORK(&nfc_rawsock(sk)->tx_work, rawsock_tx_work);
+	nfc_rawsock(sk)->tx_work_scheduled = false;
+
+	return 0;
+}
+
+static struct proto rawsock_proto = {
+	.name     = "NFC_RAW",
+	.owner    = THIS_MODULE,
+	.obj_size = sizeof(struct nfc_rawsock),
+};
+
+static const struct nfc_protocol rawsock_nfc_proto = {
+	.id	  = NFC_SOCKPROTO_RAW,
+	.proto    = &rawsock_proto,
+	.owner    = THIS_MODULE,
+	.create   = rawsock_create
+};
+
+int __init rawsock_init(void)
+{
+	int rc;
+
+	rc = nfc_proto_register(&rawsock_nfc_proto);
+
+	return rc;
+}
+
+void rawsock_exit(void)
+{
+	nfc_proto_unregister(&rawsock_nfc_proto);
+}
-- 
cgit v1.2.3


From dc99f600698dcac69b8f56dda9a8a00d645c5ffc Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2011 01:45:05 -0700
Subject: packet: Add fanout support.

Fanouts allow packet capturing to be demuxed to a set of AF_PACKET
sockets.  Two fanout policies are implemented:

1) Hashing based upon skb->rxhash

2) Pure round-robin

An AF_PACKET socket must be fully bound before it tries to add itself
to a fanout.  All AF_PACKET sockets trying to join the same fanout
must all have the same bind settings.

Fanouts are identified (within a network namespace) by a 16-bit ID.
The first socket to try to add itself to a fanout with a particular
ID, creates that fanout.  When the last socket leaves the fanout
(which happens only when the socket is closed), that fanout is
destroyed.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_packet.h |   4 +
 net/packet/af_packet.c    | 256 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 255 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 7b318630139f..1efa1cb827f5 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -49,6 +49,10 @@ struct sockaddr_ll {
 #define PACKET_VNET_HDR			15
 #define PACKET_TX_TIMESTAMP		16
 #define PACKET_TIMESTAMP		17
+#define PACKET_FANOUT			18
+
+#define PACKET_FANOUT_HASH		0
+#define PACKET_FANOUT_LB		1
 
 struct tpacket_stats {
 	unsigned int	tp_packets;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bb281bf330aa..3350f1d3c9aa 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
 
 static void packet_flush_mclist(struct sock *sk);
 
+struct packet_fanout;
 struct packet_sock {
 	/* struct sock has to be the first member of packet_sock */
 	struct sock		sk;
+	struct packet_fanout	*fanout;
 	struct tpacket_stats	stats;
 	struct packet_ring_buffer	rx_ring;
 	struct packet_ring_buffer	tx_ring;
@@ -212,6 +214,24 @@ struct packet_sock {
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
 };
 
+#define PACKET_FANOUT_MAX	256
+
+struct packet_fanout {
+#ifdef CONFIG_NET_NS
+	struct net		*net;
+#endif
+	unsigned int		num_members;
+	u16			id;
+	u8			type;
+	u8			pad;
+	atomic_t		rr_cur;
+	struct list_head	list;
+	struct sock		*arr[PACKET_FANOUT_MAX];
+	spinlock_t		lock;
+	atomic_t		sk_ref;
+	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
+};
+
 struct packet_skb_cb {
 	unsigned int origlen;
 	union {
@@ -227,6 +247,9 @@ static inline struct packet_sock *pkt_sk(struct sock *sk)
 	return (struct packet_sock *)sk;
 }
 
+static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
+static void __fanout_link(struct sock *sk, struct packet_sock *po);
+
 /* register_prot_hook must be invoked with the po->bind_lock held,
  * or from a context in which asynchronous accesses to the packet
  * socket is not possible (packet_create()).
@@ -235,7 +258,10 @@ static void register_prot_hook(struct sock *sk)
 {
 	struct packet_sock *po = pkt_sk(sk);
 	if (!po->running) {
-		dev_add_pack(&po->prot_hook);
+		if (po->fanout)
+			__fanout_link(sk, po);
+		else
+			dev_add_pack(&po->prot_hook);
 		sock_hold(sk);
 		po->running = 1;
 	}
@@ -253,7 +279,10 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
 	struct packet_sock *po = pkt_sk(sk);
 
 	po->running = 0;
-	__dev_remove_pack(&po->prot_hook);
+	if (po->fanout)
+		__fanout_unlink(sk, po);
+	else
+		__dev_remove_pack(&po->prot_hook);
 	__sock_put(sk);
 
 	if (sync) {
@@ -388,6 +417,201 @@ static void packet_sock_destruct(struct sock *sk)
 	sk_refcnt_debug_dec(sk);
 }
 
+static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
+{
+	int x = atomic_read(&f->rr_cur) + 1;
+
+	if (x >= num)
+		x = 0;
+
+	return x;
+}
+
+static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+{
+	u32 idx, hash = skb->rxhash;
+
+	idx = ((u64)hash * num) >> 32;
+
+	return f->arr[idx];
+}
+
+static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+{
+	int cur, old;
+
+	cur = atomic_read(&f->rr_cur);
+	while ((old = atomic_cmpxchg(&f->rr_cur, cur,
+				     fanout_rr_next(f, num))) != cur)
+		cur = old;
+	return f->arr[cur];
+}
+
+static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
+				  struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct packet_fanout *f = pt->af_packet_priv;
+	unsigned int num = f->num_members;
+	struct packet_sock *po;
+	struct sock *sk;
+
+	if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
+	    !num) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	skb_get_rxhash(skb);
+
+	sk = fanout_demux_hash(f, skb, num);
+	po = pkt_sk(sk);
+
+	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
+}
+
+static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
+				struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct packet_fanout *f = pt->af_packet_priv;
+	unsigned int num = f->num_members;
+	struct packet_sock *po;
+	struct sock *sk;
+
+	if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
+	    !num) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	sk = fanout_demux_lb(f, skb, num);
+	po = pkt_sk(sk);
+
+	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
+}
+
+static DEFINE_MUTEX(fanout_mutex);
+static LIST_HEAD(fanout_list);
+
+static void __fanout_link(struct sock *sk, struct packet_sock *po)
+{
+	struct packet_fanout *f = po->fanout;
+
+	spin_lock(&f->lock);
+	f->arr[f->num_members] = sk;
+	smp_wmb();
+	f->num_members++;
+	spin_unlock(&f->lock);
+}
+
+static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
+{
+	struct packet_fanout *f = po->fanout;
+	int i;
+
+	spin_lock(&f->lock);
+	for (i = 0; i < f->num_members; i++) {
+		if (f->arr[i] == sk)
+			break;
+	}
+	BUG_ON(i >= f->num_members);
+	f->arr[i] = f->arr[f->num_members - 1];
+	f->num_members--;
+	spin_unlock(&f->lock);
+}
+
+static int fanout_add(struct sock *sk, u16 id, u8 type)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	struct packet_fanout *f, *match;
+	int err;
+
+	switch (type) {
+	case PACKET_FANOUT_HASH:
+	case PACKET_FANOUT_LB:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!po->running)
+		return -EINVAL;
+
+	if (po->fanout)
+		return -EALREADY;
+
+	mutex_lock(&fanout_mutex);
+	match = NULL;
+	list_for_each_entry(f, &fanout_list, list) {
+		if (f->id == id &&
+		    read_pnet(&f->net) == sock_net(sk)) {
+			match = f;
+			break;
+		}
+	}
+	if (!match) {
+		match = kzalloc(sizeof(*match), GFP_KERNEL);
+		if (match) {
+			write_pnet(&match->net, sock_net(sk));
+			match->id = id;
+			match->type = type;
+			atomic_set(&match->rr_cur, 0);
+			INIT_LIST_HEAD(&match->list);
+			spin_lock_init(&match->lock);
+			atomic_set(&match->sk_ref, 0);
+			match->prot_hook.type = po->prot_hook.type;
+			match->prot_hook.dev = po->prot_hook.dev;
+			switch (type) {
+			case PACKET_FANOUT_HASH:
+				match->prot_hook.func = packet_rcv_fanout_hash;
+				break;
+			case PACKET_FANOUT_LB:
+				match->prot_hook.func = packet_rcv_fanout_lb;
+				break;
+			}
+			match->prot_hook.af_packet_priv = match;
+			dev_add_pack(&match->prot_hook);
+			list_add(&match->list, &fanout_list);
+		}
+	}
+	err = -ENOMEM;
+	if (match) {
+		err = -EINVAL;
+		if (match->type == type &&
+		    match->prot_hook.type == po->prot_hook.type &&
+		    match->prot_hook.dev == po->prot_hook.dev) {
+			err = -ENOSPC;
+			if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
+				__dev_remove_pack(&po->prot_hook);
+				po->fanout = match;
+				atomic_inc(&match->sk_ref);
+				__fanout_link(sk, po);
+				err = 0;
+			}
+		}
+	}
+	mutex_unlock(&fanout_mutex);
+	return err;
+}
+
+static void fanout_release(struct sock *sk)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	struct packet_fanout *f;
+
+	f = po->fanout;
+	if (!f)
+		return;
+
+	po->fanout = NULL;
+
+	mutex_lock(&fanout_mutex);
+	if (atomic_dec_and_test(&f->sk_ref)) {
+		list_del(&f->list);
+		dev_remove_pack(&f->prot_hook);
+		kfree(f);
+	}
+	mutex_unlock(&fanout_mutex);
+}
 
 static const struct proto_ops packet_ops;
 
@@ -1398,6 +1622,8 @@ static int packet_release(struct socket *sock)
 	if (po->tx_ring.pg_vec)
 		packet_set_ring(sk, &req, 1, 1);
 
+	fanout_release(sk);
+
 	synchronize_net();
 	/*
 	 *	Now the socket is dead. No more input will appear.
@@ -1421,9 +1647,9 @@ static int packet_release(struct socket *sock)
 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
 {
 	struct packet_sock *po = pkt_sk(sk);
-	/*
-	 *	Detach an existing hook if present.
-	 */
+
+	if (po->fanout)
+		return -EINVAL;
 
 	lock_sock(sk);
 
@@ -2133,6 +2359,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		po->tp_tstamp = val;
 		return 0;
 	}
+	case PACKET_FANOUT:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		return fanout_add(sk, val & 0xffff, val >> 16);
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -2231,6 +2468,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		val = po->tp_tstamp;
 		data = &val;
 		break;
+	case PACKET_FANOUT:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = (po->fanout ?
+		       ((u32)po->fanout->id |
+			((u32)po->fanout->type << 16)) :
+		       0);
+		data = &val;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
cgit v1.2.3


From 7736d33f4262d437c51ed7a28114eacbfca236ff Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2011 01:43:20 -0700
Subject: packet: Add pre-defragmentation support for ipv4 fanouts.

The skb->rxhash cannot be properly computed if the
packet is a fragment.  To alleviate this, allow the
AF_PACKET client to ask for defragmentation to be
done at demux time.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_packet.h |  1 +
 net/packet/af_packet.c    | 50 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 1efa1cb827f5..84e684e6935c 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -53,6 +53,7 @@ struct sockaddr_ll {
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
+#define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
 struct tpacket_stats {
 	unsigned int	tp_packets;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3350f1d3c9aa..7ba6871a1942 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -223,7 +223,7 @@ struct packet_fanout {
 	unsigned int		num_members;
 	u16			id;
 	u8			type;
-	u8			pad;
+	u8			defrag;
 	atomic_t		rr_cur;
 	struct list_head	list;
 	struct sock		*arr[PACKET_FANOUT_MAX];
@@ -447,6 +447,41 @@ static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb
 	return f->arr[cur];
 }
 
+static struct sk_buff *fanout_check_defrag(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	u32 len;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return skb;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		return skb;
+
+	iph = ip_hdr(skb);
+	if (iph->ihl < 5 || iph->version != 4)
+		return skb;
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		return skb;
+	iph = ip_hdr(skb);
+	len = ntohs(iph->tot_len);
+	if (skb->len < len || len < (iph->ihl * 4))
+		return skb;
+
+	if (ip_is_fragment(ip_hdr(skb))) {
+		skb = skb_clone(skb, GFP_ATOMIC);
+		if (skb) {
+			if (pskb_trim_rcsum(skb, len))
+				return skb;
+			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+			if (ip_defrag(skb, IP_DEFRAG_AF_PACKET))
+				return NULL;
+			skb->rxhash = 0;
+		}
+	}
+	return skb;
+}
+
 static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
 				  struct packet_type *pt, struct net_device *orig_dev)
 {
@@ -461,6 +496,12 @@ static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
 		return 0;
 	}
 
+	if (f->defrag) {
+		skb = fanout_check_defrag(skb);
+		if (!skb)
+			return 0;
+	}
+
 	skb_get_rxhash(skb);
 
 	sk = fanout_demux_hash(f, skb, num);
@@ -519,10 +560,12 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
 	spin_unlock(&f->lock);
 }
 
-static int fanout_add(struct sock *sk, u16 id, u8 type)
+static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 {
 	struct packet_sock *po = pkt_sk(sk);
 	struct packet_fanout *f, *match;
+	u8 type = type_flags & 0xff;
+	u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
 	int err;
 
 	switch (type) {
@@ -548,12 +591,15 @@ static int fanout_add(struct sock *sk, u16 id, u8 type)
 			break;
 		}
 	}
+	if (match && match->defrag != defrag)
+		return -EINVAL;
 	if (!match) {
 		match = kzalloc(sizeof(*match), GFP_KERNEL);
 		if (match) {
 			write_pnet(&match->net, sock_net(sk));
 			match->id = id;
 			match->type = type;
+			match->defrag = defrag;
 			atomic_set(&match->rr_cur, 0);
 			INIT_LIST_HEAD(&match->list);
 			spin_lock_init(&match->lock);
-- 
cgit v1.2.3


From 37cf4d1a9b0903b874a638d0f8649873ddde8a12 Mon Sep 17 00:00:00 2001
From: Shmulik Ravid <shmulikr@broadcom.com>
Date: Tue, 5 Jul 2011 06:16:22 +0000
Subject: dcbnl: Aggregated CEE GET operation

The following couple of patches add dcbnl an unsolicited notification of
the the DCB configuration for the CEE flavor of the DCBX protocol. This
is useful when the user-mode DCB client is not responsible for
conducting and resolving the DCBX negotiation (either because the DCBX
stack is embedded in the HW or the negotiation is handled by another
agent in the host), but still needs to get the negotiated parameters.
This functionality already exists for the IEEE flavor of the DCBX
protocol and these patches add it to the older CEE flavor.

The first patch extends the CEE attribute GET operation to include not
only the peer information, but also all the pertinent local
configuration (negotiated parameters). The second patch adds and export
a CEE specific notification routine.

Signed-off-by: Shmulik Ravid <shmulikr@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dcbnl.h |  23 +++++++-
 net/dcb/dcbnl.c       | 159 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 173 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h
index 66a67235e729..65a2562f66b4 100644
--- a/include/linux/dcbnl.h
+++ b/include/linux/dcbnl.h
@@ -333,18 +333,30 @@ enum ieee_attrs_app {
 #define DCB_ATTR_IEEE_APP_MAX (__DCB_ATTR_IEEE_APP_MAX - 1)
 
 /**
- * enum cee_attrs - CEE DCBX get attributes
+ * enum cee_attrs - CEE DCBX get attributes.
  *
  * @DCB_ATTR_CEE_UNSPEC: unspecified
  * @DCB_ATTR_CEE_PEER_PG: peer PG configuration - get only
  * @DCB_ATTR_CEE_PEER_PFC: peer PFC configuration - get only
- * @DCB_ATTR_CEE_PEER_APP: peer APP tlv - get only
+ * @DCB_ATTR_CEE_PEER_APP_TABLE: peer APP tlv - get only
+ * @DCB_ATTR_CEE_TX_PG: TX PG configuration (DCB_CMD_PGTX_GCFG)
+ * @DCB_ATTR_CEE_RX_PG: RX PG configuration (DCB_CMD_PGRX_GCFG)
+ * @DCB_ATTR_CEE_PFC: PFC configuration (DCB_CMD_PFC_GCFG)
+ * @DCB_ATTR_CEE_APP_TABLE: APP configuration (multi DCB_CMD_GAPP)
+ * @DCB_ATTR_CEE_FEAT: DCBX features flags (DCB_CMD_GFEATCFG)
+ *
+ * An aggregated collection of the cee std negotiated parameters.
  */
 enum cee_attrs {
 	DCB_ATTR_CEE_UNSPEC,
 	DCB_ATTR_CEE_PEER_PG,
 	DCB_ATTR_CEE_PEER_PFC,
 	DCB_ATTR_CEE_PEER_APP_TABLE,
+	DCB_ATTR_CEE_TX_PG,
+	DCB_ATTR_CEE_RX_PG,
+	DCB_ATTR_CEE_PFC,
+	DCB_ATTR_CEE_APP_TABLE,
+	DCB_ATTR_CEE_FEAT,
 	__DCB_ATTR_CEE_MAX
 };
 #define DCB_ATTR_CEE_MAX (__DCB_ATTR_CEE_MAX - 1)
@@ -357,6 +369,13 @@ enum peer_app_attr {
 };
 #define DCB_ATTR_CEE_PEER_APP_MAX (__DCB_ATTR_CEE_PEER_APP_MAX - 1)
 
+enum cee_attrs_app {
+	DCB_ATTR_CEE_APP_UNSPEC,
+	DCB_ATTR_CEE_APP,
+	__DCB_ATTR_CEE_APP_MAX
+};
+#define DCB_ATTR_CEE_APP_MAX (__DCB_ATTR_CEE_APP_MAX - 1)
+
 /**
  * enum dcbnl_pfc_attrs - DCB Priority Flow Control user priority nested attrs
  *
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index fc56e8546261..d5b45a201c1b 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1642,6 +1642,60 @@ err:
 	return ret;
 }
 
+static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev,
+			     int dir)
+{
+	u8 pgid, up_map, prio, tc_pct;
+	const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops;
+	int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG;
+	struct nlattr *pg = nla_nest_start(skb, i);
+
+	if (!pg)
+		goto nla_put_failure;
+
+	for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+		struct nlattr *tc_nest = nla_nest_start(skb, i);
+
+		if (!tc_nest)
+			goto nla_put_failure;
+
+		pgid = DCB_ATTR_VALUE_UNDEFINED;
+		prio = DCB_ATTR_VALUE_UNDEFINED;
+		tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+		up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+		if (!dir)
+			ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0,
+					  &prio, &pgid, &tc_pct, &up_map);
+		else
+			ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0,
+					  &prio, &pgid, &tc_pct, &up_map);
+
+		NLA_PUT_U8(skb, DCB_TC_ATTR_PARAM_PGID, pgid);
+		NLA_PUT_U8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map);
+		NLA_PUT_U8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio);
+		NLA_PUT_U8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct);
+		nla_nest_end(skb, tc_nest);
+	}
+
+	for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+		tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+
+		if (!dir)
+			ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0,
+					   &tc_pct);
+		else
+			ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0,
+					   &tc_pct);
+		NLA_PUT_U8(skb, i, tc_pct);
+	}
+	nla_nest_end(skb, pg);
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 /* Handle CEE DCBX GET commands. */
 static int dcbnl_cee_get(struct net_device *netdev, struct nlattr **tb,
 			 u32 pid, u32 seq, u16 flags)
@@ -1649,9 +1703,11 @@ static int dcbnl_cee_get(struct net_device *netdev, struct nlattr **tb,
 	struct sk_buff *skb;
 	struct nlmsghdr *nlh;
 	struct dcbmsg *dcb;
-	struct nlattr *cee;
+	struct nlattr *cee, *app;
+	struct dcb_app_type *itr;
 	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
-	int err;
+	int dcbx, i, err = -EMSGSIZE;
+	u8 value;
 
 	if (!ops)
 		return -EOPNOTSUPP;
@@ -1672,7 +1728,88 @@ static int dcbnl_cee_get(struct net_device *netdev, struct nlattr **tb,
 	if (!cee)
 		goto nla_put_failure;
 
-	/* get peer info if available */
+	/* local pg */
+	if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) {
+		err = dcbnl_cee_pg_fill(skb, netdev, 1);
+		if (err)
+			goto nla_put_failure;
+	}
+
+	if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) {
+		err = dcbnl_cee_pg_fill(skb, netdev, 0);
+		if (err)
+			goto nla_put_failure;
+	}
+
+	/* local pfc */
+	if (ops->getpfccfg) {
+		struct nlattr *pfc_nest = nla_nest_start(skb, DCB_ATTR_CEE_PFC);
+
+		if (!pfc_nest)
+			goto nla_put_failure;
+
+		for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+			ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value);
+			NLA_PUT_U8(skb, i, value);
+		}
+		nla_nest_end(skb, pfc_nest);
+	}
+
+	/* local app */
+	spin_lock(&dcb_lock);
+	app = nla_nest_start(skb, DCB_ATTR_CEE_APP_TABLE);
+	if (!app)
+		goto nla_put_failure;
+
+	list_for_each_entry(itr, &dcb_app_list, list) {
+		if (strncmp(itr->name, netdev->name, IFNAMSIZ) == 0) {
+			struct nlattr *app_nest = nla_nest_start(skb,
+								 DCB_ATTR_APP);
+			if (!app_nest)
+				goto dcb_unlock;
+
+			err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE,
+					 itr->app.selector);
+			if (err)
+				goto dcb_unlock;
+
+			err = nla_put_u16(skb, DCB_APP_ATTR_ID,
+					  itr->app.protocol);
+			if (err)
+				goto dcb_unlock;
+
+			err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY,
+					 itr->app.priority);
+			if (err)
+				goto dcb_unlock;
+
+			nla_nest_end(skb, app_nest);
+		}
+	}
+	nla_nest_end(skb, app);
+
+	if (netdev->dcbnl_ops->getdcbx)
+		dcbx = netdev->dcbnl_ops->getdcbx(netdev);
+	else
+		dcbx = -EOPNOTSUPP;
+
+	spin_unlock(&dcb_lock);
+
+	/* features flags */
+	if (ops->getfeatcfg) {
+		struct nlattr *feat = nla_nest_start(skb, DCB_ATTR_CEE_FEAT);
+		if (!feat)
+			goto nla_put_failure;
+
+		for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX;
+		     i++)
+			if (!ops->getfeatcfg(netdev, i, &value))
+				NLA_PUT_U8(skb, i, value);
+
+		nla_nest_end(skb, feat);
+	}
+
+	/* peer info if available */
 	if (ops->cee_peer_getpg) {
 		struct cee_pg pg;
 		err = ops->cee_peer_getpg(netdev, &pg);
@@ -1695,16 +1832,24 @@ static int dcbnl_cee_get(struct net_device *netdev, struct nlattr **tb,
 		if (err)
 			goto nla_put_failure;
 	}
-
 	nla_nest_end(skb, cee);
-	nlmsg_end(skb, nlh);
 
+	/* DCBX state */
+	if (dcbx >= 0) {
+		err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx);
+		if (err)
+			goto nla_put_failure;
+	}
+	nlmsg_end(skb, nlh);
 	return rtnl_unicast(skb, &init_net, pid);
+
+dcb_unlock:
+	spin_unlock(&dcb_lock);
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
 nlmsg_failure:
-	kfree_skb(skb);
-	return -1;
+	nlmsg_free(skb);
+	return err;
 }
 
 static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
-- 
cgit v1.2.3


From 95ec3eb417115fbb2c73b59e2825f6dd5d2f6cf6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 6 Jul 2011 01:56:38 -0700
Subject: packet: Add 'cpu' fanout policy.

Unfortunately we have to use a real modulus here as
the multiply trick won't work as effectively with cpu
numbers as it does with rxhash values.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_packet.h |  1 +
 net/packet/af_packet.c    | 65 ++++++++++++++++++++---------------------------
 2 files changed, 29 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 84e684e6935c..c1486060f5ed 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -53,6 +53,7 @@ struct sockaddr_ll {
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
+#define PACKET_FANOUT_CPU		2
 #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
 struct tpacket_stats {
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 7ba6871a1942..41f0489ff665 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -447,6 +447,13 @@ static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb
 	return f->arr[cur];
 }
 
+static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+{
+	unsigned int cpu = smp_processor_id();
+
+	return f->arr[cpu % num];
+}
+
 static struct sk_buff *fanout_check_defrag(struct sk_buff *skb)
 {
 	const struct iphdr *iph;
@@ -482,8 +489,8 @@ static struct sk_buff *fanout_check_defrag(struct sk_buff *skb)
 	return skb;
 }
 
-static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
-				  struct packet_type *pt, struct net_device *orig_dev)
+static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
+			     struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct packet_fanout *f = pt->af_packet_priv;
 	unsigned int num = f->num_members;
@@ -496,35 +503,25 @@ static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
 		return 0;
 	}
 
-	if (f->defrag) {
-		skb = fanout_check_defrag(skb);
-		if (!skb)
-			return 0;
-	}
-
-	skb_get_rxhash(skb);
-
-	sk = fanout_demux_hash(f, skb, num);
-	po = pkt_sk(sk);
-
-	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
-}
-
-static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
-				struct packet_type *pt, struct net_device *orig_dev)
-{
-	struct packet_fanout *f = pt->af_packet_priv;
-	unsigned int num = f->num_members;
-	struct packet_sock *po;
-	struct sock *sk;
-
-	if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
-	    !num) {
-		kfree_skb(skb);
-		return 0;
+	switch (f->type) {
+	case PACKET_FANOUT_HASH:
+	default:
+		if (f->defrag) {
+			skb = fanout_check_defrag(skb);
+			if (!skb)
+				return 0;
+		}
+		skb_get_rxhash(skb);
+		sk = fanout_demux_hash(f, skb, num);
+		break;
+	case PACKET_FANOUT_LB:
+		sk = fanout_demux_lb(f, skb, num);
+		break;
+	case PACKET_FANOUT_CPU:
+		sk = fanout_demux_cpu(f, skb, num);
+		break;
 	}
 
-	sk = fanout_demux_lb(f, skb, num);
 	po = pkt_sk(sk);
 
 	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
@@ -571,6 +568,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 	switch (type) {
 	case PACKET_FANOUT_HASH:
 	case PACKET_FANOUT_LB:
+	case PACKET_FANOUT_CPU:
 		break;
 	default:
 		return -EINVAL;
@@ -606,14 +604,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 			atomic_set(&match->sk_ref, 0);
 			match->prot_hook.type = po->prot_hook.type;
 			match->prot_hook.dev = po->prot_hook.dev;
-			switch (type) {
-			case PACKET_FANOUT_HASH:
-				match->prot_hook.func = packet_rcv_fanout_hash;
-				break;
-			case PACKET_FANOUT_LB:
-				match->prot_hook.func = packet_rcv_fanout_lb;
-				break;
-			}
+			match->prot_hook.func = packet_rcv_fanout;
 			match->prot_hook.af_packet_priv = match;
 			dev_add_pack(&match->prot_hook);
 			list_add(&match->list, &fanout_list);
-- 
cgit v1.2.3


From e5497d766adb92bcbd1fa4a147e188f84f34b20a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 5 Jul 2011 16:35:40 +0200
Subject: cfg80211/nl80211: support GTK rekey offload

In certain circumstances, like WoWLAN scenarios,
devices may implement (partial) GTK rekeying on
the device to avoid waking up the host for it.

In order to successfully go through GTK rekeying,
the KEK, KCK and the replay counter are required.

Add API to let the supplicant hand the parameters
to the driver which may store it for future GTK
rekey operations.

Note that, of course, if GTK rekeying is done by
the device, the EAP frame must not be passed up
to userspace, instead a rekey event needs to be
sent to let userspace update its replay counter.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h |  39 +++++++++++++++++
 include/net/cfg80211.h  |  26 +++++++++++
 net/wireless/mlme.c     |  11 +++++
 net/wireless/nl80211.c  | 113 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.h  |   4 ++
 5 files changed, 193 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index c7ccaae15af6..3ec2f949bf7a 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -483,6 +483,14 @@
  *	more background information, see
  *	http://wireless.kernel.org/en/users/Documentation/WoWLAN.
  *
+ * @NL80211_CMD_SET_REKEY_OFFLOAD: This command is used give the driver
+ *	the necessary information for supporting GTK rekey offload. This
+ *	feature is typically used during WoWLAN. The configuration data
+ *	is contained in %NL80211_ATTR_REKEY_DATA (which is nested and
+ *	contains the data in sub-attributes). After rekeying happened,
+ *	this command may also be sent by the driver as an MLME event to
+ *	inform userspace of the new replay counter.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -605,6 +613,8 @@ enum nl80211_commands {
 	NL80211_CMD_SCHED_SCAN_RESULTS,
 	NL80211_CMD_SCHED_SCAN_STOPPED,
 
+	NL80211_CMD_SET_REKEY_OFFLOAD,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -996,6 +1006,9 @@ enum nl80211_commands {
  *	are managed in software: interfaces of these types aren't subject to
  *	any restrictions in their number or combinations.
  *
+ * @%NL80211_ATTR_REKEY_DATA: nested attribute containing the information
+ *	necessary for GTK rekeying in the device, see &enum nl80211_rekey_data.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1194,6 +1207,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_INTERFACE_COMBINATIONS,
 	NL80211_ATTR_SOFTWARE_IFTYPES,
 
+	NL80211_ATTR_REKEY_DATA,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -2361,4 +2376,28 @@ enum nl80211_plink_state {
 	MAX_NL80211_PLINK_STATES = NUM_NL80211_PLINK_STATES - 1
 };
 
+#define NL80211_KCK_LEN			16
+#define NL80211_KEK_LEN			16
+#define NL80211_REPLAY_CTR_LEN		8
+
+/**
+ * enum nl80211_rekey_data - attributes for GTK rekey offload
+ * @__NL80211_REKEY_DATA_INVALID: invalid number for nested attributes
+ * @NL80211_REKEY_DATA_KEK: key encryption key (binary)
+ * @NL80211_REKEY_DATA_KCK: key confirmation key (binary)
+ * @NL80211_REKEY_DATA_REPLAY_CTR: replay counter (binary)
+ * @NUM_NL80211_REKEY_DATA: number of rekey attributes (internal)
+ * @MAX_NL80211_REKEY_DATA: highest rekey attribute (internal)
+ */
+enum nl80211_rekey_data {
+	__NL80211_REKEY_DATA_INVALID,
+	NL80211_REKEY_DATA_KEK,
+	NL80211_REKEY_DATA_KCK,
+	NL80211_REKEY_DATA_REPLAY_CTR,
+
+	/* keep last */
+	NUM_NL80211_REKEY_DATA,
+	MAX_NL80211_REKEY_DATA = NUM_NL80211_REKEY_DATA - 1
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7202bce7bfeb..4bf101bada4e 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1153,6 +1153,18 @@ struct cfg80211_wowlan {
 	int n_patterns;
 };
 
+/**
+ * struct cfg80211_gtk_rekey_data - rekey data
+ * @kek: key encryption key
+ * @kck: key confirmation key
+ * @replay_ctr: replay counter
+ */
+struct cfg80211_gtk_rekey_data {
+	u8 kek[NL80211_KEK_LEN];
+	u8 kck[NL80211_KCK_LEN];
+	u8 replay_ctr[NL80211_REPLAY_CTR_LEN];
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -1197,6 +1209,8 @@ struct cfg80211_wowlan {
  *
  * @set_default_mgmt_key: set the default management frame key on an interface
  *
+ * @set_rekey_data: give the data necessary for GTK rekeying to the driver
+ *
  * @add_beacon: Add a beacon with given parameters, @head, @interval
  *	and @dtim_period will be valid, @tail is optional.
  * @set_beacon: Change the beacon parameters for an access point mode
@@ -1499,6 +1513,9 @@ struct cfg80211_ops {
 				struct net_device *dev,
 				struct cfg80211_sched_scan_request *request);
 	int	(*sched_scan_stop)(struct wiphy *wiphy, struct net_device *dev);
+
+	int	(*set_rekey_data)(struct wiphy *wiphy, struct net_device *dev,
+				  struct cfg80211_gtk_rekey_data *data);
 };
 
 /*
@@ -3033,6 +3050,15 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev,
 void cfg80211_cqm_pktloss_notify(struct net_device *dev,
 				 const u8 *peer, u32 num_packets, gfp_t gfp);
 
+/**
+ * cfg80211_gtk_rekey_notify - notify userspace about driver rekeying
+ * @dev: network device
+ * @bssid: BSSID of AP (to avoid races)
+ * @replay_ctr: new replay counter
+ */
+void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid,
+			       const u8 *replay_ctr, gfp_t gfp);
+
 /* Logging, debugging and troubleshooting/diagnostic helpers. */
 
 /* wiphy_printk helpers, similar to dev_printk */
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 3633ab6af184..832f6574e4ed 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -1084,3 +1084,14 @@ void cfg80211_cqm_pktloss_notify(struct net_device *dev,
 	nl80211_send_cqm_pktloss_notify(rdev, dev, peer, num_packets, gfp);
 }
 EXPORT_SYMBOL(cfg80211_cqm_pktloss_notify);
+
+void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid,
+			       const u8 *replay_ctr, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	nl80211_gtk_rekey_notify(rdev, dev, bssid, replay_ctr, gfp);
+}
+EXPORT_SYMBOL(cfg80211_gtk_rekey_notify);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 50ff82aa4890..491b0ba40c43 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -176,6 +176,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 	[NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED },
 	[NL80211_ATTR_STA_PLINK_STATE] = { .type = NLA_U8 },
 	[NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 },
+	[NL80211_ATTR_REKEY_DATA] = { .type = NLA_NESTED },
 };
 
 /* policy for the key attributes */
@@ -206,6 +207,14 @@ nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = {
 	[NL80211_WOWLAN_TRIG_PKT_PATTERN] = { .type = NLA_NESTED },
 };
 
+/* policy for GTK rekey offload attributes */
+static const struct nla_policy
+nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
+	[NL80211_REKEY_DATA_KEK] = { .len = NL80211_KEK_LEN },
+	[NL80211_REKEY_DATA_KCK] = { .len = NL80211_KCK_LEN },
+	[NL80211_REKEY_DATA_REPLAY_CTR] = { .len = NL80211_REPLAY_CTR_LEN },
+};
+
 /* ifidx get helper */
 static int nl80211_get_ifidx(struct netlink_callback *cb)
 {
@@ -5408,6 +5417,57 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
+static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct nlattr *tb[NUM_NL80211_REKEY_DATA];
+	struct cfg80211_gtk_rekey_data rekey_data;
+	int err;
+
+	if (!info->attrs[NL80211_ATTR_REKEY_DATA])
+		return -EINVAL;
+
+	err = nla_parse(tb, MAX_NL80211_REKEY_DATA,
+			nla_data(info->attrs[NL80211_ATTR_REKEY_DATA]),
+			nla_len(info->attrs[NL80211_ATTR_REKEY_DATA]),
+			nl80211_rekey_policy);
+	if (err)
+		return err;
+
+	if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN)
+		return -ERANGE;
+	if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN)
+		return -ERANGE;
+	if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN)
+		return -ERANGE;
+
+	memcpy(rekey_data.kek, nla_data(tb[NL80211_REKEY_DATA_KEK]),
+	       NL80211_KEK_LEN);
+	memcpy(rekey_data.kck, nla_data(tb[NL80211_REKEY_DATA_KCK]),
+	       NL80211_KCK_LEN);
+	memcpy(rekey_data.replay_ctr,
+	       nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]),
+	       NL80211_REPLAY_CTR_LEN);
+
+	wdev_lock(wdev);
+	if (!wdev->current_bss) {
+		err = -ENOTCONN;
+		goto out;
+	}
+
+	if (!rdev->ops->set_rekey_data) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	err = rdev->ops->set_rekey_data(&rdev->wiphy, dev, &rekey_data);
+ out:
+	wdev_unlock(wdev);
+	return err;
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -5939,6 +5999,14 @@ static struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_WIPHY |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_SET_REKEY_OFFLOAD,
+		.doit = nl80211_set_rekey_data,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 };
 
 static struct genl_multicast_group nl80211_mlme_mcgrp = {
@@ -6883,6 +6951,51 @@ nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev,
 	nlmsg_free(msg);
 }
 
+void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev,
+			      struct net_device *netdev, const u8 *bssid,
+			      const u8 *replay_ctr, gfp_t gfp)
+{
+	struct sk_buff *msg;
+	struct nlattr *rekey_attr;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_SET_REKEY_OFFLOAD);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid);
+
+	rekey_attr = nla_nest_start(msg, NL80211_ATTR_REKEY_DATA);
+	if (!rekey_attr)
+		goto nla_put_failure;
+
+	NLA_PUT(msg, NL80211_REKEY_DATA_REPLAY_CTR,
+		NL80211_REPLAY_CTR_LEN, replay_ctr);
+
+	nla_nest_end(msg, rekey_attr);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
 void
 nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev,
 				struct net_device *netdev, const u8 *peer,
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 2f1bfb87a651..5d69c56400ae 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -109,4 +109,8 @@ nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev,
 				struct net_device *netdev, const u8 *peer,
 				u32 num_packets, gfp_t gfp);
 
+void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev,
+			      struct net_device *netdev, const u8 *bssid,
+			      const u8 *replay_ctr, gfp_t gfp);
+
 #endif /* __NET_WIRELESS_NL80211_H */
-- 
cgit v1.2.3


From 0e373639ad7c7ef2b0c9cf907574b266791b9778 Mon Sep 17 00:00:00 2001
From: Rob Herring <rob.herring@calxeda.com>
Date: Wed, 6 Jul 2011 15:42:58 -0500
Subject: dt: add helper function to read u32 arrays

Rework of_property_read_u32 to read an array of values. Then
of_property_read_u32 becomes an inline with array size of 1.

Also make struct device_node ptr const.

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/base.c  | 19 +++++++++++++------
 include/linux/of.h | 14 ++++++++++++--
 2 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 57ec27bed44f..02ed36719def 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -596,32 +596,39 @@ struct device_node *of_find_node_by_phandle(phandle handle)
 EXPORT_SYMBOL(of_find_node_by_phandle);
 
 /**
- * of_property_read_u32 - Find and read a 32 bit integer from a property
+ * of_property_read_u32_array - Find and read an array of 32 bit integers
+ * from a property.
+ *
  * @np:		device node from which the property value is to be read.
  * @propname:	name of the property to be searched.
  * @out_value:	pointer to return value, modified only if return value is 0.
  *
- * Search for a property in a device node and read a 32-bit value from
+ * Search for a property in a device node and read 32-bit value(s) from
  * it. Returns 0 on success, -EINVAL if the property does not exist,
  * -ENODATA if property does not have a value, and -EOVERFLOW if the
  * property data isn't large enough.
  *
  * The out_value is modified only if a valid u32 value can be decoded.
  */
-int of_property_read_u32(struct device_node *np, char *propname, u32 *out_value)
+int of_property_read_u32_array(const struct device_node *np, char *propname,
+			       u32 *out_values, size_t sz)
 {
 	struct property *prop = of_find_property(np, propname, NULL);
+	const __be32 *val;
 
 	if (!prop)
 		return -EINVAL;
 	if (!prop->value)
 		return -ENODATA;
-	if (sizeof(*out_value) > prop->length)
+	if ((sz * sizeof(*out_values)) > prop->length)
 		return -EOVERFLOW;
-	*out_value = be32_to_cpup(prop->value);
+
+	val = prop->value;
+	while (sz--)
+		*out_values++ = be32_to_cpup(val++);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(of_property_read_u32);
+EXPORT_SYMBOL_GPL(of_property_read_u32_array);
 
 /**
  * of_property_read_string - Find and read a string from a property
diff --git a/include/linux/of.h b/include/linux/of.h
index b23852002b30..b5f1c88e40a7 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -195,8 +195,18 @@ extern struct device_node *of_find_node_with_property(
 extern struct property *of_find_property(const struct device_node *np,
 					 const char *name,
 					 int *lenp);
-extern int of_property_read_u32(struct device_node *np, char *propname,
-					u32 *out_value);
+extern int of_property_read_u32_array(const struct device_node *np,
+				      char *propname,
+				      u32 *out_values,
+				      size_t sz);
+
+static inline int of_property_read_u32(const struct device_node *np,
+				       char *propname,
+				       u32 *out_value)
+{
+	return of_property_read_u32_array(np, propname, out_value, 1);
+}
+
 extern int of_property_read_string(struct device_node *np, char *propname,
 					const char **out_string);
 extern int of_device_is_compatible(const struct device_node *device,
-- 
cgit v1.2.3


From a6686f2f382b13f8a7253401a66690c3633b6a74 Mon Sep 17 00:00:00 2001
From: Shirley Ma <mashirle@us.ibm.com>
Date: Wed, 6 Jul 2011 12:22:12 +0000
Subject: skbuff: skb supports zero-copy buffers

This patch adds userspace buffers support in skb shared info. A new
struct skb_ubuf_info is needed to maintain the userspace buffers
argument and index, a callback is used to notify userspace to release
the buffers once lower device has done DMA (Last reference to that skb
has gone).

If there is any userspace apps to reference these userspace buffers,
then these userspaces buffers will be copied into kernel. This way we
can prevent userspace apps from holding these userspace buffers too long.

Use destructor_arg to point to the userspace buffer info; a new tx flags
SKBTX_DEV_ZEROCOPY is added for zero-copy buffer check.

Signed-off-by: Shirley Ma <xma@...ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 16 ++++++++++
 net/core/skbuff.c      | 80 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3e543371254e..08d4507715f5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -187,6 +187,20 @@ enum {
 
 	/* ensure the originating sk reference is available on driver level */
 	SKBTX_DRV_NEEDS_SK_REF = 1 << 3,
+
+	/* device driver supports TX zero-copy buffers */
+	SKBTX_DEV_ZEROCOPY = 1 << 4,
+};
+
+/*
+ * The callback notifies userspace to release buffers when skb DMA is done in
+ * lower device, the skb last reference should be 0 when calling this.
+ * The desc is used to track userspace buffer index.
+ */
+struct ubuf_info {
+	void (*callback)(void *);
+	void *arg;
+	unsigned long desc;
 };
 
 /* This data is invariant across clones and lives at
@@ -211,6 +225,7 @@ struct skb_shared_info {
 	/* Intermediate layers must ensure that destructor_arg
 	 * remains valid until skb destructor */
 	void *		destructor_arg;
+
 	/* must be last field, see pskb_expand_head() */
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
@@ -2265,5 +2280,6 @@ static inline void skb_checksum_none_assert(struct sk_buff *skb)
 }
 
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 46cbd28f40f9..a9577a2f3a43 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -329,6 +329,18 @@ static void skb_release_data(struct sk_buff *skb)
 				put_page(skb_shinfo(skb)->frags[i].page);
 		}
 
+		/*
+		 * If skb buf is from userspace, we need to notify the caller
+		 * the lower device DMA has done;
+		 */
+		if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+			struct ubuf_info *uarg;
+
+			uarg = skb_shinfo(skb)->destructor_arg;
+			if (uarg->callback)
+				uarg->callback(uarg);
+		}
+
 		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 
@@ -481,6 +493,9 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size)
 	if (irqs_disabled())
 		return false;
 
+	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)
+		return false;
+
 	if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
 		return false;
 
@@ -596,6 +611,51 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
+/* skb frags copy userspace buffers to kernel */
+static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+{
+	int i;
+	int num_frags = skb_shinfo(skb)->nr_frags;
+	struct page *page, *head = NULL;
+	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
+
+	for (i = 0; i < num_frags; i++) {
+		u8 *vaddr;
+		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+
+		page = alloc_page(GFP_ATOMIC);
+		if (!page) {
+			while (head) {
+				struct page *next = (struct page *)head->private;
+				put_page(head);
+				head = next;
+			}
+			return -ENOMEM;
+		}
+		vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+		memcpy(page_address(page),
+		       vaddr + f->page_offset, f->size);
+		kunmap_skb_frag(vaddr);
+		page->private = (unsigned long)head;
+		head = page;
+	}
+
+	/* skb frags release userspace buffers */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+		put_page(skb_shinfo(skb)->frags[i].page);
+
+	uarg->callback(uarg);
+
+	/* skb frags point to kernel buffers */
+	for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
+		skb_shinfo(skb)->frags[i - 1].page_offset = 0;
+		skb_shinfo(skb)->frags[i - 1].page = head;
+		head = (struct page *)head->private;
+	}
+	return 0;
+}
+
+
 /**
  *	skb_clone	-	duplicate an sk_buff
  *	@skb: buffer to clone
@@ -614,6 +674,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 {
 	struct sk_buff *n;
 
+	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+		if (skb_copy_ubufs(skb, gfp_mask))
+			return NULL;
+	}
+
 	n = skb + 1;
 	if (skb->fclone == SKB_FCLONE_ORIG &&
 	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
@@ -731,6 +796,12 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 	if (skb_shinfo(skb)->nr_frags) {
 		int i;
 
+		if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+			if (skb_copy_ubufs(skb, gfp_mask)) {
+				kfree(n);
+				goto out;
+			}
+		}
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
 			get_page(skb_shinfo(n)->frags[i].page);
@@ -788,7 +859,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		fastpath = true;
 	else {
 		int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
-
 		fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
 	}
 
@@ -819,6 +889,11 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	if (fastpath) {
 		kfree(skb->head);
 	} else {
+		/* copy this zero copy skb frags */
+		if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+			if (skb_copy_ubufs(skb, gfp_mask))
+				goto nofrags;
+		}
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 			get_page(skb_shinfo(skb)->frags[i].page);
 
@@ -853,6 +928,8 @@ adjust_others:
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 	return 0;
 
+nofrags:
+	kfree(data);
 nodata:
 	return -ENOMEM;
 }
@@ -1354,6 +1431,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
 		}
 		start = end;
 	}
+
 	if (!len)
 		return 0;
 
-- 
cgit v1.2.3


From 43dd61c9a09bd413e837df829e6bfb42159be52a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 Jul 2011 11:09:22 -0400
Subject: ftrace: Fix regression of :mod:module function enabling

The new code that allows different utilities to pick and choose
what functions they trace broke the :mod: hook that allows users
to trace only functions of a particular module.

The reason is that the :mod: hook bypasses the hash that is setup
to allow individual users to trace their own functions and uses
the global hash directly. But if the global hash has not been
set up, it will cause a bug:

echo '*:mod:radeon' > /sys/kernel/debug/set_ftrace_filter

produces:

 [drm:drm_mode_getfb] *ERROR* invalid framebuffer id
 [drm:radeon_crtc_page_flip] *ERROR* failed to reserve new rbo buffer before flip
 BUG: unable to handle kernel paging request at ffffffff8160ec90
 IP: [<ffffffff810d9136>] add_hash_entry+0x66/0xd0
 PGD 1a05067 PUD 1a09063 PMD 80000000016001e1
 Oops: 0003 [#1] SMP Jul  7 04:02:28 phyllis kernel: [55303.858604] CPU 1
 Modules linked in: cryptd aes_x86_64 aes_generic binfmt_misc rfcomm bnep ip6table_filter hid radeon r8169 ahci libahci mii ttm drm_kms_helper drm video i2c_algo_bit intel_agp intel_gtt

 Pid: 10344, comm: bash Tainted: G        WC  3.0.0-rc5 #1 Dell Inc. Inspiron N5010/0YXXJJ
 RIP: 0010:[<ffffffff810d9136>]  [<ffffffff810d9136>] add_hash_entry+0x66/0xd0
 RSP: 0018:ffff88003a96bda8  EFLAGS: 00010246
 RAX: ffff8801301735c0 RBX: ffffffff8160ec80 RCX: 0000000000306ee0
 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880137c92940
 RBP: ffff88003a96bdb8 R08: ffff880137c95680 R09: 0000000000000000
 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff81c9df78
 R13: ffff8801153d1000 R14: 0000000000000000 R15: 0000000000000000
 FS: 00007f329c18a700(0000) GS:ffff880137c80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffffffff8160ec90 CR3: 000000003002b000 CR4: 00000000000006e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
 Process bash (pid: 10344, threadinfo ffff88003a96a000, task ffff88012fcfc470)
 Stack:
  0000000000000fd0 00000000000000fc ffff88003a96be38 ffffffff810d92f5
  ffff88011c4c4e00 ffff880000000000 000000000b69f4d0 ffffffff8160ec80
  ffff8800300e6f06 0000000081130295 0000000000000282 ffff8800300e6f00
 Call Trace:
  [<ffffffff810d92f5>] match_records+0x155/0x1b0
  [<ffffffff810d940c>] ftrace_mod_callback+0xbc/0x100
  [<ffffffff810dafdf>] ftrace_regex_write+0x16f/0x210
  [<ffffffff810db09f>] ftrace_filter_write+0xf/0x20
  [<ffffffff81166e48>] vfs_write+0xc8/0x190
  [<ffffffff81167001>] sys_write+0x51/0x90
  [<ffffffff815c7e02>] system_call_fastpath+0x16/0x1b
 Code: 48 8b 33 31 d2 48 85 f6 75 33 49 89 d4 4c 03 63 08 49 8b 14 24 48 85 d2 48 89 10 74 04 48 89 42 08 49 89 04 24 4c 89 60 08 31 d2
 RIP [<ffffffff810d9136>] add_hash_entry+0x66/0xd0
  RSP <ffff88003a96bda8>
 CR2: ffffffff8160ec90
 ---[ end trace a5d031828efdd88e ]---

Reported-by: Brian Marete <marete@toshnix.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h         |  3 ++-
 kernel/trace/ftrace.c          | 12 +++---------
 kernel/trace/trace_functions.c |  3 ++-
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9d88e1cb5dbb..ed0eb5254d1c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -123,7 +123,8 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 struct ftrace_func_command {
 	struct list_head	list;
 	char			*name;
-	int			(*func)(char *func, char *cmd,
+	int			(*func)(struct ftrace_hash *hash,
+					char *func, char *cmd,
 					char *params, int enable);
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 908038f57440..1c4c0b087e1d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2407,10 +2407,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
  */
 
 static int
-ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
+ftrace_mod_callback(struct ftrace_hash *hash,
+		    char *func, char *cmd, char *param, int enable)
 {
-	struct ftrace_ops *ops = &global_ops;
-	struct ftrace_hash *hash;
 	char *mod;
 	int ret = -EINVAL;
 
@@ -2430,11 +2429,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
 	if (!strlen(mod))
 		return ret;
 
-	if (enable)
-		hash = ops->filter_hash;
-	else
-		hash = ops->notrace_hash;
-
 	ret = ftrace_match_module_records(hash, func, mod);
 	if (!ret)
 		ret = -EINVAL;
@@ -2760,7 +2754,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
 	mutex_lock(&ftrace_cmd_mutex);
 	list_for_each_entry(p, &ftrace_commands, list) {
 		if (strcmp(p->name, command) == 0) {
-			ret = p->func(func, command, next, enable);
+			ret = p->func(hash, func, command, next, enable);
 			goto out_unlock;
 		}
 	}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8d0e1cc4e974..c7b0c6a7db09 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 }
 
 static int
-ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
+ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+			    char *glob, char *cmd, char *param, int enable)
 {
 	struct ftrace_probe_ops *ops;
 	void *count = (void *)-1;
-- 
cgit v1.2.3


From 659fb32d1b67476f4ade25e9ea0e2642a5b9c4b5 Mon Sep 17 00:00:00 2001
From: Simon Guinot <sguinot@lacie.com>
Date: Wed, 6 Jul 2011 12:41:31 -0400
Subject: genirq: replace irq_gc_ack() with {set,clr}_bit variants (fwd)

This fixes a regression introduced by e59347a "arm: orion:
Use generic irq chip".

Depending on the device, interrupts acknowledgement is done by setting
or by clearing a dedicated register. Replace irq_gc_ack() with some
{set,clr}_bit variants allows to handle both cases.

Note that this patch affects the following SoCs: Davinci, Samsung and
Orion. Except for this last, the change is minor: irq_gc_ack() is just
renamed into irq_gc_ack_set_bit().

For the Orion SoCs, the edge GPIO interrupts support is currently
broken. irq_gc_ack() try to acknowledge a such interrupt by setting
the corresponding cause register bit. The Orion GPIO device expect the
opposite. To fix this issue, the irq_gc_ack_clr_bit() variant is used.

Tested on Network Space v2.

Reported-by: Joey Oravec <joravec@drewtech.com>
Signed-off-by: Simon Guinot <sguinot@lacie.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/mach-davinci/irq.c      |  2 +-
 arch/arm/plat-orion/gpio.c       |  2 +-
 arch/arm/plat-s5p/irq-gpioint.c  |  2 +-
 arch/arm/plat-samsung/irq-uart.c |  2 +-
 include/linux/irq.h              |  3 ++-
 kernel/irq/generic-chip.c        | 18 ++++++++++++++++--
 6 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-davinci/irq.c b/arch/arm/mach-davinci/irq.c
index bfe68ec4e1a6..d8c1af025931 100644
--- a/arch/arm/mach-davinci/irq.c
+++ b/arch/arm/mach-davinci/irq.c
@@ -53,7 +53,7 @@ davinci_alloc_gc(void __iomem *base, unsigned int irq_start, unsigned int num)
 
 	gc = irq_alloc_generic_chip("AINTC", 1, irq_start, base, handle_edge_irq);
 	ct = gc->chip_types;
-	ct->chip.irq_ack = irq_gc_ack;
+	ct->chip.irq_ack = irq_gc_ack_set_bit;
 	ct->chip.irq_mask = irq_gc_mask_clr_bit;
 	ct->chip.irq_unmask = irq_gc_mask_set_bit;
 
diff --git a/arch/arm/plat-orion/gpio.c b/arch/arm/plat-orion/gpio.c
index 5b4fffab1eb4..41ab97ebe4cf 100644
--- a/arch/arm/plat-orion/gpio.c
+++ b/arch/arm/plat-orion/gpio.c
@@ -432,7 +432,7 @@ void __init orion_gpio_init(int gpio_base, int ngpio,
 	ct->regs.mask = ochip->mask_offset + GPIO_EDGE_MASK_OFF;
 	ct->regs.ack = GPIO_EDGE_CAUSE_OFF;
 	ct->type = IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING;
-	ct->chip.irq_ack = irq_gc_ack;
+	ct->chip.irq_ack = irq_gc_ack_clr_bit;
 	ct->chip.irq_mask = irq_gc_mask_clr_bit;
 	ct->chip.irq_unmask = irq_gc_mask_set_bit;
 	ct->chip.irq_set_type = gpio_irq_set_type;
diff --git a/arch/arm/plat-s5p/irq-gpioint.c b/arch/arm/plat-s5p/irq-gpioint.c
index 135abda31c9a..327ab9f662e8 100644
--- a/arch/arm/plat-s5p/irq-gpioint.c
+++ b/arch/arm/plat-s5p/irq-gpioint.c
@@ -152,7 +152,7 @@ static __init int s5p_gpioint_add(struct s3c_gpio_chip *chip)
 	if (!gc)
 		return -ENOMEM;
 	ct = gc->chip_types;
-	ct->chip.irq_ack = irq_gc_ack;
+	ct->chip.irq_ack = irq_gc_ack_set_bit;
 	ct->chip.irq_mask = irq_gc_mask_set_bit;
 	ct->chip.irq_unmask = irq_gc_mask_clr_bit;
 	ct->chip.irq_set_type = s5p_gpioint_set_type,
diff --git a/arch/arm/plat-samsung/irq-uart.c b/arch/arm/plat-samsung/irq-uart.c
index 32582c0958e3..0e46588d847b 100644
--- a/arch/arm/plat-samsung/irq-uart.c
+++ b/arch/arm/plat-samsung/irq-uart.c
@@ -55,7 +55,7 @@ static void __init s3c_init_uart_irq(struct s3c_uart_irq *uirq)
 	gc = irq_alloc_generic_chip("s3c-uart", 1, uirq->base_irq, reg_base,
 				    handle_level_irq);
 	ct = gc->chip_types;
-	ct->chip.irq_ack = irq_gc_ack;
+	ct->chip.irq_ack = irq_gc_ack_set_bit;
 	ct->chip.irq_mask = irq_gc_mask_set_bit;
 	ct->chip.irq_unmask = irq_gc_mask_clr_bit;
 	ct->regs.ack = S3C64XX_UINTP;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8b4538446636..baa397eb9c33 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -676,7 +676,8 @@ void irq_gc_mask_disable_reg(struct irq_data *d);
 void irq_gc_mask_set_bit(struct irq_data *d);
 void irq_gc_mask_clr_bit(struct irq_data *d);
 void irq_gc_unmask_enable_reg(struct irq_data *d);
-void irq_gc_ack(struct irq_data *d);
+void irq_gc_ack_set_bit(struct irq_data *d);
+void irq_gc_ack_clr_bit(struct irq_data *d);
 void irq_gc_mask_disable_reg_and_ack(struct irq_data *d);
 void irq_gc_eoi(struct irq_data *d);
 int irq_gc_set_wake(struct irq_data *d, unsigned int on);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 31a9db711906..3a2cab407b93 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -101,10 +101,10 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
 }
 
 /**
- * irq_gc_ack - Ack pending interrupt
+ * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
  * @d: irq_data
  */
-void irq_gc_ack(struct irq_data *d)
+void irq_gc_ack_set_bit(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	u32 mask = 1 << (d->irq - gc->irq_base);
@@ -114,6 +114,20 @@ void irq_gc_ack(struct irq_data *d)
 	irq_gc_unlock(gc);
 }
 
+/**
+ * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
+ * @d: irq_data
+ */
+void irq_gc_ack_clr_bit(struct irq_data *d)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+	u32 mask = ~(1 << (d->irq - gc->irq_base));
+
+	irq_gc_lock(gc);
+	irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+	irq_gc_unlock(gc);
+}
+
 /**
  * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
  * @d: irq_data
-- 
cgit v1.2.3


From 90810645f78f894acfb04b3768e8a7d45f2b303a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 23 Jun 2011 09:36:12 -0500
Subject: slab allocators: Provide generic description of alignment defines

Provide description for alignment defines.

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 646a639a4aae..573c809c33d9 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -133,12 +133,22 @@ unsigned int kmem_cache_size(struct kmem_cache *);
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_HIGH)
 #define KMALLOC_MAX_ORDER	(KMALLOC_SHIFT_HIGH - PAGE_SHIFT)
 
+/*
+ * Some archs want to perform DMA into kmalloc caches and need a guaranteed
+ * alignment larger than the alignment of a 64-bit integer.
+ * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
+ */
 #ifdef ARCH_DMA_MINALIGN
 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
 #else
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 #endif
 
+/*
+ * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
+ * Intended for arches that get misalignment faults even for 64 bit integer
+ * aligned buffers.
+ */
 #ifndef ARCH_SLAB_MINALIGN
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 #endif
-- 
cgit v1.2.3


From d18a90dd85f8243ed20cdadb6d8a37d595df456d Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Thu, 7 Jul 2011 11:36:37 -0700
Subject: slub: Add method to verify memory is not freed

This is for tracking down suspect memory usage.

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h | 13 +++++++++++++
 mm/slub.c                | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index fd4fdc72bc8c..4b35c06dfbc5 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -218,6 +218,19 @@ kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 	return ret;
 }
 
+/**
+ * Calling this on allocated memory will check that the memory
+ * is expected to be in use, and print warnings if not.
+ */
+#ifdef CONFIG_SLUB_DEBUG
+extern bool verify_mem_not_deleted(const void *x);
+#else
+static inline bool verify_mem_not_deleted(const void *x)
+{
+	return true;
+}
+#endif
+
 #ifdef CONFIG_TRACING
 extern void *
 kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size);
diff --git a/mm/slub.c b/mm/slub.c
index c9050995bc87..0e4f4f8245bc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2964,6 +2964,42 @@ size_t ksize(const void *object)
 }
 EXPORT_SYMBOL(ksize);
 
+#ifdef CONFIG_SLUB_DEBUG
+bool verify_mem_not_deleted(const void *x)
+{
+	struct page *page;
+	void *object = (void *)x;
+	unsigned long flags;
+	bool rv;
+
+	if (unlikely(ZERO_OR_NULL_PTR(x)))
+		return false;
+
+	local_irq_save(flags);
+
+	page = virt_to_head_page(x);
+	if (unlikely(!PageSlab(page))) {
+		/* maybe it was from stack? */
+		rv = true;
+		goto out_unlock;
+	}
+
+	slab_lock(page);
+	if (on_freelist(page->slab, page, object)) {
+		object_err(page->slab, page, object, "Object is on free-list");
+		rv = false;
+	} else {
+		rv = true;
+	}
+	slab_unlock(page);
+
+out_unlock:
+	local_irq_restore(flags);
+	return rv;
+}
+EXPORT_SYMBOL(verify_mem_not_deleted);
+#endif
+
 void kfree(const void *x)
 {
 	struct page *page;
-- 
cgit v1.2.3


From c902ce1bfb40d8b049bd2319b388b4b68b04bc27 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 7 Jul 2011 12:19:48 +0100
Subject: FS-Cache: Add a helper to bulk uncache pages on an inode

Add an FS-Cache helper to bulk uncache pages on an inode.  This will
only work for the circumstance where the pages in the cache correspond
1:1 with the pages attached to an inode's page cache.

This is required for CIFS and NFS: When disabling inode cookie, we were
returning the cookie and setting cifsi->fscache to NULL but failed to
invalidate any previously mapped pages.  This resulted in "Bad page
state" errors and manifested in other kind of errors when running
fsstress.  Fix it by uncaching mapped pages when we disable the inode
cookie.

This patch should fix the following oops and "Bad page state" errors
seen during fsstress testing.

  ------------[ cut here ]------------
  kernel BUG at fs/cachefiles/namei.c:201!
  invalid opcode: 0000 [#1] SMP
  Pid: 5, comm: kworker/u:0 Not tainted 2.6.38.7-30.fc15.x86_64 #1 Bochs Bochs
  RIP: 0010: cachefiles_walk_to_object+0x436/0x745 [cachefiles]
  RSP: 0018:ffff88002ce6dd00  EFLAGS: 00010282
  RAX: ffff88002ef165f0 RBX: ffff88001811f500 RCX: 0000000000000000
  RDX: 0000000000000000 RSI: 0000000000000100 RDI: 0000000000000282
  RBP: ffff88002ce6dda0 R08: 0000000000000100 R09: ffffffff81b3a300
  R10: 0000ffff00066c0a R11: 0000000000000003 R12: ffff88002ae54840
  R13: ffff88002ae54840 R14: ffff880029c29c00 R15: ffff88001811f4b0
  FS:  00007f394dd32720(0000) GS:ffff88002ef00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 00007fffcb62ddf8 CR3: 000000001825f000 CR4: 00000000000006e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Process kworker/u:0 (pid: 5, threadinfo ffff88002ce6c000, task ffff88002ce55cc0)
  Stack:
   0000000000000246 ffff88002ce55cc0 ffff88002ce6dd58 ffff88001815dc00
   ffff8800185246c0 ffff88001811f618 ffff880029c29d18 ffff88001811f380
   ffff88002ce6dd50 ffffffff814757e4 ffff88002ce6dda0 ffffffff8106ac56
  Call Trace:
   cachefiles_lookup_object+0x78/0xd4 [cachefiles]
   fscache_lookup_object+0x131/0x16d [fscache]
   fscache_object_work_func+0x1bc/0x669 [fscache]
   process_one_work+0x186/0x298
   worker_thread+0xda/0x15d
   kthread+0x84/0x8c
   kernel_thread_helper+0x4/0x10
  RIP  cachefiles_walk_to_object+0x436/0x745 [cachefiles]
  ---[ end trace 1d481c9af1804caa ]---

I tested the uncaching by the following means:

 (1) Create a big file on my NFS server (104857600 bytes).

 (2) Read the file into the cache with md5sum on the NFS client.  Look in
     /proc/fs/fscache/stats:

	Pages  : mrk=25601 unc=0

 (3) Open the file for read/write ("bash 5<>/warthog/bigfile").  Look in proc
     again:

	Pages  : mrk=25601 unc=25601

Reported-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-Tested-by: Suresh Jayaraman <sjayaraman@suse.de>
cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/caching/netfs-api.txt | 16 +++++++++
 fs/cifs/fscache.c                               |  1 +
 fs/fscache/page.c                               | 44 +++++++++++++++++++++++++
 fs/nfs/fscache.c                                |  8 ++---
 include/linux/fscache.h                         | 21 ++++++++++++
 5 files changed, 85 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index a167ab876c35..7cc6bf2871eb 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -673,6 +673,22 @@ storage request to complete, or it may attempt to cancel the storage request -
 in which case the page will not be stored in the cache this time.
 
 
+BULK INODE PAGE UNCACHE
+-----------------------
+
+A convenience routine is provided to perform an uncache on all the pages
+attached to an inode.  This assumes that the pages on the inode correspond on a
+1:1 basis with the pages in the cache.
+
+	void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
+					     struct inode *inode);
+
+This takes the netfs cookie that the pages were cached with and the inode that
+the pages are attached to.  This function will wait for pages to finish being
+written to the cache and for the cache to finish with the page generally.  No
+error is returned.
+
+
 ==========================
 INDEX AND DATA FILE UPDATE
 ==========================
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 816696621ec9..42e5363b4102 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -92,6 +92,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode)
 
 	if (cifsi->fscache) {
 		cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
+		fscache_uncache_all_inode_pages(cifsi->fscache, inode);
 		fscache_relinquish_cookie(cifsi->fscache, 1);
 		cifsi->fscache = NULL;
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index a2a5d19ece6a..2f343b4d7a7d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -954,3 +954,47 @@ void fscache_mark_pages_cached(struct fscache_retrieval *op,
 	pagevec_reinit(pagevec);
 }
 EXPORT_SYMBOL(fscache_mark_pages_cached);
+
+/*
+ * Uncache all the pages in an inode that are marked PG_fscache, assuming them
+ * to be associated with the given cookie.
+ */
+void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
+				       struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+
+	_enter("%p,%p", cookie, inode);
+
+	if (!mapping || mapping->nrpages == 0) {
+		_leave(" [no pages]");
+		return;
+	}
+
+	pagevec_init(&pvec, 0);
+	next = 0;
+	while (next <= (loff_t)-1 &&
+	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)
+	       ) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t page_index = page->index;
+
+			ASSERTCMP(page_index, >=, next);
+			next = page_index + 1;
+
+			if (PageFsCache(page)) {
+				__fscache_wait_on_page_write(cookie, page);
+				__fscache_uncache_page(cookie, page);
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_all_inode_pages);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ce153a6b3aec..419119c371bf 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -259,12 +259,10 @@ static void nfs_fscache_disable_inode_cookie(struct inode *inode)
 		dfprintk(FSCACHE,
 			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
 
-		/* Need to invalidate any mapped pages that were read in before
-		 * turning off the cache.
+		/* Need to uncache any pages attached to this inode that
+		 * fscache knows about before turning off the cache.
 		 */
-		if (inode->i_mapping && inode->i_mapping->nrpages)
-			invalidate_inode_pages2(inode->i_mapping);
-
+		fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode);
 		nfs_fscache_zap_inode_cookie(inode);
 	}
 }
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 7c4d72f5581f..9ec20dec3353 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -204,6 +204,8 @@ extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *);
 extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *);
 extern bool __fscache_maybe_release_page(struct fscache_cookie *, struct page *,
 					 gfp_t);
+extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *,
+					      struct inode *);
 
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
@@ -643,4 +645,23 @@ bool fscache_maybe_release_page(struct fscache_cookie *cookie,
 	return false;
 }
 
+/**
+ * fscache_uncache_all_inode_pages - Uncache all an inode's pages
+ * @cookie: The cookie representing the inode's cache object.
+ * @inode: The inode to uncache pages from.
+ *
+ * Uncache all the pages in an inode that are marked PG_fscache, assuming them
+ * to be associated with the given cookie.
+ *
+ * This function may sleep.  It will wait for pages that are being written out
+ * and will wait whilst the PG_fscache mark is removed by the cache.
+ */
+static inline
+void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
+				     struct inode *inode)
+{
+	if (fscache_cookie_valid(cookie))
+		__fscache_uncache_all_inode_pages(cookie, inode);
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From 732375c6a5a4cc825b676c922d547aba96b8ce15 Mon Sep 17 00:00:00 2001
From: Dima Zavin <dima@android.com>
Date: Thu, 7 Jul 2011 17:27:59 -0700
Subject: plist: Remove the need to supply locks to plist heads

This was legacy code brought over from the RT tree and
is no longer necessary.

Signed-off-by: Dima Zavin <dima@android.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Walker <dwalker@codeaurora.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Link: http://lkml.kernel.org/r/1310084879-10351-2-git-send-email-dima@android.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/plist.h   | 55 +++----------------------------------------------
 include/linux/rtmutex.h |  4 ++--
 kernel/fork.c           |  2 +-
 kernel/futex.c          |  2 +-
 kernel/pm_qos_params.c  |  6 +++---
 kernel/rtmutex.c        |  2 +-
 kernel/sched.c          |  4 ++--
 lib/plist.c             |  7 +------
 8 files changed, 14 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index c9b9f322c8d8..aa0fb390bd29 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -77,14 +77,9 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/spinlock_types.h>
 
 struct plist_head {
 	struct list_head node_list;
-#ifdef CONFIG_DEBUG_PI_LIST
-	raw_spinlock_t *rawlock;
-	spinlock_t *spinlock;
-#endif
 };
 
 struct plist_node {
@@ -93,37 +88,13 @@ struct plist_node {
 	struct list_head	node_list;
 };
 
-#ifdef CONFIG_DEBUG_PI_LIST
-# define PLIST_HEAD_LOCK_INIT(_lock)		.spinlock = _lock
-# define PLIST_HEAD_LOCK_INIT_RAW(_lock)	.rawlock = _lock
-#else
-# define PLIST_HEAD_LOCK_INIT(_lock)
-# define PLIST_HEAD_LOCK_INIT_RAW(_lock)
-#endif
-
-#define _PLIST_HEAD_INIT(head)				\
-	.node_list = LIST_HEAD_INIT((head).node_list)
-
 /**
  * PLIST_HEAD_INIT - static struct plist_head initializer
  * @head:	struct plist_head variable name
- * @_lock:	lock to initialize for this list
- */
-#define PLIST_HEAD_INIT(head, _lock)			\
-{							\
-	_PLIST_HEAD_INIT(head),				\
-	PLIST_HEAD_LOCK_INIT(&(_lock))			\
-}
-
-/**
- * PLIST_HEAD_INIT_RAW - static struct plist_head initializer
- * @head:	struct plist_head variable name
- * @_lock:	lock to initialize for this list
  */
-#define PLIST_HEAD_INIT_RAW(head, _lock)		\
+#define PLIST_HEAD_INIT(head)				\
 {							\
-	_PLIST_HEAD_INIT(head),				\
-	PLIST_HEAD_LOCK_INIT_RAW(&(_lock))		\
+	.node_list = LIST_HEAD_INIT((head).node_list)	\
 }
 
 /**
@@ -141,31 +112,11 @@ struct plist_node {
 /**
  * plist_head_init - dynamic struct plist_head initializer
  * @head:	&struct plist_head pointer
- * @lock:	spinlock protecting the list (debugging)
  */
 static inline void
-plist_head_init(struct plist_head *head, spinlock_t *lock)
+plist_head_init(struct plist_head *head)
 {
 	INIT_LIST_HEAD(&head->node_list);
-#ifdef CONFIG_DEBUG_PI_LIST
-	head->spinlock = lock;
-	head->rawlock = NULL;
-#endif
-}
-
-/**
- * plist_head_init_raw - dynamic struct plist_head initializer
- * @head:	&struct plist_head pointer
- * @lock:	raw_spinlock protecting the list (debugging)
- */
-static inline void
-plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock)
-{
-	INIT_LIST_HEAD(&head->node_list);
-#ifdef CONFIG_DEBUG_PI_LIST
-	head->rawlock = lock;
-	head->spinlock = NULL;
-#endif
 }
 
 /**
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 8d522ffeda33..de17134244f3 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -66,7 +66,7 @@ struct hrtimer_sleeper;
 
 #define __RT_MUTEX_INITIALIZER(mutexname) \
 	{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
-	, .wait_list = PLIST_HEAD_INIT_RAW(mutexname.wait_list, mutexname.wait_lock) \
+	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \
 	, .owner = NULL \
 	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
 
@@ -100,7 +100,7 @@ extern void rt_mutex_unlock(struct rt_mutex *lock);
 
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
-	.pi_waiters	= PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock),	\
+	.pi_waiters	= PLIST_HEAD_INIT(tsk.pi_waiters),	\
 	INIT_RT_MUTEX_DEBUG(tsk)
 #else
 # define INIT_RT_MUTEXES(tsk)
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30401a0..7517a53d50e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1013,7 +1013,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 {
 	raw_spin_lock_init(&p->pi_lock);
 #ifdef CONFIG_RT_MUTEXES
-	plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
+	plist_head_init(&p->pi_waiters);
 	p->pi_blocked_on = NULL;
 #endif
 }
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc282eae..3fbc76cbb9aa 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2697,7 +2697,7 @@ static int __init futex_init(void)
 		futex_cmpxchg_enabled = 1;
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-		plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+		plist_head_init(&futex_queues[i].chain);
 		spin_lock_init(&futex_queues[i].lock);
 	}
 
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 6824ca7d4d0c..37f05d0f0793 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock);
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
 static struct pm_qos_object cpu_dma_pm_qos = {
-	.requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
+	.requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
 	.notifiers = &cpu_dma_lat_notifier,
 	.name = "cpu_dma_latency",
 	.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
@@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
 
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
 static struct pm_qos_object network_lat_pm_qos = {
-	.requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
+	.requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
 	.notifiers = &network_lat_notifier,
 	.name = "network_latency",
 	.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
@@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
 
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
 static struct pm_qos_object network_throughput_pm_qos = {
-	.requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
+	.requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
 	.notifiers = &network_throughput_notifier,
 	.name = "network_throughput",
 	.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index ab449117aaf2..255e1662acdb 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
 	lock->owner = NULL;
 	raw_spin_lock_init(&lock->wait_lock);
-	plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
+	plist_head_init(&lock->wait_list);
 
 	debug_rt_mutex_init(lock, name);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f2e502d609b..71bc127e96ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7781,7 +7781,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
-	plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+	plist_head_init(&rt_rq->pushable_tasks);
 #endif
 
 	rt_rq->rt_time = 0;
@@ -7986,7 +7986,7 @@ void __init sched_init(void)
 #endif
 
 #ifdef CONFIG_RT_MUTEXES
-	plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+	plist_head_init(&init_task.pi_waiters);
 #endif
 
 	/*
diff --git a/lib/plist.c b/lib/plist.c
index 0ae7e6431726..a0a4da489c22 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -56,11 +56,6 @@ static void plist_check_list(struct list_head *top)
 
 static void plist_check_head(struct plist_head *head)
 {
-	WARN_ON(head != &test_head && !head->rawlock && !head->spinlock);
-	if (head->rawlock)
-		WARN_ON_SMP(!raw_spin_is_locked(head->rawlock));
-	if (head->spinlock)
-		WARN_ON_SMP(!spin_is_locked(head->spinlock));
 	if (!plist_head_empty(head))
 		plist_check_list(&plist_first(head)->prio_list);
 	plist_check_list(&head->node_list);
@@ -180,7 +175,7 @@ static int  __init plist_test(void)
 	unsigned int r = local_clock();
 
 	printk(KERN_INFO "start plist test\n");
-	plist_head_init(&test_head, NULL);
+	plist_head_init(&test_head);
 	for (i = 0; i < ARRAY_SIZE(test_node); i++)
 		plist_node_init(test_node + i, 0);
 
-- 
cgit v1.2.3


From 1a84ff7564ae43dd1ea20e17f867de2700ca5b5b Mon Sep 17 00:00:00 2001
From: Luciano Coelho <coelho@ti.com>
Date: Fri, 8 Jul 2011 11:16:16 +0300
Subject: cfg80211: return -ENOENT when stopping sched_scan while not running

If we try to stop a scheduled scan while it is not running, we should
return -ENOENT instead of simply ignoring the command and returning
success.  This is more consistent with other parts of the code.

Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Luciano Coelho <coelho@ti.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 3 ++-
 net/wireless/scan.c     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 3ec2f949bf7a..8cb025a00094 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -247,7 +247,8 @@
  *	passed, all channels allowed for the current regulatory domain
  *	are used.  Extra IEs can also be passed from the userspace by
  *	using the %NL80211_ATTR_IE attribute.
- * @NL80211_CMD_STOP_SCHED_SCAN: stop a scheduled scan
+ * @NL80211_CMD_STOP_SCHED_SCAN: stop a scheduled scan.  Returns -ENOENT
+ *	if scheduled scan is not running.
  * @NL80211_CMD_SCHED_SCAN_RESULTS: indicates that there are scheduled scan
  *	results available.
  * @NL80211_CMD_SCHED_SCAN_STOPPED: indicates that the scheduled scan has
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 5d23503dd5e0..ce04566a2ecc 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -137,7 +137,7 @@ int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev,
 	ASSERT_RDEV_LOCK(rdev);
 
 	if (!rdev->sched_scan_req)
-		return 0;
+		return -ENOENT;
 
 	dev = rdev->sched_scan_req->dev;
 
-- 
cgit v1.2.3


From b98c0239204d6603b3a33bcc2b3916adaa8d4160 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Fri, 8 Jul 2011 16:27:33 +0800
Subject: dt: add empty of_property_read_u32[_array] for non-dt

The patch adds empty functions of_property_read_u32 and
of_property_read_u32_array for non-dt build, so that drivers
migrating to dt can save some '#ifdef CONFIG_OF'.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
[grant.likely: Moved things around so only one new static inline is needed]
[grant.likely: Added _string variant]
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 include/linux/of.h | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index b5f1c88e40a7..bd716f8908de 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -200,13 +200,6 @@ extern int of_property_read_u32_array(const struct device_node *np,
 				      u32 *out_values,
 				      size_t sz);
 
-static inline int of_property_read_u32(const struct device_node *np,
-				       char *propname,
-				       u32 *out_value)
-{
-	return of_property_read_u32_array(np, propname, out_value, 1);
-}
-
 extern int of_property_read_string(struct device_node *np, char *propname,
 					const char **out_string);
 extern int of_device_is_compatible(const struct device_node *device,
@@ -241,12 +234,32 @@ extern void of_attach_node(struct device_node *);
 extern void of_detach_node(struct device_node *);
 #endif
 
-#else
+#else /* CONFIG_OF */
 
 static inline bool of_have_populated_dt(void)
 {
 	return false;
 }
 
+static inline int of_property_read_u32_array(const struct device_node *np,
+				char *propname, u32 *out_values, size_t sz)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_string(struct device_node *np,
+				char *propname, const char **out_string)
+{
+	return -ENOSYS;
+}
+
 #endif /* CONFIG_OF */
+
+static inline int of_property_read_u32(const struct device_node *np,
+				       char *propname,
+				       u32 *out_value)
+{
+	return of_property_read_u32_array(np, propname, out_value, 1);
+}
+
 #endif /* _LINUX_OF_H */
-- 
cgit v1.2.3


From f607e7fc5fb94d92030c4527287e9c149ddf9e65 Mon Sep 17 00:00:00 2001
From: Jean-François Dagenais <dagenaisj@sonatest.com>
Date: Fri, 8 Jul 2011 15:39:44 -0700
Subject: w1: ds1wm: add a reset recovery parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a regression in 3.0 reported by Paul Parsons regarding the
removal of the msleep(1) in the ds1wm_reset() function:

: The linux-3.0-rc4 DS1WM 1-wire driver is logging "bus error, retrying"
: error messages on an HP iPAQ hx4700 PDA (XScale-PXA270):
:
: <snip>
: Driver for 1-wire Dallas network protocol.
: DS1WM w1 busmaster driver - (c) 2004 Szabolcs Gyurko
: 1-Wire driver for the DS2760 battery monitor  chip  - (c) 2004-2005, Szabolcs Gyurko
: ds1wm ds1wm: pass: 1 bus error, retrying
: ds1wm ds1wm: pass: 2 bus error, retrying
: ds1wm ds1wm: pass: 3 bus error, retrying
: ds1wm ds1wm: pass: 4 bus error, retrying
: ds1wm ds1wm: pass: 5 bus error, retrying
: ...
:
: The visible result is that the battery charging LED is erratic; sometimes
: it works, mostly it doesn't.
:
: The linux-2.6.39 DS1WM 1-wire driver worked OK.  I haven't tried 3.0-rc1,
: 3.0-rc2, or 3.0-rc3.

This sleep should not be required on normal circuitry provided the
pull-ups on the bus are correctly adapted to the slaves.  Unfortunately,
this is not always the case.  The sleep is restored but as a parameter to
the probe function in the pdata.

[akpm@linux-foundation.org: coding-style fixes]
Reported-by: Paul Parsons <lost.distance@yahoo.com>
Tested-by: Paul Parsons <lost.distance@yahoo.com>
Signed-off-by: Jean-François Dagenais <dagenaisj@sonatest.com>
Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mfd/asic3.c        | 1 +
 drivers/mfd/htc-pasic3.c   | 1 +
 drivers/w1/masters/ds1wm.c | 5 +++++
 include/linux/mfd/ds1wm.h  | 7 +++++++
 4 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index c27fd1fc3b86..c71ae09430c5 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -619,6 +619,7 @@ static void asic3_clk_disable(struct asic3 *asic, struct asic3_clk *clk)
 /* MFD cells (SPI, PWM, LED, DS1WM, MMC) */
 static struct ds1wm_driver_data ds1wm_pdata = {
 	.active_high = 1,
+	.reset_recover_delay = 1,
 };
 
 static struct resource ds1wm_resources[] = {
diff --git a/drivers/mfd/htc-pasic3.c b/drivers/mfd/htc-pasic3.c
index 2808bd125d13..04c7093d6499 100644
--- a/drivers/mfd/htc-pasic3.c
+++ b/drivers/mfd/htc-pasic3.c
@@ -99,6 +99,7 @@ static int ds1wm_disable(struct platform_device *pdev)
 
 static struct ds1wm_driver_data ds1wm_pdata = {
 	.active_high = 0,
+	.reset_recover_delay = 1,
 };
 
 static struct resource ds1wm_resources[] __initdata = {
diff --git a/drivers/w1/masters/ds1wm.c b/drivers/w1/masters/ds1wm.c
index ad57593d224a..a0c8965c1a79 100644
--- a/drivers/w1/masters/ds1wm.c
+++ b/drivers/w1/masters/ds1wm.c
@@ -109,6 +109,7 @@ struct ds1wm_data {
 	/* byte to write that makes all intr disabled, */
 	/* considering active_state (IAS) (optimization) */
 	u8       int_en_reg_none;
+	unsigned int reset_recover_delay; /* see ds1wm.h */
 };
 
 static inline void ds1wm_write_register(struct ds1wm_data *ds1wm_data, u32 reg,
@@ -187,6 +188,9 @@ static int ds1wm_reset(struct ds1wm_data *ds1wm_data)
 		return 1;
 	}
 
+	if (ds1wm_data->reset_recover_delay)
+		msleep(ds1wm_data->reset_recover_delay);
+
 	return 0;
 }
 
@@ -490,6 +494,7 @@ static int ds1wm_probe(struct platform_device *pdev)
 	}
 	ds1wm_data->irq = res->start;
 	ds1wm_data->int_en_reg_none = (plat->active_high ? DS1WM_INTEN_IAS : 0);
+	ds1wm_data->reset_recover_delay = plat->reset_recover_delay;
 
 	if (res->flags & IORESOURCE_IRQ_HIGHEDGE)
 		irq_set_irq_type(ds1wm_data->irq, IRQ_TYPE_EDGE_RISING);
diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
index be469a357cbb..38a372a0e285 100644
--- a/include/linux/mfd/ds1wm.h
+++ b/include/linux/mfd/ds1wm.h
@@ -3,4 +3,11 @@
 struct ds1wm_driver_data {
 	int active_high;
 	int clock_rate;
+	/* in milliseconds, the amount of time to */
+	/* sleep following a reset pulse. Zero    */
+	/* should work if your bus devices recover*/
+	/* time respects the 1-wire spec since the*/
+	/* ds1wm implements the precise timings of*/
+	/* a reset pulse/presence detect sequence.*/
+	unsigned int reset_recover_delay;
 };
-- 
cgit v1.2.3


From 18b4f3f5d058b590e7189027eeb5d897742ade0a Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Sun, 10 Jul 2011 10:39:14 +0200
Subject: PM / Domains: Export pm_genpd_poweron() in header

Allow SoC-specific code to call pm_genpd_poweron().

Signed-off-by: Magnus Damm <damm@opensource.se>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/domain.c | 2 +-
 include/linux/pm_domain.h   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 1aed94c73cfc..1f1a7d85f29d 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -37,7 +37,7 @@ static void genpd_sd_counter_dec(struct generic_pm_domain *genpd)
  * Restore power to @genpd and all of its parents so that it is possible to
  * resume a device belonging to it.
  */
-static int pm_genpd_poweron(struct generic_pm_domain *genpd)
+int pm_genpd_poweron(struct generic_pm_domain *genpd)
 {
 	int ret = 0;
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 98491ee35102..14fb0953fa47 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -63,6 +63,7 @@ extern int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 				     struct generic_pm_domain *target);
 extern void pm_genpd_init(struct generic_pm_domain *genpd,
 			  struct dev_power_governor *gov, bool is_off);
+extern int pm_genpd_poweron(struct generic_pm_domain *genpd);
 #else
 static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
 				      struct device *dev)
@@ -86,6 +87,10 @@ static inline int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 }
 static inline void pm_genpd_init(struct generic_pm_domain *genpd,
 				 struct dev_power_governor *gov, bool is_off) {}
+static inline int pm_genpd_poweron(struct generic_pm_domain *genpd)
+{
+	return -ENOSYS;
+}
 #endif
 
 #endif /* _LINUX_PM_DOMAIN_H */
-- 
cgit v1.2.3


From d84e0bd7971eb8357c700151ee4e8e4101ee65fa Mon Sep 17 00:00:00 2001
From: Daniel Baluta <dbaluta@ixiacom.com>
Date: Sun, 10 Jul 2011 07:04:04 -0700
Subject: skbuff: update struct sk_buff members comments

Rearrange struct sk_buff members comments to follow their
definition order. Also, add missing comments for ooo_okay
and dropcount members.

Signed-off-by: Daniel Baluta <dbaluta@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 08d4507715f5..32ada5351ab4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -285,15 +285,12 @@ typedef unsigned char *sk_buff_data_t;
  *	struct sk_buff - socket buffer
  *	@next: Next buffer in list
  *	@prev: Previous buffer in list
- *	@sk: Socket we are owned by
  *	@tstamp: Time we arrived
+ *	@sk: Socket we are owned by
  *	@dev: Device we arrived on/are leaving by
- *	@transport_header: Transport layer header
- *	@network_header: Network layer header
- *	@mac_header: Link layer header
+ *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@_skb_refdst: destination entry (with norefcount bit)
  *	@sp: the security path, used for xfrm
- *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
@@ -301,40 +298,45 @@ typedef unsigned char *sk_buff_data_t;
  *	@csum: Checksum (must include start/offset pair)
  *	@csum_start: Offset from skb->head where checksumming should start
  *	@csum_offset: Offset from csum_start where checksum should be stored
+ *	@priority: Packet queueing priority
  *	@local_df: allow local fragmentation
  *	@cloned: Head may be cloned (check refcnt to be sure)
+ *	@ip_summed: Driver fed us an IP checksum
  *	@nohdr: Payload reference only, must not modify header
+ *	@nfctinfo: Relationship of this skb to the connection
  *	@pkt_type: Packet class
  *	@fclone: skbuff clone status
- *	@ip_summed: Driver fed us an IP checksum
- *	@priority: Packet queueing priority
- *	@users: User count - see {datagram,tcp}.c
- *	@protocol: Packet protocol from driver
- *	@truesize: Buffer size 
- *	@head: Head of buffer
- *	@data: Data head pointer
- *	@tail: Tail pointer
- *	@end: End pointer
- *	@destructor: Destruct function
- *	@mark: Generic packet mark
- *	@nfct: Associated connection, if any
  *	@ipvs_property: skbuff is owned by ipvs
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
- *	@nfctinfo: Relationship of this skb to the connection
+ *	@protocol: Packet protocol from driver
+ *	@destructor: Destruct function
+ *	@nfct: Associated connection, if any
  *	@nfct_reasm: netfilter conntrack re-assembly pointer
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
- *	@rxhash: the packet hash computed on receive
- *	@queue_mapping: Queue mapping for multiqueue devices
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
+ *	@rxhash: the packet hash computed on receive
+ *	@queue_mapping: Queue mapping for multiqueue devices
  *	@ndisc_nodetype: router type (from link layer)
+ *	@ooo_okay: allow the mapping of a socket to a queue to be changed
  *	@dma_cookie: a cookie to one of several possible DMA operations
  *		done by skb DMA functions
  *	@secmark: security marking
+ *	@mark: Generic packet mark
+ *	@dropcount: total number of sk_receive_queue overflows
  *	@vlan_tci: vlan tag control information
+ *	@transport_header: Transport layer header
+ *	@network_header: Network layer header
+ *	@mac_header: Link layer header
+ *	@tail: Tail pointer
+ *	@end: End pointer
+ *	@head: Head of buffer
+ *	@data: Data head pointer
+ *	@truesize: Buffer size
+ *	@users: User count - see {datagram,tcp}.c
  */
 
 struct sk_buff {
-- 
cgit v1.2.3


From 04da85b86188f224cc9b391b5bdd92a3ba20ffcf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 Jul 2011 10:12:59 -0400
Subject: ftrace: Fix warning when CONFIG_FUNCTION_TRACER is not defined

The struct ftrace_hash was declared within CONFIG_FUNCTION_TRACER
but was referenced outside of it.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ed0eb5254d1c..f0c0e8a47ae6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -19,6 +19,8 @@
 
 #include <asm/ftrace.h>
 
+struct ftrace_hash;
+
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
@@ -29,8 +31,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
 
-struct ftrace_hash;
-
 enum {
 	FTRACE_OPS_FL_ENABLED		= 1 << 0,
 	FTRACE_OPS_FL_GLOBAL		= 1 << 1,
-- 
cgit v1.2.3


From 17b75eca7683d4942f4d8d00563fd15f37c39589 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 12 Jul 2011 00:39:29 +0200
Subject: PM / Domains: Do not execute device callbacks under locks

Currently, the .start_device() and .stop_device() callbacks from
struct generic_pm_domain() as well as the device drivers' runtime PM
callbacks used by the generic PM domains code are executed under
the generic PM domain lock.  This, unfortunately, is prone to
deadlocks, for example if a device and its parent are boths members
of the same PM domain.  For this reason, it would be better if the
PM domains code didn't execute device callbacks under the lock.

Rework the locking in the generic PM domains code so that the lock
is dropped for the execution of device callbacks.  To this end,
introduce PM domains states reflecting the current status of a PM
domain and such that the PM domain lock cannot be acquired if the
status is GPD_STATE_BUSY.  Make threads attempting to acquire a PM
domain's lock wait until the status changes to either
GPD_STATE_ACTIVE or GPD_STATE_POWER_OFF.

This change by itself doesn't fix the deadlock problem mentioned
above, but the mechanism introduced by it will be used for for this
purpose by a subsequent patch.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/domain.c | 249 +++++++++++++++++++++++++++++++-------------
 include/linux/pm_domain.h   |  10 +-
 2 files changed, 185 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 9a20d9302fcd..d06f3bb80b2e 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -13,6 +13,8 @@
 #include <linux/pm_domain.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/suspend.h>
 
 #ifdef CONFIG_PM
 
@@ -30,6 +32,34 @@ static void genpd_sd_counter_dec(struct generic_pm_domain *genpd)
 			genpd->sd_count--;
 }
 
+static void genpd_acquire_lock(struct generic_pm_domain *genpd)
+{
+	DEFINE_WAIT(wait);
+
+	mutex_lock(&genpd->lock);
+	/*
+	 * Wait for the domain to transition into either the active,
+	 * or the power off state.
+	 */
+	for (;;) {
+		prepare_to_wait(&genpd->status_wait_queue, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (genpd->status != GPD_STATE_BUSY)
+			break;
+		mutex_unlock(&genpd->lock);
+
+		schedule();
+
+		mutex_lock(&genpd->lock);
+	}
+	finish_wait(&genpd->status_wait_queue, &wait);
+}
+
+static void genpd_release_lock(struct generic_pm_domain *genpd)
+{
+	mutex_unlock(&genpd->lock);
+}
+
 /**
  * pm_genpd_poweron - Restore power to a given PM domain and its parents.
  * @genpd: PM domain to power up.
@@ -39,22 +69,50 @@ static void genpd_sd_counter_dec(struct generic_pm_domain *genpd)
  */
 int pm_genpd_poweron(struct generic_pm_domain *genpd)
 {
+	struct generic_pm_domain *parent = genpd->parent;
+	DEFINE_WAIT(wait);
 	int ret = 0;
 
  start:
-	if (genpd->parent)
-		mutex_lock(&genpd->parent->lock);
-	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+	if (parent) {
+		mutex_lock(&parent->lock);
+		mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		mutex_lock(&genpd->lock);
+	}
+	/*
+	 * Wait for the domain to transition into either the active,
+	 * or the power off state.
+	 */
+	for (;;) {
+		prepare_to_wait(&genpd->status_wait_queue, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (genpd->status != GPD_STATE_BUSY)
+			break;
+		mutex_unlock(&genpd->lock);
+		if (parent)
+			mutex_unlock(&parent->lock);
+
+		schedule();
 
-	if (!genpd->power_is_off
+		if (parent) {
+			mutex_lock(&parent->lock);
+			mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&genpd->lock);
+		}
+	}
+	finish_wait(&genpd->status_wait_queue, &wait);
+
+	if (genpd->status == GPD_STATE_ACTIVE
 	    || (genpd->prepared_count > 0 && genpd->suspend_power_off))
 		goto out;
 
-	if (genpd->parent && genpd->parent->power_is_off) {
+	if (parent && parent->status != GPD_STATE_ACTIVE) {
 		mutex_unlock(&genpd->lock);
-		mutex_unlock(&genpd->parent->lock);
+		mutex_unlock(&parent->lock);
 
-		ret = pm_genpd_poweron(genpd->parent);
+		ret = pm_genpd_poweron(parent);
 		if (ret)
 			return ret;
 
@@ -67,14 +125,14 @@ int pm_genpd_poweron(struct generic_pm_domain *genpd)
 			goto out;
 	}
 
-	genpd->power_is_off = false;
-	if (genpd->parent)
-		genpd->parent->sd_count++;
+	genpd->status = GPD_STATE_ACTIVE;
+	if (parent)
+		parent->sd_count++;
 
  out:
 	mutex_unlock(&genpd->lock);
-	if (genpd->parent)
-		mutex_unlock(&genpd->parent->lock);
+	if (parent)
+		mutex_unlock(&parent->lock);
 
 	return ret;
 }
@@ -90,6 +148,7 @@ int pm_genpd_poweron(struct generic_pm_domain *genpd)
  */
 static int __pm_genpd_save_device(struct dev_list_entry *dle,
 				  struct generic_pm_domain *genpd)
+	__releases(&genpd->lock) __acquires(&genpd->lock)
 {
 	struct device *dev = dle->dev;
 	struct device_driver *drv = dev->driver;
@@ -98,6 +157,8 @@ static int __pm_genpd_save_device(struct dev_list_entry *dle,
 	if (dle->need_restore)
 		return 0;
 
+	mutex_unlock(&genpd->lock);
+
 	if (drv && drv->pm && drv->pm->runtime_suspend) {
 		if (genpd->start_device)
 			genpd->start_device(dev);
@@ -108,6 +169,8 @@ static int __pm_genpd_save_device(struct dev_list_entry *dle,
 			genpd->stop_device(dev);
 	}
 
+	mutex_lock(&genpd->lock);
+
 	if (!ret)
 		dle->need_restore = true;
 
@@ -121,6 +184,7 @@ static int __pm_genpd_save_device(struct dev_list_entry *dle,
  */
 static void __pm_genpd_restore_device(struct dev_list_entry *dle,
 				      struct generic_pm_domain *genpd)
+	__releases(&genpd->lock) __acquires(&genpd->lock)
 {
 	struct device *dev = dle->dev;
 	struct device_driver *drv = dev->driver;
@@ -128,6 +192,8 @@ static void __pm_genpd_restore_device(struct dev_list_entry *dle,
 	if (!dle->need_restore)
 		return;
 
+	mutex_unlock(&genpd->lock);
+
 	if (drv && drv->pm && drv->pm->runtime_resume) {
 		if (genpd->start_device)
 			genpd->start_device(dev);
@@ -138,6 +204,8 @@ static void __pm_genpd_restore_device(struct dev_list_entry *dle,
 			genpd->stop_device(dev);
 	}
 
+	mutex_lock(&genpd->lock);
+
 	dle->need_restore = false;
 }
 
@@ -150,13 +218,14 @@ static void __pm_genpd_restore_device(struct dev_list_entry *dle,
  * the @genpd's devices' drivers and remove power from @genpd.
  */
 static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
+	__releases(&genpd->lock) __acquires(&genpd->lock)
 {
 	struct generic_pm_domain *parent;
 	struct dev_list_entry *dle;
 	unsigned int not_suspended;
 	int ret;
 
-	if (genpd->power_is_off || genpd->prepared_count > 0)
+	if (genpd->status == GPD_STATE_POWER_OFF || genpd->prepared_count > 0)
 		return 0;
 
 	if (genpd->sd_count > 0)
@@ -175,22 +244,36 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 			return -EAGAIN;
 	}
 
+	genpd->status = GPD_STATE_BUSY;
+
 	list_for_each_entry_reverse(dle, &genpd->dev_list, node) {
 		ret = __pm_genpd_save_device(dle, genpd);
 		if (ret)
 			goto err_dev;
 	}
 
+	mutex_unlock(&genpd->lock);
+
+	parent = genpd->parent;
+	if (parent) {
+		genpd_acquire_lock(parent);
+		mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		mutex_lock(&genpd->lock);
+	}
+
 	if (genpd->power_off)
 		genpd->power_off(genpd);
 
-	genpd->power_is_off = true;
+	genpd->status = GPD_STATE_POWER_OFF;
+	wake_up_all(&genpd->status_wait_queue);
 
-	parent = genpd->parent;
 	if (parent) {
 		genpd_sd_counter_dec(parent);
 		if (parent->sd_count == 0)
 			queue_work(pm_wq, &parent->power_off_work);
+
+		genpd_release_lock(parent);
 	}
 
 	return 0;
@@ -199,6 +282,9 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 	list_for_each_entry_continue(dle, &genpd->dev_list, node)
 		__pm_genpd_restore_device(dle, genpd);
 
+	genpd->status = GPD_STATE_ACTIVE;
+	wake_up_all(&genpd->status_wait_queue);
+
 	return ret;
 }
 
@@ -212,13 +298,9 @@ static void genpd_power_off_work_fn(struct work_struct *work)
 
 	genpd = container_of(work, struct generic_pm_domain, power_off_work);
 
-	if (genpd->parent)
-		mutex_lock(&genpd->parent->lock);
-	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+	genpd_acquire_lock(genpd);
 	pm_genpd_poweroff(genpd);
-	mutex_unlock(&genpd->lock);
-	if (genpd->parent)
-		mutex_unlock(&genpd->parent->lock);
+	genpd_release_lock(genpd);
 }
 
 /**
@@ -239,23 +321,17 @@ static int pm_genpd_runtime_suspend(struct device *dev)
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	if (genpd->parent)
-		mutex_lock(&genpd->parent->lock);
-	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
-
 	if (genpd->stop_device) {
 		int ret = genpd->stop_device(dev);
 		if (ret)
-			goto out;
+			return ret;
 	}
+
+	genpd_acquire_lock(genpd);
 	genpd->in_progress++;
 	pm_genpd_poweroff(genpd);
 	genpd->in_progress--;
-
- out:
-	mutex_unlock(&genpd->lock);
-	if (genpd->parent)
-		mutex_unlock(&genpd->parent->lock);
+	genpd_release_lock(genpd);
 
 	return 0;
 }
@@ -276,9 +352,6 @@ static void __pm_genpd_runtime_resume(struct device *dev,
 			break;
 		}
 	}
-
-	if (genpd->start_device)
-		genpd->start_device(dev);
 }
 
 /**
@@ -304,9 +377,15 @@ static int pm_genpd_runtime_resume(struct device *dev)
 	if (ret)
 		return ret;
 
-	mutex_lock(&genpd->lock);
+	genpd_acquire_lock(genpd);
+	genpd->status = GPD_STATE_BUSY;
 	__pm_genpd_runtime_resume(dev, genpd);
-	mutex_unlock(&genpd->lock);
+	genpd->status = GPD_STATE_ACTIVE;
+	wake_up_all(&genpd->status_wait_queue);
+	genpd_release_lock(genpd);
+
+	if (genpd->start_device)
+		genpd->start_device(dev);
 
 	return 0;
 }
@@ -339,7 +418,7 @@ static void pm_genpd_sync_poweroff(struct generic_pm_domain *genpd)
 {
 	struct generic_pm_domain *parent = genpd->parent;
 
-	if (genpd->power_is_off)
+	if (genpd->status == GPD_STATE_POWER_OFF)
 		return;
 
 	if (genpd->suspended_count != genpd->device_count || genpd->sd_count > 0)
@@ -348,7 +427,7 @@ static void pm_genpd_sync_poweroff(struct generic_pm_domain *genpd)
 	if (genpd->power_off)
 		genpd->power_off(genpd);
 
-	genpd->power_is_off = true;
+	genpd->status = GPD_STATE_POWER_OFF;
 	if (parent) {
 		genpd_sd_counter_dec(parent);
 		pm_genpd_sync_poweroff(parent);
@@ -375,32 +454,41 @@ static int pm_genpd_prepare(struct device *dev)
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	mutex_lock(&genpd->lock);
+	/*
+	 * If a wakeup request is pending for the device, it should be woken up
+	 * at this point and a system wakeup event should be reported if it's
+	 * set up to wake up the system from sleep states.
+	 */
+	pm_runtime_get_noresume(dev);
+	if (pm_runtime_barrier(dev) && device_may_wakeup(dev))
+		pm_wakeup_event(dev, 0);
+
+	if (pm_wakeup_pending()) {
+		pm_runtime_put_sync(dev);
+		return -EBUSY;
+	}
+
+	genpd_acquire_lock(genpd);
 
 	if (genpd->prepared_count++ == 0)
-		genpd->suspend_power_off = genpd->power_is_off;
+		genpd->suspend_power_off = genpd->status == GPD_STATE_POWER_OFF;
+
+	genpd_release_lock(genpd);
 
 	if (genpd->suspend_power_off) {
-		mutex_unlock(&genpd->lock);
+		pm_runtime_put_noidle(dev);
 		return 0;
 	}
 
 	/*
-	 * If the device is in the (runtime) "suspended" state, call
-	 * .start_device() for it, if defined.
-	 */
-	if (pm_runtime_suspended(dev))
-		__pm_genpd_runtime_resume(dev, genpd);
-
-	/*
-	 * Do not check if runtime resume is pending at this point, because it
-	 * has been taken care of already and if pm_genpd_poweron() ran at this
-	 * point as a result of the check, it would deadlock.
+	 * The PM domain must be in the GPD_STATE_ACTIVE state at this point,
+	 * so pm_genpd_poweron() will return immediately, but if the device
+	 * is suspended (e.g. it's been stopped by .stop_device()), we need
+	 * to make it operational.
 	 */
+	pm_runtime_resume(dev);
 	__pm_runtime_disable(dev, false);
 
-	mutex_unlock(&genpd->lock);
-
 	ret = pm_generic_prepare(dev);
 	if (ret) {
 		mutex_lock(&genpd->lock);
@@ -409,7 +497,10 @@ static int pm_genpd_prepare(struct device *dev)
 			genpd->suspend_power_off = false;
 
 		mutex_unlock(&genpd->lock);
+		pm_runtime_enable(dev);
 	}
+
+	pm_runtime_put_sync(dev);
 	return ret;
 }
 
@@ -726,7 +817,7 @@ static int pm_genpd_restore_noirq(struct device *dev)
 	 * guaranteed that this function will never run twice in parallel for
 	 * the same PM domain, so it is not necessary to use locking here.
 	 */
-	genpd->power_is_off = true;
+	genpd->status = GPD_STATE_POWER_OFF;
 	if (genpd->suspend_power_off) {
 		/*
 		 * The boot kernel might put the domain into the power on state,
@@ -836,9 +927,9 @@ int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev)
 	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(dev))
 		return -EINVAL;
 
-	mutex_lock(&genpd->lock);
+	genpd_acquire_lock(genpd);
 
-	if (genpd->power_is_off) {
+	if (genpd->status == GPD_STATE_POWER_OFF) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -870,7 +961,7 @@ int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev)
 	spin_unlock_irq(&dev->power.lock);
 
  out:
-	mutex_unlock(&genpd->lock);
+	genpd_release_lock(genpd);
 
 	return ret;
 }
@@ -891,7 +982,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(dev))
 		return -EINVAL;
 
-	mutex_lock(&genpd->lock);
+	genpd_acquire_lock(genpd);
 
 	if (genpd->prepared_count > 0) {
 		ret = -EAGAIN;
@@ -915,7 +1006,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	}
 
  out:
-	mutex_unlock(&genpd->lock);
+	genpd_release_lock(genpd);
 
 	return ret;
 }
@@ -934,9 +1025,19 @@ int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(new_subdomain))
 		return -EINVAL;
 
-	mutex_lock(&genpd->lock);
+ start:
+	genpd_acquire_lock(genpd);
+	mutex_lock_nested(&new_subdomain->lock, SINGLE_DEPTH_NESTING);
 
-	if (genpd->power_is_off && !new_subdomain->power_is_off) {
+	if (new_subdomain->status != GPD_STATE_POWER_OFF
+	    && new_subdomain->status != GPD_STATE_ACTIVE) {
+		mutex_unlock(&new_subdomain->lock);
+		genpd_release_lock(genpd);
+		goto start;
+	}
+
+	if (genpd->status == GPD_STATE_POWER_OFF
+	    &&  new_subdomain->status != GPD_STATE_POWER_OFF) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -948,17 +1049,14 @@ int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 		}
 	}
 
-	mutex_lock_nested(&new_subdomain->lock, SINGLE_DEPTH_NESTING);
-
 	list_add_tail(&new_subdomain->sd_node, &genpd->sd_list);
 	new_subdomain->parent = genpd;
-	if (!subdomain->power_is_off)
+	if (subdomain->status != GPD_STATE_POWER_OFF)
 		genpd->sd_count++;
 
-	mutex_unlock(&new_subdomain->lock);
-
  out:
-	mutex_unlock(&genpd->lock);
+	mutex_unlock(&new_subdomain->lock);
+	genpd_release_lock(genpd);
 
 	return ret;
 }
@@ -977,7 +1075,8 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(target))
 		return -EINVAL;
 
-	mutex_lock(&genpd->lock);
+ start:
+	genpd_acquire_lock(genpd);
 
 	list_for_each_entry(subdomain, &genpd->sd_list, sd_node) {
 		if (subdomain != target)
@@ -985,9 +1084,16 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 
 		mutex_lock_nested(&subdomain->lock, SINGLE_DEPTH_NESTING);
 
+		if (subdomain->status != GPD_STATE_POWER_OFF
+		    && subdomain->status != GPD_STATE_ACTIVE) {
+			mutex_unlock(&subdomain->lock);
+			genpd_release_lock(genpd);
+			goto start;
+		}
+
 		list_del(&subdomain->sd_node);
 		subdomain->parent = NULL;
-		if (!subdomain->power_is_off)
+		if (subdomain->status != GPD_STATE_POWER_OFF)
 			genpd_sd_counter_dec(genpd);
 
 		mutex_unlock(&subdomain->lock);
@@ -996,7 +1102,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 		break;
 	}
 
-	mutex_unlock(&genpd->lock);
+	genpd_release_lock(genpd);
 
 	return ret;
 }
@@ -1022,7 +1128,8 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
 	INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
 	genpd->in_progress = 0;
 	genpd->sd_count = 0;
-	genpd->power_is_off = is_off;
+	genpd->status = is_off ? GPD_STATE_POWER_OFF : GPD_STATE_ACTIVE;
+	init_waitqueue_head(&genpd->status_wait_queue);
 	genpd->device_count = 0;
 	genpd->suspended_count = 0;
 	genpd->domain.ops.runtime_suspend = pm_genpd_runtime_suspend;
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 14fb0953fa47..c71457cb8a79 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -11,8 +11,11 @@
 
 #include <linux/device.h>
 
-#define GPD_IN_SUSPEND	1
-#define GPD_POWER_OFF	2
+enum gpd_status {
+	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
+	GPD_STATE_BUSY,		/* Something is happening to the PM domain */
+	GPD_STATE_POWER_OFF,	/* PM domain is off */
+};
 
 struct dev_power_governor {
 	bool (*power_down_ok)(struct dev_pm_domain *domain);
@@ -29,7 +32,8 @@ struct generic_pm_domain {
 	struct work_struct power_off_work;
 	unsigned int in_progress;	/* Number of devices being suspended now */
 	unsigned int sd_count;	/* Number of subdomains with power "on" */
-	bool power_is_off;	/* Whether or not power has been removed */
+	enum gpd_status status;	/* Current state of the domain */
+	wait_queue_head_t status_wait_queue;
 	unsigned int device_count;	/* Number of devices */
 	unsigned int suspended_count;	/* System suspend device counter */
 	unsigned int prepared_count;	/* Suspend counter of prepared devices */
-- 
cgit v1.2.3


From c6d22b37263607ba5aeeb2e11169fa65caa29bee Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 12 Jul 2011 00:39:36 +0200
Subject: PM / Domains: Allow callbacks to execute all runtime PM helpers

A deadlock may occur if one of the PM domains' .start_device() or
.stop_device() callbacks or a device driver's .runtime_suspend() or
.runtime_resume() callback executed by the core generic PM domain
code uses a "wrong" runtime PM helper function.  This happens, for
example, if .runtime_resume() from one device's driver calls
pm_runtime_resume() for another device in the same PM domain.
A similar situation may take place if a device's parent is in the
same PM domain, in which case the runtime PM framework may execute
pm_genpd_runtime_resume() automatically for the parent (if it is
suspended at the moment).  This, of course, is undesirable, so
the generic PM domains code should be modified to prevent it from
happening.

The runtime PM framework guarantees that pm_genpd_runtime_suspend()
and pm_genpd_runtime_resume() won't be executed in parallel for
the same device, so the generic PM domains code need not worry
about those cases.  Still, it needs to prevent the other possible
race conditions between pm_genpd_runtime_suspend(),
pm_genpd_runtime_resume(), pm_genpd_poweron() and pm_genpd_poweroff()
from happening and it needs to avoid deadlocks at the same time.
To this end, modify the generic PM domains code to relax
synchronization rules so that:

* pm_genpd_poweron() doesn't wait for the PM domain status to
  change from GPD_STATE_BUSY.  If it finds that the status is
  not GPD_STATE_POWER_OFF, it returns without powering the domain on
  (it may modify the status depending on the circumstances).

* pm_genpd_poweroff() returns as soon as it finds that the PM
  domain's status changed from GPD_STATE_BUSY after it's released
  the PM domain's lock.

* pm_genpd_runtime_suspend() doesn't wait for the PM domain status
  to change from GPD_STATE_BUSY after executing the domain's
  .stop_device() callback and executes pm_genpd_poweroff() only
  if pm_genpd_runtime_resume() is not executed in parallel.

* pm_genpd_runtime_resume() doesn't wait for the PM domain status
  to change from GPD_STATE_BUSY after executing pm_genpd_poweron()
  and sets the domain's status to GPD_STATE_BUSY and increments its
  counter of resuming devices (introduced by this change) immediately
  after acquiring the lock.  The counter of resuming devices is then
  decremented after executing __pm_genpd_runtime_resume() for the
  device and the domain's status is reset to GPD_STATE_ACTIVE (unless
  there are more resuming devices in the domain, in which case the
  status remains GPD_STATE_BUSY).

This way, for example, if a device driver's .runtime_resume()
callback executes pm_runtime_resume() for another device in the same
PM domain, pm_genpd_poweron() called by pm_genpd_runtime_resume()
invoked by the runtime PM framework will not block and it will see
that there's nothing to do for it.  Next, the PM domain's lock will
be acquired without waiting for its status to change from
GPD_STATE_BUSY and the device driver's .runtime_resume() callback
will be executed.  In turn, if pm_runtime_suspend() is executed by
one device driver's .runtime_resume() callback for another device in
the same PM domain, pm_genpd_poweroff() executed by
pm_genpd_runtime_suspend() invoked by the runtime PM framework as a
result will notice that one of the devices in the domain is being
resumed, so it will return immediately.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/domain.c | 144 ++++++++++++++++++++++++++++++--------------
 include/linux/pm_domain.h   |   3 +
 2 files changed, 102 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index d06f3bb80b2e..7e6cc8a5ce5b 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -44,7 +44,8 @@ static void genpd_acquire_lock(struct generic_pm_domain *genpd)
 	for (;;) {
 		prepare_to_wait(&genpd->status_wait_queue, &wait,
 				TASK_UNINTERRUPTIBLE);
-		if (genpd->status != GPD_STATE_BUSY)
+		if (genpd->status == GPD_STATE_ACTIVE
+		    || genpd->status == GPD_STATE_POWER_OFF)
 			break;
 		mutex_unlock(&genpd->lock);
 
@@ -60,6 +61,12 @@ static void genpd_release_lock(struct generic_pm_domain *genpd)
 	mutex_unlock(&genpd->lock);
 }
 
+static void genpd_set_active(struct generic_pm_domain *genpd)
+{
+	if (genpd->resume_count == 0)
+		genpd->status = GPD_STATE_ACTIVE;
+}
+
 /**
  * pm_genpd_poweron - Restore power to a given PM domain and its parents.
  * @genpd: PM domain to power up.
@@ -75,42 +82,24 @@ int pm_genpd_poweron(struct generic_pm_domain *genpd)
 
  start:
 	if (parent) {
-		mutex_lock(&parent->lock);
+		genpd_acquire_lock(parent);
 		mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
 	} else {
 		mutex_lock(&genpd->lock);
 	}
-	/*
-	 * Wait for the domain to transition into either the active,
-	 * or the power off state.
-	 */
-	for (;;) {
-		prepare_to_wait(&genpd->status_wait_queue, &wait,
-				TASK_UNINTERRUPTIBLE);
-		if (genpd->status != GPD_STATE_BUSY)
-			break;
-		mutex_unlock(&genpd->lock);
-		if (parent)
-			mutex_unlock(&parent->lock);
-
-		schedule();
-
-		if (parent) {
-			mutex_lock(&parent->lock);
-			mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
-		} else {
-			mutex_lock(&genpd->lock);
-		}
-	}
-	finish_wait(&genpd->status_wait_queue, &wait);
 
 	if (genpd->status == GPD_STATE_ACTIVE
 	    || (genpd->prepared_count > 0 && genpd->suspend_power_off))
 		goto out;
 
+	if (genpd->status != GPD_STATE_POWER_OFF) {
+		genpd_set_active(genpd);
+		goto out;
+	}
+
 	if (parent && parent->status != GPD_STATE_ACTIVE) {
 		mutex_unlock(&genpd->lock);
-		mutex_unlock(&parent->lock);
+		genpd_release_lock(parent);
 
 		ret = pm_genpd_poweron(parent);
 		if (ret)
@@ -125,14 +114,14 @@ int pm_genpd_poweron(struct generic_pm_domain *genpd)
 			goto out;
 	}
 
-	genpd->status = GPD_STATE_ACTIVE;
+	genpd_set_active(genpd);
 	if (parent)
 		parent->sd_count++;
 
  out:
 	mutex_unlock(&genpd->lock);
 	if (parent)
-		mutex_unlock(&parent->lock);
+		genpd_release_lock(parent);
 
 	return ret;
 }
@@ -209,6 +198,20 @@ static void __pm_genpd_restore_device(struct dev_list_entry *dle,
 	dle->need_restore = false;
 }
 
+/**
+ * genpd_abort_poweroff - Check if a PM domain power off should be aborted.
+ * @genpd: PM domain to check.
+ *
+ * Return true if a PM domain's status changed to GPD_STATE_ACTIVE during
+ * a "power off" operation, which means that a "power on" has occured in the
+ * meantime, or if its resume_count field is different from zero, which means
+ * that one of its devices has been resumed in the meantime.
+ */
+static bool genpd_abort_poweroff(struct generic_pm_domain *genpd)
+{
+	return genpd->status == GPD_STATE_ACTIVE || genpd->resume_count > 0;
+}
+
 /**
  * pm_genpd_poweroff - Remove power from a given PM domain.
  * @genpd: PM domain to power down.
@@ -223,9 +226,17 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 	struct generic_pm_domain *parent;
 	struct dev_list_entry *dle;
 	unsigned int not_suspended;
-	int ret;
+	int ret = 0;
 
-	if (genpd->status == GPD_STATE_POWER_OFF || genpd->prepared_count > 0)
+ start:
+	/*
+	 * Do not try to power off the domain in the following situations:
+	 * (1) The domain is already in the "power off" state.
+	 * (2) System suspend is in progress.
+	 * (3) One of the domain's devices is being resumed right now.
+	 */
+	if (genpd->status == GPD_STATE_POWER_OFF || genpd->prepared_count > 0
+	    || genpd->resume_count > 0)
 		return 0;
 
 	if (genpd->sd_count > 0)
@@ -239,34 +250,54 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 	if (not_suspended > genpd->in_progress)
 		return -EBUSY;
 
+	if (genpd->poweroff_task) {
+		/*
+		 * Another instance of pm_genpd_poweroff() is executing
+		 * callbacks, so tell it to start over and return.
+		 */
+		genpd->status = GPD_STATE_REPEAT;
+		return 0;
+	}
+
 	if (genpd->gov && genpd->gov->power_down_ok) {
 		if (!genpd->gov->power_down_ok(&genpd->domain))
 			return -EAGAIN;
 	}
 
 	genpd->status = GPD_STATE_BUSY;
+	genpd->poweroff_task = current;
 
 	list_for_each_entry_reverse(dle, &genpd->dev_list, node) {
 		ret = __pm_genpd_save_device(dle, genpd);
 		if (ret)
 			goto err_dev;
-	}
 
-	mutex_unlock(&genpd->lock);
+		if (genpd_abort_poweroff(genpd))
+			goto out;
+
+		if (genpd->status == GPD_STATE_REPEAT) {
+			genpd->poweroff_task = NULL;
+			goto start;
+		}
+	}
 
 	parent = genpd->parent;
 	if (parent) {
+		mutex_unlock(&genpd->lock);
+
 		genpd_acquire_lock(parent);
 		mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
-	} else {
-		mutex_lock(&genpd->lock);
+
+		if (genpd_abort_poweroff(genpd)) {
+			genpd_release_lock(parent);
+			goto out;
+		}
 	}
 
 	if (genpd->power_off)
 		genpd->power_off(genpd);
 
 	genpd->status = GPD_STATE_POWER_OFF;
-	wake_up_all(&genpd->status_wait_queue);
 
 	if (parent) {
 		genpd_sd_counter_dec(parent);
@@ -276,16 +307,17 @@ static int pm_genpd_poweroff(struct generic_pm_domain *genpd)
 		genpd_release_lock(parent);
 	}
 
-	return 0;
+ out:
+	genpd->poweroff_task = NULL;
+	wake_up_all(&genpd->status_wait_queue);
+	return ret;
 
  err_dev:
 	list_for_each_entry_continue(dle, &genpd->dev_list, node)
 		__pm_genpd_restore_device(dle, genpd);
 
-	genpd->status = GPD_STATE_ACTIVE;
-	wake_up_all(&genpd->status_wait_queue);
-
-	return ret;
+	genpd_set_active(genpd);
+	goto out;
 }
 
 /**
@@ -327,11 +359,11 @@ static int pm_genpd_runtime_suspend(struct device *dev)
 			return ret;
 	}
 
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 	genpd->in_progress++;
 	pm_genpd_poweroff(genpd);
 	genpd->in_progress--;
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 
 	return 0;
 }
@@ -365,6 +397,7 @@ static void __pm_genpd_runtime_resume(struct device *dev,
 static int pm_genpd_runtime_resume(struct device *dev)
 {
 	struct generic_pm_domain *genpd;
+	DEFINE_WAIT(wait);
 	int ret;
 
 	dev_dbg(dev, "%s()\n", __func__);
@@ -377,12 +410,31 @@ static int pm_genpd_runtime_resume(struct device *dev)
 	if (ret)
 		return ret;
 
-	genpd_acquire_lock(genpd);
+	mutex_lock(&genpd->lock);
 	genpd->status = GPD_STATE_BUSY;
+	genpd->resume_count++;
+	for (;;) {
+		prepare_to_wait(&genpd->status_wait_queue, &wait,
+				TASK_UNINTERRUPTIBLE);
+		/*
+		 * If current is the powering off task, we have been called
+		 * reentrantly from one of the device callbacks, so we should
+		 * not wait.
+		 */
+		if (!genpd->poweroff_task || genpd->poweroff_task == current)
+			break;
+		mutex_unlock(&genpd->lock);
+
+		schedule();
+
+		mutex_lock(&genpd->lock);
+	}
+	finish_wait(&genpd->status_wait_queue, &wait);
 	__pm_genpd_runtime_resume(dev, genpd);
-	genpd->status = GPD_STATE_ACTIVE;
+	genpd->resume_count--;
+	genpd_set_active(genpd);
 	wake_up_all(&genpd->status_wait_queue);
-	genpd_release_lock(genpd);
+	mutex_unlock(&genpd->lock);
 
 	if (genpd->start_device)
 		genpd->start_device(dev);
@@ -1130,6 +1182,8 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
 	genpd->sd_count = 0;
 	genpd->status = is_off ? GPD_STATE_POWER_OFF : GPD_STATE_ACTIVE;
 	init_waitqueue_head(&genpd->status_wait_queue);
+	genpd->poweroff_task = NULL;
+	genpd->resume_count = 0;
 	genpd->device_count = 0;
 	genpd->suspended_count = 0;
 	genpd->domain.ops.runtime_suspend = pm_genpd_runtime_suspend;
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index c71457cb8a79..feb80af4cc1b 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -14,6 +14,7 @@
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
 	GPD_STATE_BUSY,		/* Something is happening to the PM domain */
+	GPD_STATE_REPEAT,	/* Power off in progress, to be repeated */
 	GPD_STATE_POWER_OFF,	/* PM domain is off */
 };
 
@@ -34,6 +35,8 @@ struct generic_pm_domain {
 	unsigned int sd_count;	/* Number of subdomains with power "on" */
 	enum gpd_status status;	/* Current state of the domain */
 	wait_queue_head_t status_wait_queue;
+	struct task_struct *poweroff_task;	/* Powering off task */
+	unsigned int resume_count;	/* Number of devices being resumed */
 	unsigned int device_count;	/* Number of devices */
 	unsigned int suspended_count;	/* System suspend device counter */
 	unsigned int prepared_count;	/* Suspend counter of prepared devices */
-- 
cgit v1.2.3


From a63fdc5156f2ef5690b6cf03d72b0c4917efbba7 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 14 Jun 2011 10:57:50 +1000
Subject: mm: Move definition of MIN_MEMORY_BLOCK_SIZE to a header

The macro MIN_MEMORY_BLOCK_SIZE is currently defined twice in two .c
files, and I need it in a third one to fix a powerpc bug, so let's
first move it into a header

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c  | 3 +--
 drivers/base/memory.c  | 1 -
 include/linux/memory.h | 2 ++
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d865c4aeec55..bbaaa005bf0e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -28,6 +28,7 @@
 #include <linux/poison.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
+#include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nmi.h>
 #include <linux/gfp.h>
@@ -895,8 +896,6 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 }
 
 #ifdef CONFIG_X86_UV
-#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)
-
 unsigned long memory_block_size_bytes(void)
 {
 	if (is_uv_system()) {
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 9f9b2359f718..45d7c8fc73bd 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -30,7 +30,6 @@
 static DEFINE_MUTEX(mem_sysfs_mutex);
 
 #define MEMORY_CLASS_NAME	"memory"
-#define MIN_MEMORY_BLOCK_SIZE	(1 << SECTION_SIZE_BITS)
 
 static int sections_per_block;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index e1e3b2b84f85..935699b30b7c 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -20,6 +20,8 @@
 #include <linux/compiler.h>
 #include <linux/mutex.h>
 
+#define MIN_MEMORY_BLOCK_SIZE     (1 << SECTION_SIZE_BITS)
+
 struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long end_section_nr;
-- 
cgit v1.2.3


From 4915a0de43c3e9aef92005c1f94a8ff3a6cfced5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Jul 2011 20:08:34 -0700
Subject: net: introduce __netdev_alloc_skb_ip_align

RX rings should use GFP_KERNEL allocations if possible, add
__netdev_alloc_skb_ip_align() helper to ease this.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 32ada5351ab4..a24218c9c84b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1579,16 +1579,22 @@ static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
 	return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
 }
 
-static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
-		unsigned int length)
+static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
+		unsigned int length, gfp_t gfp)
 {
-	struct sk_buff *skb = netdev_alloc_skb(dev, length + NET_IP_ALIGN);
+	struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);
 
 	if (NET_IP_ALIGN && skb)
 		skb_reserve(skb, NET_IP_ALIGN);
 	return skb;
 }
 
+static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
+		unsigned int length)
+{
+	return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
+}
+
 /**
  *	__netdev_alloc_page - allocate a page for ps-rx on a specific device
  *	@dev: network device to receive on
-- 
cgit v1.2.3


From f3393b62f157cc87f8d78247e97b87778dc077b8 Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@ti.com>
Date: Tue, 12 Jul 2011 11:17:09 +0200
Subject: PM / Runtime: Add new helper function: pm_runtime_status_suspended()

This boolean function simply returns whether or not the runtime status
of the device is 'suspended'.  Unlike pm_runtime_suspended(), this
function returns the runtime status whether or not runtime PM for the
device has been disabled or not.

Also add entry to Documentation/power/runtime.txt

Signed-off-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/power/runtime_pm.txt | 3 +++
 include/linux/pm_runtime.h         | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index 40e47c75f064..14dd3c6ad97e 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -412,6 +412,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
     - return true if the device's runtime PM status is 'suspended' and its
       'power.disable_depth' field is equal to zero, or false otherwise
 
+  bool pm_runtime_status_suspended(struct device *dev);
+    - return true if the device's runtime PM status is 'suspended'
+
   void pm_runtime_allow(struct device *dev);
     - set the power.runtime_auto flag for the device and decrease its usage
       counter (used by the /sys/devices/.../power/control interface to
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index dfb8539ed686..daac05d751b2 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -82,6 +82,11 @@ static inline bool pm_runtime_suspended(struct device *dev)
 		&& !dev->power.disable_depth;
 }
 
+static inline bool pm_runtime_status_suspended(struct device *dev)
+{
+	return dev->power.runtime_status == RPM_SUSPENDED;
+}
+
 static inline bool pm_runtime_enabled(struct device *dev)
 {
 	return !dev->power.disable_depth;
@@ -130,6 +135,7 @@ static inline void pm_runtime_put_noidle(struct device *dev) {}
 static inline bool device_run_wake(struct device *dev) { return false; }
 static inline void device_set_run_wake(struct device *dev, bool enable) {}
 static inline bool pm_runtime_suspended(struct device *dev) { return false; }
+static inline bool pm_runtime_status_suspended(struct device *dev) { return false; }
 static inline bool pm_runtime_enabled(struct device *dev) { return false; }
 
 static inline int pm_generic_runtime_idle(struct device *dev) { return 0; }
-- 
cgit v1.2.3


From 91e3d71db29d9b3b67b64e2a08e724a7ff538b4c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 3 Jun 2011 08:51:05 +0200
Subject: KVM: Clarify KVM_ASSIGN_PCI_DEVICE documentation

Neither host_irq nor the guest_msi struct are used anymore today.
Tag the former, drop the latter to avoid confusion.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 7 +------
 include/linux/kvm.h               | 8 +-------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 2bd06b018b32..0ebe9225b171 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1143,15 +1143,10 @@ Assigns an IRQ to a passed-through device.
 
 struct kvm_assigned_irq {
 	__u32 assigned_dev_id;
-	__u32 host_irq;
+	__u32 host_irq; /* ignored (legacy field) */
 	__u32 guest_irq;
 	__u32 flags;
 	union {
-		struct {
-			__u32 addr_lo;
-			__u32 addr_hi;
-			__u32 data;
-		} guest_msi;
 		__u32 reserved[12];
 	};
 };
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 55ef181521ff..9c9ca7c7b491 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -773,20 +773,14 @@ struct kvm_assigned_pci_dev {
 
 struct kvm_assigned_irq {
 	__u32 assigned_dev_id;
-	__u32 host_irq;
+	__u32 host_irq; /* ignored (legacy field) */
 	__u32 guest_irq;
 	__u32 flags;
 	union {
-		struct {
-			__u32 addr_lo;
-			__u32 addr_hi;
-			__u32 data;
-		} guest_msi;
 		__u32 reserved[12];
 	};
 };
 
-
 struct kvm_assigned_msix_nr {
 	__u32 assigned_dev_id;
 	__u16 entry_nr;
-- 
cgit v1.2.3


From de56a948b9182fbcf92cb8212f114de096c2d574 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:21:34 +0000
Subject: KVM: PPC: Add support for Book3S processors in hypervisor mode

This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode.  Using hypervisor mode means
that the guest can use the processor's supervisor mode.  That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host.  This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.

This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses.  That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification.  In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.

Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.

This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.

With the guest running in supervisor mode, most exceptions go straight
to the guest.  We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.

We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.

In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount.  Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.

The POWER7 processor has a restriction that all threads in a core have
to be in the same partition.  MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest.  At present we require the host and guest to run
in single-thread mode because of this hardware restriction.

This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management.  This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.

This also adds a few new exports needed by the book3s_hv code.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt         |  17 +
 arch/powerpc/include/asm/exception-64s.h  |  19 +-
 arch/powerpc/include/asm/kvm_asm.h        |   4 +
 arch/powerpc/include/asm/kvm_book3s.h     | 137 ++++-
 arch/powerpc/include/asm/kvm_book3s_64.h  |   2 +
 arch/powerpc/include/asm/kvm_book3s_asm.h |  12 +
 arch/powerpc/include/asm/kvm_booke.h      |   4 +
 arch/powerpc/include/asm/kvm_host.h       |  53 +-
 arch/powerpc/include/asm/kvm_ppc.h        |   6 +
 arch/powerpc/include/asm/mmu-hash64.h     |  10 +-
 arch/powerpc/include/asm/paca.h           |   2 +
 arch/powerpc/include/asm/reg.h            |   4 +
 arch/powerpc/kernel/asm-offsets.c         |  79 +++
 arch/powerpc/kernel/exceptions-64s.S      |  60 +--
 arch/powerpc/kernel/process.c             |   3 +
 arch/powerpc/kernel/setup-common.c        |   3 +
 arch/powerpc/kernel/smp.c                 |   1 +
 arch/powerpc/kvm/Kconfig                  |  37 +-
 arch/powerpc/kvm/Makefile                 |  17 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c       | 258 ++++++++++
 arch/powerpc/kvm/book3s_exports.c         |   5 +
 arch/powerpc/kvm/book3s_hv.c              | 445 +++++++++++++++++
 arch/powerpc/kvm/book3s_hv_interrupts.S   | 136 +++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 806 ++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_segment.S         |  34 +-
 arch/powerpc/kvm/powerpc.c                |  22 +-
 arch/powerpc/kvm/trace.h                  |   2 +-
 include/linux/kvm.h                       |   6 +
 28 files changed, 2103 insertions(+), 81 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_interrupts.S
 create mode 100644 arch/powerpc/kvm/book3s_hv_rmhandlers.S

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index b2511360b8f5..e8875fef3eb8 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1532,6 +1532,23 @@ Userspace can now handle the hypercall and when it's done modify the gprs as
 necessary. Upon guest entry all guest GPRs will then be replaced by the values
 in this struct.
 
+		/* KVM_EXIT_PAPR_HCALL */
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
+
+This is used on 64-bit PowerPC when emulating a pSeries partition,
+e.g. with the 'pseries' machine type in qemu.  It occurs when the
+guest does a hypercall using the 'sc 1' instruction.  The 'nr' field
+contains the hypercall number (from the guest R3), and 'args' contains
+the arguments (from the guest R4 - R12).  Userspace should put the
+return code in 'ret' and any extra returned values in args[].
+The possible hypercalls are defined in the Power Architecture Platform
+Requirements (PAPR) document available from www.power.org (free
+developer registration required to access it).
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 296c9b66c04a..69435da8f2ba 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -134,6 +134,17 @@ do_kvm_##n:								\
 #define KVM_HANDLER_SKIP(area, h, n)
 #endif
 
+#ifdef CONFIG_KVM_BOOK3S_PR
+#define KVMTEST_PR(n)			__KVMTEST(n)
+#define KVM_HANDLER_PR(area, h, n)	__KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST_PR(n)
+#define KVM_HANDLER_PR(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)
+#endif
+
 #define NOTEST(n)
 
 /*
@@ -210,7 +221,7 @@ label##_pSeries:					\
 	HMT_MEDIUM;					\
 	SET_SCRATCH0(r13);		/* save r13 */		\
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
-				 EXC_STD, KVMTEST, vec)
+				 EXC_STD, KVMTEST_PR, vec)
 
 #define STD_EXCEPTION_HV(loc, vec, label)		\
 	. = loc;					\
@@ -227,8 +238,8 @@ label##_hv:						\
 	beq	masked_##h##interrupt
 #define _SOFTEN_TEST(h)	__SOFTEN_TEST(h)
 
-#define SOFTEN_TEST(vec)						\
-	KVMTEST(vec);							\
+#define SOFTEN_TEST_PR(vec)						\
+	KVMTEST_PR(vec);						\
 	_SOFTEN_TEST(EXC_STD)
 
 #define SOFTEN_TEST_HV(vec)						\
@@ -248,7 +259,7 @@ label##_hv:						\
 	.globl label##_pSeries;						\
 label##_pSeries:							\
 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
-				    EXC_STD, SOFTEN_TEST)
+				    EXC_STD, SOFTEN_TEST_PR)
 
 #define MASKABLE_EXCEPTION_HV(loc, vec, label)				\
 	. = loc;							\
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 0951b17f4eb5..7b1f0e0fc653 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -64,8 +64,12 @@
 #define BOOK3S_INTERRUPT_PROGRAM	0x700
 #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
 #define BOOK3S_INTERRUPT_DECREMENTER	0x900
+#define BOOK3S_INTERRUPT_HV_DECREMENTER	0x980
 #define BOOK3S_INTERRUPT_SYSCALL	0xc00
 #define BOOK3S_INTERRUPT_TRACE		0xd00
+#define BOOK3S_INTERRUPT_H_DATA_STORAGE	0xe00
+#define BOOK3S_INTERRUPT_H_INST_STORAGE	0xe20
+#define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
 #define BOOK3S_INTERRUPT_PERFMON	0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
 #define BOOK3S_INTERRUPT_VSX		0xf40
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 480fff6090db..5537c45d626c 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -116,6 +116,7 @@ extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
 extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
@@ -127,10 +128,12 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern int kvmppc_mmu_hpte_sysinit(void);
 extern void kvmppc_mmu_hpte_sysexit(void);
+extern int kvmppc_mmu_hv_init(void);
 
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
@@ -140,6 +143,7 @@ extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 extern void kvmppc_handler_lowmem_trampoline(void);
 extern void kvmppc_handler_trampoline_enter(void);
 extern void kvmppc_rmcall(ulong srr0, ulong srr1);
+extern void kvmppc_hv_entry_trampoline(void);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
 extern void kvmppc_load_up_vsx(void);
@@ -151,6 +155,19 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
 }
 
+extern void kvm_return_point(void);
+
+/* Also add subarch specific defines */
+
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+#include <asm/kvm_book3s_32.h>
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/kvm_book3s_64.h>
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_PR
+
 static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 {
 	return to_book3s(vcpu)->hior;
@@ -165,16 +182,6 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 		vcpu->arch.shared->int_pending = 0;
 }
 
-static inline ulong dsisr(void)
-{
-	ulong r;
-	asm ( "mfdsisr %0 " : "=r" (r) );
-	return r;
-}
-
-extern void kvm_return_point(void);
-static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
-
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
 	if ( num < 14 ) {
@@ -281,6 +288,108 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 	return crit;
 }
+#else /* CONFIG_KVM_BOOK3S_PR */
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+			unsigned long pending_now, unsigned long old_pending)
+{
+	/* Recalculate LPCR:MER based on the presence of
+	 * a pending external interrupt
+	 */
+	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &pending_now) ||
+	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &pending_now))
+		vcpu->arch.lpcr |= LPCR_MER;
+	else
+		vcpu->arch.lpcr &= ~((u64)LPCR_MER);
+}
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+	vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+	return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.xer;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pc;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+	ulong pc = kvmppc_get_pc(vcpu);
+
+	/* Load the instruction manually if it failed to do so in the
+	 * exit path */
+	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+
+	return vcpu->arch.last_inst;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.fault_dar;
+}
+
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+#endif
 
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
@@ -289,12 +398,4 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 #define INS_DCBZ			0x7c0007ec
 
-/* Also add subarch specific defines */
-
-#ifdef CONFIG_PPC_BOOK3S_32
-#include <asm/kvm_book3s_32.h>
-#else
-#include <asm/kvm_book3s_64.h>
-#endif
-
 #endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 4cadd612d575..5f73388ea0af 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,9 +20,11 @@
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
+#ifdef CONFIG_KVM_BOOK3S_PR
 static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 {
 	return &get_paca()->shadow_vcpu;
 }
+#endif
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 312617529864..b7b039532fbc 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -70,10 +70,22 @@ kvmppc_resume_\intno:
 struct kvmppc_host_state {
 	ulong host_r1;
 	ulong host_r2;
+	ulong host_msr;
 	ulong vmhandler;
 	ulong scratch0;
 	ulong scratch1;
 	u8 in_guest;
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	struct kvm_vcpu *kvm_vcpu;
+	u64 dabr;
+	u64 host_mmcr[3];
+	u32 host_pmc[6];
+	u64 host_purr;
+	u64 host_spurr;
+	u64 host_dscr;
+	u64 dec_expires;
+#endif
 };
 
 struct kvmppc_book3s_shadow_vcpu {
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index 9c9ba3d59b1b..a90e09188777 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 	return vcpu->arch.fault_dear;
 }
 
+static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->msr;
+}
 #endif /* __ASM_KVM_BOOKE_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 069eb9fc6c41..4a3f790d5fc4 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -33,7 +33,9 @@
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 
+#ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#endif
 
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x)	0
@@ -133,7 +135,26 @@ struct kvmppc_exit_timing {
 	};
 };
 
+struct kvmppc_pginfo {
+	unsigned long pfn;
+	atomic_t refcnt;
+};
+
 struct kvm_arch {
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	unsigned long hpt_virt;
+	unsigned long ram_npages;
+	unsigned long ram_psize;
+	unsigned long ram_porder;
+	struct kvmppc_pginfo *ram_pginfo;
+	unsigned int lpid;
+	unsigned int host_lpid;
+	unsigned long host_lpcr;
+	unsigned long sdr1;
+	unsigned long host_sdr1;
+	int tlbie_lock;
+	unsigned short last_vcpu[NR_CPUS];
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
 struct kvmppc_pte {
@@ -190,7 +211,7 @@ struct kvm_vcpu_arch {
 	ulong rmcall;
 	ulong host_paca_phys;
 	struct kvmppc_slb slb[64];
-	int slb_max;		/* # valid entries in slb[] */
+	int slb_max;		/* 1 + index of last valid entry in slb[] */
 	int slb_nr;		/* total number of entries in SLB */
 	struct kvmppc_mmu mmu;
 #endif
@@ -212,7 +233,7 @@ struct kvm_vcpu_arch {
 #endif
 
 #ifdef CONFIG_VSX
-	u64 vsr[32];
+	u64 vsr[64];
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S
@@ -220,18 +241,24 @@ struct kvm_vcpu_arch {
 	u32 qpr[32];
 #endif
 
-#ifdef CONFIG_BOOKE
 	ulong pc;
 	ulong ctr;
 	ulong lr;
 
 	ulong xer;
 	u32 cr;
-#endif
 
 #ifdef CONFIG_PPC_BOOK3S
 	ulong hflags;
 	ulong guest_owned_ext;
+	ulong purr;
+	ulong spurr;
+	ulong lpcr;
+	ulong dscr;
+	ulong amr;
+	ulong uamor;
+	u32 ctrl;
+	ulong dabr;
 #endif
 	u32 vrsave; /* also USPRG0 */
 	u32 mmucr;
@@ -270,6 +297,9 @@ struct kvm_vcpu_arch {
 	u32 dbcr1;
 	u32 dbsr;
 
+	u64 mmcr[3];
+	u32 pmc[6];
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct mutex exit_timing_lock;
 	struct kvmppc_exit_timing timing_exit;
@@ -284,8 +314,12 @@ struct kvm_vcpu_arch {
 	struct dentry *debugfs_exit_timing;
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S
+	ulong fault_dar;
+	u32 fault_dsisr;
+#endif
+
 #ifdef CONFIG_BOOKE
-	u32 last_inst;
 	ulong fault_dear;
 	ulong fault_esr;
 	ulong queued_dear;
@@ -300,16 +334,25 @@ struct kvm_vcpu_arch {
 	u8 dcr_is_write;
 	u8 osi_needed;
 	u8 osi_enabled;
+	u8 hcall_needed;
 
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
 	struct hrtimer dec_timer;
 	struct tasklet_struct tasklet;
 	u64 dec_jiffies;
+	u64 dec_expires;
 	unsigned long pending_exceptions;
+	u16 last_cpu;
+	u32 last_inst;
+	int trap;
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	struct kvm_vcpu_arch_shared shregs;
+#endif
 };
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 48b7ab76de2d..0dafd53c30ed 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -112,6 +112,12 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
 
+extern long kvmppc_alloc_hpt(struct kvm *kvm);
+extern void kvmppc_free_hpt(struct kvm *kvm);
+extern long kvmppc_prepare_vrma(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem);
+extern void kvmppc_map_vrma(struct kvm *kvm,
+			    struct kvm_userspace_memory_region *mem);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index d865bd909c7d..b445e0af4c2b 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -90,13 +90,19 @@ extern char initial_stab[];
 
 #define HPTE_R_PP0		ASM_CONST(0x8000000000000000)
 #define HPTE_R_TS		ASM_CONST(0x4000000000000000)
+#define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
 #define HPTE_R_RPN_SHIFT	12
-#define HPTE_R_RPN		ASM_CONST(0x3ffffffffffff000)
-#define HPTE_R_FLAGS		ASM_CONST(0x00000000000003ff)
+#define HPTE_R_RPN		ASM_CONST(0x0ffffffffffff000)
 #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
 #define HPTE_R_N		ASM_CONST(0x0000000000000004)
+#define HPTE_R_G		ASM_CONST(0x0000000000000008)
+#define HPTE_R_M		ASM_CONST(0x0000000000000010)
+#define HPTE_R_I		ASM_CONST(0x0000000000000020)
+#define HPTE_R_W		ASM_CONST(0x0000000000000040)
+#define HPTE_R_WIMG		ASM_CONST(0x0000000000000078)
 #define HPTE_R_C		ASM_CONST(0x0000000000000080)
 #define HPTE_R_R		ASM_CONST(0x0000000000000100)
+#define HPTE_R_KEY_LO		ASM_CONST(0x0000000000000e00)
 
 #define HPTE_V_1TB_SEG		ASM_CONST(0x4000000000000000)
 #define HPTE_V_VRMA_MASK	ASM_CONST(0x4001ffffff000000)
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 58f4a18ef60c..a6da12859959 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -147,8 +147,10 @@ struct paca_struct {
 	struct dtl_entry *dtl_curr;	/* pointer corresponding to dtl_ridx */
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 	/* We use this to store guest state in */
 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
+#endif
 	struct kvmppc_host_state kvm_hstate;
 #endif
 };
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d879a6b91635..36a611b398c5 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -189,6 +189,9 @@
 #define SPRN_CTR	0x009	/* Count Register */
 #define SPRN_DSCR	0x11
 #define SPRN_CFAR	0x1c	/* Come From Address Register */
+#define SPRN_AMR	0x1d	/* Authority Mask Register */
+#define SPRN_UAMOR	0x9d	/* User Authority Mask Override Register */
+#define SPRN_AMOR	0x15d	/* Authority Mask Override Register */
 #define SPRN_ACOP	0x1F	/* Available Coprocessor Register */
 #define SPRN_CTRLF	0x088
 #define SPRN_CTRLT	0x098
@@ -252,6 +255,7 @@
 #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
 #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
 #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
+#define   LPID_RSVD	0x3ff		/* Reserved LPID for partn switching */
 #define	SPRN_HMER	0x150	/* Hardware m? error recovery */
 #define	SPRN_HMEER	0x151	/* Hardware m? enable error recovery */
 #define	SPRN_HEIR	0x153	/* Hypervisor Emulated Instruction Register */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index dabfb7346f36..936267462cae 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -187,6 +187,7 @@ int main(void)
 	DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
 	DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
+	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
@@ -392,6 +393,29 @@ int main(void)
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
+	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
+	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
+#ifdef CONFIG_ALTIVEC
+	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
+	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
+#endif
+#ifdef CONFIG_VSX
+	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
+#endif
+	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
+	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr));
+	DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0));
+	DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1));
+	DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.shregs.sprg0));
+	DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1));
+	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
+	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
+#endif
 	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
@@ -403,17 +427,60 @@ int main(void)
 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
 
 	/* book3s */
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
+	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
+	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
+	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
+	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
+	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
+	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
+	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
+	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
+#endif
 #ifdef CONFIG_PPC_BOOK3S
+	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
+	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
 	DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
+	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
+	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
+	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
+	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
+	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
 	DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
 	DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
 	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
+	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
+	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
+	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
+	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
+	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
+	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
+	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
+	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
+	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
+			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
+	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
+	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
+	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
 
 #ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_PR
 # define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
+#else
+# define SVCPU_FIELD(x, f)
+#endif
 # define HSTATE_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
 #else	/* 32-bit */
 # define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
@@ -453,11 +520,23 @@ int main(void)
 
 	HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
 	HSTATE_FIELD(HSTATE_HOST_R2, host_r2);
+	HSTATE_FIELD(HSTATE_HOST_MSR, host_msr);
 	HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
 	HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
 	HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
 	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
+	HSTATE_FIELD(HSTATE_PMC, host_pmc);
+	HSTATE_FIELD(HSTATE_PURR, host_purr);
+	HSTATE_FIELD(HSTATE_SPURR, host_spurr);
+	HSTATE_FIELD(HSTATE_DSCR, host_dscr);
+	HSTATE_FIELD(HSTATE_DABR, dabr);
+	HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 #else /* CONFIG_PPC_BOOK3S */
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
 	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6da00550afea..163c041cec24 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -87,14 +87,14 @@ data_access_not_stab:
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 #endif
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-				 KVMTEST, 0x300)
+				 KVMTEST_PR, 0x300)
 
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
 #ifdef __DISABLED__
@@ -125,7 +125,7 @@ data_access_slb_pSeries:
 instruction_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
 #ifdef __DISABLED__
@@ -153,32 +153,32 @@ instruction_access_slb_pSeries:
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
-		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
-					    EXC_STD, SOFTEN_TEST)
-		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
-	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
 					    EXC_HV, SOFTEN_TEST_HV)
 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
-	ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
+	FTR_SECTION_ELSE
+		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
+					    EXC_STD, SOFTEN_TEST_PR)
+		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206)
 
 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
 
 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700)
 
 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
 
 	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
 	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
 
 	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00)
 
 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00)
 
 	. = 0xc00
 	.globl	system_call_pSeries
@@ -219,7 +219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	b	.
 
 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
 
 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
 	 * out of line to handle them
@@ -254,23 +254,23 @@ vsx_unavailable_pSeries_1:
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700)
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
 #endif /* CONFIG_CBE_RAS */
 
 	. = 0x3000
@@ -297,7 +297,7 @@ data_access_check_stab:
 	mfspr	r9,SPRN_DSISR
 	srdi	r10,r10,60
 	rlwimi	r10,r9,16,0x20
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 	lbz	r9,HSTATE_IN_GUEST(r13)
 	rlwimi	r10,r9,8,0x300
 #endif
@@ -316,11 +316,11 @@ do_stab_bolted_pSeries:
 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
 #endif /* CONFIG_POWER4_ONLY */
 
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
-	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
+	KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
 
 	.align	7
@@ -336,11 +336,11 @@ do_stab_bolted_pSeries:
 
 	/* moved from 0xf00 */
 	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00)
 	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
 	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
 
 /*
  * An interrupt came in while soft-disabled; clear EE in SRR1,
@@ -417,7 +417,11 @@ slb_miss_user_pseries:
 /* KVM's trampoline code needs to be close to the interrupt handlers */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 #include "../kvm/book3s_rmhandlers.S"
+#else
+#include "../kvm/book3s_hv_rmhandlers.S"
+#endif
 #endif
 
 	.align	7
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 60ac2a9251db..ec2d0edeb134 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -96,6 +96,7 @@ void flush_fp_to_thread(struct task_struct *tsk)
 		preempt_enable();
 	}
 }
+EXPORT_SYMBOL_GPL(flush_fp_to_thread);
 
 void enable_kernel_fp(void)
 {
@@ -145,6 +146,7 @@ void flush_altivec_to_thread(struct task_struct *tsk)
 		preempt_enable();
 	}
 }
+EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 #endif /* CONFIG_ALTIVEC */
 
 #ifdef CONFIG_VSX
@@ -186,6 +188,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
 		preempt_enable();
 	}
 }
+EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
 #endif /* CONFIG_VSX */
 
 #ifdef CONFIG_SPE
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 79fca2651b65..22051ef04bd9 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -375,6 +375,9 @@ void __init check_for_initrd(void)
 
 int threads_per_core, threads_shift;
 cpumask_t threads_core_mask;
+EXPORT_SYMBOL_GPL(threads_per_core);
+EXPORT_SYMBOL_GPL(threads_shift);
+EXPORT_SYMBOL_GPL(threads_core_mask);
 
 static void __init cpu_init_thread_core_maps(int tpc)
 {
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8ebc6700b98d..09a85a9045d6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -243,6 +243,7 @@ void smp_send_reschedule(int cpu)
 	if (likely(smp_ops))
 		smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
+EXPORT_SYMBOL_GPL(smp_send_reschedule);
 
 void arch_send_call_function_single_ipi(int cpu)
 {
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index b7baff78f90c..5d9b78ebbaa6 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ config KVM
 	bool
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
-	select KVM_MMIO
 
 config KVM_BOOK3S_HANDLER
 	bool
@@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
 config KVM_BOOK3S_32_HANDLER
 	bool
 	select KVM_BOOK3S_HANDLER
+	select KVM_MMIO
 
 config KVM_BOOK3S_64_HANDLER
 	bool
 	select KVM_BOOK3S_HANDLER
 
+config KVM_BOOK3S_PR
+	bool
+	select KVM_MMIO
+
 config KVM_BOOK3S_32
 	tristate "KVM support for PowerPC book3s_32 processors"
 	depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
 	select KVM
 	select KVM_BOOK3S_32_HANDLER
+	select KVM_BOOK3S_PR
 	---help---
 	  Support running unmodified book3s_32 guest kernels
 	  in virtual machines on book3s_32 host processors.
@@ -50,8 +55,8 @@ config KVM_BOOK3S_32
 config KVM_BOOK3S_64
 	tristate "KVM support for PowerPC book3s_64 processors"
 	depends on EXPERIMENTAL && PPC_BOOK3S_64
-	select KVM
 	select KVM_BOOK3S_64_HANDLER
+	select KVM
 	---help---
 	  Support running unmodified book3s_64 and book3s_32 guest kernels
 	  in virtual machines on book3s_64 host processors.
@@ -61,10 +66,37 @@ config KVM_BOOK3S_64
 
 	  If unsure, say N.
 
+config KVM_BOOK3S_64_HV
+	bool "KVM support for POWER7 using hypervisor mode in host"
+	depends on KVM_BOOK3S_64
+	---help---
+	  Support running unmodified book3s_64 guest kernels in
+	  virtual machines on POWER7 processors that have hypervisor
+	  mode available to the host.
+
+	  If you say Y here, KVM will use the hardware virtualization
+	  facilities of POWER7 (and later) processors, meaning that
+	  guest operating systems will run at full hardware speed
+	  using supervisor and user modes.  However, this also means
+	  that KVM is not usable under PowerVM (pHyp), is only usable
+	  on POWER7 (or later) processors, and can only emulate
+	  POWER5+, POWER6 and POWER7 processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64_PR
+	def_bool y
+	depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV
+	select KVM_BOOK3S_PR
+
 config KVM_440
 	bool "KVM support for PowerPC 440 processors"
 	depends on EXPERIMENTAL && 44x
 	select KVM
+	select KVM_MMIO
 	---help---
 	  Support running unmodified 440 guest kernels in virtual machines on
 	  440 host processors.
@@ -89,6 +121,7 @@ config KVM_E500
 	bool "KVM support for PowerPC E500 processors"
 	depends on EXPERIMENTAL && E500
 	select KVM
+	select KVM_MMIO
 	---help---
 	  Support running unmodified E500 guest kernels in virtual machines on
 	  E500 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index bf9854fa7f63..8a435a6da665 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -38,11 +38,10 @@ kvm-e500-objs := \
 	e500_emulate.o
 kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
 
-kvm-book3s_64-objs := \
-	$(common-objs-y) \
+kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
+	../../../virt/kvm/coalesced_mmio.o \
 	fpu.o \
 	book3s_paired_singles.o \
-	book3s.o \
 	book3s_pr.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
@@ -50,6 +49,18 @@ kvm-book3s_64-objs := \
 	book3s_64_mmu_host.o \
 	book3s_64_mmu.o \
 	book3s_32_mmu.o
+
+kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+	book3s_hv.o \
+	book3s_hv_interrupts.o \
+	book3s_64_mmu_hv.o
+
+kvm-book3s_64-objs := \
+	../../../virt/kvm/kvm_main.o \
+	powerpc.o \
+	emulate.o \
+	book3s.o \
+	$(kvm-book3s_64-objs-y)
 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
 
 kvm-book3s_32-objs := \
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
new file mode 100644
index 000000000000..4a4fbec61a17
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -0,0 +1,258 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER	24
+#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
+#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+
+/* Pages in the VRMA are 16MB pages */
+#define VRMA_PAGE_ORDER	24
+#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
+
+#define NR_LPIDS	(LPID_RSVD + 1)
+unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
+
+long kvmppc_alloc_hpt(struct kvm *kvm)
+{
+	unsigned long hpt;
+	unsigned long lpid;
+
+	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
+			       HPT_ORDER - PAGE_SHIFT);
+	if (!hpt) {
+		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
+		return -ENOMEM;
+	}
+	kvm->arch.hpt_virt = hpt;
+
+	do {
+		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
+		if (lpid >= NR_LPIDS) {
+			pr_err("kvm_alloc_hpt: No LPIDs free\n");
+			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+			return -ENOMEM;
+		}
+	} while (test_and_set_bit(lpid, lpid_inuse));
+
+	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
+	kvm->arch.lpid = lpid;
+	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
+	kvm->arch.host_lpid = mfspr(SPRN_LPID);
+	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
+
+	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
+	return 0;
+}
+
+void kvmppc_free_hpt(struct kvm *kvm)
+{
+	unsigned long i;
+	struct kvmppc_pginfo *pginfo;
+
+	clear_bit(kvm->arch.lpid, lpid_inuse);
+	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+
+	if (kvm->arch.ram_pginfo) {
+		pginfo = kvm->arch.ram_pginfo;
+		kvm->arch.ram_pginfo = NULL;
+		for (i = 0; i < kvm->arch.ram_npages; ++i)
+			put_page(pfn_to_page(pginfo[i].pfn));
+		kfree(pginfo);
+	}
+}
+
+static unsigned long user_page_size(unsigned long addr)
+{
+	struct vm_area_struct *vma;
+	unsigned long size = PAGE_SIZE;
+
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, addr);
+	if (vma)
+		size = vma_kernel_pagesize(vma);
+	up_read(&current->mm->mmap_sem);
+	return size;
+}
+
+static pfn_t hva_to_pfn(unsigned long addr)
+{
+	struct page *page[1];
+	int npages;
+
+	might_sleep();
+
+	npages = get_user_pages_fast(addr, 1, 1, page);
+
+	if (unlikely(npages != 1))
+		return 0;
+
+	return page_to_pfn(page[0]);
+}
+
+long kvmppc_prepare_vrma(struct kvm *kvm,
+			 struct kvm_userspace_memory_region *mem)
+{
+	unsigned long psize, porder;
+	unsigned long i, npages;
+	struct kvmppc_pginfo *pginfo;
+	pfn_t pfn;
+	unsigned long hva;
+
+	/* First see what page size we have */
+	psize = user_page_size(mem->userspace_addr);
+	/* For now, only allow 16MB pages */
+	if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) {
+		pr_err("bad psize=%lx memory_size=%llx @ %llx\n",
+		       psize, mem->memory_size, mem->userspace_addr);
+		return -EINVAL;
+	}
+	porder = __ilog2(psize);
+
+	npages = mem->memory_size >> porder;
+	pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL);
+	if (!pginfo) {
+		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n",
+		       npages * sizeof(struct kvmppc_pginfo));
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < npages; ++i) {
+		hva = mem->userspace_addr + (i << porder);
+		if (user_page_size(hva) != psize)
+			goto err;
+		pfn = hva_to_pfn(hva);
+		if (pfn == 0) {
+			pr_err("oops, no pfn for hva %lx\n", hva);
+			goto err;
+		}
+		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
+			pr_err("oops, unaligned pfn %llx\n", pfn);
+			put_page(pfn_to_page(pfn));
+			goto err;
+		}
+		pginfo[i].pfn = pfn;
+	}
+
+	kvm->arch.ram_npages = npages;
+	kvm->arch.ram_psize = psize;
+	kvm->arch.ram_porder = porder;
+	kvm->arch.ram_pginfo = pginfo;
+
+	return 0;
+
+ err:
+	kfree(pginfo);
+	return -EINVAL;
+}
+
+void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
+{
+	unsigned long i;
+	unsigned long npages = kvm->arch.ram_npages;
+	unsigned long pfn;
+	unsigned long *hpte;
+	unsigned long hash;
+	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
+
+	if (!pginfo)
+		return;
+
+	/* VRMA can't be > 1TB */
+	if (npages > 1ul << (40 - kvm->arch.ram_porder))
+		npages = 1ul << (40 - kvm->arch.ram_porder);
+	/* Can't use more than 1 HPTE per HPTEG */
+	if (npages > HPT_NPTEG)
+		npages = HPT_NPTEG;
+
+	for (i = 0; i < npages; ++i) {
+		pfn = pginfo[i].pfn;
+		/* can't use hpt_hash since va > 64 bits */
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
+		/*
+		 * We assume that the hash table is empty and no
+		 * vcpus are using it at this stage.  Since we create
+		 * at most one HPTE per HPTEG, we just assume entry 7
+		 * is available and use it.
+		 */
+		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
+		hpte += 7 * 2;
+		/* HPTE low word - RPN, protection, etc. */
+		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
+			HPTE_R_M | PP_RWXX;
+		wmb();
+		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
+			HPTE_V_LARGE | HPTE_V_VALID;
+	}
+}
+
+int kvmppc_mmu_hv_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+		return -EINVAL;
+	memset(lpid_inuse, 0, sizeof(lpid_inuse));
+	set_bit(mfspr(SPRN_LPID), lpid_inuse);
+	set_bit(LPID_RSVD, lpid_inuse);
+
+	return 0;
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+}
+
+static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
+{
+	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
+}
+
+static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+				struct kvmppc_pte *gpte, bool data)
+{
+	return -ENOENT;
+}
+
+void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	vcpu->arch.slb_nr = 32;		/* Assume POWER7 for now */
+
+	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index f94fd9a4c69a..88c8f26add02 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -20,6 +20,9 @@
 #include <linux/module.h>
 #include <asm/kvm_book3s.h>
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
+#else
 EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter);
 EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
 EXPORT_SYMBOL_GPL(kvmppc_rmcall);
@@ -30,3 +33,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
 #ifdef CONFIG_VSX
 EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
 #endif
+#endif
+
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644
index 000000000000..60b7300568c8
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Paul Mackerras <paulus@au1.ibm.com>
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description: KVM functions specific to running on Book 3S
+ * processors in hypervisor mode (specifically POWER7 and later).
+ *
+ * This file is derived from arch/powerpc/kvm/book3s.c,
+ * by Alexander Graf <agraf@suse.de>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpumask.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/lppaca.h>
+#include <asm/processor.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+/* #define EXIT_DEBUG_INT */
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	local_paca->kvm_hstate.kvm_vcpu = vcpu;
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
+{
+	u64 now;
+	unsigned long dec_nsec;
+
+	now = get_tb();
+	if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_queue_dec(vcpu);
+	if (vcpu->arch.pending_exceptions)
+		return;
+	if (vcpu->arch.dec_expires != ~(u64)0) {
+		dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
+			tb_ticks_per_sec;
+		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+			      HRTIMER_MODE_REL);
+	}
+
+	kvm_vcpu_block(vcpu);
+	vcpu->stat.halt_wakeup++;
+
+	if (vcpu->arch.dec_expires != ~(u64)0)
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	vcpu->arch.shregs.msr = msr;
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	vcpu->arch.pvr = pvr;
+}
+
+void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
+	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
+	       vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
+	for (r = 0; r < 16; ++r)
+		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
+		       r, kvmppc_get_gpr(vcpu, r),
+		       r+16, kvmppc_get_gpr(vcpu, r+16));
+	pr_err("ctr = %.16lx  lr  = %.16lx\n",
+	       vcpu->arch.ctr, vcpu->arch.lr);
+	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
+	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
+	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
+	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
+	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
+	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
+	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
+	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
+	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
+	pr_err("fault dar = %.16lx dsisr = %.8x\n",
+	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
+	for (r = 0; r < vcpu->arch.slb_max; ++r)
+		pr_err("  ESID = %.16llx VSID = %.16llx\n",
+		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
+	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
+	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->arch.last_inst);
+}
+
+static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			      struct task_struct *tsk)
+{
+	int r = RESUME_HOST;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+	switch (vcpu->arch.trap) {
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PERFMON:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	{
+		ulong flags;
+		/*
+		 * Normally program interrupts are delivered directly
+		 * to the guest by the hardware, but we can get here
+		 * as a result of a hypervisor emulation interrupt
+		 * (e40) getting turned into a 700 by BML RTAS.
+		 */
+		flags = vcpu->arch.shregs.msr & 0x1f0000ull;
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		/* hcall - punt to userspace */
+		int i;
+
+		if (vcpu->arch.shregs.msr & MSR_PR) {
+			/* sc 1 from userspace - reflect to guest syscall */
+			kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
+			r = RESUME_GUEST;
+			break;
+		}
+		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
+		for (i = 0; i < 9; ++i)
+			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
+		run->exit_reason = KVM_EXIT_PAPR_HCALL;
+		vcpu->arch.hcall_needed = 1;
+		r = RESUME_HOST;
+		break;
+	}
+	/*
+	 * We get these next two if the guest does a bad real-mode access,
+	 * as we have enabled VRMA (virtualized real mode area) mode in the
+	 * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+	 */
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
+		vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
+					0x08000000);
+		r = RESUME_GUEST;
+		break;
+	/*
+	 * This occurs if the guest executes an illegal instruction.
+	 * We just generate a program interrupt to the guest, since
+	 * we don't emulate any guest instructions at this stage.
+	 */
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		kvmppc_core_queue_program(vcpu, 0x80000);
+		r = RESUME_GUEST;
+		break;
+	default:
+		kvmppc_dump_regs(vcpu);
+		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+			vcpu->arch.trap, kvmppc_get_pc(vcpu),
+			vcpu->arch.shregs.msr);
+		r = RESUME_HOST;
+		BUG();
+		break;
+	}
+
+
+	if (!(r & RESUME_HOST)) {
+		/* To avoid clobbering exit_reason, only check for signals if
+		 * we aren't already exiting to userspace for some other
+		 * reason. */
+		if (signal_pending(tsk)) {
+			vcpu->stat.signal_exits++;
+			run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+		} else {
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i;
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	memset(sregs, 0, sizeof(struct kvm_sregs));
+	for (i = 0; i < vcpu->arch.slb_max; i++) {
+		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
+		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+	}
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i, j;
+
+	kvmppc_set_pvr(vcpu, sregs->pvr);
+
+	j = 0;
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
+			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
+			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
+			++j;
+		}
+	}
+	vcpu->arch.slb_max = j;
+
+	return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	if (cpu_has_feature(CPU_FTR_HVMODE_206))
+		return 0;
+	return -EIO;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvm_vcpu *vcpu;
+	int err = -ENOMEM;
+	unsigned long lpcr;
+
+	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+	if (!vcpu)
+		goto out;
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	vcpu->arch.shared = &vcpu->arch.shregs;
+	vcpu->arch.last_cpu = -1;
+	vcpu->arch.mmcr[0] = MMCR0_FC;
+	vcpu->arch.ctrl = CTRL_RUNLATCH;
+	/* default to host PVR, since we can't spoof it */
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+
+	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
+	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
+	vcpu->arch.lpcr = lpcr;
+
+	kvmppc_mmu_book3s_hv_init(vcpu);
+
+	return vcpu;
+
+free_vcpu:
+	kfree(vcpu);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_uninit(vcpu);
+	kfree(vcpu);
+}
+
+extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	u64 now;
+
+	if (signal_pending(current)) {
+		run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+	preempt_disable();
+
+	/*
+	 * Make sure we are running on thread 0, and that
+	 * secondary threads are offline.
+	 * XXX we should also block attempts to bring any
+	 * secondary threads online.
+	 */
+	if (threads_per_core > 1) {
+		int cpu = smp_processor_id();
+		int thr = cpu_thread_in_core(cpu);
+
+		if (thr)
+			goto out;
+		while (++thr < threads_per_core)
+			if (cpu_online(cpu + thr))
+				goto out;
+	}
+
+	kvm_guest_enter();
+
+	__kvmppc_vcore_entry(NULL, vcpu);
+
+	kvm_guest_exit();
+
+	preempt_enable();
+	kvm_resched(vcpu);
+
+	now = get_tb();
+	/* cancel pending dec exception if dec is positive */
+	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_dequeue_dec(vcpu);
+
+	return kvmppc_handle_exit(run, vcpu, current);
+
+ out:
+	preempt_enable();
+	return -EBUSY;
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		return kvmppc_prepare_vrma(kvm, mem);
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		kvmppc_map_vrma(kvm, mem);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	long r;
+
+	/* Allocate hashed page table */
+	r = kvmppc_alloc_hpt(kvm);
+
+	return r;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvmppc_free_hpt(kvm);
+}
+
+/* These are stubs for now */
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+}
+
+/* We don't need to emulate any privileged instructions or dcbz */
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_book3s_hv_init(void)
+{
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+
+	if (r)
+		return r;
+
+	r = kvmppc_mmu_hv_init();
+
+	return r;
+}
+
+static void kvmppc_book3s_hv_exit(void)
+{
+	kvm_exit();
+}
+
+module_init(kvmppc_book3s_hv_init);
+module_exit(kvmppc_book3s_hv_exit);
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
new file mode 100644
index 000000000000..532afaf19841
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -0,0 +1,136 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_interrupts.S, which is:
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/ppc-opcode.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcore_entry)
+
+	/* Write correct stack frame */
+	mflr	r0
+	std	r0,PPC_LR_STKOFF(r1)
+
+	/* Save host state to the stack */
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save non-volatile registers (r14 - r31) */
+	SAVE_NVGPRS(r1)
+
+	/* Save host DSCR */
+	mfspr	r3, SPRN_DSCR
+	std	r3, HSTATE_DSCR(r13)
+
+	/* Save host DABR */
+	mfspr	r3, SPRN_DABR
+	std	r3, HSTATE_DABR(r13)
+
+	/* Hard-disable interrupts */
+	mfmsr   r10
+	std	r10, HSTATE_HOST_MSR(r13)
+	rldicl  r10,r10,48,1
+	rotldi  r10,r10,16
+	mtmsrd  r10,1
+
+	/* Save host PMU registers and load guest PMU registers */
+	/* R4 is live here (vcpu pointer) but not r3 or r5 */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
+	isync
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r5, LPPACA_PMCINUSE(r3)
+	cmpwi	r5, 0
+	beq	31f			/* skip if not */
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r6, SPRN_MMCRA
+	std	r7, HSTATE_MMCR(r13)
+	std	r5, HSTATE_MMCR + 8(r13)
+	std	r6, HSTATE_MMCR + 16(r13)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r5, SPRN_PMC2
+	mfspr	r6, SPRN_PMC3
+	mfspr	r7, SPRN_PMC4
+	mfspr	r8, SPRN_PMC5
+	mfspr	r9, SPRN_PMC6
+	stw	r3, HSTATE_PMC(r13)
+	stw	r5, HSTATE_PMC + 4(r13)
+	stw	r6, HSTATE_PMC + 8(r13)
+	stw	r7, HSTATE_PMC + 12(r13)
+	stw	r8, HSTATE_PMC + 16(r13)
+	stw	r9, HSTATE_PMC + 20(r13)
+31:
+
+	/*
+	 * Put whatever is in the decrementer into the
+	 * hypervisor decrementer.
+	 */
+	mfspr	r8,SPRN_DEC
+	mftb	r7
+	mtspr	SPRN_HDEC,r8
+	extsw	r8,r8
+	add	r8,r8,r7
+	std	r8,HSTATE_DECEXP(r13)
+
+	/* Jump to partition switch code */
+	bl	.kvmppc_hv_entry_trampoline
+	nop
+
+/*
+ * We return here in virtual mode after the guest exits
+ * with something that we can't handle in real mode.
+ * Interrupts are enabled again at this point.
+ */
+
+.global kvmppc_handler_highmem
+kvmppc_handler_highmem:
+
+	/*
+	 * Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R12      = exit handler id
+	 * R13      = PACA
+	 */
+
+	/* Restore non-volatile host registers (r14 - r31) */
+	REST_NVGPRS(r1)
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
new file mode 100644
index 000000000000..9af264840b98
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -0,0 +1,806 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_rmhandlers.S and other files, which are:
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *        Real Mode handlers that need to be in the linear mapping           *
+ *                                                                           *
+ ****************************************************************************/
+
+#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
+
+	.globl	kvmppc_skip_interrupt
+kvmppc_skip_interrupt:
+	mfspr	r13,SPRN_SRR0
+	addi	r13,r13,4
+	mtspr	SPRN_SRR0,r13
+	GET_SCRATCH0(r13)
+	rfid
+	b	.
+
+	.globl	kvmppc_skip_Hinterrupt
+kvmppc_skip_Hinterrupt:
+	mfspr	r13,SPRN_HSRR0
+	addi	r13,r13,4
+	mtspr	SPRN_HSRR0,r13
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+
+/*
+ * Call kvmppc_handler_trampoline_enter in real mode.
+ * Must be called with interrupts hard-disabled.
+ *
+ * Input Registers:
+ *
+ * LR = return address to continue at after eventually re-enabling MMU
+ */
+_GLOBAL(kvmppc_hv_entry_trampoline)
+	mfmsr	r10
+	LOAD_REG_ADDR(r5, kvmppc_hv_entry)
+	li	r0,MSR_RI
+	andc	r0,r10,r0
+	li	r6,MSR_IR | MSR_DR
+	andc	r6,r10,r6
+	mtmsrd	r0,1		/* clear RI in MSR */
+	mtsrr0	r5
+	mtsrr1	r6
+	RFI
+
+#define ULONG_SIZE 		8
+#define VCPU_GPR(n)		(VCPU_GPRS + (n * ULONG_SIZE))
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_hv_entry
+kvmppc_hv_entry:
+
+	/* Required state:
+	 *
+	 * R4 = vcpu pointer
+	 * MSR = ~IR|DR
+	 * R13 = PACA
+	 * R1 = host R1
+	 * all other volatile GPRS = free
+	 */
+	mflr	r0
+	std	r0, HSTATE_VMHANDLER(r13)
+
+	ld	r14, VCPU_GPR(r14)(r4)
+	ld	r15, VCPU_GPR(r15)(r4)
+	ld	r16, VCPU_GPR(r16)(r4)
+	ld	r17, VCPU_GPR(r17)(r4)
+	ld	r18, VCPU_GPR(r18)(r4)
+	ld	r19, VCPU_GPR(r19)(r4)
+	ld	r20, VCPU_GPR(r20)(r4)
+	ld	r21, VCPU_GPR(r21)(r4)
+	ld	r22, VCPU_GPR(r22)(r4)
+	ld	r23, VCPU_GPR(r23)(r4)
+	ld	r24, VCPU_GPR(r24)(r4)
+	ld	r25, VCPU_GPR(r25)(r4)
+	ld	r26, VCPU_GPR(r26)(r4)
+	ld	r27, VCPU_GPR(r27)(r4)
+	ld	r28, VCPU_GPR(r28)(r4)
+	ld	r29, VCPU_GPR(r29)(r4)
+	ld	r30, VCPU_GPR(r30)(r4)
+	ld	r31, VCPU_GPR(r31)(r4)
+
+	/* Load guest PMU registers */
+	/* R4 is live here (vcpu pointer) */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
+	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
+	lwz	r6, VCPU_PMC + 8(r4)
+	lwz	r7, VCPU_PMC + 12(r4)
+	lwz	r8, VCPU_PMC + 16(r4)
+	lwz	r9, VCPU_PMC + 20(r4)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r5
+	mtspr	SPRN_PMC3, r6
+	mtspr	SPRN_PMC4, r7
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, VCPU_MMCR(r4)
+	ld	r5, VCPU_MMCR + 8(r4)
+	ld	r6, VCPU_MMCR + 16(r4)
+	mtspr	SPRN_MMCR1, r5
+	mtspr	SPRN_MMCRA, r6
+	mtspr	SPRN_MMCR0, r3
+	isync
+
+	/* Load up FP, VMX and VSX registers */
+	bl	kvmppc_load_fp
+
+	/* Switch DSCR to guest value */
+	ld	r5, VCPU_DSCR(r4)
+	mtspr	SPRN_DSCR, r5
+
+	/*
+	 * Set the decrementer to the guest decrementer.
+	 */
+	ld	r8,VCPU_DEC_EXPIRES(r4)
+	mftb	r7
+	subf	r3,r7,r8
+	mtspr	SPRN_DEC,r3
+	stw	r3,VCPU_DEC(r4)
+
+	ld	r5, VCPU_SPRG0(r4)
+	ld	r6, VCPU_SPRG1(r4)
+	ld	r7, VCPU_SPRG2(r4)
+	ld	r8, VCPU_SPRG3(r4)
+	mtspr	SPRN_SPRG0, r5
+	mtspr	SPRN_SPRG1, r6
+	mtspr	SPRN_SPRG2, r7
+	mtspr	SPRN_SPRG3, r8
+
+	/* Save R1 in the PACA */
+	std	r1, HSTATE_HOST_R1(r13)
+
+	/* Load up DAR and DSISR */
+	ld	r5, VCPU_DAR(r4)
+	lwz	r6, VCPU_DSISR(r4)
+	mtspr	SPRN_DAR, r5
+	mtspr	SPRN_DSISR, r6
+
+	/* Set partition DABR */
+	li	r5,3
+	ld	r6,VCPU_DABR(r4)
+	mtspr	SPRN_DABRX,r5
+	mtspr	SPRN_DABR,r6
+
+	/* Restore AMR and UAMOR, set AMOR to all 1s */
+	ld	r5,VCPU_AMR(r4)
+	ld	r6,VCPU_UAMOR(r4)
+	li	r7,-1
+	mtspr	SPRN_AMR,r5
+	mtspr	SPRN_UAMOR,r6
+	mtspr	SPRN_AMOR,r7
+
+	/* Clear out SLB */
+	li	r6,0
+	slbmte	r6,r6
+	slbia
+	ptesync
+
+	/* Switch to guest partition. */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+	ld	r6,KVM_SDR1(r9)
+	lwz	r7,KVM_LPID(r9)
+	li	r0,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r0
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+	ld	r8,VCPU_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* Check if HDEC expires soon */
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,10
+	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	mr	r9,r4
+	blt	hdec_soon
+
+	/*
+	 * Invalidate the TLB if we could possibly have stale TLB
+	 * entries for this partition on this core due to the use
+	 * of tlbiel.
+	 */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+	lwz	r5,VCPU_VCPUID(r4)
+	lhz	r6,PACAPACAINDEX(r13)
+	lhz	r8,VCPU_LAST_CPU(r4)
+	sldi	r7,r6,1			/* see if this is the same vcpu */
+	add	r7,r7,r9		/* as last ran on this pcpu */
+	lhz	r0,KVM_LAST_VCPU(r7)
+	cmpw	r6,r8			/* on the same cpu core as last time? */
+	bne	3f
+	cmpw	r0,r5			/* same vcpu as this core last ran? */
+	beq	1f
+3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition TLB */
+	sth	r5,KVM_LAST_VCPU(r7)
+	li	r6,128
+	mtctr	r6
+	li	r7,0x800		/* IS field = 0b10 */
+	ptesync
+2:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	2b
+	ptesync
+1:
+
+	/* Save purr/spurr */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	std	r5,HSTATE_PURR(r13)
+	std	r6,HSTATE_SPURR(r13)
+	ld	r7,VCPU_PURR(r4)
+	ld	r8,VCPU_SPURR(r4)
+	mtspr	SPRN_PURR,r7
+	mtspr	SPRN_SPURR,r8
+
+	/* Load up guest SLB entries */
+	lwz	r5,VCPU_SLB_MAX(r4)
+	cmpwi	r5,0
+	beq	9f
+	mtctr	r5
+	addi	r6,r4,VCPU_SLB
+1:	ld	r8,VCPU_SLB_E(r6)
+	ld	r9,VCPU_SLB_V(r6)
+	slbmte	r9,r8
+	addi	r6,r6,VCPU_SLB_SIZE
+	bdnz	1b
+9:
+
+	/* Restore state of CTRL run bit; assume 1 on entry */
+	lwz	r5,VCPU_CTRL(r4)
+	andi.	r5,r5,1
+	bne	4f
+	mfspr	r6,SPRN_CTRLF
+	clrrdi	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	ld	r6, VCPU_CTR(r4)
+	lwz	r7, VCPU_XER(r4)
+
+	mtctr	r6
+	mtxer	r7
+
+	/* Move SRR0 and SRR1 into the respective regs */
+	ld	r6, VCPU_SRR0(r4)
+	ld	r7, VCPU_SRR1(r4)
+	mtspr	SPRN_SRR0, r6
+	mtspr	SPRN_SRR1, r7
+
+	ld	r10, VCPU_PC(r4)
+
+	ld	r11, VCPU_MSR(r4)	/* r10 = vcpu->arch.msr & ~MSR_HV */
+	rldicl	r11, r11, 63 - MSR_HV_LG, 1
+	rotldi	r11, r11, 1 + MSR_HV_LG
+	ori	r11, r11, MSR_ME
+
+fast_guest_return:
+	mtspr	SPRN_HSRR0,r10
+	mtspr	SPRN_HSRR1,r11
+
+	/* Activate guest mode, so faults get handled by KVM */
+	li	r9, KVM_GUEST_MODE_GUEST
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+	/* Enter guest */
+
+	ld	r5, VCPU_LR(r4)
+	lwz	r6, VCPU_CR(r4)
+	mtlr	r5
+	mtcr	r6
+
+	ld	r0, VCPU_GPR(r0)(r4)
+	ld	r1, VCPU_GPR(r1)(r4)
+	ld	r2, VCPU_GPR(r2)(r4)
+	ld	r3, VCPU_GPR(r3)(r4)
+	ld	r5, VCPU_GPR(r5)(r4)
+	ld	r6, VCPU_GPR(r6)(r4)
+	ld	r7, VCPU_GPR(r7)(r4)
+	ld	r8, VCPU_GPR(r8)(r4)
+	ld	r9, VCPU_GPR(r9)(r4)
+	ld	r10, VCPU_GPR(r10)(r4)
+	ld	r11, VCPU_GPR(r11)(r4)
+	ld	r12, VCPU_GPR(r12)(r4)
+	ld	r13, VCPU_GPR(r13)(r4)
+
+	ld	r4, VCPU_GPR(r4)(r4)
+
+	hrfid
+	b	.
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+/*
+ * We come here from the first-level interrupt handlers.
+ */
+	.globl	kvmppc_interrupt
+kvmppc_interrupt:
+	/*
+	 * Register contents:
+	 * R12		= interrupt vector
+	 * R13		= PACA
+	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest R13 saved in SPRN_SCRATCH0
+	 */
+	/* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
+	std	r9, HSTATE_HOST_R2(r13)
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	/* Save registers */
+
+	std	r0, VCPU_GPR(r0)(r9)
+	std	r1, VCPU_GPR(r1)(r9)
+	std	r2, VCPU_GPR(r2)(r9)
+	std	r3, VCPU_GPR(r3)(r9)
+	std	r4, VCPU_GPR(r4)(r9)
+	std	r5, VCPU_GPR(r5)(r9)
+	std	r6, VCPU_GPR(r6)(r9)
+	std	r7, VCPU_GPR(r7)(r9)
+	std	r8, VCPU_GPR(r8)(r9)
+	ld	r0, HSTATE_HOST_R2(r13)
+	std	r0, VCPU_GPR(r9)(r9)
+	std	r10, VCPU_GPR(r10)(r9)
+	std	r11, VCPU_GPR(r11)(r9)
+	ld	r3, HSTATE_SCRATCH0(r13)
+	lwz	r4, HSTATE_SCRATCH1(r13)
+	std	r3, VCPU_GPR(r12)(r9)
+	stw	r4, VCPU_CR(r9)
+
+	/* Restore R1/R2 so we can handle faults */
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATOC(r13)
+
+	mfspr	r10, SPRN_SRR0
+	mfspr	r11, SPRN_SRR1
+	std	r10, VCPU_SRR0(r9)
+	std	r11, VCPU_SRR1(r9)
+	andi.	r0, r12, 2		/* need to read HSRR0/1? */
+	beq	1f
+	mfspr	r10, SPRN_HSRR0
+	mfspr	r11, SPRN_HSRR1
+	clrrdi	r12, r12, 2
+1:	std	r10, VCPU_PC(r9)
+	std	r11, VCPU_MSR(r9)
+
+	GET_SCRATCH0(r3)
+	mflr	r4
+	std	r3, VCPU_GPR(r13)(r9)
+	std	r4, VCPU_LR(r9)
+
+	/* Unset guest mode */
+	li	r0, KVM_GUEST_MODE_NONE
+	stb	r0, HSTATE_IN_GUEST(r13)
+
+	stw	r12,VCPU_TRAP(r9)
+
+	/* See if this is a leftover HDEC interrupt */
+	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	bne	2f
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,0
+	bge	ignore_hdec
+2:
+
+	/* Check for mediated interrupts (could be done earlier really ...) */
+	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
+	bne+	1f
+	ld	r5,VCPU_LPCR(r9)
+	andi.	r0,r11,MSR_EE
+	beq	1f
+	andi.	r0,r5,LPCR_MER
+	bne	bounce_ext_interrupt
+1:
+
+	/* Save DEC */
+	mfspr	r5,SPRN_DEC
+	mftb	r6
+	extsw	r5,r5
+	add	r5,r5,r6
+	std	r5,VCPU_DEC_EXPIRES(r9)
+
+	/* Save HEIR (HV emulation assist reg) in last_inst
+	   if this is an HEI (HV emulation interrupt, e40) */
+	li	r3,-1
+	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
+	bne	11f
+	mfspr	r3,SPRN_HEIR
+11:	stw	r3,VCPU_LAST_INST(r9)
+
+	/* Save more register state  */
+	mfxer	r5
+	mfdar	r6
+	mfdsisr	r7
+	mfctr	r8
+
+	stw	r5, VCPU_XER(r9)
+	std	r6, VCPU_DAR(r9)
+	stw	r7, VCPU_DSISR(r9)
+	std	r8, VCPU_CTR(r9)
+	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
+	beq	6f
+7:	std	r6, VCPU_FAULT_DAR(r9)
+	stw	r7, VCPU_FAULT_DSISR(r9)
+
+	/* Save guest CTRL register, set runlatch to 1 */
+	mfspr	r6,SPRN_CTRLF
+	stw	r6,VCPU_CTRL(r9)
+	andi.	r0,r6,1
+	bne	4f
+	ori	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	/* Read the guest SLB and save it away */
+	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
+	mtctr	r0
+	li	r6,0
+	addi	r7,r9,VCPU_SLB
+	li	r5,0
+1:	slbmfee	r8,r6
+	andis.	r0,r8,SLB_ESID_V@h
+	beq	2f
+	add	r8,r8,r6		/* put index in */
+	slbmfev	r3,r6
+	std	r8,VCPU_SLB_E(r7)
+	std	r3,VCPU_SLB_V(r7)
+	addi	r7,r7,VCPU_SLB_SIZE
+	addi	r5,r5,1
+2:	addi	r6,r6,1
+	bdnz	1b
+	stw	r5,VCPU_SLB_MAX(r9)
+
+	/*
+	 * Save the guest PURR/SPURR
+	 */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	ld	r7,VCPU_PURR(r9)
+	ld	r8,VCPU_SPURR(r9)
+	std	r5,VCPU_PURR(r9)
+	std	r6,VCPU_SPURR(r9)
+	subf	r5,r7,r5
+	subf	r6,r8,r6
+
+	/*
+	 * Restore host PURR/SPURR and add guest times
+	 * so that the time in the guest gets accounted.
+	 */
+	ld	r3,HSTATE_PURR(r13)
+	ld	r4,HSTATE_SPURR(r13)
+	add	r3,r3,r5
+	add	r4,r4,r6
+	mtspr	SPRN_PURR,r3
+	mtspr	SPRN_SPURR,r4
+
+	/* Clear out SLB */
+	li	r5,0
+	slbmte	r5,r5
+	slbia
+	ptesync
+
+hdec_soon:
+	/* Switch back to host partition */
+	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+	ld	r6,KVM_HOST_SDR1(r4)
+	lwz	r7,KVM_HOST_LPID(r4)
+	li	r8,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r8
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+	lis	r8,0x7fff		/* MAX_INT@h */
+	mtspr	SPRN_HDEC,r8
+
+	ld	r8,KVM_HOST_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* load host SLB entries */
+	ld	r8,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	ld	r5,SLBSHADOW_SAVEAREA(r8)
+	ld	r6,SLBSHADOW_SAVEAREA+8(r8)
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r8,r8,16
+	.endr
+
+	/* Save and reset AMR and UAMOR before turning on the MMU */
+	mfspr	r5,SPRN_AMR
+	mfspr	r6,SPRN_UAMOR
+	std	r5,VCPU_AMR(r9)
+	std	r6,VCPU_UAMOR(r9)
+	li	r6,0
+	mtspr	SPRN_AMR,r6
+
+	/* Restore host DABR and DABRX */
+	ld	r5,HSTATE_DABR(r13)
+	li	r6,7
+	mtspr	SPRN_DABR,r5
+	mtspr	SPRN_DABRX,r6
+
+	/* Switch DSCR back to host value */
+	mfspr	r8, SPRN_DSCR
+	ld	r7, HSTATE_DSCR(r13)
+	std	r8, VCPU_DSCR(r7)
+	mtspr	SPRN_DSCR, r7
+
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(r14)(r9)
+	std	r15, VCPU_GPR(r15)(r9)
+	std	r16, VCPU_GPR(r16)(r9)
+	std	r17, VCPU_GPR(r17)(r9)
+	std	r18, VCPU_GPR(r18)(r9)
+	std	r19, VCPU_GPR(r19)(r9)
+	std	r20, VCPU_GPR(r20)(r9)
+	std	r21, VCPU_GPR(r21)(r9)
+	std	r22, VCPU_GPR(r22)(r9)
+	std	r23, VCPU_GPR(r23)(r9)
+	std	r24, VCPU_GPR(r24)(r9)
+	std	r25, VCPU_GPR(r25)(r9)
+	std	r26, VCPU_GPR(r26)(r9)
+	std	r27, VCPU_GPR(r27)(r9)
+	std	r28, VCPU_GPR(r28)(r9)
+	std	r29, VCPU_GPR(r29)(r9)
+	std	r30, VCPU_GPR(r30)(r9)
+	std	r31, VCPU_GPR(r31)(r9)
+
+	/* Save SPRGs */
+	mfspr	r3, SPRN_SPRG0
+	mfspr	r4, SPRN_SPRG1
+	mfspr	r5, SPRN_SPRG2
+	mfspr	r6, SPRN_SPRG3
+	std	r3, VCPU_SPRG0(r9)
+	std	r4, VCPU_SPRG1(r9)
+	std	r5, VCPU_SPRG2(r9)
+	std	r6, VCPU_SPRG3(r9)
+
+	/* Save PMU registers */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r6, SPRN_MMCRA
+	std	r4, VCPU_MMCR(r9)
+	std	r5, VCPU_MMCR + 8(r9)
+	std	r6, VCPU_MMCR + 16(r9)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r4, SPRN_PMC2
+	mfspr	r5, SPRN_PMC3
+	mfspr	r6, SPRN_PMC4
+	mfspr	r7, SPRN_PMC5
+	mfspr	r8, SPRN_PMC6
+	stw	r3, VCPU_PMC(r9)
+	stw	r4, VCPU_PMC + 4(r9)
+	stw	r5, VCPU_PMC + 8(r9)
+	stw	r6, VCPU_PMC + 12(r9)
+	stw	r7, VCPU_PMC + 16(r9)
+	stw	r8, VCPU_PMC + 20(r9)
+22:
+	/* save FP state */
+	mr	r3, r9
+	bl	.kvmppc_save_fp
+
+	/*
+	 * Reload DEC.  HDEC interrupts were disabled when
+	 * we reloaded the host's LPCR value.
+	 */
+	ld	r3, HSTATE_DECEXP(r13)
+	mftb	r4
+	subf	r4, r4, r3
+	mtspr	SPRN_DEC, r4
+
+	/* Reload the host's PMU registers */
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r4, LPPACA_PMCINUSE(r3)
+	cmpwi	r4, 0
+	beq	23f			/* skip if not */
+	lwz	r3, HSTATE_PMC(r13)
+	lwz	r4, HSTATE_PMC + 4(r13)
+	lwz	r5, HSTATE_PMC + 8(r13)
+	lwz	r6, HSTATE_PMC + 12(r13)
+	lwz	r8, HSTATE_PMC + 16(r13)
+	lwz	r9, HSTATE_PMC + 20(r13)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r4
+	mtspr	SPRN_PMC3, r5
+	mtspr	SPRN_PMC4, r6
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, HSTATE_MMCR(r13)
+	ld	r4, HSTATE_MMCR + 8(r13)
+	ld	r5, HSTATE_MMCR + 16(r13)
+	mtspr	SPRN_MMCR1, r4
+	mtspr	SPRN_MMCRA, r5
+	mtspr	SPRN_MMCR0, r3
+	isync
+23:
+	/*
+	 * For external and machine check interrupts, we need
+	 * to call the Linux handler to process the interrupt.
+	 * We do that by jumping to the interrupt vector address
+	 * which we have in r12.  The [h]rfid at the end of the
+	 * handler will return to the book3s_hv_interrupts.S code.
+	 * For other interrupts we do the rfid to get back
+	 * to the book3s_interrupts.S code here.
+	 */
+	ld	r8, HSTATE_VMHANDLER(r13)
+	ld	r7, HSTATE_HOST_MSR(r13)
+
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	11f
+	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+
+	/* RFI into the highmem handler, or branch to interrupt handler */
+	mfmsr	r6
+	mtctr	r12
+	li	r0, MSR_RI
+	andc	r6, r6, r0
+	mtmsrd	r6, 1			/* Clear RI in MSR */
+	mtsrr0	r8
+	mtsrr1	r7
+	beqctr
+	RFI
+
+11:	mtspr	SPRN_HSRR0, r8
+	mtspr	SPRN_HSRR1, r7
+	ba	0x500
+
+6:	mfspr	r6,SPRN_HDAR
+	mfspr	r7,SPRN_HDSISR
+	b	7b
+
+ignore_hdec:
+	mr	r4,r9
+	b	fast_guest_return
+
+bounce_ext_interrupt:
+	mr	r4,r9
+	mtspr	SPRN_SRR0,r10
+	mtspr	SPRN_SRR1,r11
+	li	r10,BOOK3S_INTERRUPT_EXTERNAL
+	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
+	b	fast_guest_return
+
+/*
+ * Save away FP, VMX and VSX registers.
+ * r3 = vcpu pointer
+*/
+_GLOBAL(kvmppc_save_fp)
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r8
+	isync
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r6,reg*16+VCPU_VSRS
+	stxvd2x	reg,r6,r3
+	reg = reg + 1
+	.endr
+FTR_SECTION_ELSE
+#endif
+	reg = 0
+	.rept	32
+	stfd	reg,reg*8+VCPU_FPRS(r3)
+	reg = reg + 1
+	.endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+	mffs	fr0
+	stfd	fr0,VCPU_FPSCR(r3)
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r6,reg*16+VCPU_VRS
+	stvx	reg,r6,r3
+	reg = reg + 1
+	.endr
+	mfvscr	vr0
+	li	r6,VCPU_VSCR
+	stvx	vr0,r6,r3
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	mfspr	r6,SPRN_VRSAVE
+	stw	r6,VCPU_VRSAVE(r3)
+	mtmsrd	r9
+	isync
+	blr
+
+/*
+ * Load up FP, VMX and VSX registers
+ * r4 = vcpu pointer
+ */
+	.globl	kvmppc_load_fp
+kvmppc_load_fp:
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r8
+	isync
+	lfd	fr0,VCPU_FPSCR(r4)
+	MTFSF_L(fr0)
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r7,reg*16+VCPU_VSRS
+	lxvd2x	reg,r7,r4
+	reg = reg + 1
+	.endr
+FTR_SECTION_ELSE
+#endif
+	reg = 0
+	.rept	32
+	lfd	reg,reg*8+VCPU_FPRS(r4)
+	reg = reg + 1
+	.endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	li	r7,VCPU_VSCR
+	lvx	vr0,r7,r4
+	mtvscr	vr0
+	reg = 0
+	.rept	32
+	li	r7,reg*16+VCPU_VRS
+	lvx	reg,r7,r4
+	reg = reg + 1
+	.endr
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	lwz	r7,VCPU_VRSAVE(r4)
+	mtspr	SPRN_VRSAVE,r7
+	blr
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 1cc25e8c0cf1..134501691ad0 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -89,29 +89,29 @@ kvmppc_handler_trampoline_enter:
 
 	/* Enter guest */
 
-	PPC_LL	r4, (SVCPU_CTR)(r3)
-	PPC_LL	r5, (SVCPU_LR)(r3)
-	lwz	r6, (SVCPU_CR)(r3)
-	lwz	r7, (SVCPU_XER)(r3)
+	PPC_LL	r4, SVCPU_CTR(r3)
+	PPC_LL	r5, SVCPU_LR(r3)
+	lwz	r6, SVCPU_CR(r3)
+	lwz	r7, SVCPU_XER(r3)
 
 	mtctr	r4
 	mtlr	r5
 	mtcr	r6
 	mtxer	r7
 
-	PPC_LL	r0, (SVCPU_R0)(r3)
-	PPC_LL	r1, (SVCPU_R1)(r3)
-	PPC_LL	r2, (SVCPU_R2)(r3)
-	PPC_LL	r4, (SVCPU_R4)(r3)
-	PPC_LL	r5, (SVCPU_R5)(r3)
-	PPC_LL	r6, (SVCPU_R6)(r3)
-	PPC_LL	r7, (SVCPU_R7)(r3)
-	PPC_LL	r8, (SVCPU_R8)(r3)
-	PPC_LL	r9, (SVCPU_R9)(r3)
-	PPC_LL	r10, (SVCPU_R10)(r3)
-	PPC_LL	r11, (SVCPU_R11)(r3)
-	PPC_LL	r12, (SVCPU_R12)(r3)
-	PPC_LL	r13, (SVCPU_R13)(r3)
+	PPC_LL	r0, SVCPU_R0(r3)
+	PPC_LL	r1, SVCPU_R1(r3)
+	PPC_LL	r2, SVCPU_R2(r3)
+	PPC_LL	r4, SVCPU_R4(r3)
+	PPC_LL	r5, SVCPU_R5(r3)
+	PPC_LL	r6, SVCPU_R6(r3)
+	PPC_LL	r7, SVCPU_R7(r3)
+	PPC_LL	r8, SVCPU_R8(r3)
+	PPC_LL	r9, SVCPU_R9(r3)
+	PPC_LL	r10, SVCPU_R10(r3)
+	PPC_LL	r11, SVCPU_R11(r3)
+	PPC_LL	r12, SVCPU_R12(r3)
+	PPC_LL	r13, SVCPU_R13(r3)
 
 	PPC_LL	r3, (SVCPU_R3)(r3)
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 026036efcde0..3a4f379ee70f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,8 +38,12 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
+#ifndef CONFIG_KVM_BOOK3S_64_HV
 	return !(v->arch.shared->msr & MSR_WE) ||
 	       !!(v->arch.pending_exceptions);
+#else
+	return 1;
+#endif
 }
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -184,10 +188,13 @@ int kvm_dev_ioctl_check_extension(long ext)
 #else
 	case KVM_CAP_PPC_SEGSTATE:
 #endif
-	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
+		r = 1;
+		break;
+#ifndef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_OSI:
 	case KVM_CAP_PPC_GET_PVINFO:
 		r = 1;
@@ -195,6 +202,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+#endif
 	default:
 		r = 0;
 		break;
@@ -291,6 +299,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
 	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
+	vcpu->arch.dec_expires = ~(u64)0;
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	mutex_init(&vcpu->arch.exit_timing_lock);
@@ -317,6 +326,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
 #endif
 	kvmppc_core_vcpu_load(vcpu, cpu);
+	vcpu->cpu = smp_processor_id();
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -325,6 +335,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_BOOKE
 	vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
 #endif
+	vcpu->cpu = -1;
 }
 
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -496,6 +507,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		for (i = 0; i < 32; i++)
 			kvmppc_set_gpr(vcpu, i, gprs[i]);
 		vcpu->arch.osi_needed = 0;
+	} else if (vcpu->arch.hcall_needed) {
+		int i;
+
+		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
+		for (i = 0; i < 9; ++i)
+			kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
+		vcpu->arch.hcall_needed = 0;
 	}
 
 	kvmppc_core_deliver_interrupts(vcpu);
@@ -518,6 +536,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
+	} else if (vcpu->cpu != -1) {
+		smp_send_reschedule(vcpu->cpu);
 	}
 
 	return 0;
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index d62a14b2cd0f..b135d3d397db 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write,
  *                         Book3S trace points                           *
  *************************************************************************/
 
-#ifdef CONFIG_PPC_BOOK3S
+#ifdef CONFIG_KVM_BOOK3S_PR
 
 TRACE_EVENT(kvm_book3s_exit,
 	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 9c9ca7c7b491..a156294fc22a 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_PAPR_HCALL	  19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,11 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
cgit v1.2.3


From 54738c097163c3f01e67ccc85462b78d4d4f495f Mon Sep 17 00:00:00 2001
From: David Gibson <dwg@au1.ibm.com>
Date: Wed, 29 Jun 2011 00:22:41 +0000
Subject: KVM: PPC: Accelerate H_PUT_TCE by implementing it in real mode

This improves I/O performance for guests using the PAPR
paravirtualization interface by making the H_PUT_TCE hcall faster, by
implementing it in real mode.  H_PUT_TCE is used for updating virtual
IOMMU tables, and is used both for virtual I/O and for real I/O in the
PAPR interface.

Since this moves the IOMMU tables into the kernel, we define a new
KVM_CREATE_SPAPR_TCE ioctl to allow qemu to create the tables.  The
ioctl returns a file descriptor which can be used to mmap the newly
created table.  The qemu driver models use them in the same way as
userspace managed tables, but they can be updated directly by the
guest with a real-mode H_PUT_TCE implementation, reducing the number
of host/guest context switches during guest IO.

There are certain circumstances where it is useful for userland qemu
to write to the TCE table even if the kernel H_PUT_TCE path is used
most of the time.  Specifically, allowing this will avoid awkwardness
when we need to reset the table.  More importantly, we will in the
future need to write the table in order to restore its state after a
checkpoint resume or migration.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt        |  35 ++++++++++
 arch/powerpc/include/asm/kvm.h           |   9 +++
 arch/powerpc/include/asm/kvm_book3s_64.h |   2 +
 arch/powerpc/include/asm/kvm_host.h      |   9 +++
 arch/powerpc/include/asm/kvm_ppc.h       |   2 +
 arch/powerpc/kvm/Makefile                |   3 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c      |  73 +++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c             | 116 ++++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |   2 +-
 arch/powerpc/kvm/powerpc.c               |  18 +++++
 include/linux/kvm.h                      |   2 +
 11 files changed, 268 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_vio_hv.c

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e8875fef3eb8..a1d344d5ff4c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1350,6 +1350,41 @@ The following flags are defined:
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
 
+4.62 KVM_CREATE_SPAPR_TCE
+
+Capability: KVM_CAP_SPAPR_TCE
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This creates a virtual TCE (translation control entry) table, which
+is an IOMMU for PAPR-style virtual I/O.  It is used to translate
+logical addresses used in virtual I/O into guest physical addresses,
+and provides a scatter/gather capability for PAPR virtual I/O.
+
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+	__u64 liobn;
+	__u32 window_size;
+};
+
+The liobn field gives the logical IO bus number for which to create a
+TCE table.  The window_size field specifies the size of the DMA window
+which this TCE table will translate - the table will contain one 64
+bit TCE entry for every 4kiB of the DMA window.
+
+When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE
+table has been created using this ioctl(), the kernel will handle it
+in real mode, updating the TCE table.  H_PUT_TCE calls for other
+liobns will cause a vm exit and must be handled by userspace.
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the created TCE table into userspace.  This lets userspace read
+the entries written by kernel-handled H_PUT_TCE calls, and also lets
+userspace update the TCE table directly which is useful in some
+circumstances.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index d2ca5ed3877b..c3ec990daf45 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -22,6 +22,9 @@
 
 #include <linux/types.h>
 
+/* Select powerpc specific features in <linux/kvm.h> */
+#define __KVM_HAVE_SPAPR_TCE
+
 struct kvm_regs {
 	__u64 pc;
 	__u64 cr;
@@ -272,4 +275,10 @@ struct kvm_guest_debug_arch {
 #define KVM_INTERRUPT_UNSET	-2U
 #define KVM_INTERRUPT_SET_LEVEL	-3U
 
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+	__u64 liobn;
+	__u32 window_size;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 5f73388ea0af..e43fe42b9875 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -27,4 +27,6 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 }
 #endif
 
+#define SPAPR_TCE_SHIFT		12
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 6ebf1721680a..5616e39a7fa4 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -144,6 +144,14 @@ struct kvmppc_pginfo {
 	atomic_t refcnt;
 };
 
+struct kvmppc_spapr_tce_table {
+	struct list_head list;
+	struct kvm *kvm;
+	u64 liobn;
+	u32 window_size;
+	struct page *pages[0];
+};
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
@@ -157,6 +165,7 @@ struct kvm_arch {
 	unsigned long sdr1;
 	unsigned long host_sdr1;
 	int tlbie_lock;
+	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2afe92e6f625..99f6fcf4cf88 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -119,6 +119,8 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				struct kvm_create_spapr_tce *args);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 2ecffc0dc1bb..1de3d54901d4 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -55,7 +55,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_interrupts.o \
 	book3s_64_mmu_hv.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
-	book3s_hv_rm_mmu.o
+	book3s_hv_rm_mmu.o \
+	book3s_64_vio_hv.o
 
 kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
new file mode 100644
index 000000000000..ea0f8c537c28
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -0,0 +1,73 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_host.h>
+#include <asm/udbg.h>
+
+#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba, unsigned long tce)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_spapr_tce_table *stt;
+
+	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
+	/* 	    liobn, ioba, tce); */
+
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == liobn) {
+			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+			struct page *page;
+			u64 *tbl;
+
+			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
+			/* 	    liobn, stt, stt->window_size); */
+			if (ioba >= stt->window_size)
+				return H_PARAMETER;
+
+			page = stt->pages[idx / TCES_PER_PAGE];
+			tbl = (u64 *)page_address(page);
+
+			/* FIXME: Need to validate the TCE itself */
+			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
+			tbl[idx % TCES_PER_PAGE] = tce;
+			return H_SUCCESS;
+		}
+	}
+
+	/* Didn't find the liobn, punt it to userspace */
+	return H_TOO_HARD;
+}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index af862c30b70e..6fe469eabce8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -538,6 +538,116 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static long kvmppc_stt_npages(unsigned long window_size)
+{
+	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
+
+static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+{
+	struct kvm *kvm = stt->kvm;
+	int i;
+
+	mutex_lock(&kvm->lock);
+	list_del(&stt->list);
+	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+		__free_page(stt->pages[i]);
+	kfree(stt);
+	mutex_unlock(&kvm->lock);
+
+	kvm_put_kvm(kvm);
+}
+
+static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+		return VM_FAULT_SIGBUS;
+
+	page = stt->pages[vmf->pgoff];
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
+	.fault = kvm_spapr_tce_fault,
+};
+
+static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_spapr_tce_vm_ops;
+	return 0;
+}
+
+static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+
+	release_spapr_tce_table(stt);
+	return 0;
+}
+
+static struct file_operations kvm_spapr_tce_fops = {
+	.mmap           = kvm_spapr_tce_mmap,
+	.release	= kvm_spapr_tce_release,
+};
+
+long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				   struct kvm_create_spapr_tce *args)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	long npages;
+	int ret = -ENOMEM;
+	int i;
+
+	/* Check this LIOBN hasn't been previously allocated */
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == args->liobn)
+			return -EBUSY;
+	}
+
+	npages = kvmppc_stt_npages(args->window_size);
+
+	stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
+		      GFP_KERNEL);
+	if (!stt)
+		goto fail;
+
+	stt->liobn = args->liobn;
+	stt->window_size = args->window_size;
+	stt->kvm = kvm;
+
+	for (i = 0; i < npages; i++) {
+		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!stt->pages[i])
+			goto fail;
+	}
+
+	kvm_get_kvm(kvm);
+
+	mutex_lock(&kvm->lock);
+	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+
+	mutex_unlock(&kvm->lock);
+
+	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				stt, O_RDWR);
+
+fail:
+	if (stt) {
+		for (i = 0; i < npages; i++)
+			if (stt->pages[i])
+				__free_page(stt->pages[i]);
+
+		kfree(stt);
+	}
+	return ret;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
@@ -559,13 +669,17 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	/* Allocate hashed page table */
 	r = kvmppc_alloc_hpt(kvm);
+	if (r)
+		return r;
 
-	return r;
+	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	return 0;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
 	kvmppc_free_hpt(kvm);
+	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
 
 /* These are stubs for now */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 319ff63b1f31..e6adaadcdff2 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -754,7 +754,7 @@ hcall_real_table:
 	.long	0		/* 0x14 - H_CLEAR_REF */
 	.long	.kvmppc_h_protect - hcall_real_table
 	.long	0		/* 0x1c - H_GET_TCE */
-	.long	0		/* 0x20 - H_SET_TCE */
+	.long	.kvmppc_h_put_tce - hcall_real_table
 	.long	0		/* 0x24 - H_SET_SPRG0 */
 	.long	.kvmppc_h_set_dabr - hcall_real_table
 	.long	0		/* 0x2c */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6fc9ee499b61..c78ceb9d5605 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -202,6 +202,11 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CAP_SPAPR_TCE:
+		r = 1;
+		break;
 #endif
 	default:
 		r = 0;
@@ -653,6 +658,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		break;
 	}
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CREATE_SPAPR_TCE: {
+		struct kvm_create_spapr_tce create_tce;
+		struct kvm *kvm = filp->private_data;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
+			goto out;
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+		goto out;
+	}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index a156294fc22a..61f56502732e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -550,6 +550,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_TSC_CONTROL 60
 #define KVM_CAP_GET_TSC_KHZ 61
 #define KVM_CAP_PPC_BOOKE_SREGS 62
+#define KVM_CAP_SPAPR_TCE 63
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -752,6 +753,7 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
+#define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
cgit v1.2.3


From 371fefd6f2dc46668e00871930dde613b88d4bde Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:23:08 +0000
Subject: KVM: PPC: Allow book3s_hv guests to use SMT processor modes

This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7.  The host still has to run single-threaded.

This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability.  The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.

To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode.  KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline).  To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c.  In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it.  Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.

When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host.  This number is exported
to userspace via the KVM_CAP_PPC_SMT capability.  If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.

We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host.  We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked.  This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.

When a vcore starts to run, it executes in the context of one of the
vcpu threads.  The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).

It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running.  In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest.  It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.

Note that there is no fixed relationship between the hardware thread
number and the vcpu number.  Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt         |  13 ++
 arch/powerpc/include/asm/kvm.h            |   1 +
 arch/powerpc/include/asm/kvm_book3s_asm.h |   2 +
 arch/powerpc/include/asm/kvm_host.h       |  46 ++++-
 arch/powerpc/include/asm/kvm_ppc.h        |  13 ++
 arch/powerpc/kernel/asm-offsets.c         |   6 +
 arch/powerpc/kernel/exceptions-64s.S      |  31 ++-
 arch/powerpc/kernel/idle_power7.S         |   2 -
 arch/powerpc/kvm/book3s_hv.c              | 316 +++++++++++++++++++++++++++---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 168 +++++++++++++++-
 arch/powerpc/kvm/powerpc.c                |   4 +
 arch/powerpc/sysdev/xics/icp-native.c     |   9 +
 include/linux/kvm.h                       |   1 +
 13 files changed, 567 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a1d344d5ff4c..681871311d3e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time.
 If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
 cpus max.
 
+On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
+threads in one or more virtual CPU cores.  (This is because the
+hardware requires all the hardware threads in a CPU core to be in the
+same partition.)  The KVM_CAP_PPC_SMT capability indicates the number
+of vcpus per virtual core (vcore).  The vcore id is obtained by
+dividing the vcpu id by the number of vcpus per vcore.  The vcpus in a
+given vcore will always be in the same physical core as each other
+(though that might be a different physical core from time to time).
+Userspace can control the threading (SMT) mode of the guest by its
+allocation of vcpu ids.  For example, if userspace wants
+single-threaded guest vcpus, it should make all vcpu ids be a multiple
+of the number of vcpus per vcore.
+
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 
 Capability: basic
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index c3ec990daf45..471bb3d85e0b 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -24,6 +24,7 @@
 
 /* Select powerpc specific features in <linux/kvm.h> */
 #define __KVM_HAVE_SPAPR_TCE
+#define __KVM_HAVE_PPC_SMT
 
 struct kvm_regs {
 	__u64 pc;
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index b7b039532fbc..9cfd5436782d 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -78,6 +78,8 @@ struct kvmppc_host_state {
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu *kvm_vcpu;
+	struct kvmppc_vcore *kvm_vcore;
+	unsigned long xics_phys;
 	u64 dabr;
 	u64 host_mmcr[3];
 	u32 host_pmc[6];
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5616e39a7fa4..0d6d569e19c7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -25,10 +25,14 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
 #include <linux/kvm_para.h>
 #include <asm/kvm_asm.h>
+#include <asm/processor.h>
 
-#define KVM_MAX_VCPUS 1
+#define KVM_MAX_VCPUS		NR_CPUS
+#define KVM_MAX_VCORES		NR_CPUS
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -167,9 +171,34 @@ struct kvm_arch {
 	int tlbie_lock;
 	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
+	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_count combines an entry count in the bottom 8 bits
+ * and an exit count in the next 8 bits.  This is so that we can
+ * atomically increment the entry count iff the exit count is 0
+ * without taking the lock.
+ */
+struct kvmppc_vcore {
+	int n_runnable;
+	int n_blocked;
+	int num_threads;
+	int entry_exit_count;
+	int n_woken;
+	int nap_count;
+	u16 pcpu;
+	u8 vcore_running;
+	u8 in_guest;
+	struct list_head runnable_threads;
+	spinlock_t lock;
+};
+
+#define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
+#define VCORE_EXIT_COUNT(vc)	((vc)->entry_exit_count >> 8)
+
 struct kvmppc_pte {
 	ulong eaddr;
 	u64 vpage;
@@ -365,14 +394,29 @@ struct kvm_vcpu_arch {
 	struct slb_shadow *slb_shadow;
 	struct dtl *dtl;
 	struct dtl *dtl_end;
+
+	struct kvmppc_vcore *vcore;
+	int ret;
 	int trap;
+	int state;
+	int ptid;
+	wait_queue_head_t cpu_run;
+
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu_arch_shared shregs;
+
+	struct list_head run_list;
+	struct task_struct *run_task;
+	struct kvm_run *kvm_run;
 #endif
 };
 
+#define KVMPPC_VCPU_BUSY_IN_HOST	0
+#define KVMPPC_VCPU_BLOCKED		1
+#define KVMPPC_VCPU_RUNNABLE		2
+
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 99f6fcf4cf88..6ef734428634 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -33,6 +33,9 @@
 #else
 #include <asm/kvm_booke.h>
 #endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/paca.h>
+#endif
 
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
@@ -169,4 +172,14 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{
+	paca[cpu].kvm_hstate.xics_phys = addr;
+}
+#else
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{}
+#endif
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index c70d106bf1a4..d0f2387fd792 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -471,6 +471,10 @@ int main(void)
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
+	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
+	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
 	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
@@ -530,6 +534,8 @@ int main(void)
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
+	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
 	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
 	HSTATE_FIELD(HSTATE_PMC, host_pmc);
 	HSTATE_FIELD(HSTATE_PURR, host_purr);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 163c041cec24..5bc06fdfa6c0 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -49,19 +49,32 @@ BEGIN_FTR_SECTION
 	 * state loss at this time.
 	 */
 	mfspr	r13,SPRN_SRR1
-	rlwinm	r13,r13,47-31,30,31
-	cmpwi	cr0,r13,1
-	bne	1f
-	b	.power7_wakeup_noloss
-1:	cmpwi	cr0,r13,2
-	bne	1f
-	b	.power7_wakeup_loss
+	rlwinm.	r13,r13,47-31,30,31
+	beq	9f
+
+	/* waking up from powersave (nap) state */
+	cmpwi	cr1,r13,2
 	/* Total loss of HV state is fatal, we could try to use the
 	 * PIR to locate a PACA, then use an emergency stack etc...
 	 * but for now, let's just stay stuck here
 	 */
-1:	cmpwi	cr0,r13,3
-	beq	.
+	bgt	cr1,.
+	GET_PACA(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	lbz	r0,PACAPROCSTART(r13)
+	cmpwi	r0,0x80
+	bne	1f
+	li	r0,0
+	stb	r0,PACAPROCSTART(r13)
+	b	kvm_start_guest
+1:
+#endif
+
+	beq	cr1,2f
+	b	.power7_wakeup_noloss
+2:	b	.power7_wakeup_loss
+9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 #endif /* CONFIG_PPC_P7_NAP */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index f8f0bc7f1d4f..3a70845a51c7 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -73,7 +73,6 @@ _GLOBAL(power7_idle)
 	b	.
 
 _GLOBAL(power7_wakeup_loss)
-	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
 	REST_NVGPRS(r1)
 	REST_GPR(2, r1)
@@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss)
 	rfid
 
 _GLOBAL(power7_wakeup_noloss)
-	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
 	ld	r4,_MSR(r1)
 	ld	r5,_NIP(r1)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6fe469eabce8..36b6d98f1197 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -39,6 +39,7 @@
 #include <asm/mmu_context.h>
 #include <asm/lppaca.h>
 #include <asm/processor.h>
+#include <asm/cputhreads.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
@@ -51,12 +52,16 @@
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	local_paca->kvm_hstate.kvm_vcpu = vcpu;
+	local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 }
 
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
+
 void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	u64 now;
@@ -74,11 +79,15 @@ void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
 			      HRTIMER_MODE_REL);
 	}
 
+	kvmppc_vcpu_blocked(vcpu);
+
 	kvm_vcpu_block(vcpu);
 	vcpu->stat.halt_wakeup++;
 
 	if (vcpu->arch.dec_expires != ~(u64)0)
 		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+
+	kvmppc_vcpu_unblocked(vcpu);
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -429,9 +438,16 @@ int kvmppc_core_check_processor_compat(void)
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	struct kvm_vcpu *vcpu;
-	int err = -ENOMEM;
+	int err = -EINVAL;
+	int core;
+	struct kvmppc_vcore *vcore;
 	unsigned long lpcr;
 
+	core = id / threads_per_core;
+	if (core >= KVM_MAX_VCORES)
+		goto out;
+
+	err = -ENOMEM;
 	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
 	if (!vcpu)
 		goto out;
@@ -454,6 +470,38 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
+	/*
+	 * Some vcpus may start out in stopped state.  If we initialize
+	 * them to busy-in-host state they will stop other vcpus in the
+	 * vcore from running.  Instead we initialize them to blocked
+	 * state, effectively considering them to be stopped until we
+	 * see the first run ioctl for them.
+	 */
+	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+
+	init_waitqueue_head(&vcpu->arch.cpu_run);
+
+	mutex_lock(&kvm->lock);
+	vcore = kvm->arch.vcores[core];
+	if (!vcore) {
+		vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
+		if (vcore) {
+			INIT_LIST_HEAD(&vcore->runnable_threads);
+			spin_lock_init(&vcore->lock);
+		}
+		kvm->arch.vcores[core] = vcore;
+	}
+	mutex_unlock(&kvm->lock);
+
+	if (!vcore)
+		goto free_vcpu;
+
+	spin_lock(&vcore->lock);
+	++vcore->num_threads;
+	++vcore->n_blocked;
+	spin_unlock(&vcore->lock);
+	vcpu->arch.vcore = vcore;
+
 	return vcpu;
 
 free_vcpu:
@@ -468,21 +516,121 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	kfree(vcpu);
 }
 
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	spin_lock(&vc->lock);
+	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+	++vc->n_blocked;
+	if (vc->n_runnable > 0 &&
+	    vc->n_runnable + vc->n_blocked == vc->num_threads) {
+		vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+					arch.run_list);
+		wake_up(&vcpu->arch.cpu_run);
+	}
+	spin_unlock(&vc->lock);
+}
+
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	spin_lock(&vc->lock);
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	--vc->n_blocked;
+	spin_unlock(&vc->lock);
+}
+
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern void xics_wake_cpu(int cpu);
 
-static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
+				   struct kvm_vcpu *vcpu)
 {
-	u64 now;
+	struct kvm_vcpu *v;
 
-	if (signal_pending(current)) {
-		run->exit_reason = KVM_EXIT_INTR;
-		return -EINTR;
+	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	--vc->n_runnable;
+	/* decrement the physical thread id of each following vcpu */
+	v = vcpu;
+	list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
+		--v->arch.ptid;
+	list_del(&vcpu->arch.run_list);
+}
+
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+	struct paca_struct *tpaca;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	cpu = vc->pcpu + vcpu->arch.ptid;
+	tpaca = &paca[cpu];
+	tpaca->kvm_hstate.kvm_vcpu = vcpu;
+	tpaca->kvm_hstate.kvm_vcore = vc;
+	smp_wmb();
+#ifdef CONFIG_PPC_ICP_NATIVE
+	if (vcpu->arch.ptid) {
+		tpaca->cpu_start = 0x80;
+		tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
+		wmb();
+		xics_wake_cpu(cpu);
+		++vc->n_woken;
 	}
+#endif
+}
 
-	flush_fp_to_thread(current);
-	flush_altivec_to_thread(current);
-	flush_vsx_to_thread(current);
-	preempt_disable();
+static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+{
+	int i;
+
+	HMT_low();
+	i = 0;
+	while (vc->nap_count < vc->n_woken) {
+		if (++i >= 1000000) {
+			pr_err("kvmppc_wait_for_nap timeout %d %d\n",
+			       vc->nap_count, vc->n_woken);
+			break;
+		}
+		cpu_relax();
+	}
+	HMT_medium();
+}
+
+/*
+ * Check that we are on thread 0 and that any other threads in
+ * this core are off-line.
+ */
+static int on_primary_thread(void)
+{
+	int cpu = smp_processor_id();
+	int thr = cpu_thread_in_core(cpu);
+
+	if (thr)
+		return 0;
+	while (++thr < threads_per_core)
+		if (cpu_online(cpu + thr))
+			return 0;
+	return 1;
+}
+
+/*
+ * Run a set of guest threads on a physical core.
+ * Called with vc->lock held.
+ */
+static int kvmppc_run_core(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu, *vnext;
+	long ret;
+	u64 now;
+
+	/* don't start if any threads have a signal pending */
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		if (signal_pending(vcpu->arch.run_task))
+			return 0;
 
 	/*
 	 * Make sure we are running on thread 0, and that
@@ -490,36 +638,150 @@ static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	 * XXX we should also block attempts to bring any
 	 * secondary threads online.
 	 */
-	if (threads_per_core > 1) {
-		int cpu = smp_processor_id();
-		int thr = cpu_thread_in_core(cpu);
-
-		if (thr)
-			goto out;
-		while (++thr < threads_per_core)
-			if (cpu_online(cpu + thr))
-				goto out;
+	if (threads_per_core > 1 && !on_primary_thread()) {
+		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+			vcpu->arch.ret = -EBUSY;
+		goto out;
 	}
 
-	kvm_guest_enter();
+	vc->n_woken = 0;
+	vc->nap_count = 0;
+	vc->entry_exit_count = 0;
+	vc->vcore_running = 1;
+	vc->in_guest = 0;
+	vc->pcpu = smp_processor_id();
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		kvmppc_start_thread(vcpu);
+	vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+				arch.run_list);
+
+	spin_unlock(&vc->lock);
 
+	preempt_disable();
+	kvm_guest_enter();
 	__kvmppc_vcore_entry(NULL, vcpu);
 
+	/* wait for secondary threads to finish writing their state to memory */
+	spin_lock(&vc->lock);
+	if (vc->nap_count < vc->n_woken)
+		kvmppc_wait_for_nap(vc);
+	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
+	vc->vcore_running = 2;
+	spin_unlock(&vc->lock);
+
+	/* make sure updates to secondary vcpu structs are visible now */
+	smp_mb();
 	kvm_guest_exit();
 
 	preempt_enable();
 	kvm_resched(vcpu);
 
 	now = get_tb();
-	/* cancel pending dec exception if dec is positive */
-	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
-		kvmppc_core_dequeue_dec(vcpu);
-
-	return kvmppc_handle_exit(run, vcpu, current);
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+		/* cancel pending dec exception if dec is positive */
+		if (now < vcpu->arch.dec_expires &&
+		    kvmppc_core_pending_dec(vcpu))
+			kvmppc_core_dequeue_dec(vcpu);
+		if (!vcpu->arch.trap) {
+			if (signal_pending(vcpu->arch.run_task)) {
+				vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+				vcpu->arch.ret = -EINTR;
+			}
+			continue;		/* didn't get to run */
+		}
+		ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
+					 vcpu->arch.run_task);
+		vcpu->arch.ret = ret;
+		vcpu->arch.trap = 0;
+	}
 
+	spin_lock(&vc->lock);
  out:
-	preempt_enable();
-	return -EBUSY;
+	vc->vcore_running = 0;
+	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+				 arch.run_list) {
+		if (vcpu->arch.ret != RESUME_GUEST) {
+			kvmppc_remove_runnable(vc, vcpu);
+			wake_up(&vcpu->arch.cpu_run);
+		}
+	}
+
+	return 1;
+}
+
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ptid;
+	int wait_state;
+	struct kvmppc_vcore *vc;
+	DEFINE_WAIT(wait);
+
+	/* No need to go into the guest when all we do is going out */
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	kvm_run->exit_reason = 0;
+	vcpu->arch.ret = RESUME_GUEST;
+	vcpu->arch.trap = 0;
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+
+	/*
+	 * Synchronize with other threads in this virtual core
+	 */
+	vc = vcpu->arch.vcore;
+	spin_lock(&vc->lock);
+	/* This happens the first time this is called for a vcpu */
+	if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
+		--vc->n_blocked;
+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+	ptid = vc->n_runnable;
+	vcpu->arch.run_task = current;
+	vcpu->arch.kvm_run = kvm_run;
+	vcpu->arch.ptid = ptid;
+	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+	++vc->n_runnable;
+
+	wait_state = TASK_INTERRUPTIBLE;
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		if (signal_pending(current)) {
+			if (!vc->vcore_running) {
+				kvm_run->exit_reason = KVM_EXIT_INTR;
+				vcpu->arch.ret = -EINTR;
+				break;
+			}
+			/* have to wait for vcore to stop executing guest */
+			wait_state = TASK_UNINTERRUPTIBLE;
+			smp_send_reschedule(vc->pcpu);
+		}
+
+		if (!vc->vcore_running &&
+		    vc->n_runnable + vc->n_blocked == vc->num_threads) {
+			/* we can run now */
+			if (kvmppc_run_core(vc))
+				continue;
+		}
+
+		if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
+			kvmppc_start_thread(vcpu);
+
+		/* wait for other threads to come in, or wait for vcore */
+		prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+		spin_unlock(&vc->lock);
+		schedule();
+		finish_wait(&vcpu->arch.cpu_run, &wait);
+		spin_lock(&vc->lock);
+	}
+
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+		kvmppc_remove_runnable(vc, vcpu);
+	spin_unlock(&vc->lock);
+
+	return vcpu->arch.ret;
 }
 
 int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index e6adaadcdff2..c9bf177b7cf2 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -30,8 +30,6 @@
  *                                                                           *
  ****************************************************************************/
 
-#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
-
 	.globl	kvmppc_skip_interrupt
 kvmppc_skip_interrupt:
 	mfspr	r13,SPRN_SRR0
@@ -79,6 +77,32 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
  *                                                                            *
  *****************************************************************************/
 
+#define XICS_XIRR		4
+#define XICS_QIRR		0xc
+
+/*
+ * We come in here when wakened from nap mode on a secondary hw thread.
+ * Relocation is off and most register values are lost.
+ * r13 points to the PACA.
+ */
+	.globl	kvm_start_guest
+kvm_start_guest:
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	/* get vcpu pointer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+
+	/* We got here with an IPI; clear it */
+	ld	r5, HSTATE_XICS_PHYS(r13)
+	li	r0, 0xff
+	li	r6, XICS_QIRR
+	li	r7, XICS_XIRR
+	lwzcix	r8, r5, r7		/* ack the interrupt */
+	sync
+	stbcix	r0, r5, r6		/* clear it */
+	stwcix	r8, r5, r7		/* EOI it */
+
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
 
@@ -200,7 +224,20 @@ kvmppc_hv_entry:
 	slbia
 	ptesync
 
-	/* Switch to guest partition. */
+	/* Increment entry count iff exit count is zero. */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	addi	r9,r5,VCORE_ENTRY_EXIT
+21:	lwarx	r3,0,r9
+	cmpwi	r3,0x100		/* any threads starting to exit? */
+	bge	secondary_too_late	/* if so we're too late to the party */
+	addi	r3,r3,1
+	stwcx.	r3,0,r9
+	bne	21b
+
+	/* Primary thread switches to guest partition. */
+	lwz	r6,VCPU_PTID(r4)
+	cmpwi	r6,0
+	bne	20f
 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
@@ -210,7 +247,15 @@ kvmppc_hv_entry:
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
-	ld	r8,VCPU_LPCR(r4)
+	li	r0,1
+	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
+	b	10f
+
+	/* Secondary threads wait for primary to have done partition switch */
+20:	lbz	r0,VCORE_IN_GUEST(r5)
+	cmpwi	r0,0
+	beq	20b
+10:	ld	r8,VCPU_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
 
@@ -225,10 +270,12 @@ kvmppc_hv_entry:
 	 * Invalidate the TLB if we could possibly have stale TLB
 	 * entries for this partition on this core due to the use
 	 * of tlbiel.
+	 * XXX maybe only need this on primary thread?
 	 */
 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	lwz	r5,VCPU_VCPUID(r4)
 	lhz	r6,PACAPACAINDEX(r13)
+	rldimi	r6,r5,0,62		/* XXX map as if threads 1:1 p:v */
 	lhz	r8,VCPU_LAST_CPU(r4)
 	sldi	r7,r6,1			/* see if this is the same vcpu */
 	add	r7,r7,r9		/* as last ran on this pcpu */
@@ -512,8 +559,60 @@ hcall_real_cont:
 	ptesync
 
 hdec_soon:
-	/* Switch back to host partition */
+	/* Increment the threads-exiting-guest count in the 0xff00
+	   bits of vcore->entry_exit_count */
+	lwsync
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	addi	r6,r5,VCORE_ENTRY_EXIT
+41:	lwarx	r3,0,r6
+	addi	r0,r3,0x100
+	stwcx.	r0,0,r6
+	bne	41b
+
+	/*
+	 * At this point we have an interrupt that we have to pass
+	 * up to the kernel or qemu; we can't handle it in real mode.
+	 * Thus we have to do a partition switch, so we have to
+	 * collect the other threads, if we are the first thread
+	 * to take an interrupt.  To do this, we set the HDEC to 0,
+	 * which causes an HDEC interrupt in all threads within 2ns
+	 * because the HDEC register is shared between all 4 threads.
+	 * However, we don't need to bother if this is an HDEC
+	 * interrupt, since the other threads will already be on their
+	 * way here in that case.
+	 */
+	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	beq	40f
+	cmpwi	r3,0x100	/* Are we the first here? */
+	bge	40f
+	cmpwi	r3,1
+	ble	40f
+	li	r0,0
+	mtspr	SPRN_HDEC,r0
+40:
+
+	/* Secondary threads wait for primary to do partition switch */
 	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	lwz	r3,VCPU_PTID(r9)
+	cmpwi	r3,0
+	beq	15f
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	b	16f
+
+	/* Primary thread waits for all the secondaries to exit guest */
+15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
+	srwi	r0,r3,8
+	clrldi	r3,r3,56
+	cmpw	r3,r0
+	bne	15b
+	isync
+
+	/* Primary thread switches back to host partition */
 	ld	r6,KVM_HOST_SDR1(r4)
 	lwz	r7,KVM_HOST_LPID(r4)
 	li	r8,LPID_RSVD		/* switch to reserved LPID */
@@ -522,10 +621,12 @@ hdec_soon:
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
+	li	r0,0
+	stb	r0,VCORE_IN_GUEST(r5)
 	lis	r8,0x7fff		/* MAX_INT@h */
 	mtspr	SPRN_HDEC,r8
 
-	ld	r8,KVM_HOST_LPCR(r4)
+16:	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
 
@@ -634,6 +735,11 @@ hdec_soon:
 	mr	r3, r9
 	bl	.kvmppc_save_fp
 
+	/* Secondary threads go off to take a nap */
+	lwz	r0,VCPU_PTID(r3)
+	cmpwi	r0,0
+	bne	secondary_nap
+
 	/*
 	 * Reload DEC.  HDEC interrupts were disabled when
 	 * we reloaded the host's LPCR value.
@@ -840,6 +946,56 @@ _GLOBAL(kvmppc_h_set_dabr)
 	li	r3,0
 	blr
 
+secondary_too_late:
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	ld	r11,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	ld	r5,SLBSHADOW_SAVEAREA(r11)
+	ld	r6,SLBSHADOW_SAVEAREA+8(r11)
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r11,r11,16
+	.endr
+	b	50f
+
+secondary_nap:
+	/* Clear any pending IPI */
+50:	ld	r5, HSTATE_XICS_PHYS(r13)
+	li	r0, 0xff
+	li	r6, XICS_QIRR
+	stbcix	r0, r5, r6
+
+	/* increment the nap count and then go to nap mode */
+	ld	r4, HSTATE_KVM_VCORE(r13)
+	addi	r4, r4, VCORE_NAP_COUNT
+	lwsync				/* make previous updates visible */
+51:	lwarx	r3, 0, r4
+	addi	r3, r3, 1
+	stwcx.	r3, 0, r4
+	bne	51b
+	isync
+
+	mfspr	r4, SPRN_LPCR
+	li	r0, LPCR_PECE
+	andc	r4, r4, r0
+	ori	r4, r4, LPCR_PECE0	/* exit nap on interrupt */
+	mtspr	SPRN_LPCR, r4
+	li	r0, 0
+	std	r0, HSTATE_SCRATCH0(r13)
+	ptesync
+	ld	r0, HSTATE_SCRATCH0(r13)
+1:	cmpd	r0, r0
+	bne	1b
+	nap
+	b	.
+
 /*
  * Save away FP, VMX and VSX registers.
  * r3 = vcpu pointer
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c78ceb9d5605..4c549664c987 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -30,6 +30,7 @@
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/tlbflush.h>
+#include <asm/cputhreads.h>
 #include "timing.h"
 #include "../mm/mmu_decl.h"
 
@@ -207,6 +208,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SPAPR_TCE:
 		r = 1;
 		break;
+	case KVM_CAP_PPC_SMT:
+		r = threads_per_core;
+		break;
 #endif
 	default:
 		r = 0;
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 1f15ad436140..ba382b59b926 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 #include <linux/of.h>
 #include <linux/spinlock.h>
+#include <linux/module.h>
 
 #include <asm/prom.h>
 #include <asm/io.h>
@@ -24,6 +25,7 @@
 #include <asm/irq.h>
 #include <asm/errno.h>
 #include <asm/xics.h>
+#include <asm/kvm_ppc.h>
 
 struct icp_ipl {
 	union {
@@ -139,6 +141,12 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
+void xics_wake_cpu(int cpu)
+{
+	icp_native_set_qirr(cpu, IPI_PRIORITY);
+}
+EXPORT_SYMBOL_GPL(xics_wake_cpu);
+
 static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
 {
 	int cpu = smp_processor_id();
@@ -185,6 +193,7 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr,
 	}
 
 	icp_native_regs[cpu] = ioremap(addr, size);
+	kvmppc_set_xics_phys(cpu, addr);
 	if (!icp_native_regs[cpu]) {
 		pr_warning("icp_native: Failed ioremap for CPU %d, "
 			   "interrupt server #0x%x, addr %#lx\n",
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 61f56502732e..e2a378d97160 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -551,6 +551,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_GET_TSC_KHZ 61
 #define KVM_CAP_PPC_BOOKE_SREGS 62
 #define KVM_CAP_SPAPR_TCE 63
+#define KVM_CAP_PPC_SMT 64
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From aa04b4cc5be64b4fb9ef4e0fdf2418e2f4737fb2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:25:44 +0000
Subject: KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests

This adds infrastructure which will be needed to allow book3s_hv KVM to
run on older POWER processors, including PPC970, which don't support
the Virtual Real Mode Area (VRMA) facility, but only the Real Mode
Offset (RMO) facility.  These processors require a physically
contiguous, aligned area of memory for each guest.  When the guest does
an access in real mode (MMU off), the address is compared against a
limit value, and if it is lower, the address is ORed with an offset
value (from the Real Mode Offset Register (RMOR)) and the result becomes
the real address for the access.  The size of the RMA has to be one of
a set of supported values, which usually includes 64MB, 128MB, 256MB
and some larger powers of 2.

Since we are unlikely to be able to allocate 64MB or more of physically
contiguous memory after the kernel has been running for a while, we
allocate a pool of RMAs at boot time using the bootmem allocator.  The
size and number of the RMAs can be set using the kvm_rma_size=xx and
kvm_rma_count=xx kernel command line options.

KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability
of the pool of preallocated RMAs.  The capability value is 1 if the
processor can use an RMA but doesn't require one (because it supports
the VRMA facility), or 2 if the processor requires an RMA for each guest.

This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the
pool and returns a file descriptor which can be used to map the RMA.  It
also returns the size of the RMA in the argument structure.

Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION
ioctl calls from userspace.  To cope with this, we now preallocate the
kvm->arch.ram_pginfo array when the VM is created with a size sufficient
for up to 64GB of guest memory.  Subsequently we will get rid of this
array and use memory associated with each memslot instead.

This moves most of the code that translates the user addresses into
host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level
to kvmppc_core_prepare_memory_region.  Also, instead of having to look
up the VMA for each page in order to check the page size, we now check
that the pages we get are compound pages of 16MB.  However, if we are
adding memory that is mapped to an RMA, we don't bother with calling
get_user_pages_fast and instead just offset from the base pfn for the
RMA.

Typically the RMA gets added after vcpus are created, which makes it
inconvenient to have the LPCR (logical partition control register) value
in the vcpu->arch struct, since the LPCR controls whether the processor
uses RMA or VRMA for the guest.  This moves the LPCR value into the
kvm->arch struct and arranges for the MER (mediated external request)
bit, which is the only bit that varies between vcpus, to be set in
assembly code when going into the guest if there is a pending external
interrupt request.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt       |  32 ++++
 arch/powerpc/include/asm/kvm.h          |   5 +
 arch/powerpc/include/asm/kvm_book3s.h   |   8 -
 arch/powerpc/include/asm/kvm_host.h     |  15 +-
 arch/powerpc/include/asm/kvm_ppc.h      |  10 ++
 arch/powerpc/include/asm/reg.h          |   1 +
 arch/powerpc/kernel/asm-offsets.c       |   4 +-
 arch/powerpc/kernel/setup_64.c          |   3 +
 arch/powerpc/kvm/Makefile               |   3 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c     |  97 +-----------
 arch/powerpc/kvm/book3s_hv.c            | 259 ++++++++++++++++++++++++++++++--
 arch/powerpc/kvm/book3s_hv_builtin.c    | 152 +++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  19 ++-
 arch/powerpc/kvm/powerpc.c              |  13 ++
 include/linux/kvm.h                     |   3 +
 15 files changed, 505 insertions(+), 119 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_builtin.c

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 681871311d3e..b0e4b9cd6a66 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1398,6 +1398,38 @@ the entries written by kernel-handled H_PUT_TCE calls, and also lets
 userspace update the TCE table directly which is useful in some
 circumstances.
 
+4.63 KVM_ALLOCATE_RMA
+
+Capability: KVM_CAP_PPC_RMA
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_allocate_rma (out)
+Returns: file descriptor for mapping the allocated RMA
+
+This allocates a Real Mode Area (RMA) from the pool allocated at boot
+time by the kernel.  An RMA is a physically-contiguous, aligned region
+of memory used on older POWER processors to provide the memory which
+will be accessed by real-mode (MMU off) accesses in a KVM guest.
+POWER processors support a set of sizes for the RMA that usually
+includes 64MB, 128MB, 256MB and some larger powers of two.
+
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+	__u64 rma_size;
+};
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the allocated RMA into userspace.  The mapped area can then be
+passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the
+RMA for a virtual machine.  The size of the RMA in bytes (which is
+fixed at host kernel boot time) is returned in the rma_size field of
+the argument structure.
+
+The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl
+is supported; 2 if the processor requires all virtual machines to have
+an RMA, or 1 if the processor can use an RMA but doesn't require it,
+because it supports the Virtual RMA (VRMA) facility.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 471bb3d85e0b..a4f6c85431f8 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -282,4 +282,9 @@ struct kvm_create_spapr_tce {
 	__u32 window_size;
 };
 
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+	__u64 rma_size;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5537c45d626c..3f91ebd4ae43 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -298,14 +298,6 @@ static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 			unsigned long pending_now, unsigned long old_pending)
 {
-	/* Recalculate LPCR:MER based on the presence of
-	 * a pending external interrupt
-	 */
-	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &pending_now) ||
-	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &pending_now))
-		vcpu->arch.lpcr |= LPCR_MER;
-	else
-		vcpu->arch.lpcr &= ~((u64)LPCR_MER);
 }
 
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0d6d569e19c7..f572d9cc31bd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -28,6 +28,8 @@
 #include <linux/threads.h>
 #include <linux/spinlock.h>
 #include <linux/kvm_para.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
 #include <asm/kvm_asm.h>
 #include <asm/processor.h>
 
@@ -156,6 +158,14 @@ struct kvmppc_spapr_tce_table {
 	struct page *pages[0];
 };
 
+struct kvmppc_rma_info {
+	void		*base_virt;
+	unsigned long	 base_pfn;
+	unsigned long	 npages;
+	struct list_head list;
+	atomic_t 	 use_count;
+};
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
@@ -169,6 +179,10 @@ struct kvm_arch {
 	unsigned long sdr1;
 	unsigned long host_sdr1;
 	int tlbie_lock;
+	int n_rma_pages;
+	unsigned long lpcr;
+	unsigned long rmor;
+	struct kvmppc_rma_info *rma;
 	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
@@ -295,7 +309,6 @@ struct kvm_vcpu_arch {
 	ulong guest_owned_ext;
 	ulong purr;
 	ulong spurr;
-	ulong lpcr;
 	ulong dscr;
 	ulong amr;
 	ulong uamor;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 6ef734428634..d121f49d62b8 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -124,6 +124,10 @@ extern void kvmppc_map_vrma(struct kvm *kvm,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
+extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
+				struct kvm_allocate_rma *rma);
+extern struct kvmppc_rma_info *kvm_alloc_rma(void);
+extern void kvm_release_rma(struct kvmppc_rma_info *ri);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
@@ -177,9 +181,15 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
 	paca[cpu].kvm_hstate.xics_phys = addr;
 }
+
+extern void kvm_rma_init(void);
+
 #else
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}
+
+static inline void kvm_rma_init(void)
+{}
 #endif
 
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 36a611b398c5..20a053c14270 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -242,6 +242,7 @@
 #define   LPCR_VRMA_LP1	(1ul << (63-16))
 #define   LPCR_VRMASD_SH (63-16)
 #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
+#define	  LPCR_RMLS_SH	(63-37)
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
 #define   LPCR_PECE	0x00007000	/* powersave exit cause enable */
 #define     LPCR_PECE0	0x00004000	/* ext. exceptions can cause exit */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index d0f2387fd792..f4aba938166b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -437,6 +437,8 @@ int main(void)
 	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
 	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
 	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
+	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 #endif
@@ -459,7 +461,7 @@ int main(void)
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
-	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index a88bf2713d41..532054f24ecb 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -63,6 +63,7 @@
 #include <asm/kexec.h>
 #include <asm/mmu_context.h>
 #include <asm/code-patching.h>
+#include <asm/kvm_ppc.h>
 
 #include "setup.h"
 
@@ -580,6 +581,8 @@ void __init setup_arch(char **cmdline_p)
 	/* Initialize the MMU context management stuff */
 	mmu_context_init();
 
+	kvm_rma_init();
+
 	ppc64_boot_msg(0x15, "Setup Done");
 }
 
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1de3d54901d4..08428e2c188d 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -56,7 +56,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_64_mmu_hv.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_rm_mmu.o \
-	book3s_64_vio_hv.o
+	book3s_64_vio_hv.o \
+	book3s_hv_builtin.o
 
 kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 4a4fbec61a17..96ba96a16abf 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -79,103 +79,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
-	unsigned long i;
-	struct kvmppc_pginfo *pginfo;
-
 	clear_bit(kvm->arch.lpid, lpid_inuse);
 	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
-
-	if (kvm->arch.ram_pginfo) {
-		pginfo = kvm->arch.ram_pginfo;
-		kvm->arch.ram_pginfo = NULL;
-		for (i = 0; i < kvm->arch.ram_npages; ++i)
-			put_page(pfn_to_page(pginfo[i].pfn));
-		kfree(pginfo);
-	}
-}
-
-static unsigned long user_page_size(unsigned long addr)
-{
-	struct vm_area_struct *vma;
-	unsigned long size = PAGE_SIZE;
-
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma(current->mm, addr);
-	if (vma)
-		size = vma_kernel_pagesize(vma);
-	up_read(&current->mm->mmap_sem);
-	return size;
-}
-
-static pfn_t hva_to_pfn(unsigned long addr)
-{
-	struct page *page[1];
-	int npages;
-
-	might_sleep();
-
-	npages = get_user_pages_fast(addr, 1, 1, page);
-
-	if (unlikely(npages != 1))
-		return 0;
-
-	return page_to_pfn(page[0]);
-}
-
-long kvmppc_prepare_vrma(struct kvm *kvm,
-			 struct kvm_userspace_memory_region *mem)
-{
-	unsigned long psize, porder;
-	unsigned long i, npages;
-	struct kvmppc_pginfo *pginfo;
-	pfn_t pfn;
-	unsigned long hva;
-
-	/* First see what page size we have */
-	psize = user_page_size(mem->userspace_addr);
-	/* For now, only allow 16MB pages */
-	if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) {
-		pr_err("bad psize=%lx memory_size=%llx @ %llx\n",
-		       psize, mem->memory_size, mem->userspace_addr);
-		return -EINVAL;
-	}
-	porder = __ilog2(psize);
-
-	npages = mem->memory_size >> porder;
-	pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL);
-	if (!pginfo) {
-		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n",
-		       npages * sizeof(struct kvmppc_pginfo));
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < npages; ++i) {
-		hva = mem->userspace_addr + (i << porder);
-		if (user_page_size(hva) != psize)
-			goto err;
-		pfn = hva_to_pfn(hva);
-		if (pfn == 0) {
-			pr_err("oops, no pfn for hva %lx\n", hva);
-			goto err;
-		}
-		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
-			pr_err("oops, unaligned pfn %llx\n", pfn);
-			put_page(pfn_to_page(pfn));
-			goto err;
-		}
-		pginfo[i].pfn = pfn;
-	}
-
-	kvm->arch.ram_npages = npages;
-	kvm->arch.ram_psize = psize;
-	kvm->arch.ram_porder = porder;
-	kvm->arch.ram_pginfo = pginfo;
-
-	return 0;
-
- err:
-	kfree(pginfo);
-	return -EINVAL;
 }
 
 void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
@@ -199,6 +104,8 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 
 	for (i = 0; i < npages; ++i) {
 		pfn = pginfo[i].pfn;
+		if (!pfn)
+			break;
 		/* can't use hpt_hash since va > 64 bits */
 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 		/*
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 36b6d98f1197..04da135cae61 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -27,6 +27,8 @@
 #include <linux/fs.h>
 #include <linux/anon_inodes.h>
 #include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -40,11 +42,22 @@
 #include <asm/lppaca.h>
 #include <asm/processor.h>
 #include <asm/cputhreads.h>
+#include <asm/page.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 
+/*
+ * For now, limit memory to 64GB and require it to be large pages.
+ * This value is chosen because it makes the ram_pginfo array be
+ * 64kB in size, which is about as large as we want to be trying
+ * to allocate with kmalloc.
+ */
+#define MAX_MEM_ORDER		36
+
+#define LARGE_PAGE_ORDER	24	/* 16MB pages */
+
 /* #define EXIT_DEBUG */
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
@@ -129,7 +142,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 		pr_err("  ESID = %.16llx VSID = %.16llx\n",
 		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
 	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
-	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
 	       vcpu->arch.last_inst);
 }
 
@@ -441,7 +454,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	int err = -EINVAL;
 	int core;
 	struct kvmppc_vcore *vcore;
-	unsigned long lpcr;
 
 	core = id / threads_per_core;
 	if (core >= KVM_MAX_VCORES)
@@ -464,10 +476,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu->arch.pvr = mfspr(SPRN_PVR);
 	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
 
-	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
-	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
-	vcpu->arch.lpcr = lpcr;
-
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
 	/*
@@ -910,24 +918,216 @@ fail:
 	return ret;
 }
 
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+	switch (rma_size) {
+	case 32ul << 20:	/* 32 MB */
+		return 8;
+	case 64ul << 20:	/* 64 MB */
+		return 3;
+	case 128ul << 20:	/* 128 MB */
+		return 7;
+	case 256ul << 20:	/* 256 MB */
+		return 4;
+	case 1ul << 30:		/* 1 GB */
+		return 2;
+	case 16ul << 30:	/* 16 GB */
+		return 1;
+	case 256ul << 30:	/* 256 GB */
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvmppc_rma_info *ri = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= ri->npages)
+		return VM_FAULT_SIGBUS;
+
+	page = pfn_to_page(ri->base_pfn + vmf->pgoff);
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_rma_vm_ops = {
+	.fault = kvm_rma_fault,
+};
+
+static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &kvm_rma_vm_ops;
+	return 0;
+}
+
+static int kvm_rma_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_rma_info *ri = filp->private_data;
+
+	kvm_release_rma(ri);
+	return 0;
+}
+
+static struct file_operations kvm_rma_fops = {
+	.mmap           = kvm_rma_mmap,
+	.release	= kvm_rma_release,
+};
+
+long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
+{
+	struct kvmppc_rma_info *ri;
+	long fd;
+
+	ri = kvm_alloc_rma();
+	if (!ri)
+		return -ENOMEM;
+
+	fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+	if (fd < 0)
+		kvm_release_rma(ri);
+
+	ret->rma_size = ri->npages << PAGE_SHIFT;
+	return fd;
+}
+
+static struct page *hva_to_page(unsigned long addr)
+{
+	struct page *page[1];
+	int npages;
+
+	might_sleep();
+
+	npages = get_user_pages_fast(addr, 1, 1, page);
+
+	if (unlikely(npages != 1))
+		return 0;
+
+	return page[0];
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
-	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
-		return kvmppc_prepare_vrma(kvm, mem);
+	unsigned long psize, porder;
+	unsigned long i, npages, totalpages;
+	unsigned long pg_ix;
+	struct kvmppc_pginfo *pginfo;
+	unsigned long hva;
+	struct kvmppc_rma_info *ri = NULL;
+	struct page *page;
+
+	/* For now, only allow 16MB pages */
+	porder = LARGE_PAGE_ORDER;
+	psize = 1ul << porder;
+	if ((mem->memory_size & (psize - 1)) ||
+	    (mem->guest_phys_addr & (psize - 1))) {
+		pr_err("bad memory_size=%llx @ %llx\n",
+		       mem->memory_size, mem->guest_phys_addr);
+		return -EINVAL;
+	}
+
+	npages = mem->memory_size >> porder;
+	totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
+
+	/* More memory than we have space to track? */
+	if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
+		return -EINVAL;
+
+	/* Do we already have an RMA registered? */
+	if (mem->guest_phys_addr == 0 && kvm->arch.rma)
+		return -EINVAL;
+
+	if (totalpages > kvm->arch.ram_npages)
+		kvm->arch.ram_npages = totalpages;
+
+	/* Is this one of our preallocated RMAs? */
+	if (mem->guest_phys_addr == 0) {
+		struct vm_area_struct *vma;
+
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, mem->userspace_addr);
+		if (vma && vma->vm_file &&
+		    vma->vm_file->f_op == &kvm_rma_fops &&
+		    mem->userspace_addr == vma->vm_start)
+			ri = vma->vm_file->private_data;
+		up_read(&current->mm->mmap_sem);
+	}
+
+	if (ri) {
+		unsigned long rma_size;
+		unsigned long lpcr;
+		long rmls;
+
+		rma_size = ri->npages << PAGE_SHIFT;
+		if (rma_size > mem->memory_size)
+			rma_size = mem->memory_size;
+		rmls = lpcr_rmls(rma_size);
+		if (rmls < 0) {
+			pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
+			return -EINVAL;
+		}
+		atomic_inc(&ri->use_count);
+		kvm->arch.rma = ri;
+		kvm->arch.n_rma_pages = rma_size >> porder;
+		lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L);
+		lpcr |= rmls << LPCR_RMLS_SH;
+		kvm->arch.lpcr = lpcr;
+		kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+		pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
+			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
+	}
+
+	pg_ix = mem->guest_phys_addr >> porder;
+	pginfo = kvm->arch.ram_pginfo + pg_ix;
+	for (i = 0; i < npages; ++i, ++pg_ix) {
+		if (ri && pg_ix < kvm->arch.n_rma_pages) {
+			pginfo[i].pfn = ri->base_pfn +
+				(pg_ix << (porder - PAGE_SHIFT));
+			continue;
+		}
+		hva = mem->userspace_addr + (i << porder);
+		page = hva_to_page(hva);
+		if (!page) {
+			pr_err("oops, no pfn for hva %lx\n", hva);
+			goto err;
+		}
+		/* Check it's a 16MB page */
+		if (!PageHead(page) ||
+		    compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
+			pr_err("page at %lx isn't 16MB (o=%d)\n",
+			       hva, compound_order(page));
+			goto err;
+		}
+		pginfo[i].pfn = page_to_pfn(page);
+	}
+
 	return 0;
+
+ err:
+	return -EINVAL;
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
-	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
+	    !kvm->arch.rma)
 		kvmppc_map_vrma(kvm, mem);
 }
 
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
 	long r;
+	unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
+	long err = -ENOMEM;
+	unsigned long lpcr;
 
 	/* Allocate hashed page table */
 	r = kvmppc_alloc_hpt(kvm);
@@ -935,11 +1135,52 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 		return r;
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+
+	kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
+				       GFP_KERNEL);
+	if (!kvm->arch.ram_pginfo) {
+		pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
+		       npages * sizeof(struct kvmppc_pginfo));
+		goto out_free;
+	}
+
+	kvm->arch.ram_npages = 0;
+	kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
+	kvm->arch.ram_porder = LARGE_PAGE_ORDER;
+	kvm->arch.rma = NULL;
+	kvm->arch.n_rma_pages = 0;
+
+	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
+	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+		LPCR_VPM0 | LPCR_VRMA_L;
+	kvm->arch.lpcr = lpcr;
+
+
 	return 0;
+
+ out_free:
+	kvmppc_free_hpt(kvm);
+	return err;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
+	struct kvmppc_pginfo *pginfo;
+	unsigned long i;
+
+	if (kvm->arch.ram_pginfo) {
+		pginfo = kvm->arch.ram_pginfo;
+		kvm->arch.ram_pginfo = NULL;
+		for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
+			if (pginfo[i].pfn)
+				put_page(pfn_to_page(pginfo[i].pfn));
+		kfree(pginfo);
+	}
+	if (kvm->arch.rma) {
+		kvm_release_rma(kvm->arch.rma);
+		kvm->arch.rma = NULL;
+	}
+
 	kvmppc_free_hpt(kvm);
 	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
new file mode 100644
index 000000000000..736df3cbbc55
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+
+#include <asm/cputable.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+/*
+ * This maintains a list of RMAs (real mode areas) for KVM guests to use.
+ * Each RMA has to be physically contiguous and of a size that the
+ * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB,
+ * and other larger sizes.  Since we are unlikely to be allocate that
+ * much physically contiguous memory after the system is up and running,
+ * we preallocate a set of RMAs in early boot for KVM to use.
+ */
+static unsigned long kvm_rma_size = 64 << 20;	/* 64MB */
+static unsigned long kvm_rma_count;
+
+static int __init early_parse_rma_size(char *p)
+{
+	if (!p)
+		return 1;
+
+	kvm_rma_size = memparse(p, &p);
+
+	return 0;
+}
+early_param("kvm_rma_size", early_parse_rma_size);
+
+static int __init early_parse_rma_count(char *p)
+{
+	if (!p)
+		return 1;
+
+	kvm_rma_count = simple_strtoul(p, NULL, 0);
+
+	return 0;
+}
+early_param("kvm_rma_count", early_parse_rma_count);
+
+static struct kvmppc_rma_info *rma_info;
+static LIST_HEAD(free_rmas);
+static DEFINE_SPINLOCK(rma_lock);
+
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+	switch (rma_size) {
+	case 32ul << 20:	/* 32 MB */
+		return 8;
+	case 64ul << 20:	/* 64 MB */
+		return 3;
+	case 128ul << 20:	/* 128 MB */
+		return 7;
+	case 256ul << 20:	/* 256 MB */
+		return 4;
+	case 1ul << 30:		/* 1 GB */
+		return 2;
+	case 16ul << 30:	/* 16 GB */
+		return 1;
+	case 256ul << 30:	/* 256 GB */
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+/*
+ * Called at boot time while the bootmem allocator is active,
+ * to allocate contiguous physical memory for the real memory
+ * areas for guests.
+ */
+void kvm_rma_init(void)
+{
+	unsigned long i;
+	unsigned long j, npages;
+	void *rma;
+	struct page *pg;
+
+	/* Only do this on POWER7 in HV mode */
+	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+		return;
+
+	if (!kvm_rma_size || !kvm_rma_count)
+		return;
+
+	/* Check that the requested size is one supported in hardware */
+	if (lpcr_rmls(kvm_rma_size) < 0) {
+		pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+		return;
+	}
+
+	npages = kvm_rma_size >> PAGE_SHIFT;
+	rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info));
+	for (i = 0; i < kvm_rma_count; ++i) {
+		rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size);
+		pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma,
+			kvm_rma_size >> 20);
+		rma_info[i].base_virt = rma;
+		rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT;
+		rma_info[i].npages = npages;
+		list_add_tail(&rma_info[i].list, &free_rmas);
+		atomic_set(&rma_info[i].use_count, 0);
+
+		pg = pfn_to_page(rma_info[i].base_pfn);
+		for (j = 0; j < npages; ++j) {
+			atomic_inc(&pg->_count);
+			++pg;
+		}
+	}
+}
+
+struct kvmppc_rma_info *kvm_alloc_rma(void)
+{
+	struct kvmppc_rma_info *ri;
+
+	ri = NULL;
+	spin_lock(&rma_lock);
+	if (!list_empty(&free_rmas)) {
+		ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list);
+		list_del(&ri->list);
+		atomic_inc(&ri->use_count);
+	}
+	spin_unlock(&rma_lock);
+	return ri;
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_rma);
+
+void kvm_release_rma(struct kvmppc_rma_info *ri)
+{
+	if (atomic_dec_and_test(&ri->use_count)) {
+		spin_lock(&rma_lock);
+		list_add_tail(&ri->list, &free_rmas);
+		spin_unlock(&rma_lock);
+
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_release_rma);
+
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c9bf177b7cf2..9ee223c35285 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -235,10 +235,10 @@ kvmppc_hv_entry:
 	bne	21b
 
 	/* Primary thread switches to guest partition. */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	lwz	r6,VCPU_PTID(r4)
 	cmpwi	r6,0
 	bne	20f
-	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
 	li	r0,LPID_RSVD		/* switch to reserved LPID */
@@ -255,8 +255,18 @@ kvmppc_hv_entry:
 20:	lbz	r0,VCORE_IN_GUEST(r5)
 	cmpwi	r0,0
 	beq	20b
-10:	ld	r8,VCPU_LPCR(r4)
-	mtspr	SPRN_LPCR,r8
+
+	/* Set LPCR.  Set the MER bit if there is a pending external irq. */
+10:	ld	r8,KVM_LPCR(r9)
+	ld	r0,VCPU_PENDING_EXC(r4)
+	li	r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
+	oris	r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+	and.	r0,r0,r7
+	beq	11f
+	ori	r8,r8,LPCR_MER
+11:	mtspr	SPRN_LPCR,r8
+	ld	r8,KVM_RMOR(r9)
+	mtspr	SPRN_RMOR,r8
 	isync
 
 	/* Check if HDEC expires soon */
@@ -464,7 +474,8 @@ hcall_real_cont:
 	/* Check for mediated interrupts (could be done earlier really ...) */
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
 	bne+	1f
-	ld	r5,VCPU_LPCR(r9)
+	ld	r5,VCPU_KVM(r9)
+	ld	r5,KVM_LPCR(r5)
 	andi.	r0,r11,MSR_EE
 	beq	1f
 	andi.	r0,r5,LPCR_MER
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4c549664c987..72c506505fa4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -211,6 +211,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_SMT:
 		r = threads_per_core;
 		break;
+	case KVM_CAP_PPC_RMA:
+		r = 1;
+		break;
 #endif
 	default:
 		r = 0;
@@ -673,6 +676,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
 		goto out;
 	}
+
+	case KVM_ALLOCATE_RMA: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_allocate_rma rma;
+
+		r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
+		if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
+			r = -EFAULT;
+		break;
+	}
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
 	default:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e2a378d97160..2c366b52f505 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -552,6 +552,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_BOOKE_SREGS 62
 #define KVM_CAP_SPAPR_TCE 63
 #define KVM_CAP_PPC_SMT 64
+#define KVM_CAP_PPC_RMA	65
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -755,6 +756,8 @@ struct kvm_clock_data {
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+/* Available with KVM_CAP_RMA */
+#define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
cgit v1.2.3


From e03b644fe68b1c6401465b02724d261538dba10f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 Jul 2011 15:28:11 -0400
Subject: KVM: introduce kvm_read_guest_cached

Introduce kvm_read_guest_cached() function in addition to write one we
already have.

[ by glauber: export function signature in kvm header ]

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Tested-by: Eric Munson <emunson@mgebm.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 31ebb59cbd2f..f7df0a3b031d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -381,6 +381,8 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
 			  unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 11d2783eb9df..d5ef9ebcaff7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1418,6 +1418,26 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	int r;
+
+	if (slots->generation != ghc->generation)
+		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+
+	if (kvm_is_error_hva(ghc->hva))
+		return -EFAULT;
+
+	r = __copy_from_user(data, (void __user *)ghc->hva, len);
+	if (r)
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
 	return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
-- 
cgit v1.2.3


From e2270ea62ae4d7a47d6d72942cdb8c669be6357a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 12 Jul 2011 10:37:21 -0700
Subject: netdevice: Kill 'feature' test macros.

Almost all of these have long outstayed their welcome.

And for every one of these macros, there are 10 features for which we
didn't add macros.

Let's just delete them all, and get out of habit of doing things this
way.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/net/cxgb4vf/cxgb4vf_main.c            | 17 -----------------
 drivers/staging/ft1000/ft1000-usb/ft1000_hw.c |  9 ---------
 drivers/staging/wlags49_h2/wl_internal.h      |  7 -------
 include/linux/netdevice.h                     | 12 ------------
 net/batman-adv/soft-interface.c               | 16 ----------------
 5 files changed, 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/cxgb4vf/cxgb4vf_main.c b/drivers/net/cxgb4vf/cxgb4vf_main.c
index 8a6f8911bc55..3942a825b3dd 100644
--- a/drivers/net/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/cxgb4vf/cxgb4vf_main.c
@@ -2421,7 +2421,6 @@ static int __devinit enable_msix(struct adapter *adapter)
 	return err;
 }
 
-#ifdef HAVE_NET_DEVICE_OPS
 static const struct net_device_ops cxgb4vf_netdev_ops	= {
 	.ndo_open		= cxgb4vf_open,
 	.ndo_stop		= cxgb4vf_stop,
@@ -2437,7 +2436,6 @@ static const struct net_device_ops cxgb4vf_netdev_ops	= {
 	.ndo_poll_controller	= cxgb4vf_poll_controller,
 #endif
 };
-#endif
 
 /*
  * "Probe" a device: initialize a device and construct all kernel and driver
@@ -2611,22 +2609,7 @@ static int __devinit cxgb4vf_pci_probe(struct pci_dev *pdev,
 		if (pci_using_dac)
 			netdev->features |= NETIF_F_HIGHDMA;
 
-#ifdef HAVE_NET_DEVICE_OPS
 		netdev->netdev_ops = &cxgb4vf_netdev_ops;
-#else
-		netdev->vlan_rx_register = cxgb4vf_vlan_rx_register;
-		netdev->open = cxgb4vf_open;
-		netdev->stop = cxgb4vf_stop;
-		netdev->hard_start_xmit = t4vf_eth_xmit;
-		netdev->get_stats = cxgb4vf_get_stats;
-		netdev->set_rx_mode = cxgb4vf_set_rxmode;
-		netdev->do_ioctl = cxgb4vf_do_ioctl;
-		netdev->change_mtu = cxgb4vf_change_mtu;
-		netdev->set_mac_address = cxgb4vf_set_mac_addr;
-#ifdef CONFIG_NET_POLL_CONTROLLER
-		netdev->poll_controller = cxgb4vf_poll_controller;
-#endif
-#endif
 		SET_ETHTOOL_OPS(netdev, &cxgb4vf_ethtool_ops);
 
 		/*
diff --git a/drivers/staging/ft1000/ft1000-usb/ft1000_hw.c b/drivers/staging/ft1000/ft1000-usb/ft1000_hw.c
index b0a4211f43a1..3f303ea14337 100644
--- a/drivers/staging/ft1000/ft1000-usb/ft1000_hw.c
+++ b/drivers/staging/ft1000/ft1000-usb/ft1000_hw.c
@@ -671,7 +671,6 @@ static int ft1000_reset_card(struct net_device *dev)
 	return TRUE;
 }
 
-#ifdef HAVE_NET_DEVICE_OPS
 static const struct net_device_ops ftnet_ops =
 {
 	.ndo_open = &ft1000_open,
@@ -679,7 +678,6 @@ static const struct net_device_ops ftnet_ops =
 	.ndo_start_xmit = &ft1000_start_xmit,
 	.ndo_get_stats = &ft1000_netdev_stats,
 };
-#endif
 
 
 //---------------------------------------------------------------------------
@@ -764,14 +762,7 @@ int init_ft1000_netdev(struct ft1000_device *ft1000dev)
 
 	INIT_LIST_HEAD(&pInfo->nodes.list);
 
-#ifdef HAVE_NET_DEVICE_OPS
 	netdev->netdev_ops = &ftnet_ops;
-#else
-	netdev->hard_start_xmit = &ft1000_start_xmit;
-	netdev->get_stats = &ft1000_netdev_stats;
-	netdev->open = &ft1000_open;
-	netdev->stop = &ft1000_close;
-#endif
 
 	ft1000dev->net = netdev;
 
diff --git a/drivers/staging/wlags49_h2/wl_internal.h b/drivers/staging/wlags49_h2/wl_internal.h
index cd129b3ee6c0..e86aad53b53d 100644
--- a/drivers/staging/wlags49_h2/wl_internal.h
+++ b/drivers/staging/wlags49_h2/wl_internal.h
@@ -990,14 +990,7 @@ struct wl_private
 #endif // USE_WDS
 }; // wl_private
 
-#ifdef HAVE_NETDEV_PRIV
 #define wl_priv(dev) ((struct wl_private *) netdev_priv(dev))
-#else
-extern inline struct wl_private *wl_priv(struct net_device *dev)
-{
-    return dev->priv;
-}
-#endif
 
 /********************************************************************/
 /* Locking and synchronization functions                            */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 011eb89e9582..30f17e42c304 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -60,11 +60,6 @@ struct wireless_dev;
 #define SET_ETHTOOL_OPS(netdev,ops) \
 	( (netdev)->ethtool_ops = (ops) )
 
-#define HAVE_ALLOC_NETDEV		/* feature macro: alloc_xxxdev
-					   functions are available. */
-#define HAVE_FREE_NETDEV		/* free_netdev() */
-#define HAVE_NETDEV_PRIV		/* netdev_priv() */
-
 /* hardware address assignment types */
 #define NET_ADDR_PERM		0	/* address is permanent (default) */
 #define NET_ADDR_RANDOM		1	/* address is generated randomly */
@@ -313,7 +308,6 @@ struct header_ops {
 			   const void *saddr, unsigned len);
 	int	(*parse)(const struct sk_buff *skb, unsigned char *haddr);
 	int	(*rebuild)(struct sk_buff *skb);
-#define HAVE_HEADER_CACHE
 	int	(*cache)(const struct neighbour *neigh, struct hh_cache *hh);
 	void	(*cache_update)(struct hh_cache *hh,
 				const struct net_device *dev,
@@ -887,7 +881,6 @@ struct netdev_tc_txq {
  *	Must return >0 or -errno if it changed dev->features itself.
  *
  */
-#define HAVE_NET_DEVICE_OPS
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
 	void			(*ndo_uninit)(struct net_device *dev);
@@ -1781,8 +1774,6 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd,
 
 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
-#define HAVE_NETIF_QUEUE
-
 extern void __netif_schedule(struct Qdisc *q);
 
 static inline void netif_schedule_queue(struct netdev_queue *txq)
@@ -2058,10 +2049,8 @@ extern void dev_kfree_skb_irq(struct sk_buff *skb);
  */
 extern void dev_kfree_skb_any(struct sk_buff *skb);
 
-#define HAVE_NETIF_RX 1
 extern int		netif_rx(struct sk_buff *skb);
 extern int		netif_rx_ni(struct sk_buff *skb);
-#define HAVE_NETIF_RECEIVE_SKB 1
 extern int		netif_receive_skb(struct sk_buff *skb);
 extern gro_result_t	dev_gro_receive(struct napi_struct *napi,
 					struct sk_buff *skb);
@@ -2241,7 +2230,6 @@ extern void netif_device_attach(struct net_device *dev);
 /*
  * Network interface message level settings
  */
-#define HAVE_NETIF_MSG 1
 
 enum {
 	NETIF_MSG_DRV		= 0x0001,
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 3f20332e1d38..3e2f91ffa4e2 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -750,7 +750,6 @@ out:
 	return;
 }
 
-#ifdef HAVE_NET_DEVICE_OPS
 static const struct net_device_ops bat_netdev_ops = {
 	.ndo_open = interface_open,
 	.ndo_stop = interface_release,
@@ -760,7 +759,6 @@ static const struct net_device_ops bat_netdev_ops = {
 	.ndo_start_xmit = interface_tx,
 	.ndo_validate_addr = eth_validate_addr
 };
-#endif
 
 static void interface_setup(struct net_device *dev)
 {
@@ -769,16 +767,7 @@ static void interface_setup(struct net_device *dev)
 
 	ether_setup(dev);
 
-#ifdef HAVE_NET_DEVICE_OPS
 	dev->netdev_ops = &bat_netdev_ops;
-#else
-	dev->open = interface_open;
-	dev->stop = interface_release;
-	dev->get_stats = interface_stats;
-	dev->set_mac_address = interface_set_mac_addr;
-	dev->change_mtu = interface_change_mtu;
-	dev->hard_start_xmit = interface_tx;
-#endif
 	dev->destructor = free_netdev;
 	dev->tx_queue_len = 0;
 
@@ -885,13 +874,8 @@ void softif_destroy(struct net_device *soft_iface)
 
 int softif_is_valid(const struct net_device *net_dev)
 {
-#ifdef HAVE_NET_DEVICE_OPS
 	if (net_dev->netdev_ops->ndo_start_xmit == interface_tx)
 		return 1;
-#else
-	if (net_dev->hard_start_xmit == interface_tx)
-		return 1;
-#endif
 
 	return 0;
 }
-- 
cgit v1.2.3


From 1e01979c8f502ac13e3cdece4f38712c5944e6e8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Jul 2011 09:45:34 +0200
Subject: x86, numa: Implement pfn -> nid mapping granularity check

SPARSEMEM w/o VMEMMAP and DISCONTIGMEM, both used only on 32bit, use
sections array to map pfn to nid which is limited in granularity.  If
NUMA nodes are laid out such that the mapping cannot be accurate, boot
will fail triggering BUG_ON() in mminit_verify_page_links().

On 32bit, it's 512MiB w/ PAE and SPARSEMEM.  This seems to have been
granular enough until commit 2706a0bf7b (x86, NUMA: Enable
CONFIG_AMD_NUMA on 32bit too).  Apparently, there is a machine which
aligns NUMA nodes to 128MiB and has only AMD NUMA but not SRAT.  This
led to the following BUG_ON().

 On node 0 totalpages: 2096615
   DMA zone: 32 pages used for memmap
   DMA zone: 0 pages reserved
   DMA zone: 3927 pages, LIFO batch:0
   Normal zone: 1740 pages used for memmap
   Normal zone: 220978 pages, LIFO batch:31
   HighMem zone: 16405 pages used for memmap
   HighMem zone: 1853533 pages, LIFO batch:31
 BUG: Int 6: CR2   (null)
      EDI   (null)  ESI 00000002  EBP 00000002  ESP c1543ecc
      EBX f2400000  EDX 00000006  ECX   (null)  EAX 00000001
      err   (null)  EIP c16209aa   CS 00000060  flg 00010002
 Stack: f2400000 00220000 f7200800 c1620613 00220000 01000000 04400000 00238000
          (null) f7200000 00000002 f7200b58 f7200800 c1620929 000375fe   (null)
        f7200b80 c16395f0 00200a02 f7200a80   (null) 000375fe 00000002   (null)
 Pid: 0, comm: swapper Not tainted 2.6.39-rc5-00181-g2706a0b #17
 Call Trace:
  [<c136b1e5>] ? early_fault+0x2e/0x2e
  [<c16209aa>] ? mminit_verify_page_links+0x12/0x42
  [<c1620613>] ? memmap_init_zone+0xaf/0x10c
  [<c1620929>] ? free_area_init_node+0x2b9/0x2e3
  [<c1607e99>] ? free_area_init_nodes+0x3f2/0x451
  [<c1601d80>] ? paging_init+0x112/0x118
  [<c15f578d>] ? setup_arch+0x791/0x82f
  [<c15f43d9>] ? start_kernel+0x6a/0x257

This patch implements node_map_pfn_alignment() which determines
maximum internode alignment and update numa_register_memblks() to
reject NUMA configuration if alignment exceeds the pfn -> nid mapping
granularity of the memory model as determined by PAGES_PER_SECTION.

This makes the problematic machine boot w/ flatmem by rejecting the
NUMA config and provides protection against crazy NUMA configurations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20110712074534.GB2872@htj.dyndns.org
LKML-Reference: <20110628174613.GP478@escobedo.osrc.amd.com>
Reported-and-Tested-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
Cc: Conny Seidel <conny.seidel@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/numa.c | 15 +++++++++++++++
 include/linux/mm.h |  1 +
 mm/page_alloc.c    | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f5510d889a22..fbeaaf416610 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -496,6 +496,7 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
 
 static int __init numa_register_memblks(struct numa_meminfo *mi)
 {
+	unsigned long uninitialized_var(pfn_align);
 	int i, nid;
 
 	/* Account for nodes with cpus and no memory */
@@ -511,6 +512,20 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
 	/* for out of order entries */
 	sort_node_map();
+
+	/*
+	 * If sections array is gonna be used for pfn -> nid mapping, check
+	 * whether its granularity is fine enough.
+	 */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+	pfn_align = node_map_pfn_alignment();
+	if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+		printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+		       PFN_PHYS(pfn_align) >> 20,
+		       PFN_PHYS(PAGES_PER_SECTION) >> 20);
+		return -EINVAL;
+	}
+#endif
 	if (!numa_meminfo_cover_memory(mi))
 		return -EINVAL;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71d7be9..c70a326b8f26 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1313,6 +1313,7 @@ extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
 void sort_node_map(void);
+unsigned long node_map_pfn_alignment(void);
 unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
 						unsigned long end_pfn);
 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab8..9119faae6e6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4585,6 +4585,60 @@ void __init sort_node_map(void)
 			cmp_node_active_region, NULL);
 }
 
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
+ * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's.  0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+	unsigned long accl_mask = 0, last_end = 0;
+	int last_nid = -1;
+	int i;
+
+	for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
+		int nid = early_node_map[i].nid;
+		unsigned long start = early_node_map[i].start_pfn;
+		unsigned long end = early_node_map[i].end_pfn;
+		unsigned long mask;
+
+		if (!start || last_nid < 0 || last_nid == nid) {
+			last_nid = nid;
+			last_end = end;
+			continue;
+		}
+
+		/*
+		 * Start with a mask granular enough to pin-point to the
+		 * start pfn and tick off bits one-by-one until it becomes
+		 * too coarse to separate the current node from the last.
+		 */
+		mask = ~((1 << __ffs(start)) - 1);
+		while (mask && last_end <= (start & (mask << 1)))
+			mask <<= 1;
+
+		/* accumulate all internode masks */
+		accl_mask |= mask;
+	}
+
+	/* convert mask to number of pages */
+	return ~accl_mask + 1;
+}
+
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
-- 
cgit v1.2.3


From e69dd336ee3a05a589629b505b18ba5e7a5b4c54 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 12 Jul 2011 23:28:12 -0700
Subject: net: Push protocol type directly down to header_ops->cache()

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/firewire/net.c      | 6 +++---
 drivers/isdn/i4l/isdn_net.c | 5 +++--
 drivers/net/plip.c          | 6 +++---
 include/linux/etherdevice.h | 2 +-
 include/linux/netdevice.h   | 2 +-
 net/core/neighbour.c        | 2 +-
 net/ethernet/eth.c          | 3 +--
 7 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index b9762d07198d..eced1c25bf58 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -261,16 +261,16 @@ static int fwnet_header_rebuild(struct sk_buff *skb)
 }
 
 static int fwnet_header_cache(const struct neighbour *neigh,
-			      struct hh_cache *hh)
+			      struct hh_cache *hh, __be16 type)
 {
 	struct net_device *net;
 	struct fwnet_header *h;
 
-	if (hh->hh_type == cpu_to_be16(ETH_P_802_3))
+	if (type == cpu_to_be16(ETH_P_802_3))
 		return -1;
 	net = neigh->dev;
 	h = (struct fwnet_header *)((u8 *)hh->hh_data + 16 - sizeof(*h));
-	h->h_proto = hh->hh_type;
+	h->h_proto = type;
 	memcpy(h->h_dest, neigh->ha, net->addr_len);
 	hh->hh_len = FWNET_HLEN;
 
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index 97988111e45a..48e9cc0369b1 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -1983,13 +1983,14 @@ isdn_net_rebuild_header(struct sk_buff *skb)
 	return ret;
 }
 
-static int isdn_header_cache(const struct neighbour *neigh, struct hh_cache *hh)
+static int isdn_header_cache(const struct neighbour *neigh, struct hh_cache *hh,
+			     __be16 type)
 {
 	const struct net_device *dev = neigh->dev;
 	isdn_net_local *lp = netdev_priv(dev);
 
 	if (lp->p_encap == ISDN_NET_ENCAP_ETHER)
-		return eth_header_cache(neigh, hh);
+		return eth_header_cache(neigh, hh, type);
 	return -1;
 }
 
diff --git a/drivers/net/plip.c b/drivers/net/plip.c
index ca4df7f4cf21..a9e9ca8a86ed 100644
--- a/drivers/net/plip.c
+++ b/drivers/net/plip.c
@@ -152,7 +152,7 @@ static int plip_hard_header(struct sk_buff *skb, struct net_device *dev,
                             unsigned short type, const void *daddr,
 			    const void *saddr, unsigned len);
 static int plip_hard_header_cache(const struct neighbour *neigh,
-                                  struct hh_cache *hh);
+                                  struct hh_cache *hh, __be16 type);
 static int plip_open(struct net_device *dev);
 static int plip_close(struct net_device *dev);
 static int plip_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
@@ -1026,11 +1026,11 @@ plip_hard_header(struct sk_buff *skb, struct net_device *dev,
 }
 
 static int plip_hard_header_cache(const struct neighbour *neigh,
-				  struct hh_cache *hh)
+				  struct hh_cache *hh, __be16 type)
 {
 	int ret;
 
-	ret = eth_header_cache(neigh, hh);
+	ret = eth_header_cache(neigh, hh, type);
 	if (ret == 0) {
 		struct ethhdr *eth;
 
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index ab68f785fd19..05955cf09937 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -38,7 +38,7 @@ extern int eth_header(struct sk_buff *skb, struct net_device *dev,
 		      const void *daddr, const void *saddr, unsigned len);
 extern int eth_rebuild_header(struct sk_buff *skb);
 extern int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);
-extern int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh);
+extern int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type);
 extern void eth_header_cache_update(struct hh_cache *hh,
 				    const struct net_device *dev,
 				    const unsigned char *haddr);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 30f17e42c304..564d89fdddda 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -308,7 +308,7 @@ struct header_ops {
 			   const void *saddr, unsigned len);
 	int	(*parse)(const struct sk_buff *skb, unsigned char *haddr);
 	int	(*rebuild)(struct sk_buff *skb);
-	int	(*cache)(const struct neighbour *neigh, struct hh_cache *hh);
+	int	(*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type);
 	void	(*cache_update)(struct hh_cache *hh,
 				const struct net_device *dev,
 				const unsigned char *haddr);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 50bd960983e0..8f7e1d8d92a0 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1247,7 +1247,7 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 	hh->hh_type = protocol;
 	atomic_set(&hh->hh_refcnt, 2);
 
-	if (dev->header_ops->cache(n, hh)) {
+	if (dev->header_ops->cache(n, hh, protocol)) {
 		kfree(hh);
 		return;
 	}
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 44d2b42fda56..5cffb63f481a 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -233,9 +233,8 @@ EXPORT_SYMBOL(eth_header_parse);
  * @hh: destination cache entry
  * Create an Ethernet header template from the neighbour.
  */
-int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh)
+int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)
 {
-	__be16 type = hh->hh_type;
 	struct ethhdr *eth;
 	const struct net_device *dev = neigh->dev;
 
-- 
cgit v1.2.3


From 5c25f686db352082eef8daa21b760192351a023a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 13 Jul 2011 00:51:10 -0700
Subject: net: Kill support for multiple hh_cache entries per neighbour

This never, ever, happens.

Neighbour entries are always tied to one address family, and therefore
one set of dst_ops, and therefore one dst_ops->protocol "hh_type"
value.

This capability was blindly imported by Alexey Kuznetsov when he wrote
the neighbour layer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 ++-------
 net/core/neighbour.c      | 37 ++++++++++++++++++-------------------
 2 files changed, 20 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 564d89fdddda..75382378a1ba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -252,7 +252,6 @@ struct netdev_hw_addr_list {
 	netdev_hw_addr_list_for_each(ha, &(dev)->mc)
 
 struct hh_cache {
-	struct hh_cache *hh_next;	/* Next entry			     */
 	atomic_t	hh_refcnt;	/* number of users                   */
 /*
  * We want hh_output, hh_len, hh_lock and hh_data be a in a separate
@@ -260,12 +259,8 @@ struct hh_cache {
  * They are mostly read, but hh_refcnt may be changed quite frequently,
  * incurring cache line ping pongs.
  */
-	__be16		hh_type ____cacheline_aligned_in_smp;
-					/* protocol identifier, f.e ETH_P_IP
-                                         *  NOTE:  For VLANs, this will be the
-                                         *  encapuslated type. --BLG
-                                         */
-	u16		hh_len;		/* length of header */
+	u16		hh_len ____cacheline_aligned_in_smp;
+	u16		__pad;
 	int		(*hh_output)(struct sk_buff *skb);
 	seqlock_t	hh_lock;
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8f7e1d8d92a0..f879bb552994 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -702,9 +702,9 @@ void neigh_destroy(struct neighbour *neigh)
 	if (neigh_del_timer(neigh))
 		printk(KERN_WARNING "Impossible event.\n");
 
-	while ((hh = neigh->hh) != NULL) {
-		neigh->hh = hh->hh_next;
-		hh->hh_next = NULL;
+	hh = neigh->hh;
+	if (hh) {
+		neigh->hh = NULL;
 
 		write_seqlock_bh(&hh->hh_lock);
 		hh->hh_output = neigh_blackhole;
@@ -737,7 +737,8 @@ static void neigh_suspect(struct neighbour *neigh)
 
 	neigh->output = neigh->ops->output;
 
-	for (hh = neigh->hh; hh; hh = hh->hh_next)
+	hh = neigh->hh;
+	if (hh)
 		hh->hh_output = neigh->ops->output;
 }
 
@@ -754,7 +755,8 @@ static void neigh_connect(struct neighbour *neigh)
 
 	neigh->output = neigh->ops->connected_output;
 
-	for (hh = neigh->hh; hh; hh = hh->hh_next)
+	hh = neigh->hh;
+	if (hh)
 		hh->hh_output = neigh->ops->hh_output;
 }
 
@@ -1025,7 +1027,8 @@ static void neigh_update_hhs(const struct neighbour *neigh)
 		update = neigh->dev->header_ops->cache_update;
 
 	if (update) {
-		for (hh = neigh->hh; hh; hh = hh->hh_next) {
+		hh = neigh->hh;
+		if (hh) {
 			write_seqlock_bh(&hh->hh_lock);
 			update(hh, neigh->dev, neigh->ha);
 			write_sequnlock_bh(&hh->hh_lock);
@@ -1211,19 +1214,17 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
 }
 EXPORT_SYMBOL(neigh_event_ns);
 
-static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
-				   __be16 protocol)
+static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst)
 {
 	struct hh_cache *hh;
 
 	smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
-	for (hh = n->hh; hh; hh = hh->hh_next) {
-		if (hh->hh_type == protocol) {
-			atomic_inc(&hh->hh_refcnt);
-			if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
-				hh_cache_put(hh);
-			return true;
-		}
+	hh = n->hh;
+	if (hh) {
+		atomic_inc(&hh->hh_refcnt);
+		if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+			hh_cache_put(hh);
+		return true;
 	}
 	return false;
 }
@@ -1235,7 +1236,7 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 	struct hh_cache	*hh;
 	struct net_device *dev = dst->dev;
 
-	if (likely(neigh_hh_lookup(n, dst, protocol)))
+	if (likely(neigh_hh_lookup(n, dst)))
 		return;
 
 	/* slow path */
@@ -1244,7 +1245,6 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 		return;
 
 	seqlock_init(&hh->hh_lock);
-	hh->hh_type = protocol;
 	atomic_set(&hh->hh_refcnt, 2);
 
 	if (dev->header_ops->cache(n, hh, protocol)) {
@@ -1255,7 +1255,7 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 	write_lock_bh(&n->lock);
 
 	/* must check if another thread already did the insert */
-	if (neigh_hh_lookup(n, dst, protocol)) {
+	if (neigh_hh_lookup(n, dst)) {
 		kfree(hh);
 		goto end;
 	}
@@ -1265,7 +1265,6 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 	else
 		hh->hh_output = n->ops->output;
 
-	hh->hh_next = n->hh;
 	smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
 	n->hh	    = hh;
 
-- 
cgit v1.2.3


From 5125bbf3880755419eff68672623cde49c4f31e8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 13 Jul 2011 12:31:52 +0200
Subject: PM / Domains: Introduce function to power off all unused PM domains

Add a new function pm_genpd_poweroff_unused() queuing up the
execution of pm_genpd_poweroff() for every initialized generic PM
domain.  Calling it will cause every generic PM domain without
devices in use to be powered off.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Magnus Damm <damm@opensource.se>
---
 drivers/base/power/domain.c | 21 +++++++++++++++++++++
 include/linux/pm_domain.h   |  3 +++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index c3e4e2934e16..c2c537de22b6 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -16,6 +16,9 @@
 #include <linux/sched.h>
 #include <linux/suspend.h>
 
+static LIST_HEAD(gpd_list);
+static DEFINE_MUTEX(gpd_list_lock);
+
 #ifdef CONFIG_PM
 
 static struct generic_pm_domain *dev_to_genpd(struct device *dev)
@@ -1241,4 +1244,22 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
 	genpd->domain.ops.restore_noirq = pm_genpd_restore_noirq;
 	genpd->domain.ops.restore = pm_genpd_restore;
 	genpd->domain.ops.complete = pm_genpd_complete;
+	mutex_lock(&gpd_list_lock);
+	list_add(&genpd->gpd_list_node, &gpd_list);
+	mutex_unlock(&gpd_list_lock);
+}
+
+/**
+ * pm_genpd_poweroff_unused - Power off all PM domains with no devices in use.
+ */
+void pm_genpd_poweroff_unused(void)
+{
+	struct generic_pm_domain *genpd;
+
+	mutex_lock(&gpd_list_lock);
+
+	list_for_each_entry(genpd, &gpd_list, gpd_list_node)
+		genpd_queue_power_off_work(genpd);
+
+	mutex_unlock(&gpd_list_lock);
 }
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index feb80af4cc1b..3e4f3d308f5e 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -24,6 +24,7 @@ struct dev_power_governor {
 
 struct generic_pm_domain {
 	struct dev_pm_domain domain;	/* PM domain operations */
+	struct list_head gpd_list_node;	/* Node in the global PM domains list */
 	struct list_head sd_node;	/* Node in the parent's subdomain list */
 	struct generic_pm_domain *parent;	/* Parent PM domain */
 	struct list_head sd_list;	/* List of dubdomains */
@@ -71,6 +72,7 @@ extern int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 extern void pm_genpd_init(struct generic_pm_domain *genpd,
 			  struct dev_power_governor *gov, bool is_off);
 extern int pm_genpd_poweron(struct generic_pm_domain *genpd);
+extern void pm_genpd_poweroff_unused(void);
 #else
 static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
 				      struct device *dev)
@@ -98,6 +100,7 @@ static inline int pm_genpd_poweron(struct generic_pm_domain *genpd)
 {
 	return -ENOSYS;
 }
+static inline void pm_genpd_poweroff_unused(void) {}
 #endif
 
 #endif /* _LINUX_PM_DOMAIN_H */
-- 
cgit v1.2.3


From 433bd805e5fd2c731b3a9025b034f066272d336e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@mit.edu>
Date: Wed, 13 Jul 2011 09:24:13 -0400
Subject: clocksource: Replace vread with generic arch data

The vread field was bloating struct clocksource everywhere except
x86_64, and I want to change the way this works on x86_64, so let's
split it out into per-arch data.

Cc: x86@kernel.org
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: linux-ia64@vger.kernel.org
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andy Lutomirski <luto@mit.edu>
Link: http://lkml.kernel.org/r/3ae5ec76a168eaaae63f08a2a1060b91aa0b7759.1310563276.git.luto@mit.edu
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/clocksource.h | 16 ++++++++++++++++
 arch/x86/kernel/hpet.c             |  2 +-
 arch/x86/kernel/tsc.c              |  2 +-
 arch/x86/kernel/vsyscall_64.c      |  2 +-
 include/asm-generic/clocksource.h  |  4 ++++
 include/linux/clocksource.h        | 10 ++++++++--
 6 files changed, 31 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/include/asm/clocksource.h
 create mode 100644 include/asm-generic/clocksource.h

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
new file mode 100644
index 000000000000..a5df33f614c9
--- /dev/null
+++ b/arch/x86/include/asm/clocksource.h
@@ -0,0 +1,16 @@
+/* x86-specific clocksource additions */
+
+#ifndef _ASM_X86_CLOCKSOURCE_H
+#define _ASM_X86_CLOCKSOURCE_H
+
+#ifdef CONFIG_X86_64
+
+#define __ARCH_HAS_CLOCKSOURCE_DATA
+
+struct arch_clocksource_data {
+	cycle_t (*vread)(void);
+};
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index e9f5605e4748..0e07257bb389 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -753,7 +753,7 @@ static struct clocksource clocksource_hpet = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
 #ifdef CONFIG_X86_64
-	.vread		= vread_hpet,
+	.archdata	= { .vread = vread_hpet },
 #endif
 };
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6cc6922262af..e7a74b889ab3 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = {
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
 #ifdef CONFIG_X86_64
-	.vread                  = vread_tsc,
+	.archdata               = { .vread = vread_tsc },
 #endif
 };
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index a262400c3479..12d488fd95d9 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -74,7 +74,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
 
 	/* copy vsyscall data */
-	vsyscall_gtod_data.clock.vread		= clock->vread;
+	vsyscall_gtod_data.clock.vread		= clock->archdata.vread;
 	vsyscall_gtod_data.clock.cycle_last	= clock->cycle_last;
 	vsyscall_gtod_data.clock.mask		= clock->mask;
 	vsyscall_gtod_data.clock.mult		= mult;
diff --git a/include/asm-generic/clocksource.h b/include/asm-generic/clocksource.h
new file mode 100644
index 000000000000..0a462d3fb05e
--- /dev/null
+++ b/include/asm-generic/clocksource.h
@@ -0,0 +1,4 @@
+/*
+ * Architectures should override this file to add private userspace
+ * clock magic if needed.
+ */
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index d4646b48dc4a..0fb83c224471 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -22,6 +22,8 @@
 typedef u64 cycle_t;
 struct clocksource;
 
+#include <asm/clocksource.h>
+
 /**
  * struct cyclecounter - hardware abstraction for a free running counter
  *	Provides completely state-free accessors to the underlying hardware.
@@ -153,7 +155,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
  * @shift:		cycle to nanosecond divisor (power of two)
  * @max_idle_ns:	max idle time permitted by the clocksource (nsecs)
  * @flags:		flags describing special properties
- * @vread:		vsyscall based read
+ * @archdata:		arch-specific data
  * @suspend:		suspend function for the clocksource, if necessary
  * @resume:		resume function for the clocksource, if necessary
  */
@@ -175,10 +177,14 @@ struct clocksource {
 #else
 #define CLKSRC_FSYS_MMIO_SET(mmio, addr)      do { } while (0)
 #endif
+
+#ifdef __ARCH_HAS_CLOCKSOURCE_DATA
+	struct arch_clocksource_data archdata;
+#endif
+
 	const char *name;
 	struct list_head list;
 	int rating;
-	cycle_t (*vread)(void);
 	int (*enable)(struct clocksource *cs);
 	void (*disable)(struct clocksource *cs);
 	unsigned long flags;
-- 
cgit v1.2.3


From f39b2dd9d065151a04f5996656d1f27a7eb32d45 Mon Sep 17 00:00:00 2001
From: Philip Rakity <prakity@marvell.com>
Date: Thu, 7 Jul 2011 09:04:55 -0700
Subject: mmc: core: Bus width testing needs to handle suspend/resume

On reading the ext_csd for the first time (in 1 bit mode), save the
ext_csd information needed for bus width compare.

On every pass we make re-reading the ext_csd, compare the data
against the saved ext_csd data.

This fixes a regression introduced in 3.0-rc1 by 08ee80cc397ac1a3
("mmc: core: eMMC bus width may not work on all platforms"), which
incorrectly assumed we would be re-reading the ext_csd at resume-
time.

Signed-off-by: Philip Rakity <prakity@marvell.com>
Tested-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/mmc.c   | 77 +++++++++++++++++++++++++++++++-----------------
 include/linux/mmc/card.h | 13 ++++++++
 2 files changed, 63 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 2a7e43bc796d..aa7d1d79b8c5 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -247,12 +247,12 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		return 0;
 
 	/* Version is coded in the CSD_STRUCTURE byte in the EXT_CSD register */
+	card->ext_csd.raw_ext_csd_structure = ext_csd[EXT_CSD_STRUCTURE];
 	if (card->csd.structure == 3) {
-		int ext_csd_struct = ext_csd[EXT_CSD_STRUCTURE];
-		if (ext_csd_struct > 2) {
+		if (card->ext_csd.raw_ext_csd_structure > 2) {
 			printk(KERN_ERR "%s: unrecognised EXT_CSD structure "
 				"version %d\n", mmc_hostname(card->host),
-					ext_csd_struct);
+					card->ext_csd.raw_ext_csd_structure);
 			err = -EINVAL;
 			goto out;
 		}
@@ -266,6 +266,10 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		goto out;
 	}
 
+	card->ext_csd.raw_sectors[0] = ext_csd[EXT_CSD_SEC_CNT + 0];
+	card->ext_csd.raw_sectors[1] = ext_csd[EXT_CSD_SEC_CNT + 1];
+	card->ext_csd.raw_sectors[2] = ext_csd[EXT_CSD_SEC_CNT + 2];
+	card->ext_csd.raw_sectors[3] = ext_csd[EXT_CSD_SEC_CNT + 3];
 	if (card->ext_csd.rev >= 2) {
 		card->ext_csd.sectors =
 			ext_csd[EXT_CSD_SEC_CNT + 0] << 0 |
@@ -277,7 +281,7 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		if (card->ext_csd.sectors > (2u * 1024 * 1024 * 1024) / 512)
 			mmc_card_set_blockaddr(card);
 	}
-
+	card->ext_csd.raw_card_type = ext_csd[EXT_CSD_CARD_TYPE];
 	switch (ext_csd[EXT_CSD_CARD_TYPE] & EXT_CSD_CARD_TYPE_MASK) {
 	case EXT_CSD_CARD_TYPE_DDR_52 | EXT_CSD_CARD_TYPE_52 |
 	     EXT_CSD_CARD_TYPE_26:
@@ -307,6 +311,11 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 			mmc_hostname(card->host));
 	}
 
+	card->ext_csd.raw_s_a_timeout = ext_csd[EXT_CSD_S_A_TIMEOUT];
+	card->ext_csd.raw_erase_timeout_mult =
+		ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT];
+	card->ext_csd.raw_hc_erase_grp_size =
+		ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE];
 	if (card->ext_csd.rev >= 3) {
 		u8 sa_shift = ext_csd[EXT_CSD_S_A_TIMEOUT];
 		card->ext_csd.part_config = ext_csd[EXT_CSD_PART_CONFIG];
@@ -334,6 +343,16 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		card->ext_csd.boot_size = ext_csd[EXT_CSD_BOOT_MULT] << 17;
 	}
 
+	card->ext_csd.raw_hc_erase_gap_size =
+		ext_csd[EXT_CSD_PARTITION_ATTRIBUTE];
+	card->ext_csd.raw_sec_trim_mult =
+		ext_csd[EXT_CSD_SEC_TRIM_MULT];
+	card->ext_csd.raw_sec_erase_mult =
+		ext_csd[EXT_CSD_SEC_ERASE_MULT];
+	card->ext_csd.raw_sec_feature_support =
+		ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT];
+	card->ext_csd.raw_trim_mult =
+		ext_csd[EXT_CSD_TRIM_MULT];
 	if (card->ext_csd.rev >= 4) {
 		/*
 		 * Enhanced area feature support -- check whether the eMMC
@@ -341,7 +360,7 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		 * area offset and size to user by adding sysfs interface.
 		 */
 		if ((ext_csd[EXT_CSD_PARTITION_SUPPORT] & 0x2) &&
-				(ext_csd[EXT_CSD_PARTITION_ATTRIBUTE] & 0x1)) {
+		    (ext_csd[EXT_CSD_PARTITION_ATTRIBUTE] & 0x1)) {
 			u8 hc_erase_grp_sz =
 				ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE];
 			u8 hc_wp_grp_sz =
@@ -401,17 +420,17 @@ static inline void mmc_free_ext_csd(u8 *ext_csd)
 }
 
 
-static int mmc_compare_ext_csds(struct mmc_card *card, u8 *ext_csd,
-			unsigned bus_width)
+static int mmc_compare_ext_csds(struct mmc_card *card, unsigned bus_width)
 {
 	u8 *bw_ext_csd;
 	int err;
 
+	if (bus_width == MMC_BUS_WIDTH_1)
+		return 0;
+
 	err = mmc_get_ext_csd(card, &bw_ext_csd);
-	if (err)
-		return err;
 
-	if ((ext_csd == NULL || bw_ext_csd == NULL)) {
+	if (err || bw_ext_csd == NULL) {
 		if (bus_width != MMC_BUS_WIDTH_1)
 			err = -EINVAL;
 		goto out;
@@ -421,35 +440,40 @@ static int mmc_compare_ext_csds(struct mmc_card *card, u8 *ext_csd,
 		goto out;
 
 	/* only compare read only fields */
-	err = (!(ext_csd[EXT_CSD_PARTITION_SUPPORT] ==
+	err = (!(card->ext_csd.raw_partition_support ==
 			bw_ext_csd[EXT_CSD_PARTITION_SUPPORT]) &&
-		(ext_csd[EXT_CSD_ERASED_MEM_CONT] ==
+		(card->ext_csd.raw_erased_mem_count ==
 			bw_ext_csd[EXT_CSD_ERASED_MEM_CONT]) &&
-		(ext_csd[EXT_CSD_REV] ==
+		(card->ext_csd.rev ==
 			bw_ext_csd[EXT_CSD_REV]) &&
-		(ext_csd[EXT_CSD_STRUCTURE] ==
+		(card->ext_csd.raw_ext_csd_structure ==
 			bw_ext_csd[EXT_CSD_STRUCTURE]) &&
-		(ext_csd[EXT_CSD_CARD_TYPE] ==
+		(card->ext_csd.raw_card_type ==
 			bw_ext_csd[EXT_CSD_CARD_TYPE]) &&
-		(ext_csd[EXT_CSD_S_A_TIMEOUT] ==
+		(card->ext_csd.raw_s_a_timeout ==
 			bw_ext_csd[EXT_CSD_S_A_TIMEOUT]) &&
-		(ext_csd[EXT_CSD_HC_WP_GRP_SIZE] ==
+		(card->ext_csd.raw_hc_erase_gap_size ==
 			bw_ext_csd[EXT_CSD_HC_WP_GRP_SIZE]) &&
-		(ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT] ==
+		(card->ext_csd.raw_erase_timeout_mult ==
 			bw_ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT]) &&
-		(ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE] ==
+		(card->ext_csd.raw_hc_erase_grp_size ==
 			bw_ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE]) &&
-		(ext_csd[EXT_CSD_SEC_TRIM_MULT] ==
+		(card->ext_csd.raw_sec_trim_mult ==
 			bw_ext_csd[EXT_CSD_SEC_TRIM_MULT]) &&
-		(ext_csd[EXT_CSD_SEC_ERASE_MULT] ==
+		(card->ext_csd.raw_sec_erase_mult ==
 			bw_ext_csd[EXT_CSD_SEC_ERASE_MULT]) &&
-		(ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT] ==
+		(card->ext_csd.raw_sec_feature_support ==
 			bw_ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT]) &&
-		(ext_csd[EXT_CSD_TRIM_MULT] ==
+		(card->ext_csd.raw_trim_mult ==
 			bw_ext_csd[EXT_CSD_TRIM_MULT]) &&
-		memcmp(&ext_csd[EXT_CSD_SEC_CNT],
-		       &bw_ext_csd[EXT_CSD_SEC_CNT],
-		       4) != 0);
+		(card->ext_csd.raw_sectors[0] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 0]) &&
+		(card->ext_csd.raw_sectors[1] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 1]) &&
+		(card->ext_csd.raw_sectors[2] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 2]) &&
+		(card->ext_csd.raw_sectors[3] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 3]));
 	if (err)
 		err = -EINVAL;
 
@@ -770,7 +794,6 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 				 */
 				if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
 					err = mmc_compare_ext_csds(card,
-						ext_csd,
 						bus_width);
 				else
 					err = mmc_bus_test(card, bus_width);
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index c6927a4d157f..6ad43554ac05 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -64,6 +64,19 @@ struct mmc_ext_csd {
 	unsigned long long	enhanced_area_offset;	/* Units: Byte */
 	unsigned int		enhanced_area_size;	/* Units: KB */
 	unsigned int		boot_size;		/* in bytes */
+	u8			raw_partition_support;	/* 160 */
+	u8			raw_erased_mem_count;	/* 181 */
+	u8			raw_ext_csd_structure;	/* 194 */
+	u8			raw_card_type;		/* 196 */
+	u8			raw_s_a_timeout;		/* 217 */
+	u8			raw_hc_erase_gap_size;	/* 221 */
+	u8			raw_erase_timeout_mult;	/* 223 */
+	u8			raw_hc_erase_grp_size;	/* 224 */
+	u8			raw_sec_trim_mult;	/* 229 */
+	u8			raw_sec_erase_mult;	/* 230 */
+	u8			raw_sec_feature_support;/* 231 */
+	u8			raw_trim_mult;		/* 232 */
+	u8			raw_sectors[4];		/* 212 - 4 bytes */
 };
 
 struct sd_scr {
-- 
cgit v1.2.3


From c9aaa8957f203bd6df83b002fb40b98390bed078 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 11 Jul 2011 15:28:14 -0400
Subject: KVM: Steal time implementation

To implement steal time, we need the hypervisor to pass the guest
information about how much time was spent running other processes
outside the VM, while the vcpu had meaningful work to do - halt
time does not count.

This information is acquired through the run_delay field of
delayacct/schedstats infrastructure, that counts time spent in a
runqueue but not running.

Steal time is a per-cpu information, so the traditional MSR-based
infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the
memory area address containing information about steal time

This patch contains the hypervisor part of the steal time infrasructure,
and can be backported independently of the guest portion.

[avi, yongjie: export delayacct_on, to avoid build failures in some configs]

Signed-off-by: Glauber Costa <glommer@redhat.com>
Tested-by: Eric B Munson <emunson@mgebm.net>
CC: Rik van Riel <riel@redhat.com>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Yongjie Ren <yongjie.ren@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  9 +++++
 arch/x86/include/asm/kvm_para.h |  4 +++
 arch/x86/kvm/Kconfig            |  1 +
 arch/x86/kvm/x86.c              | 74 +++++++++++++++++++++++++++++++++++++++--
 include/linux/kvm_host.h        |  1 +
 kernel/delayacct.c              |  2 ++
 6 files changed, 89 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da6bbee878ca..59086a77ff13 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -389,6 +389,15 @@ struct kvm_vcpu_arch {
 	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+
+	struct {
+		u64 msr_val;
+		u64 last_steal;
+		u64 accum_steal;
+		struct gfn_to_hva_cache stime;
+		struct kvm_steal_time steal;
+	} st;
+
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
 	u64 last_tsc_nsec;
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 65f8bb9279e0..c484ba8e05ea 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -45,6 +45,10 @@ struct kvm_steal_time {
 	__u32 pad[12];
 };
 
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
+
 #define KVM_MAX_MMU_OP_BATCH           32
 
 #define KVM_ASYNC_PF_ENABLED			(1 << 0)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 50f63648ce1b..99c3f0589faa 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -31,6 +31,7 @@ config KVM
 	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
+	select TASK_DELAY_ACCT
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b803f04bde7..c96cdc092484 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -808,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	8
+#define KVM_SAVE_MSRS_BEGIN	9
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1488,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void accumulate_steal_time(struct kvm_vcpu *vcpu)
+{
+	u64 delta;
+
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
+	vcpu->arch.st.last_steal = current->sched_info.run_delay;
+	vcpu->arch.st.accum_steal = delta;
+}
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+		return;
+
+	vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
+	vcpu->arch.st.steal.version += 2;
+	vcpu->arch.st.accum_steal = 0;
+
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
@@ -1570,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		if (kvm_pv_enable_async_pf(vcpu, data))
 			return 1;
 		break;
+	case MSR_KVM_STEAL_TIME:
+
+		if (unlikely(!sched_info_on()))
+			return 1;
+
+		if (data & KVM_STEAL_RESERVED_MASK)
+			return 1;
+
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+							data & KVM_STEAL_VALID_BITS))
+			return 1;
+
+		vcpu->arch.st.msr_val = data;
+
+		if (!(data & KVM_MSR_ENABLED))
+			break;
+
+		vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+		preempt_disable();
+		accumulate_steal_time(vcpu);
+		preempt_enable();
+
+		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+		break;
+
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1855,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_ASYNC_PF_EN:
 		data = vcpu->arch.apf.msr_val;
 		break;
+	case MSR_KVM_STEAL_TIME:
+		data = vcpu->arch.st.msr_val;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -2166,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
 	}
+
+	accumulate_steal_time(vcpu);
+	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2487,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
 			     (1 << KVM_FEATURE_ASYNC_PF) |
 			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
+		if (sched_info_on())
+			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
 		entry->ebx = 0;
 		entry->ecx = 0;
 		entry->edx = 0;
@@ -5470,6 +5536,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			r = 1;
 			goto out;
 		}
+		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+			record_steal_time(vcpu);
+
 	}
 
 	r = kvm_mmu_reload(vcpu);
@@ -6206,6 +6275,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.apf.msr_val = 0;
+	vcpu->arch.st.msr_val = 0;
 
 	kvmclock_reset(vcpu);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f7df0a3b031d..c8e023902f79 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -47,6 +47,7 @@
 #define KVM_REQ_DEACTIVATE_FPU    10
 #define KVM_REQ_EVENT             11
 #define KVM_REQ_APF_HALT          12
+#define KVM_REQ_STEAL_UPDATE      13
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa71..418b3f7053aa 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
 #include <linux/time.h>
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
+#include <linux/module.h>
 
 int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
 struct kmem_cache *delayacct_cache;
 
 static int __init delayacct_setup_disable(char *str)
-- 
cgit v1.2.3


From f6b72b6217f8c24f2a54988e58af858b4e66024d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 14 Jul 2011 07:53:20 -0700
Subject: net: Embed hh_cache inside of struct neighbour.

Now that there is a one-to-one correspondance between neighbour
and hh_cache entries, we no longer need:

1) dynamic allocation
2) attachment to dst->hh
3) refcounting

Initialization of the hh_cache entry is indicated by hh_len
being non-zero, and such initialization is always done with
the neighbour's lock held as a writer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 15 +--------
 include/net/dst.h         | 18 +++++------
 include/net/neighbour.h   |  2 +-
 net/bridge/br_netfilter.c |  6 ++--
 net/core/dst.c            |  7 ----
 net/core/neighbour.c      | 81 +++++++++++++----------------------------------
 net/ipv4/ip_output.c      | 14 +++++---
 net/ipv4/route.c          |  7 ++--
 net/ipv6/ip6_output.c     | 14 +++++---
 9 files changed, 59 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 75382378a1ba..5ccc0cb8352b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -252,14 +252,7 @@ struct netdev_hw_addr_list {
 	netdev_hw_addr_list_for_each(ha, &(dev)->mc)
 
 struct hh_cache {
-	atomic_t	hh_refcnt;	/* number of users                   */
-/*
- * We want hh_output, hh_len, hh_lock and hh_data be a in a separate
- * cache line on SMP.
- * They are mostly read, but hh_refcnt may be changed quite frequently,
- * incurring cache line ping pongs.
- */
-	u16		hh_len ____cacheline_aligned_in_smp;
+	u16		hh_len;
 	u16		__pad;
 	int		(*hh_output)(struct sk_buff *skb);
 	seqlock_t	hh_lock;
@@ -273,12 +266,6 @@ struct hh_cache {
 	unsigned long	hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
 };
 
-static inline void hh_cache_put(struct hh_cache *hh)
-{
-	if (atomic_dec_and_test(&hh->hh_refcnt))
-		kfree(hh);
-}
-
 /* Reserve HH_DATA_MOD byte aligned hard_header_len, but at least that much.
  * Alternative is:
  *   dev->hard_header_len ? (dev->hard_header_len +
diff --git a/include/net/dst.h b/include/net/dst.h
index e12ddfb9eb16..0dd7ccbc0dd5 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -38,7 +38,6 @@ struct dst_entry {
 	unsigned long		expires;
 	struct dst_entry	*path;
 	struct neighbour	*neighbour;
-	struct hh_cache		*hh;
 #ifdef CONFIG_XFRM
 	struct xfrm_state	*xfrm;
 #else
@@ -47,6 +46,14 @@ struct dst_entry {
 	int			(*input)(struct sk_buff*);
 	int			(*output)(struct sk_buff*);
 
+	int			flags;
+#define DST_HOST		0x0001
+#define DST_NOXFRM		0x0002
+#define DST_NOPOLICY		0x0004
+#define DST_NOHASH		0x0008
+#define DST_NOCACHE		0x0010
+#define DST_NOCOUNT		0x0020
+
 	short			error;
 	short			obsolete;
 	unsigned short		header_len;	/* more space at head required */
@@ -62,7 +69,7 @@ struct dst_entry {
 	 * (L1_CACHE_SIZE would be too much)
 	 */
 #ifdef CONFIG_64BIT
-	long			__pad_to_align_refcnt[1];
+	long			__pad_to_align_refcnt[2];
 #endif
 	/*
 	 * __refcnt wants to be on a different cache line from
@@ -71,13 +78,6 @@ struct dst_entry {
 	atomic_t		__refcnt;	/* client references	*/
 	int			__use;
 	unsigned long		lastuse;
-	int			flags;
-#define DST_HOST		0x0001
-#define DST_NOXFRM		0x0002
-#define DST_NOPOLICY		0x0004
-#define DST_NOHASH		0x0008
-#define DST_NOCACHE		0x0010
-#define DST_NOCOUNT		0x0020
 	union {
 		struct dst_entry	*next;
 		struct rtable __rcu	*rt_next;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 6fe8c2cd5acb..bd8f9f09ab5c 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -108,7 +108,7 @@ struct neighbour {
 	__u8			dead;
 	seqlock_t		ha_lock;
 	unsigned char		ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
-	struct hh_cache		*hh;
+	struct hh_cache		hh;
 	int			(*output)(struct sk_buff *skb);
 	const struct neigh_ops	*ops;
 	struct rcu_head		rcu;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 56149ec36d7f..75ee421917c7 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -343,14 +343,16 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct neighbour *neigh;
 	struct dst_entry *dst;
 
 	skb->dev = bridge_parent(skb->dev);
 	if (!skb->dev)
 		goto free_skb;
 	dst = skb_dst(skb);
-	if (dst->hh) {
-		neigh_hh_bridge(dst->hh, skb);
+	neigh = dst->neighbour;
+	if (neigh->hh.hh_len) {
+		neigh_hh_bridge(&neigh->hh, skb);
 		skb->dev = nf_bridge->physindev;
 		return br_handle_frame_finish(skb);
 	} else if (dst->neighbour) {
diff --git a/net/core/dst.c b/net/core/dst.c
index 6135f3671692..4aacc14936a0 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -172,7 +172,6 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
 	dst->expires = 0UL;
 	dst->path = dst;
 	dst->neighbour = NULL;
-	dst->hh = NULL;
 #ifdef CONFIG_XFRM
 	dst->xfrm = NULL;
 #endif
@@ -226,19 +225,13 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
 {
 	struct dst_entry *child;
 	struct neighbour *neigh;
-	struct hh_cache *hh;
 
 	smp_rmb();
 
 again:
 	neigh = dst->neighbour;
-	hh = dst->hh;
 	child = dst->child;
 
-	dst->hh = NULL;
-	if (hh)
-		hh_cache_put(hh);
-
 	if (neigh) {
 		dst->neighbour = NULL;
 		neigh_release(neigh);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f879bb552994..77a399f2ad03 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -297,6 +297,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
 	n->updated	  = n->used = now;
 	n->nud_state	  = NUD_NONE;
 	n->output	  = neigh_blackhole;
+	seqlock_init(&n->hh.hh_lock);
 	n->parms	  = neigh_parms_clone(&tbl->parms);
 	setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
 
@@ -702,14 +703,11 @@ void neigh_destroy(struct neighbour *neigh)
 	if (neigh_del_timer(neigh))
 		printk(KERN_WARNING "Impossible event.\n");
 
-	hh = neigh->hh;
-	if (hh) {
-		neigh->hh = NULL;
-
+	hh = &neigh->hh;
+	if (hh->hh_len) {
 		write_seqlock_bh(&hh->hh_lock);
 		hh->hh_output = neigh_blackhole;
 		write_sequnlock_bh(&hh->hh_lock);
-		hh_cache_put(hh);
 	}
 
 	skb_queue_purge(&neigh->arp_queue);
@@ -737,8 +735,8 @@ static void neigh_suspect(struct neighbour *neigh)
 
 	neigh->output = neigh->ops->output;
 
-	hh = neigh->hh;
-	if (hh)
+	hh = &neigh->hh;
+	if (hh->hh_len)
 		hh->hh_output = neigh->ops->output;
 }
 
@@ -755,8 +753,8 @@ static void neigh_connect(struct neighbour *neigh)
 
 	neigh->output = neigh->ops->connected_output;
 
-	hh = neigh->hh;
-	if (hh)
+	hh = &neigh->hh;
+	if (hh->hh_len)
 		hh->hh_output = neigh->ops->hh_output;
 }
 
@@ -1017,7 +1015,7 @@ out_unlock_bh:
 }
 EXPORT_SYMBOL(__neigh_event_send);
 
-static void neigh_update_hhs(const struct neighbour *neigh)
+static void neigh_update_hhs(struct neighbour *neigh)
 {
 	struct hh_cache *hh;
 	void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
@@ -1027,8 +1025,8 @@ static void neigh_update_hhs(const struct neighbour *neigh)
 		update = neigh->dev->header_ops->cache_update;
 
 	if (update) {
-		hh = neigh->hh;
-		if (hh) {
+		hh = &neigh->hh;
+		if (hh->hh_len) {
 			write_seqlock_bh(&hh->hh_lock);
 			update(hh, neigh->dev, neigh->ha);
 			write_sequnlock_bh(&hh->hh_lock);
@@ -1214,62 +1212,29 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
 }
 EXPORT_SYMBOL(neigh_event_ns);
 
-static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst)
-{
-	struct hh_cache *hh;
-
-	smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
-	hh = n->hh;
-	if (hh) {
-		atomic_inc(&hh->hh_refcnt);
-		if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
-			hh_cache_put(hh);
-		return true;
-	}
-	return false;
-}
-
 /* called with read_lock_bh(&n->lock); */
-static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
-			  __be16 protocol)
+static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
 {
-	struct hh_cache	*hh;
 	struct net_device *dev = dst->dev;
-
-	if (likely(neigh_hh_lookup(n, dst)))
-		return;
-
-	/* slow path */
-	hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
-	if (!hh)
-		return;
-
-	seqlock_init(&hh->hh_lock);
-	atomic_set(&hh->hh_refcnt, 2);
-
-	if (dev->header_ops->cache(n, hh, protocol)) {
-		kfree(hh);
-		return;
-	}
+	__be16 prot = dst->ops->protocol;
+	struct hh_cache	*hh = &n->hh;
 
 	write_lock_bh(&n->lock);
 
-	/* must check if another thread already did the insert */
-	if (neigh_hh_lookup(n, dst)) {
-		kfree(hh);
+	/* Only one thread can come in here and initialize the
+	 * hh_cache entry.
+	 */
+	if (hh->hh_len)
+		goto end;
+
+	if (dev->header_ops->cache(n, hh, prot))
 		goto end;
-	}
 
 	if (n->nud_state & NUD_CONNECTED)
 		hh->hh_output = n->ops->hh_output;
 	else
 		hh->hh_output = n->ops->output;
 
-	smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
-	n->hh	    = hh;
-
-	if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
-		hh_cache_put(hh);
 end:
 	write_unlock_bh(&n->lock);
 }
@@ -1312,10 +1277,8 @@ int neigh_resolve_output(struct sk_buff *skb)
 		struct net_device *dev = neigh->dev;
 		unsigned int seq;
 
-		if (dev->header_ops->cache &&
-		    !dst->hh &&
-		    !(dst->flags & DST_NOCACHE))
-			neigh_hh_init(neigh, dst, dst->ops->protocol);
+		if (dev->header_ops->cache && !neigh->hh.hh_len)
+			neigh_hh_init(neigh, dst);
 
 		do {
 			seq = read_seqbegin(&neigh->ha_lock);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 54119d5aae8f..a621b96aed15 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -182,6 +182,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	struct rtable *rt = (struct rtable *)dst;
 	struct net_device *dev = dst->dev;
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
+	struct neighbour *neigh;
 
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -203,11 +204,14 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 		skb = skb2;
 	}
 
-	if (dst->hh)
-		return neigh_hh_output(dst->hh, skb);
-	else if (dst->neighbour)
-		return dst->neighbour->output(skb);
-
+	neigh = dst->neighbour;
+	if (neigh) {
+		struct hh_cache *hh = &neigh->hh;
+		if (hh->hh_len)
+			return neigh_hh_output(hh, skb);
+		else
+			return dst->neighbour->output(skb);
+	}
 	if (net_ratelimit())
 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 	kfree_skb(skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c6388e825ed3..a52bb74d2612 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -426,9 +426,10 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 			      dst_metric(&r->dst, RTAX_RTTVAR)),
 			r->rt_key_tos,
-			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
-			r->dst.hh ? (r->dst.hh->hh_output ==
-				       dev_queue_xmit) : 0,
+			-1,
+			(r->dst.neighbour ?
+			 (r->dst.neighbour->hh.hh_output ==
+			  dev_queue_xmit) : 0),
 			r->rt_spec_dst, &len);
 
 		seq_printf(seq, "%*s\n", 127 - len, "");
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9d4b165837d6..f0f144cac0bd 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -100,6 +100,7 @@ static int ip6_finish_output2(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *dev = dst->dev;
+	struct neighbour *neigh;
 
 	skb->protocol = htons(ETH_P_IPV6);
 	skb->dev = dev;
@@ -134,11 +135,14 @@ static int ip6_finish_output2(struct sk_buff *skb)
 				skb->len);
 	}
 
-	if (dst->hh)
-		return neigh_hh_output(dst->hh, skb);
-	else if (dst->neighbour)
-		return dst->neighbour->output(skb);
-
+	neigh = dst->neighbour;
+	if (neigh) {
+		struct hh_cache *hh = &neigh->hh;
+		if (hh->hh_len)
+			return neigh_hh_output(hh, skb);
+		else
+			return dst->neighbour->output(skb);
+	}
 	IP6_INC_STATS_BH(dev_net(dst->dev),
 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 	kfree_skb(skb);
-- 
cgit v1.2.3


From 0bc5b2debb832191a42baea7ff59d2ca6ce9f7d5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 14 Jul 2011 20:59:07 +0200
Subject: ARM / shmobile: Use genpd_queue_power_off_work()

Make pd_power_down_a3rv() use genpd_queue_power_off_work() to queue
up the powering off of the A4LC domain to avoid queuing it up when
it is pending.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Magnus Damm <damm@opensource.se>
---
 arch/arm/mach-shmobile/pm-sh7372.c | 2 +-
 drivers/base/power/domain.c        | 2 +-
 include/linux/pm_domain.h          | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-shmobile/pm-sh7372.c b/arch/arm/mach-shmobile/pm-sh7372.c
index f47281a57d43..0b07138908b7 100644
--- a/arch/arm/mach-shmobile/pm-sh7372.c
+++ b/arch/arm/mach-shmobile/pm-sh7372.c
@@ -107,7 +107,7 @@ static int pd_power_down_a3rv(struct generic_pm_domain *genpd)
 
 	/* try to power down A4LC after A3RV is requested off */
 	pm_genpd_poweron(&sh7372_a4lc.genpd);
-	queue_work(pm_wq, &sh7372_a4lc.genpd.power_off_work);
+	genpd_queue_power_off_work(&sh7372_a4lc.genpd);
 
 	return ret;
 }
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index c2c537de22b6..00ed4f32a4de 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -222,7 +222,7 @@ static bool genpd_abort_poweroff(struct generic_pm_domain *genpd)
  * Queue up the execution of pm_genpd_poweroff() unless it's already been done
  * before.
  */
-static void genpd_queue_power_off_work(struct generic_pm_domain *genpd)
+void genpd_queue_power_off_work(struct generic_pm_domain *genpd)
 {
 	if (!work_pending(&genpd->power_off_work))
 		queue_work(pm_wq, &genpd->power_off_work);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 3e4f3d308f5e..21097cb086fe 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -73,6 +73,7 @@ extern void pm_genpd_init(struct generic_pm_domain *genpd,
 			  struct dev_power_governor *gov, bool is_off);
 extern int pm_genpd_poweron(struct generic_pm_domain *genpd);
 extern void pm_genpd_poweroff_unused(void);
+extern void genpd_queue_power_off_work(struct generic_pm_domain *genpd);
 #else
 static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
 				      struct device *dev)
@@ -101,6 +102,7 @@ static inline int pm_genpd_poweron(struct generic_pm_domain *genpd)
 	return -ENOSYS;
 }
 static inline void pm_genpd_poweroff_unused(void) {}
+static inline void genpd_queue_power_off_work(struct generic_pm_domain *gpd) {}
 #endif
 
 #endif /* _LINUX_PM_DOMAIN_H */
-- 
cgit v1.2.3


From 4a9bd3f134decd6d16ead8d288342d57aad486be Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 14 Jul 2011 16:36:53 -0400
Subject: tracing: Have dynamic size event stack traces

Currently the stack trace per event in ftace is only 8 frames.
This can be quite limiting and sometimes useless. Especially when
the "ignore frames" is wrong and we also use up stack frames for
the event processing itself.

Change this to be dynamic by adding a percpu buffer that we can
write a large stack frame into and then copy into the ring buffer.

For interrupts and NMIs that come in while another event is being
process, will only get to use the 8 frame stack. That should be enough
as the task that it interrupted will have the full stack frame anyway.

Requested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  1 +
 kernel/trace/trace.c         | 92 +++++++++++++++++++++++++++++++++++++-------
 kernel/trace/trace_entries.h |  3 +-
 kernel/trace/trace_output.c  | 11 +++---
 4 files changed, 88 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index b1e69eefc203..96efa6794ea5 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -76,6 +76,7 @@ struct trace_iterator {
 	struct trace_entry	*ent;
 	unsigned long		lost_events;
 	int			leftover;
+	int			ent_size;
 	int			cpu;
 	u64			ts;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d9c16123f6e2..e5df02c69b1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1248,6 +1248,15 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 }
 
 #ifdef CONFIG_STACKTRACE
+
+#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+struct ftrace_stack {
+	unsigned long		calls[FTRACE_STACK_MAX_ENTRIES];
+};
+
+static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+static DEFINE_PER_CPU(int, ftrace_stack_reserve);
+
 static void __ftrace_trace_stack(struct ring_buffer *buffer,
 				 unsigned long flags,
 				 int skip, int pc, struct pt_regs *regs)
@@ -1256,25 +1265,77 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
+	int use_stack;
+	int size = FTRACE_STACK_ENTRIES;
+
+	trace.nr_entries	= 0;
+	trace.skip		= skip;
+
+	/*
+	 * Since events can happen in NMIs there's no safe way to
+	 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
+	 * or NMI comes in, it will just have to use the default
+	 * FTRACE_STACK_SIZE.
+	 */
+	preempt_disable_notrace();
+
+	use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+	/*
+	 * We don't need any atomic variables, just a barrier.
+	 * If an interrupt comes in, we don't care, because it would
+	 * have exited and put the counter back to what we want.
+	 * We just need a barrier to keep gcc from moving things
+	 * around.
+	 */
+	barrier();
+	if (use_stack == 1) {
+		trace.entries		= &__get_cpu_var(ftrace_stack).calls[0];
+		trace.max_entries	= FTRACE_STACK_MAX_ENTRIES;
+
+		if (regs)
+			save_stack_trace_regs(regs, &trace);
+		else
+			save_stack_trace(&trace);
+
+		if (trace.nr_entries > size)
+			size = trace.nr_entries;
+	} else
+		/* From now on, use_stack is a boolean */
+		use_stack = 0;
+
+	size *= sizeof(unsigned long);
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry) + size, flags, pc);
 	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	memset(&entry->caller, 0, sizeof(entry->caller));
+		goto out;
+	entry = ring_buffer_event_data(event);
 
-	trace.nr_entries	= 0;
-	trace.max_entries	= FTRACE_STACK_ENTRIES;
-	trace.skip		= skip;
-	trace.entries		= entry->caller;
+	memset(&entry->caller, 0, size);
+
+	if (use_stack)
+		memcpy(&entry->caller, trace.entries,
+		       trace.nr_entries * sizeof(unsigned long));
+	else {
+		trace.max_entries	= FTRACE_STACK_ENTRIES;
+		trace.entries		= entry->caller;
+		if (regs)
+			save_stack_trace_regs(regs, &trace);
+		else
+			save_stack_trace(&trace);
+	}
+
+	entry->size = trace.nr_entries;
 
-	if (regs)
-		save_stack_trace_regs(regs, &trace);
-	else
-		save_stack_trace(&trace);
 	if (!filter_check_discard(call, entry, buffer, event))
 		ring_buffer_unlock_commit(buffer, event);
+
+ out:
+	/* Again, don't let gcc optimize things here */
+	barrier();
+	__get_cpu_var(ftrace_stack_reserve)--;
+	preempt_enable_notrace();
+
 }
 
 void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
@@ -1562,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
 
 	ftrace_enable_cpu();
 
-	return event ? ring_buffer_event_data(event) : NULL;
+	if (event) {
+		iter->ent_size = ring_buffer_event_length(event);
+		return ring_buffer_event_data(event);
+	}
+	iter->ent_size = 0;
+	return NULL;
 }
 
 static struct trace_entry *
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e32744c84d94..93365907f219 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
 	TRACE_STACK,
 
 	F_STRUCT(
-		__array(	unsigned long,	caller, FTRACE_STACK_ENTRIES	)
+		__field(	int,		size	)
+		__dynamic_array(unsigned long,	caller	)
 	),
 
 	F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e37de492a9e1..51999309a6cf 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 {
 	struct stack_entry *field;
 	struct trace_seq *s = &iter->seq;
-	int i;
+	unsigned long *p;
+	unsigned long *end;
 
 	trace_assign_type(field, iter->ent);
+	end = (unsigned long *)((long)iter->ent + iter->ent_size);
 
 	if (!trace_seq_puts(s, "<stack trace>\n"))
 		goto partial;
-	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-		if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
-			break;
+
+	for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
 		if (!trace_seq_puts(s, " => "))
 			goto partial;
 
-		if (!seq_print_ip_sym(s, field->caller[i], flags))
+		if (!seq_print_ip_sym(s, *p, flags))
 			goto partial;
 		if (!trace_seq_puts(s, "\n"))
 			goto partial;
-- 
cgit v1.2.3


From fec30c33819b442fd618b10f405248ee7cbb51d8 Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Wed, 13 Jul 2011 14:10:30 +0000
Subject: net: unexport netdev_fix_features()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is not used anywhere except net/core/dev.c now.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 -
 net/core/dev.c            | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5ccc0cb8352b..f84dfd25c431 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2529,7 +2529,6 @@ static inline u32 netdev_get_wanted_features(struct net_device *dev)
 	return (dev->features & ~dev->hw_features) | dev->wanted_features;
 }
 u32 netdev_increment_features(u32 all, u32 one, u32 mask);
-u32 netdev_fix_features(struct net_device *dev, u32 features);
 int __netdev_update_features(struct net_device *dev);
 void netdev_update_features(struct net_device *dev);
 void netdev_change_features(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index e57be0262051..9444c5cb4137 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5209,7 +5209,7 @@ static void rollback_registered(struct net_device *dev)
 	list_del(&single);
 }
 
-u32 netdev_fix_features(struct net_device *dev, u32 features)
+static u32 netdev_fix_features(struct net_device *dev, u32 features)
 {
 	/* Fix illegal checksum combinations */
 	if ((features & NETIF_F_HW_CSUM) &&
@@ -5268,7 +5268,6 @@ u32 netdev_fix_features(struct net_device *dev, u32 features)
 
 	return features;
 }
-EXPORT_SYMBOL(netdev_fix_features);
 
 int __netdev_update_features(struct net_device *dev)
 {
-- 
cgit v1.2.3


From e20e6940736fb7b4dd024933f7456b9da4c44118 Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Thu, 14 Jul 2011 14:45:59 -0700
Subject: net: remove SK_ROUTE_CAPS from meta ematch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove it, as it indirectly exposes netdev features. It's not used in
iproute2 (2.6.38) - is anything else using its interface?

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tc_ematch/tc_em_meta.h | 2 +-
 net/sched/em_meta.c                  | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h
index 7138962664f8..b11f8ce2d3c0 100644
--- a/include/linux/tc_ematch/tc_em_meta.h
+++ b/include/linux/tc_ematch/tc_em_meta.h
@@ -67,7 +67,7 @@ enum {
 	TCF_META_ID_SK_FORWARD_ALLOCS,
 	TCF_META_ID_SK_SNDBUF,
  	TCF_META_ID_SK_ALLOCS,
- 	TCF_META_ID_SK_ROUTE_CAPS,
+	__TCF_META_ID_SK_ROUTE_CAPS,	/* unimplemented but in ABI already */
  	TCF_META_ID_SK_HASH,
  	TCF_META_ID_SK_LINGERTIME,
  	TCF_META_ID_SK_ACK_BACKLOG,
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 49130e8abff0..1363bf14e61b 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -404,12 +404,6 @@ META_COLLECTOR(int_sk_alloc)
 	dst->value = (__force int) skb->sk->sk_allocation;
 }
 
-META_COLLECTOR(int_sk_route_caps)
-{
-	SKIP_NONLOCAL(skb);
-	dst->value = skb->sk->sk_route_caps;
-}
-
 META_COLLECTOR(int_sk_hash)
 {
 	SKIP_NONLOCAL(skb);
@@ -530,7 +524,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] =
 		[META_ID(SK_ERR_QLEN)]		= META_FUNC(int_sk_err_qlen),
 		[META_ID(SK_FORWARD_ALLOCS)]	= META_FUNC(int_sk_fwd_alloc),
 		[META_ID(SK_ALLOCS)]		= META_FUNC(int_sk_alloc),
-		[META_ID(SK_ROUTE_CAPS)]	= META_FUNC(int_sk_route_caps),
 		[META_ID(SK_HASH)]		= META_FUNC(int_sk_hash),
 		[META_ID(SK_LINGERTIME)]	= META_FUNC(int_sk_lingertime),
 		[META_ID(SK_ACK_BACKLOG)]	= META_FUNC(int_sk_ack_bl),
-- 
cgit v1.2.3


From 62f2a3a48bdc99822a24356e667e52c30df287c9 Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Wed, 13 Jul 2011 14:10:29 +0000
Subject: net: remove NETIF_F_ALL_TX_OFFLOADS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no software fallback implemented for SCTP or FCoE checksumming,
and so it should not be passed on by software devices like bridge or bonding.

For VLAN devices, this is different. First, the driver for underlying device
should be prepared to get offloaded packets even when the feature is disabled
(especially if it advertises it in vlan_features). Second, devices under
VLANs do not get replaced without tearing down the VLAN first.

This fixes a mess I accidentally introduced while converting bonding to
ndo_fix_features.

NETIF_F_SOFT_FEATURES are removed from BOND_VLAN_FEATURES because they
are unused as of commit 712ae51afd.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 6 +++---
 include/linux/netdevice.h       | 6 ------
 net/8021q/vlan_dev.c            | 6 +++++-
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index eafe44a528ac..63c22b0bb5ad 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1428,9 +1428,9 @@ out:
 	return features;
 }
 
-#define BOND_VLAN_FEATURES	(NETIF_F_ALL_TX_OFFLOADS | \
-				 NETIF_F_SOFT_FEATURES | \
-				 NETIF_F_LRO)
+#define BOND_VLAN_FEATURES	(NETIF_F_ALL_CSUM | NETIF_F_SG | \
+				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
+				 NETIF_F_HIGHDMA | NETIF_F_LRO)
 
 static void bond_compute_features(struct bonding *bond)
 {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 54b8b4d7b68f..9e19477991ad 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1097,12 +1097,6 @@ struct net_device {
 #define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
 				 NETIF_F_FSO)
 
-#define NETIF_F_ALL_TX_OFFLOADS	(NETIF_F_ALL_CSUM | NETIF_F_SG | \
-				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
-				 NETIF_F_HIGHDMA | \
-				 NETIF_F_SCTP_CSUM | \
-				 NETIF_F_ALL_FCOE)
-
 	/*
 	 * If one device supports one of these features, then enable them
 	 * for all in netdev_increment_features.
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 86bff9b1ac47..6e82148edfc8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -528,7 +528,11 @@ static int vlan_dev_init(struct net_device *dev)
 					  (1<<__LINK_STATE_DORMANT))) |
 		      (1<<__LINK_STATE_PRESENT);
 
-	dev->hw_features = NETIF_F_ALL_TX_OFFLOADS;
+	dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG |
+			   NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
+			   NETIF_F_HIGHDMA | NETIF_F_SCTP_CSUM |
+			   NETIF_F_ALL_FCOE;
+
 	dev->features |= real_dev->vlan_features | NETIF_F_LLTX;
 	dev->gso_max_size = real_dev->gso_max_size;
 
-- 
cgit v1.2.3


From 574c44fa8fa6262ffd5939789ef51a6e98ed62d7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@mit.edu>
Date: Wed, 13 Jul 2011 09:24:15 -0400
Subject: ia64: Replace clocksource.fsys_mmio with generic arch data

Now that clocksource.archdata is available, use it for ia64-specific
code.

Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: linux-ia64@vger.kernel.org
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andy Lutomirski <luto@mit.edu>
Link: http://lkml.kernel.org/r/d31de0ee0842a0e322fb6441571c2b0adb323fa2.1310563276.git.luto@mit.edu
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/ia64/include/asm/clocksource.h | 12 ++++++++++++
 arch/ia64/kernel/cyclone.c          |  2 +-
 arch/ia64/kernel/time.c             |  2 +-
 arch/ia64/sn/kernel/sn2/timer.c     |  2 +-
 drivers/char/hpet.c                 |  2 +-
 include/linux/clocksource.h         |  7 -------
 6 files changed, 16 insertions(+), 11 deletions(-)
 create mode 100644 arch/ia64/include/asm/clocksource.h

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/clocksource.h b/arch/ia64/include/asm/clocksource.h
new file mode 100644
index 000000000000..00eb549a59b0
--- /dev/null
+++ b/arch/ia64/include/asm/clocksource.h
@@ -0,0 +1,12 @@
+/* IA64-specific clocksource additions */
+
+#ifndef _ASM_IA64_CLOCKSOURCE_H
+#define _ASM_IA64_CLOCKSOURCE_H
+
+#define __ARCH_HAS_CLOCKSOURCE_DATA
+
+struct arch_clocksource_data {
+	void *fsys_mmio;        /* used by fsyscall asm code */
+};
+
+#endif /* _ASM_IA64_CLOCKSOURCE_H */
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index f64097b5118a..4826ff957a3d 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -115,7 +115,7 @@ int __init init_cyclone_clock(void)
 	}
 	/* initialize last tick */
 	cyclone_mc = cyclone_timer;
-	clocksource_cyclone.fsys_mmio = cyclone_timer;
+	clocksource_cyclone.archdata.fsys_mmio = cyclone_timer;
 	clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ);
 
 	return 0;
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 85118dfe9bb5..43920de425f1 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -468,7 +468,7 @@ void update_vsyscall(struct timespec *wall, struct timespec *wtm,
         fsyscall_gtod_data.clk_mask = c->mask;
         fsyscall_gtod_data.clk_mult = mult;
         fsyscall_gtod_data.clk_shift = c->shift;
-        fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio;
+        fsyscall_gtod_data.clk_fsys_mmio = c->archdata.fsys_mmio;
         fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
 
 	/* copy kernel time structures */
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
index c34efda122e1..0f8844e49363 100644
--- a/arch/ia64/sn/kernel/sn2/timer.c
+++ b/arch/ia64/sn/kernel/sn2/timer.c
@@ -54,7 +54,7 @@ ia64_sn_udelay (unsigned long usecs)
 
 void __init sn_timer_init(void)
 {
-	clocksource_sn2.fsys_mmio = RTC_COUNTER_ADDR;
+	clocksource_sn2.archdata.fsys_mmio = RTC_COUNTER_ADDR;
 	clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second);
 
 	ia64_udelay = &ia64_sn_udelay;
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 051474c65b78..055765147dc2 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -931,7 +931,7 @@ int hpet_alloc(struct hpet_data *hdp)
 #ifdef CONFIG_IA64
 	if (!hpet_clocksource) {
 		hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc;
-		CLKSRC_FSYS_MMIO_SET(clocksource_hpet.fsys_mmio, hpet_mctr);
+		clocksource_hpet.archdata.fsys_mmio = hpet_mctr;
 		clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq);
 		hpetp->hp_clocksource = &clocksource_hpet;
 		hpet_clocksource = &clocksource_hpet;
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 0fb83c224471..6bb69702c4fa 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -171,13 +171,6 @@ struct clocksource {
 	u32 shift;
 	u64 max_idle_ns;
 
-#ifdef CONFIG_IA64
-	void *fsys_mmio;        /* used by fsyscall asm code */
-#define CLKSRC_FSYS_MMIO_SET(mmio, addr)      ((mmio) = (addr))
-#else
-#define CLKSRC_FSYS_MMIO_SET(mmio, addr)      do { } while (0)
-#endif
-
 #ifdef __ARCH_HAS_CLOCKSOURCE_DATA
 	struct arch_clocksource_data archdata;
 #endif
-- 
cgit v1.2.3


From 1bb6f9b042eb1428ba927d2e851a00df6308877a Mon Sep 17 00:00:00 2001
From: Peter Korsgaard <jacmet@sunsite.dk>
Date: Fri, 15 Jul 2011 10:25:30 +0200
Subject: mcp23s08: get rid of setup/teardown callbacks

There's no in-tree users, and bus notifiers are more generic anyway.

Signed-off-by: Peter Korsgaard <jacmet@sunsite.dk>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/gpio/gpio-mcp23s08.c | 19 -------------------
 include/linux/spi/mcp23s08.h |  9 ---------
 2 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-mcp23s08.c b/drivers/gpio/gpio-mcp23s08.c
index da3e04f0909b..ef61234c0e03 100644
--- a/drivers/gpio/gpio-mcp23s08.c
+++ b/drivers/gpio/gpio-mcp23s08.c
@@ -432,14 +432,6 @@ static int mcp23s08_probe(struct spi_device *spi)
 	 * handled here...
 	 */
 
-	if (pdata->setup) {
-		status = pdata->setup(spi,
-				pdata->base, data->ngpio,
-				pdata->context);
-		if (status < 0)
-			dev_dbg(&spi->dev, "setup --> %d\n", status);
-	}
-
 	return 0;
 
 fail:
@@ -459,20 +451,9 @@ fail:
 static int mcp23s08_remove(struct spi_device *spi)
 {
 	struct mcp23s08_driver_data	*data = spi_get_drvdata(spi);
-	struct mcp23s08_platform_data	*pdata = spi->dev.platform_data;
 	unsigned			addr;
 	int				status = 0;
 
-	if (pdata->teardown) {
-		status = pdata->teardown(spi,
-				pdata->base, data->ngpio,
-				pdata->context);
-		if (status < 0) {
-			dev_err(&spi->dev, "%s --> %d\n", "teardown", status);
-			return status;
-		}
-	}
-
 	for (addr = 0; addr < ARRAY_SIZE(data->mcp); addr++) {
 		int tmp;
 
diff --git a/include/linux/spi/mcp23s08.h b/include/linux/spi/mcp23s08.h
index c42cff8ca191..2d676d5aaa89 100644
--- a/include/linux/spi/mcp23s08.h
+++ b/include/linux/spi/mcp23s08.h
@@ -22,13 +22,4 @@ struct mcp23s08_platform_data {
 	 * base to base+15 (or base+31 for s17 variant).
 	 */
 	unsigned	base;
-
-	void		*context;	/* param to setup/teardown */
-
-	int		(*setup)(struct spi_device *spi,
-					int gpio, unsigned ngpio,
-					void *context);
-	int		(*teardown)(struct spi_device *spi,
-					int gpio, unsigned ngpio,
-					void *context);
 };
-- 
cgit v1.2.3


From 99f381d3549432a250fe846a2a82d61a032804b0 Mon Sep 17 00:00:00 2001
From: Nishanth Menon <nm@ti.com>
Date: Fri, 10 Jun 2011 20:24:57 +0200
Subject: PM / OPP: Introduce function to free cpufreq table

cpufreq table allocated by opp_init_cpufreq_table is better
freed by OPP layer itself. This allows future modifications to
the table handling to be transparent to the users.

Signed-off-by: Nishanth Menon <nm@ti.com>
Acked-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/power/opp.txt |  2 ++
 drivers/base/power/opp.c    | 17 +++++++++++++++++
 include/linux/opp.h         |  8 ++++++++
 3 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt
index 5ae70a12c1e2..3035d00757ad 100644
--- a/Documentation/power/opp.txt
+++ b/Documentation/power/opp.txt
@@ -321,6 +321,8 @@ opp_init_cpufreq_table - cpufreq framework typically is initialized with
 	addition to CONFIG_PM as power management feature is required to
 	dynamically scale voltage and frequency in a system.
 
+opp_free_cpufreq_table - Free up the table allocated by opp_init_cpufreq_table
+
 7. Data Structures
 ==================
 Typically an SoC contains multiple voltage domains which are variable. Each
diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index 56a6899f5e9e..5cc12322ef32 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -625,4 +625,21 @@ int opp_init_cpufreq_table(struct device *dev,
 
 	return 0;
 }
+
+/**
+ * opp_free_cpufreq_table() - free the cpufreq table
+ * @dev:	device for which we do this operation
+ * @table:	table to free
+ *
+ * Free up the table allocated by opp_init_cpufreq_table
+ */
+void opp_free_cpufreq_table(struct device *dev,
+				struct cpufreq_frequency_table **table)
+{
+	if (!table)
+		return;
+
+	kfree(*table);
+	*table = NULL;
+}
 #endif		/* CONFIG_CPU_FREQ */
diff --git a/include/linux/opp.h b/include/linux/opp.h
index 5449945d589f..7020e9736fc5 100644
--- a/include/linux/opp.h
+++ b/include/linux/opp.h
@@ -94,12 +94,20 @@ static inline int opp_disable(struct device *dev, unsigned long freq)
 #if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP)
 int opp_init_cpufreq_table(struct device *dev,
 			    struct cpufreq_frequency_table **table);
+void opp_free_cpufreq_table(struct device *dev,
+				struct cpufreq_frequency_table **table);
 #else
 static inline int opp_init_cpufreq_table(struct device *dev,
 			    struct cpufreq_frequency_table **table)
 {
 	return -EINVAL;
 }
+
+static inline
+void opp_free_cpufreq_table(struct device *dev,
+				struct cpufreq_frequency_table **table)
+{
+}
 #endif		/* CONFIG_CPU_FREQ */
 
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From 3b5fe85252326217cd96f24a7bda4460d8f71bee Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@samsung.com>
Date: Sun, 12 Jun 2011 15:57:05 +0200
Subject: PM / Suspend: Add .suspend_again() callback to suspend_ops

A system or a device may need to control suspend/wakeup events. It may
want to wakeup the system after a predefined amount of time or at a
predefined event decided while entering suspend for polling or delayed
work. Then, it may want to enter suspend again if its predefined wakeup
condition is the only wakeup reason and there is no outstanding events;
thus, it does not wakeup the userspace unnecessary or unnecessary
devices and keeps suspended as long as possible (saving the power).

Enabling a system to wakeup after a specified time can be easily
achieved by using RTC. However, to enter suspend again immediately
without invoking userland and unrelated devices, we need additional
features in the suspend framework.

Such need comes from:

 1. Monitoring a critical device status without interrupts that can
wakeup the system. (in-suspend polling)
 An example is ambient temperature monitoring that needs to shut down
the system or a specific device function if it is too hot or cold. The
temperature of a specific device may be needed to be monitored as well;
e.g., a charger monitors battery temperature in order to stop charging
if overheated.

 2. Execute critical "delayed work" at suspend.
 A driver or a system/board may have a delayed work (or any similar
things) that it wants to execute at the requested time.
 For example, some chargers want to check the battery voltage some
time (e.g., 30 seconds) after the battery is fully charged and the
charger has stopped. Then, the charger restarts charging if the voltage
has dropped more than a threshold, which is smaller than "restart-charger"
voltage, which is a threshold to restart charging regardless of the
time passed.

This patch allows to add "suspend_again" callback at struct
platform_suspend_ops and let the "suspend_again" callback return true if
the system is required to enter suspend again after the current instance
of wakeup. Device-wise suspend_again implemented at dev_pm_ops or
syscore is not done because: a) suspend_again feature is usually under
platform-wise decision and controls the behavior of the whole platform
and b) There are very limited devices related to the usage cases of
suspend_again; chargers and temperature sensors are mentioned so far.

With suspend_again callback registered at struct platform_suspend_ops
suspend_ops in kernel/power/suspend.c with suspend_set_ops by the
platform, the suspend framework tries to enter suspend again by
looping suspend_enter() if suspend_again has returned true and there has
been no errors in the suspending sequence or pending wakeups (by
pm_wakeup_pending).

Tested at Exynos4-NURI.

[rjw: Fixed up kerneldoc comment for suspend_enter().]

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/suspend.h |  8 ++++++++
 kernel/power/suspend.c  | 18 ++++++++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 083ffea7ba18..e1e3742733be 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -92,6 +92,13 @@ typedef int __bitwise suspend_state_t;
  *	@enter() and @wake(), even if any of them fails.  It is executed after
  *	a failing @prepare.
  *
+ * @suspend_again: Returns whether the system should suspend again (true) or
+ *	not (false). If the platform wants to poll sensors or execute some
+ *	code during suspended without invoking userspace and most of devices,
+ *	suspend_again callback is the place assuming that periodic-wakeup or
+ *	alarm-wakeup is already setup. This allows to execute some codes while
+ *	being kept suspended in the view of userland and devices.
+ *
  * @end: Called by the PM core right after resuming devices, to indicate to
  *	the platform that the system has returned to the working state or
  *	the transition to the sleep state has been aborted.
@@ -113,6 +120,7 @@ struct platform_suspend_ops {
 	int (*enter)(suspend_state_t state);
 	void (*wake)(void);
 	void (*finish)(void);
+	bool (*suspend_again)(void);
 	void (*end)(void);
 	void (*recover)(void);
 };
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1c41ba215419..b6762f43365d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -126,12 +126,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
 }
 
 /**
- *	suspend_enter - enter the desired system sleep state.
- *	@state:		state to enter
+ * suspend_enter - enter the desired system sleep state.
+ * @state: State to enter
+ * @wakeup: Returns information that suspend should not be entered again.
  *
- *	This function should be called after devices have been suspended.
+ * This function should be called after devices have been suspended.
  */
-static int suspend_enter(suspend_state_t state)
+static int suspend_enter(suspend_state_t state, bool *wakeup)
 {
 	int error;
 
@@ -165,7 +166,8 @@ static int suspend_enter(suspend_state_t state)
 
 	error = syscore_suspend();
 	if (!error) {
-		if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
+		*wakeup = pm_wakeup_pending();
+		if (!(suspend_test(TEST_CORE) || *wakeup)) {
 			error = suspend_ops->enter(state);
 			events_check_enabled = false;
 		}
@@ -199,6 +201,7 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
 	int error;
+	bool wakeup = false;
 
 	if (!suspend_ops)
 		return -ENOSYS;
@@ -220,7 +223,10 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (suspend_test(TEST_DEVICES))
 		goto Recover_platform;
 
-	error = suspend_enter(state);
+	do {
+		error = suspend_enter(state, &wakeup);
+	} while (!error && !wakeup
+		&& suspend_ops->suspend_again && suspend_ops->suspend_again());
 
  Resume_devices:
 	suspend_test_start();
-- 
cgit v1.2.3


From 93b37905f70083d6143f5f4dba0a45cc64379a62 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sat, 9 Jul 2011 16:43:22 +0200
Subject: firewire: cdev: prevent race between first get_info ioctl and bus
 reset event queuing

Between open(2) of a /dev/fw* and the first FW_CDEV_IOC_GET_INFO
ioctl(2) on it, the kernel already queues FW_CDEV_EVENT_BUS_RESET events
to be read(2) by the client.  The get_info ioctl is practically always
issued right away after open, hence this condition only occurs if the
client opens during a bus reset, especially during a rapid series of bus
resets.

The problem with this condition is twofold:

  - These bus reset events carry the (as yet undocumented) @closure
    value of 0.  But it is not the kernel's place to choose closures;
    they are privat to the client.  E.g., this 0 value forced from the
    kernel makes it unsafe for clients to dereference it as a pointer to
    a closure object without NULL pointer check.

  - It is impossible for clients to determine the relative order of bus
    reset events from get_info ioctl(2) versus those from read(2),
    except in one way:  By comparison of closure values.  Again, such a
    procedure imposes complexity on clients and reduces freedom in use
    of the bus reset closure.

So, change the ABI to suppress queuing of bus reset events before the
first FW_CDEV_IOC_GET_INFO ioctl was issued by the client.

Note, this ABI change cannot be version-controlled.  The kernel cannot
distinguish old from new clients before the first FW_CDEV_IOC_GET_INFO
ioctl.

We will try to back-merge this change into currently maintained stable/
longterm series, and we only document the new behaviour.  The old
behavior is now considered a kernel bug, which it basically is.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: <stable@kernel.org>
---
 drivers/firewire/core-cdev.c  | 18 ++++++++++--------
 include/linux/firewire-cdev.h |  3 +++
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 9b5915ebeb35..e6ad3bb6c1a6 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -253,14 +253,11 @@ static int fw_device_op_open(struct inode *inode, struct file *file)
 	init_waitqueue_head(&client->wait);
 	init_waitqueue_head(&client->tx_flush_wait);
 	INIT_LIST_HEAD(&client->phy_receiver_link);
+	INIT_LIST_HEAD(&client->link);
 	kref_init(&client->kref);
 
 	file->private_data = client;
 
-	mutex_lock(&device->client_list_mutex);
-	list_add_tail(&client->link, &device->client_list);
-	mutex_unlock(&device->client_list_mutex);
-
 	return nonseekable_open(inode, file);
 }
 
@@ -451,15 +448,20 @@ static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
 	if (ret != 0)
 		return -EFAULT;
 
+	mutex_lock(&client->device->client_list_mutex);
+
 	client->bus_reset_closure = a->bus_reset_closure;
 	if (a->bus_reset != 0) {
 		fill_bus_reset_event(&bus_reset, client);
-		if (copy_to_user(u64_to_uptr(a->bus_reset),
-				 &bus_reset, sizeof(bus_reset)))
-			return -EFAULT;
+		ret = copy_to_user(u64_to_uptr(a->bus_reset),
+				   &bus_reset, sizeof(bus_reset));
 	}
+	if (ret == 0 && list_empty(&client->link))
+		list_add_tail(&client->link, &client->device->client_list);
 
-	return 0;
+	mutex_unlock(&client->device->client_list_mutex);
+
+	return ret ? -EFAULT : 0;
 }
 
 static int add_client_resource(struct client *client,
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 4ff09889c5c0..55814aa33be2 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -475,6 +475,9 @@ union fw_cdev_event {
  *		of the bus.  This does not cause a bus reset to happen.
  * @bus_reset_closure: Value of &closure in this and subsequent bus reset events
  * @card:	The index of the card this device belongs to
+ *
+ * As a side effect, reception of %FW_CDEV_EVENT_BUS_RESET events to be read(2)
+ * is started by this ioctl.
  */
 struct fw_cdev_get_info {
 	__u32 version;
-- 
cgit v1.2.3


From f6a7cd0212c359f7b55414aeee364ee7cac363cc Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 14 Jul 2011 21:08:39 +0200
Subject: firewire: cdev: ABI documentation enhancements

Add overview documentation in Documentation/ABI/stable/firewire-cdev.

Improve the inline reference documentation in firewire-cdev.h:

  - Add /* available since kernel... */ comments to event numbers
    consistent with the comments on ioctl numbers.

  - Shorten some documentation on an event and an ioctl that are
    less interesting to current programming because there are newer
    preferable variants.

  - Spell Configuration ROM (name of an IEEE 1212 register) in
    upper case.

  - Move the dummy FW_CDEV_VERSION out of the reader's field of
    vision.  We should remove it from the header next year or so.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 Documentation/ABI/stable/firewire-cdev | 103 +++++++++++++++++++++++++++++++++
 include/linux/firewire-cdev.h          |  75 +++++++++++-------------
 2 files changed, 137 insertions(+), 41 deletions(-)
 create mode 100644 Documentation/ABI/stable/firewire-cdev

(limited to 'include/linux')

diff --git a/Documentation/ABI/stable/firewire-cdev b/Documentation/ABI/stable/firewire-cdev
new file mode 100644
index 000000000000..16d030827368
--- /dev/null
+++ b/Documentation/ABI/stable/firewire-cdev
@@ -0,0 +1,103 @@
+What:		/dev/fw[0-9]+
+Date:		May 2007
+KernelVersion:	2.6.22
+Contact:	linux1394-devel@lists.sourceforge.net
+Description:
+		The character device files /dev/fw* are the interface between
+		firewire-core and IEEE 1394 device drivers implemented in
+		userspace.  The ioctl(2)- and read(2)-based ABI is defined and
+		documented in <linux/firewire-cdev.h>.
+
+		This ABI offers most of the features which firewire-core also
+		exposes to kernelspace IEEE 1394 drivers.
+
+		Each /dev/fw* is associated with one IEEE 1394 node, which can
+		be remote or local nodes.  Operations on a /dev/fw* file have
+		different scope:
+		  - The 1394 node which is associated with the file:
+			  - Asynchronous request transmission
+			  - Get the Configuration ROM
+			  - Query node ID
+			  - Query maximum speed of the path between this node
+			    and local node
+		  - The 1394 bus (i.e. "card") to which the node is attached to:
+			  - Isochronous stream transmission and reception
+			  - Asynchronous stream transmission and reception
+			  - Asynchronous broadcast request transmission
+			  - PHY packet transmission and reception
+			  - Allocate, reallocate, deallocate isochronous
+			    resources (channels, bandwidth) at the bus's IRM
+			  - Query node IDs of local node, root node, IRM, bus
+			    manager
+			  - Query cycle time
+			  - Bus reset initiation, bus reset event reception
+		  - All 1394 buses:
+			  - Allocation of IEEE 1212 address ranges on the local
+			    link layers, reception of inbound requests to such
+			    an address range, asynchronous response transmission
+			    to inbound requests
+			  - Addition of descriptors or directories to the local
+			    nodes' Configuration ROM
+
+		Due to the different scope of operations and in order to let
+		userland implement different access permission models, some
+		operations are restricted to /dev/fw* files that are associated
+		with a local node:
+			  - Addition of descriptors or directories to the local
+			    nodes' Configuration ROM
+			  - PHY packet transmission and reception
+
+		A /dev/fw* file remains associated with one particular node
+		during its entire life time.  Bus topology changes, and hence
+		node ID changes, are tracked by firewire-core.  ABI users do not
+		need to be aware of topology.
+
+		The following file operations are supported:
+
+		open(2)
+		Currently the only useful flags are O_RDWR.
+
+		ioctl(2)
+		Initiate various actions.  Some take immediate effect, others
+		are performed asynchronously while or after the ioctl returns.
+		See the inline documentation in <linux/firewire-cdev.h> for
+		descriptions of all ioctls.
+
+		poll(2), select(2), epoll_wait(2) etc.
+		Watch for events to become available to be read.
+
+		read(2)
+		Receive various events.  There are solicited events like
+		outbound asynchronous transaction completion or isochronous
+		buffer completion, and unsolicited events such as bus resets,
+		request reception, or PHY packet reception.  Always use a read
+		buffer which is large enough to receive the largest event that
+		could ever arrive.  See <linux/firewire-cdev.h> for descriptions
+		of all event types and for which ioctls affect reception of
+		events.
+
+		mmap(2)
+		Allocate a DMA buffer for isochronous reception or transmission
+		and map it into the process address space.  The arguments should
+		be used as follows:  addr = NULL, length = the desired buffer
+		size, i.e. number of packets times size of largest packet,
+		prot = at least PROT_READ for reception and at least PROT_WRITE
+		for transmission, flags = MAP_SHARED, fd = the handle to the
+		/dev/fw*, offset = 0.
+
+		Isochronous reception works in packet-per-buffer fashion except
+		for multichannel reception which works in buffer-fill mode.
+
+		munmap(2)
+		Unmap the isochronous I/O buffer from the process address space.
+
+		close(2)
+		Besides stopping and freeing I/O contexts that were associated
+		with the file descriptor, back out any changes to the local
+		nodes' Configuration ROM.  Deallocate isochronous channels and
+		bandwidth at the IRM that were marked for kernel-assisted
+		re- and deallocation.
+
+Users:		libraw1394
+		libdc1394
+		tools like jujuutils, fwhack, ...
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 55814aa33be2..357dbfc2829e 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -30,10 +30,13 @@
 #include <linux/types.h>
 #include <linux/firewire-constants.h>
 
+/* available since kernel version 2.6.22 */
 #define FW_CDEV_EVENT_BUS_RESET				0x00
 #define FW_CDEV_EVENT_RESPONSE				0x01
 #define FW_CDEV_EVENT_REQUEST				0x02
 #define FW_CDEV_EVENT_ISO_INTERRUPT			0x03
+
+/* available since kernel version 2.6.30 */
 #define FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED		0x04
 #define FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED		0x05
 
@@ -120,24 +123,11 @@ struct fw_cdev_event_response {
 
 /**
  * struct fw_cdev_event_request - Old version of &fw_cdev_event_request2
- * @closure:	See &fw_cdev_event_common; set by %FW_CDEV_IOC_ALLOCATE ioctl
  * @type:	See &fw_cdev_event_common; always %FW_CDEV_EVENT_REQUEST
- * @tcode:	See &fw_cdev_event_request2
- * @offset:	See &fw_cdev_event_request2
- * @handle:	See &fw_cdev_event_request2
- * @length:	See &fw_cdev_event_request2
- * @data:	See &fw_cdev_event_request2
  *
  * This event is sent instead of &fw_cdev_event_request2 if the kernel or
- * the client implements ABI version <= 3.
- *
- * Unlike &fw_cdev_event_request2, the sender identity cannot be established,
- * broadcast write requests cannot be distinguished from unicast writes, and
- * @tcode of lock requests is %TCODE_LOCK_REQUEST.
- *
- * Requests to the FCP_REQUEST or FCP_RESPONSE register are responded to as
- * with &fw_cdev_event_request2, except in kernel 2.6.32 and older which send
- * the response packet of the client's %FW_CDEV_IOC_SEND_RESPONSE ioctl.
+ * the client implements ABI version <= 3.  &fw_cdev_event_request lacks
+ * essential information; use &fw_cdev_event_request2 instead.
  */
 struct fw_cdev_event_request {
 	__u64 closure;
@@ -452,30 +442,29 @@ union fw_cdev_event {
  *                 %FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL, and
  *                 %FW_CDEV_IOC_SET_ISO_CHANNELS
  */
-#define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
 /**
  * struct fw_cdev_get_info - General purpose information ioctl
  * @version:	The version field is just a running serial number.  Both an
  *		input parameter (ABI version implemented by the client) and
  *		output parameter (ABI version implemented by the kernel).
- *		A client must not fill in an %FW_CDEV_VERSION defined from an
- *		included kernel header file but the actual version for which
- *		the client was implemented.  This is necessary for forward
- *		compatibility.  We never break backwards compatibility, but
- *		may add more structs, events, and ioctls in later revisions.
- * @rom_length:	If @rom is non-zero, at most rom_length bytes of configuration
+ *		A client shall fill in the ABI @version for which the client
+ *		was implemented.  This is necessary for forward compatibility.
+ * @rom_length:	If @rom is non-zero, up to @rom_length bytes of Configuration
  *		ROM will be copied into that user space address.  In either
  *		case, @rom_length is updated with the actual length of the
- *		configuration ROM.
+ *		Configuration ROM.
  * @rom:	If non-zero, address of a buffer to be filled by a copy of the
- *		device's configuration ROM
+ *		device's Configuration ROM
  * @bus_reset:	If non-zero, address of a buffer to be filled by a
  *		&struct fw_cdev_event_bus_reset with the current state
  *		of the bus.  This does not cause a bus reset to happen.
  * @bus_reset_closure: Value of &closure in this and subsequent bus reset events
  * @card:	The index of the card this device belongs to
  *
+ * The %FW_CDEV_IOC_GET_INFO ioctl is usually the very first one which a client
+ * performs right after it opened a /dev/fw* file.
+ *
  * As a side effect, reception of %FW_CDEV_EVENT_BUS_RESET events to be read(2)
  * is started by this ioctl.
  */
@@ -615,7 +604,7 @@ struct fw_cdev_initiate_bus_reset {
  * @handle:	Handle to the descriptor, written by the kernel
  *
  * Add a descriptor block and optionally a preceding immediate key to the local
- * node's configuration ROM.
+ * node's Configuration ROM.
  *
  * The @key field specifies the upper 8 bits of the descriptor root directory
  * pointer and the @data and @length fields specify the contents. The @key
@@ -630,9 +619,9 @@ struct fw_cdev_initiate_bus_reset {
  * If successful, the kernel adds the descriptor and writes back a @handle to
  * the kernel-side object to be used for later removal of the descriptor block
  * and immediate key.  The kernel will also generate a bus reset to signal the
- * change of the configuration ROM to other nodes.
+ * change of the Configuration ROM to other nodes.
  *
- * This ioctl affects the configuration ROMs of all local nodes.
+ * This ioctl affects the Configuration ROMs of all local nodes.
  * The ioctl only succeeds on device files which represent a local node.
  */
 struct fw_cdev_add_descriptor {
@@ -644,13 +633,13 @@ struct fw_cdev_add_descriptor {
 };
 
 /**
- * struct fw_cdev_remove_descriptor - Remove contents from the configuration ROM
+ * struct fw_cdev_remove_descriptor - Remove contents from the Configuration ROM
  * @handle:	Handle to the descriptor, as returned by the kernel when the
  *		descriptor was added
  *
  * Remove a descriptor block and accompanying immediate key from the local
- * nodes' configuration ROMs.  The kernel will also generate a bus reset to
- * signal the change of the configuration ROM to other nodes.
+ * nodes' Configuration ROMs.  The kernel will also generate a bus reset to
+ * signal the change of the Configuration ROM to other nodes.
  */
 struct fw_cdev_remove_descriptor {
 	__u32 handle;
@@ -866,13 +855,8 @@ struct fw_cdev_stop_iso {
  * @local_time:   system time, in microseconds since the Epoch
  * @cycle_timer:  Cycle Time register contents
  *
- * The %FW_CDEV_IOC_GET_CYCLE_TIMER ioctl reads the isochronous cycle timer
- * and also the system clock (%CLOCK_REALTIME).  This allows to express the
- * receive time of an isochronous packet as a system time.
- *
- * @cycle_timer consists of 7 bits cycleSeconds, 13 bits cycleCount, and
- * 12 bits cycleOffset, in host byte order.  Cf. the Cycle Time register
- * per IEEE 1394 or Isochronous Cycle Timer register per OHCI-1394.
+ * Same as %FW_CDEV_IOC_GET_CYCLE_TIMER2, but fixed to use %CLOCK_REALTIME
+ * and only with microseconds resolution.
  *
  * In version 1 and 2 of the ABI, this ioctl returned unreliable (non-
  * monotonic) @cycle_timer values on certain controllers.
@@ -889,10 +873,17 @@ struct fw_cdev_get_cycle_timer {
  * @clk_id:       input parameter, clock from which to get the system time
  * @cycle_timer:  Cycle Time register contents
  *
- * The %FW_CDEV_IOC_GET_CYCLE_TIMER2 works like
- * %FW_CDEV_IOC_GET_CYCLE_TIMER but lets you choose a clock like with POSIX'
- * clock_gettime function.  Supported @clk_id values are POSIX' %CLOCK_REALTIME
- * and %CLOCK_MONOTONIC and Linux' %CLOCK_MONOTONIC_RAW.
+ * The %FW_CDEV_IOC_GET_CYCLE_TIMER2 ioctl reads the isochronous cycle timer
+ * and also the system clock.  This allows to correlate reception time of
+ * isochronous packets with system time.
+ *
+ * @clk_id lets you choose a clock like with POSIX' clock_gettime function.
+ * Supported @clk_id values are POSIX' %CLOCK_REALTIME and %CLOCK_MONOTONIC
+ * and Linux' %CLOCK_MONOTONIC_RAW.
+ *
+ * @cycle_timer consists of 7 bits cycleSeconds, 13 bits cycleCount, and
+ * 12 bits cycleOffset, in host byte order.  Cf. the Cycle Time register
+ * per IEEE 1394 or Isochronous Cycle Timer register per OHCI-1394.
  */
 struct fw_cdev_get_cycle_timer2 {
 	__s64 tv_sec;
@@ -1014,4 +1005,6 @@ struct fw_cdev_receive_phy_packets {
 	__u64 closure;
 };
 
+#define FW_CDEV_VERSION 3 /* Meaningless legacy macro; don't use it. */
+
 #endif /* _LINUX_FIREWIRE_CDEV_H */
-- 
cgit v1.2.3


From b23b5455b6458920179a1f27513ce42e70d11f37 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 16 Jul 2011 17:45:02 -0700
Subject: neigh: Kill hh_cache->hh_output

It's just taking on one of two possible values, either
neigh_ops->output or dev_queue_xmit().  And this is purely depending
upon whether nud_state has NUD_CONNECTED set or not.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 -
 include/net/neighbour.h   |  4 ++--
 net/core/neighbour.c      | 25 ++-----------------------
 net/ipv4/route.c          |  6 +++---
 4 files changed, 7 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f84dfd25c431..52c4e3835aa6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -254,7 +254,6 @@ struct netdev_hw_addr_list {
 struct hh_cache {
 	u16		hh_len;
 	u16		__pad;
-	int		(*hh_output)(struct sk_buff *skb);
 	seqlock_t	hh_lock;
 
 	/* cached hardware header; allow for machine alignment needs.        */
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 97990ddca66c..60bac8112d86 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -340,13 +340,13 @@ static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
 	} while (read_seqretry(&hh->hh_lock, seq));
 
 	skb_push(skb, hh_len);
-	return hh->hh_output(skb);
+	return dev_queue_xmit(skb);
 }
 
 static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
 {
 	struct hh_cache *hh = &n->hh;
-	if (hh->hh_len)
+	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
 		return neigh_hh_output(hh, skb);
 	else
 		return n->output(skb);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index c22def5ae486..2feda6e7a31d 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -720,15 +720,9 @@ EXPORT_SYMBOL(neigh_destroy);
  */
 static void neigh_suspect(struct neighbour *neigh)
 {
-	struct hh_cache *hh;
-
 	NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
 
 	neigh->output = neigh->ops->output;
-
-	hh = &neigh->hh;
-	if (hh->hh_len)
-		hh->hh_output = neigh->ops->output;
 }
 
 /* Neighbour state is OK;
@@ -738,15 +732,9 @@ static void neigh_suspect(struct neighbour *neigh)
  */
 static void neigh_connect(struct neighbour *neigh)
 {
-	struct hh_cache *hh;
-
 	NEIGH_PRINTK2("neigh %p is connected.\n", neigh);
 
 	neigh->output = neigh->ops->connected_output;
-
-	hh = &neigh->hh;
-	if (hh->hh_len)
-		hh->hh_output = dev_queue_xmit;
 }
 
 static void neigh_periodic_work(struct work_struct *work)
@@ -1215,18 +1203,9 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
 	/* Only one thread can come in here and initialize the
 	 * hh_cache entry.
 	 */
-	if (hh->hh_len)
-		goto end;
-
-	if (dev->header_ops->cache(n, hh, prot))
-		goto end;
-
-	if (n->nud_state & NUD_CONNECTED)
-		hh->hh_output = dev_queue_xmit;
-	else
-		hh->hh_output = n->ops->output;
+	if (!hh->hh_len)
+		dev->header_ops->cache(n, hh, prot);
 
-end:
 	write_unlock_bh(&n->lock);
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a52bb74d2612..bcf9bb508200 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -427,9 +427,9 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			      dst_metric(&r->dst, RTAX_RTTVAR)),
 			r->rt_key_tos,
 			-1,
-			(r->dst.neighbour ?
-			 (r->dst.neighbour->hh.hh_output ==
-			  dev_queue_xmit) : 0),
+			(r->dst.neighbour &&
+			 (r->dst.neighbour->nud_state & NUD_CONNECTED)) ?
+			   1 : 0,
 			r->rt_spec_dst, &len);
 
 		seq_printf(seq, "%*s\n", 127 - len, "");
-- 
cgit v1.2.3


From 6634ae1033ceaeca5877dd75723210f8c2648c17 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 8 Jul 2011 19:13:39 +0200
Subject: ptrace_init_task: initialize child->jobctl explicitly

new_child->jobctl is not initialized during the fork, it is copied
from parent->jobctl. Currently this is harmless, the forking task
is running and copy_process() can't succeed if signal_pending() is
true, so only JOBCTL_STOP_DEQUEUED can be copied. Still this is a
bit fragile, it would be more clean to set ->jobctl = 0 explicitly.

Also, check ->ptrace != 0 instead of PT_PTRACED, move the
CONFIG_HAVE_HW_BREAKPOINT code up.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ptrace.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index eae381d584f9..fd8669fc339f 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -217,16 +217,17 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 {
 	INIT_LIST_HEAD(&child->ptrace_entry);
 	INIT_LIST_HEAD(&child->ptraced);
-	child->parent = child->real_parent;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	atomic_set(&child->ptrace_bp_refcnt, 1);
+#endif
+	child->jobctl = 0;
 	child->ptrace = 0;
-	if (unlikely(ptrace) && (current->ptrace & PT_PTRACED)) {
+	child->parent = child->real_parent;
+
+	if (unlikely(ptrace) && current->ptrace) {
 		child->ptrace = current->ptrace;
 		__ptrace_link(child, current->parent);
 	}
-
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-	atomic_set(&child->ptrace_bp_refcnt, 1);
-#endif
 }
 
 /**
-- 
cgit v1.2.3


From dcace06cc29df927a74a6bc0e57b9bef87704377 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 8 Jul 2011 19:13:54 +0200
Subject: ptrace: mv send-SIGSTOP from do_fork() to ptrace_init_task()

If the new child is traced, do_fork() adds the pending SIGSTOP.
It assumes that either it is traced because of auto-attach or the
tracer attached later, in both cases sigaddset/set_thread_flag is
correct even if SIGSTOP is already pending.

Now that we have PTRACE_SEIZE this is no longer right in the latter
case. If the tracer does PTRACE_SEIZE after copy_process() makes the
child visible the queued SIGSTOP is wrong.

We could check PT_SEIZED bit and change ptrace_attach() to set both
PT_PTRACED and PT_SEIZED bits simultaneously but see the next patch,
we need to know whether this child was auto-attached or not anyway.

So this patch simply moves this code to ptrace_init_task(), this
way we can never race with ptrace_attach().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ptrace.h |  3 +++
 kernel/fork.c          | 12 ------------
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index fd8669fc339f..9b5d2c901d06 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -227,6 +227,9 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 	if (unlikely(ptrace) && current->ptrace) {
 		child->ptrace = current->ptrace;
 		__ptrace_link(child, current->parent);
+
+		sigaddset(&child->pending.signal, SIGSTOP);
+		set_tsk_thread_flag(child, TIF_SIGPENDING);
 	}
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3c72a5b321a7..4d4117e01504 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
-#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
 #include <linux/kthread.h>
@@ -1521,17 +1520,6 @@ long do_fork(unsigned long clone_flags,
 
 		audit_finish_fork(p);
 
-		/*
-		 * Child is ready but hasn't started running yet.  Queue
-		 * SIGSTOP if it's gonna be ptraced - it doesn't matter who
-		 * attached/attaching to this task, the pending SIGSTOP is
-		 * right in any case.
-		 */
-		if (unlikely(p->ptrace)) {
-			sigaddset(&p->pending.signal, SIGSTOP);
-			set_tsk_thread_flag(p, TIF_SIGPENDING);
-		}
-
 		/*
 		 * We set PF_STARTING at creation in case tracing wants to
 		 * use this to distinguish a fully live task from one that
-- 
cgit v1.2.3


From d184d6eb1dc3c9869e25a8e422be5c55ab0db4ac Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 8 Jul 2011 19:14:17 +0200
Subject: ptrace: dont send SIGSTOP on auto-attach if PT_SEIZED

The fake SIGSTOP during attach has numerous problems. PTRACE_SEIZE
is already fine, but we have basically the same problems is SIGSTOP
is sent on auto-attach, the tracer can't know if this signal signal
should be cancelled or not.

Change ptrace_event() to set JOBCTL_TRAP_STOP if the new child is
PT_SEIZED, this triggers the PTRACE_EVENT_STOP report.

Thereafter a PT_SEIZED task can never report the bogus SIGSTOP.

Test-case:

	#define PTRACE_SEIZE		0x4206
	#define PTRACE_SEIZE_DEVEL	0x80000000
	#define PTRACE_EVENT_STOP	7
	#define WEVENT(s)		((s & 0xFF0000) >> 16)

	int main(void)
	{
		int child, grand_child, status;
		long message;

		child = fork();
		if (!child) {
			kill(getpid(), SIGSTOP);
			fork();
			assert(0);
			return 0x23;
		}

		assert(ptrace(PTRACE_SEIZE, child, 0,PTRACE_SEIZE_DEVEL) == 0);
		assert(wait(&status) == child);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);

		assert(ptrace(PTRACE_SETOPTIONS, child, 0, PTRACE_O_TRACEFORK) == 0);

		assert(ptrace(PTRACE_CONT, child, 0,0) == 0);
		assert(waitpid(child, &status, 0) == child);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP);
		assert(WEVENT(status) == PTRACE_EVENT_FORK);

		assert(ptrace(PTRACE_GETEVENTMSG, child, 0, &message) == 0);
		grand_child = message;

		assert(waitpid(grand_child, &status, 0) == grand_child);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP);
		assert(WEVENT(status) == PTRACE_EVENT_STOP);

		kill(child, SIGKILL);
		kill(grand_child, SIGKILL);
		return 0;
	}

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ptrace.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 9b5d2c901d06..800f113bea66 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -228,7 +228,11 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 		child->ptrace = current->ptrace;
 		__ptrace_link(child, current->parent);
 
-		sigaddset(&child->pending.signal, SIGSTOP);
+		if (child->ptrace & PT_SEIZED)
+			task_set_jobctl_pending(child, JOBCTL_TRAP_STOP);
+		else
+			sigaddset(&child->pending.signal, SIGSTOP);
+
 		set_tsk_thread_flag(child, TIF_SIGPENDING);
 	}
 }
-- 
cgit v1.2.3


From 178edcbc6b539dfc21062fed9bf31701a8952e1c Mon Sep 17 00:00:00 2001
From: David Lamparter <equinox@diac24.net>
Date: Sun, 17 Jul 2011 08:32:12 +0000
Subject: net: add 802.1ad / 802.1ah / QinQ ethertypes

define ETH_P_8021AD to 88a8 (assigned by IEEE) and add ETH_P_QINQ{1,2,3}
for the pre-standard 9{1,2,3}00 types. all of them use 802.1q frame
format, with 1 bit used differently in some cases.

also define ETH_P_8021AH to 88e7 (assigned by IEEE). this is Mac-in-Mac
and uses a different, 16-byte header.

Signed-off-by: David Lamparter <equinox@diac24.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_ether.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index 0065ffd3226b..a3d99ff6e3b5 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -78,10 +78,15 @@
 					 */
 #define ETH_P_PAE	0x888E		/* Port Access Entity (IEEE 802.1X) */
 #define ETH_P_AOE	0x88A2		/* ATA over Ethernet		*/
+#define ETH_P_8021AD	0x88A8          /* 802.1ad Service VLAN		*/
 #define ETH_P_TIPC	0x88CA		/* TIPC 			*/
+#define ETH_P_8021AH	0x88E7          /* 802.1ah Backbone Service Tag */
 #define ETH_P_1588	0x88F7		/* IEEE 1588 Timesync */
 #define ETH_P_FCOE	0x8906		/* Fibre Channel over Ethernet  */
 #define ETH_P_FIP	0x8914		/* FCoE Initialization Protocol */
+#define ETH_P_QINQ1	0x9100		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_QINQ2	0x9200		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_QINQ3	0x9300		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 
 /*
-- 
cgit v1.2.3


From 69ecca86da4890c13a5e29c51b4ac76a1a8a62c9 Mon Sep 17 00:00:00 2001
From: David Lamparter <equinox@diac24.net>
Date: Sun, 17 Jul 2011 08:53:12 +0000
Subject: net: vlan, qlcnic: make vlan_find_dev private

there is only one user of vlan_find_dev outside of the actual vlan code:
qlcnic uses it to iterate over some VLANs it knows.

let's just make vlan_find_dev private to the VLAN code and have the
iteration in qlcnic be a bit more direct. (a few rcu dereferences less
too)

Signed-off-by: David Lamparter <equinox@diac24.net>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Amit Kumar Salecha <amit.salecha@qlogic.com>
Cc: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
Cc: linux-driver@qlogic.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/qlcnic/qlcnic_main.c |  7 ++++++-
 include/linux/if_vlan.h          | 11 -----------
 net/8021q/vlan.h                 | 12 ++++++++++++
 3 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c
index 006a693d49a7..3579229db4a9 100644
--- a/drivers/net/qlcnic/qlcnic_main.c
+++ b/drivers/net/qlcnic/qlcnic_main.c
@@ -4198,13 +4198,18 @@ static void
 qlcnic_restore_indev_addr(struct net_device *netdev, unsigned long event)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct vlan_group *grp;
 	struct net_device *dev;
 	u16 vid;
 
 	qlcnic_config_indev_addr(adapter, netdev, event);
 
+	grp = rcu_dereference_rtnl(netdev->vlgrp);
+	if (!grp)
+		return;
+
 	for_each_set_bit(vid, adapter->vlans, VLAN_N_VID) {
-		dev = vlan_find_dev(netdev, vid);
+		dev = vlan_group_get_device(grp, vid);
 		if (!dev)
 			continue;
 		qlcnic_config_indev_addr(adapter, dev, event);
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index affa27380b72..bc03e40fd7fd 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -119,17 +119,6 @@ static inline int is_vlan_dev(struct net_device *dev)
 #define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
 
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
-/* Must be invoked with rcu_read_lock or with RTNL. */
-static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
-					       u16 vlan_id)
-{
-	struct vlan_group *grp = rcu_dereference_rtnl(real_dev->vlgrp);
-
-	if (grp)
-		return vlan_group_get_device(grp, vlan_id);
-
-	return NULL;
-}
 
 extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
 extern u16 vlan_dev_vlan_id(const struct net_device *dev);
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 9da07e30d1a2..b132f542b44b 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -74,6 +74,18 @@ static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev)
 	return netdev_priv(dev);
 }
 
+/* Must be invoked with rcu_read_lock or with RTNL. */
+static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
+					       u16 vlan_id)
+{
+	struct vlan_group *grp = rcu_dereference_rtnl(real_dev->vlgrp);
+
+	if (grp)
+		return vlan_group_get_device(grp, vlan_id);
+
+	return NULL;
+}
+
 /* found in vlan_dev.c */
 void vlan_dev_set_ingress_priority(const struct net_device *dev,
 				   u32 skb_prio, u16 vlan_prio);
-- 
cgit v1.2.3


From 6b75e3e8d664a9a1b99d31a7f4976ae70d1d090a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 18 Jul 2011 16:08:07 +0200
Subject: netfilter: nfnetlink: add RCU in nfnetlink_rcv_msg()

Goal of this patch is to permit nfnetlink providers not mandate
nfnl_mutex being held while nfnetlink_rcv_msg() calls them.

If struct nfnl_callback contains a non NULL call_rcu(), then
nfnetlink_rcv_msg() will use it instead of call() field, holding
rcu_read_lock instead of nfnl_mutex

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Florian Westphal <fw@strlen.de>
CC: Eric Leblond <eric@regit.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink.h |  3 +++
 net/netfilter/nfnetlink.c           | 40 +++++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 2b11fc1a86be..74d33861473c 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -60,6 +60,9 @@ struct nfnl_callback {
 	int (*call)(struct sock *nl, struct sk_buff *skb, 
 		    const struct nlmsghdr *nlh,
 		    const struct nlattr * const cda[]);
+	int (*call_rcu)(struct sock *nl, struct sk_buff *skb, 
+		    const struct nlmsghdr *nlh,
+		    const struct nlattr * const cda[]);
 	const struct nla_policy *policy;	/* netlink attribute policy */
 	const u_int16_t attr_count;		/* number of nlattr's */
 };
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index b4a4532823e8..1905976b5135 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -37,7 +37,7 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
 
 static char __initdata nfversion[] = "0.30";
 
-static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
+static const struct nfnetlink_subsystem __rcu *subsys_table[NFNL_SUBSYS_COUNT];
 static DEFINE_MUTEX(nfnl_mutex);
 
 void nfnl_lock(void)
@@ -59,7 +59,7 @@ int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
 		nfnl_unlock();
 		return -EBUSY;
 	}
-	subsys_table[n->subsys_id] = n;
+	rcu_assign_pointer(subsys_table[n->subsys_id], n);
 	nfnl_unlock();
 
 	return 0;
@@ -71,7 +71,7 @@ int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
 	nfnl_lock();
 	subsys_table[n->subsys_id] = NULL;
 	nfnl_unlock();
-
+	synchronize_rcu();
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
@@ -83,7 +83,7 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t
 	if (subsys_id >= NFNL_SUBSYS_COUNT)
 		return NULL;
 
-	return subsys_table[subsys_id];
+	return rcu_dereference(subsys_table[subsys_id]);
 }
 
 static inline const struct nfnl_callback *
@@ -139,21 +139,27 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	type = nlh->nlmsg_type;
 replay:
+	rcu_read_lock();
 	ss = nfnetlink_get_subsys(type);
 	if (!ss) {
 #ifdef CONFIG_MODULES
-		nfnl_unlock();
+		rcu_read_unlock();
 		request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
-		nfnl_lock();
+		rcu_read_lock();
 		ss = nfnetlink_get_subsys(type);
 		if (!ss)
 #endif
+		{
+			rcu_read_unlock();
 			return -EINVAL;
+		}
 	}
 
 	nc = nfnetlink_find_client(type, ss);
-	if (!nc)
+	if (!nc) {
+		rcu_read_unlock();
 		return -EINVAL;
+	}
 
 	{
 		int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
@@ -167,7 +173,23 @@ replay:
 		if (err < 0)
 			return err;
 
-		err = nc->call(net->nfnl, skb, nlh, (const struct nlattr **)cda);
+		if (nc->call_rcu) {
+			err = nc->call_rcu(net->nfnl, skb, nlh,
+					   (const struct nlattr **)cda);
+			rcu_read_unlock();
+		} else {
+			rcu_read_unlock();
+			nfnl_lock();
+			if (rcu_dereference_protected(
+					subsys_table[NFNL_SUBSYS_ID(type)],
+					lockdep_is_held(&nfnl_mutex)) != ss ||
+			    nfnetlink_find_client(type, ss) != nc)
+				err = -EAGAIN;
+			else
+				err = nc->call(net->nfnl, skb, nlh,
+						   (const struct nlattr **)cda);
+			nfnl_unlock();
+		}
 		if (err == -EAGAIN)
 			goto replay;
 		return err;
@@ -176,9 +198,7 @@ replay:
 
 static void nfnetlink_rcv(struct sk_buff *skb)
 {
-	nfnl_lock();
 	netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
-	nfnl_unlock();
 }
 
 static int __net_init nfnetlink_net_init(struct net *net)
-- 
cgit v1.2.3


From 61b8013a114cb041db2c56f747953cac69637f26 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@st.com>
Date: Sun, 17 Jul 2011 20:54:09 +0000
Subject: stmmac: Allow SOCs to use Store forward mode eventhough tx_coe is 0.
 (V2)

This patch adds new field 'force_sf_dma_mode' to plat_stmmacenet_data
struct to allow users to specify if they want to use force store forward
eventhough tx_coe is not available in hw.
without this flag stmmac driver will use cut-thru mode not use
store-forward mode.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@st.com>
Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/stmmac/stmmac_main.c | 8 +++++---
 include/linux/stmmac.h           | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/stmmac/stmmac_main.c b/drivers/net/stmmac/stmmac_main.c
index c8c9e5bc6608..d37ebc80623a 100644
--- a/drivers/net/stmmac/stmmac_main.c
+++ b/drivers/net/stmmac/stmmac_main.c
@@ -557,9 +557,11 @@ static void free_dma_desc_resources(struct stmmac_priv *priv)
  */
 static void stmmac_dma_operation_mode(struct stmmac_priv *priv)
 {
-	if (likely((priv->plat->tx_coe) && (!priv->no_csum_insertion))) {
-		/* In case of GMAC, SF mode has to be enabled
-		 * to perform the TX COE. This depends on:
+	if (likely(priv->plat->force_sf_dma_mode ||
+		((priv->plat->tx_coe) && (!priv->no_csum_insertion)))) {
+		/*
+		 * In case of GMAC, SF mode can be enabled
+		 * to perform the TX COE in HW. This depends on:
 		 * 1) TX COE if actually supported
 		 * 2) There is no bugged Jumbo frame support
 		 *    that needs to not insert csum in the TDES.
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 9529e49b0385..05d775690b72 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -40,6 +40,7 @@ struct plat_stmmacenet_data {
 	int tx_coe;
 	int bugged_jumbo;
 	int pmt;
+	int force_sf_dma_mode;
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	void (*bus_setup)(void __iomem *ioaddr);
 	int (*init)(struct platform_device *pdev);
-- 
cgit v1.2.3


From a07c7964a29b6dc515b120f1e1c223ac2f8666f5 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sat, 16 Jul 2011 22:22:20 +0000
Subject: include/linux/sdla.h: remove the prototype of sdla()

`make headers_check` complains that

linux-2.6/usr/include/linux/sdla.h:116: userspace cannot reference
function or variable defined in the kernel

this is due to that there is no such a kernel function,

void sdla(void *cfg_info, char *dev, struct frad_conf *conf, int quiet);

I don't know why we have it in a kernel header, so remove it.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sdla.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sdla.h b/include/linux/sdla.h
index 564acd3a71c1..9995c7fc3f60 100644
--- a/include/linux/sdla.h
+++ b/include/linux/sdla.h
@@ -112,11 +112,7 @@ struct sdla_dlci_conf {
    short Tb_max;
 };
 
-#ifndef __KERNEL__
-
-void sdla(void *cfg_info, char *dev, struct frad_conf *conf, int quiet);
-
-#else
+#ifdef __KERNEL__
 
 /* important Z80 window addresses */
 #define SDLA_CONTROL_WND		0xE000
-- 
cgit v1.2.3


From f701e5b73a1a79ea62ffd45d9e2bed4c7d5c1fd2 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vzapolskiy@gmail.com>
Date: Fri, 15 Jul 2011 20:45:18 +0300
Subject: connector: add an event for monitoring process tracers

This change adds a procfs connector event, which is emitted on every
successful process tracer attach or detach.

If some process connects to other one, kernelspace connector reports
process id and thread group id of both these involved processes. On
disconnection null process id is returned.

Such an event allows to create a simple automated userspace mechanism
to be aware about processes connecting to others, therefore predefined
process policies can be applied to them if needed.

Note, a detach signal is emitted only in case, if a tracer process
explicitly executes PTRACE_DETACH request. In other cases like tracee
or tracer exit detach event from proc connector is not reported.

Signed-off-by: Vladimir Zapolskiy <vzapolskiy@gmail.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 drivers/connector/cn_proc.c | 35 +++++++++++++++++++++++++++++++++++
 include/linux/cn_proc.h     | 13 +++++++++++++
 kernel/ptrace.c             |  7 ++++++-
 3 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 2b46a7efa0ac..281902d3f7ec 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/connector.h>
 #include <linux/gfp.h>
+#include <linux/ptrace.h>
 #include <asm/atomic.h>
 #include <asm/unaligned.h>
 
@@ -166,6 +167,40 @@ void proc_sid_connector(struct task_struct *task)
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
+void proc_ptrace_connector(struct task_struct *task, int ptrace_id)
+{
+	struct cn_msg *msg;
+	struct proc_event *ev;
+	struct timespec ts;
+	__u8 buffer[CN_PROC_MSG_SIZE];
+	struct task_struct *tracer;
+
+	if (atomic_read(&proc_event_num_listeners) < 1)
+		return;
+
+	msg = (struct cn_msg *)buffer;
+	ev = (struct proc_event *)msg->data;
+	get_seq(&msg->seq, &ev->cpu);
+	ktime_get_ts(&ts); /* get high res monotonic timestamp */
+	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
+	ev->what = PROC_EVENT_PTRACE;
+	ev->event_data.ptrace.process_pid  = task->pid;
+	ev->event_data.ptrace.process_tgid = task->tgid;
+	if (ptrace_id == PTRACE_ATTACH) {
+		ev->event_data.ptrace.tracer_pid  = current->pid;
+		ev->event_data.ptrace.tracer_tgid = current->tgid;
+	} else if (ptrace_id == PTRACE_DETACH) {
+		ev->event_data.ptrace.tracer_pid  = 0;
+		ev->event_data.ptrace.tracer_tgid = 0;
+	} else
+		return;
+
+	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+	msg->ack = 0; /* not used */
+	msg->len = sizeof(*ev);
+	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+}
+
 void proc_exit_connector(struct task_struct *task)
 {
 	struct cn_msg *msg;
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 47dac5ea8d3a..12c517b51ca2 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -53,6 +53,7 @@ struct proc_event {
 		PROC_EVENT_UID  = 0x00000004,
 		PROC_EVENT_GID  = 0x00000040,
 		PROC_EVENT_SID  = 0x00000080,
+		PROC_EVENT_PTRACE = 0x00000100,
 		/* "next" should be 0x00000400 */
 		/* "last" is the last process event: exit */
 		PROC_EVENT_EXIT = 0x80000000
@@ -95,6 +96,13 @@ struct proc_event {
 			__kernel_pid_t process_tgid;
 		} sid;
 
+		struct ptrace_proc_event {
+			__kernel_pid_t process_pid;
+			__kernel_pid_t process_tgid;
+			__kernel_pid_t tracer_pid;
+			__kernel_pid_t tracer_tgid;
+		} ptrace;
+
 		struct exit_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
@@ -109,6 +117,7 @@ void proc_fork_connector(struct task_struct *task);
 void proc_exec_connector(struct task_struct *task);
 void proc_id_connector(struct task_struct *task, int which_id);
 void proc_sid_connector(struct task_struct *task);
+void proc_ptrace_connector(struct task_struct *task, int which_id);
 void proc_exit_connector(struct task_struct *task);
 #else
 static inline void proc_fork_connector(struct task_struct *task)
@@ -124,6 +133,10 @@ static inline void proc_id_connector(struct task_struct *task,
 static inline void proc_sid_connector(struct task_struct *task)
 {}
 
+static inline void proc_ptrace_connector(struct task_struct *task,
+					 int ptrace_id)
+{}
+
 static inline void proc_exit_connector(struct task_struct *task)
 {}
 #endif	/* CONFIG_PROC_EVENTS */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d7ccc79454f5..9de3ecfd20f9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,6 +23,7 @@
 #include <linux/uaccess.h>
 #include <linux/regset.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/cn_proc.h>
 
 
 static int ptrace_trapping_sleep_fn(void *flags)
@@ -305,9 +306,12 @@ unlock_tasklist:
 unlock_creds:
 	mutex_unlock(&task->signal->cred_guard_mutex);
 out:
-	if (!retval)
+	if (!retval) {
 		wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
 			    ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+		proc_ptrace_connector(task, PTRACE_ATTACH);
+	}
+
 	return retval;
 }
 
@@ -415,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 	}
 	write_unlock_irq(&tasklist_lock);
 
+	proc_ptrace_connector(child, PTRACE_DETACH);
 	if (unlikely(dead))
 		release_task(child);
 
-- 
cgit v1.2.3


From 90e33f62e027d330485d03598e1b2d8db3ff031c Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 5 Jul 2011 23:42:28 -0600
Subject: of/address: Add of_find_matching_node_by_address helper

of_find_matching_node_by_address() can be used to find a device tree
node for a device at a specific address.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/address.c       | 18 ++++++++++++++++++
 include/linux/of_address.h |  4 ++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index b4559c58c095..da1f4b9605df 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -577,6 +577,24 @@ int of_address_to_resource(struct device_node *dev, int index,
 }
 EXPORT_SYMBOL_GPL(of_address_to_resource);
 
+struct device_node *of_find_matching_node_by_address(struct device_node *from,
+					const struct of_device_id *matches,
+					u64 base_address)
+{
+	struct device_node *dn = of_find_matching_node(from, matches);
+	struct resource res;
+
+	while (dn) {
+		if (of_address_to_resource(dn, 0, &res))
+			continue;
+		if (res.start == base_address)
+			return dn;
+		dn = of_find_matching_node(dn, matches);
+	}
+
+	return NULL;
+}
+
 
 /**
  * of_iomap - Maps the memory mapped IO for a given device_node
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 2feda6ee6140..bdfc20d8ff84 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -6,6 +6,10 @@
 extern u64 of_translate_address(struct device_node *np, const __be32 *addr);
 extern int of_address_to_resource(struct device_node *dev, int index,
 				  struct resource *r);
+extern struct device_node *of_find_matching_node_by_address(
+					struct device_node *from,
+					const struct of_device_id *matches,
+					u64 base_address);
 extern void __iomem *of_iomap(struct device_node *device, int index);
 
 /* Extract an address from a device, returns the region size and
-- 
cgit v1.2.3


From 99ce39e359fa29e4b609a6a13485e7573eda5dfb Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 5 Jul 2011 23:42:37 -0600
Subject: dt: include linux/errno.h in linux/of_address.h

of_address.h makes reference to some of the error code #defines, so it
needs to include errno.h.  If CONFIG_PCI is not selected, then some files
will fail to compile.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 include/linux/of_address.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index bdfc20d8ff84..3118623c2c1f 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -1,6 +1,7 @@
 #ifndef __OF_ADDRESS_H
 #define __OF_ADDRESS_H
 #include <linux/ioport.h>
+#include <linux/errno.h>
 #include <linux/of.h>
 
 extern u64 of_translate_address(struct device_node *np, const __be32 *addr);
-- 
cgit v1.2.3


From 52eafc68d601afd699b023201b0c6be5209f39ce Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 15 Jun 2011 14:41:42 +0000
Subject: mlx4_core: Extend capability flags to 64 bits

The latest firmware adds a second dword containing more device flags,
so extend the device capabilities flags field from 32 to 64 bits.
Derived from patch by Eli Cohen <eli@mellanox.co.il>

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.co.il>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/mlx4/fw.c       |  9 +++++----
 drivers/net/mlx4/fw.h       |  2 +-
 include/linux/mlx4/device.h | 34 +++++++++++++++++-----------------
 3 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 67a209ba939d..346d8b3d03df 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -75,7 +75,7 @@ MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (defa
 		}						      \
 	} while (0)
 
-static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags)
+static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
 {
 	static const char *fname[] = {
 		[ 0] = "RC transport",
@@ -105,7 +105,7 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags)
 
 	mlx4_dbg(dev, "DEV_CAP flags:\n");
 	for (i = 0; i < ARRAY_SIZE(fname); ++i)
-		if (fname[i] && (flags & (1 << i)))
+		if (fname[i] && (flags & (1LL << i)))
 			mlx4_dbg(dev, "    %s\n", fname[i]);
 }
 
@@ -142,7 +142,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	struct mlx4_cmd_mailbox *mailbox;
 	u32 *outbox;
 	u8 field;
-	u32 field32;
+	u32 field32, flags;
 	u16 size;
 	u16 stat_rate;
 	int err;
@@ -279,7 +279,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET);
 	dev_cap->loopback_support = field & 0x1;
 	dev_cap->wol = field & 0x40;
-	MLX4_GET(dev_cap->flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+	MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+	dev_cap->flags = flags;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
 	dev_cap->reserved_uars = field >> 4;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index 88003ebc6185..2a8d11008c51 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -83,7 +83,7 @@ struct mlx4_dev_cap {
 	int vep_uc_steering;
 	int vep_mc_steering;
 	int wol;
-	u32 flags;
+	u64 flags;
 	int reserved_uars;
 	int uar_size;
 	int min_page_sz;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 8985768e2c0d..f337e989b70f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -58,22 +58,22 @@ enum {
 };
 
 enum {
-	MLX4_DEV_CAP_FLAG_RC		= 1 <<  0,
-	MLX4_DEV_CAP_FLAG_UC		= 1 <<  1,
-	MLX4_DEV_CAP_FLAG_UD		= 1 <<  2,
-	MLX4_DEV_CAP_FLAG_SRQ		= 1 <<  6,
-	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1 <<  7,
-	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1 <<  8,
-	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1 <<  9,
-	MLX4_DEV_CAP_FLAG_DPDP		= 1 << 12,
-	MLX4_DEV_CAP_FLAG_BLH		= 1 << 15,
-	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1 << 16,
-	MLX4_DEV_CAP_FLAG_APM		= 1 << 17,
-	MLX4_DEV_CAP_FLAG_ATOMIC	= 1 << 18,
-	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1 << 19,
-	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1 << 20,
-	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1 << 21,
-	MLX4_DEV_CAP_FLAG_IBOE		= 1 << 30
+	MLX4_DEV_CAP_FLAG_RC		= 1LL <<  0,
+	MLX4_DEV_CAP_FLAG_UC		= 1LL <<  1,
+	MLX4_DEV_CAP_FLAG_UD		= 1LL <<  2,
+	MLX4_DEV_CAP_FLAG_SRQ		= 1LL <<  6,
+	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1LL <<  7,
+	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1LL <<  8,
+	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1LL <<  9,
+	MLX4_DEV_CAP_FLAG_DPDP		= 1LL << 12,
+	MLX4_DEV_CAP_FLAG_BLH		= 1LL << 15,
+	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1LL << 16,
+	MLX4_DEV_CAP_FLAG_APM		= 1LL << 17,
+	MLX4_DEV_CAP_FLAG_ATOMIC	= 1LL << 18,
+	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1LL << 19,
+	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1LL << 20,
+	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1LL << 21,
+	MLX4_DEV_CAP_FLAG_IBOE		= 1LL << 30
 };
 
 enum {
@@ -253,7 +253,7 @@ struct mlx4_caps {
 	int			mtt_entry_sz;
 	u32			max_msg_sz;
 	u32			page_size_cap;
-	u32			flags;
+	u64			flags;
 	u32			bmme_flags;
 	u32			reserved_lkey;
 	u16			stat_rate_support;
-- 
cgit v1.2.3


From ccf863219675aa86bebdd6a2806acb8176478e37 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 7 Jul 2011 19:19:29 +0000
Subject: mlx4_core: Read extended capabilities into the flags field

Query another dword containing up to 32 extended device capabilities
and merge it into struct mlx4_caps.flags.  Update the code that
handles the current extended device capabilities (e.g UDP RSS, WoL,
vep steering, etc) to use the extended device cap flags field instead
of a field per extended capability.  Initial patch done by Eli Cohen
<eli@mellanox.co.il>.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.co.il>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/mlx4/en_ethtool.c  |  9 +++++----
 drivers/net/mlx4/en_main.c     |  3 ++-
 drivers/net/mlx4/en_netdev.c   |  5 +++--
 drivers/net/mlx4/en_port.c     |  6 ++++--
 drivers/net/mlx4/en_selftest.c |  3 ++-
 drivers/net/mlx4/fw.c          | 24 +++++++++++-------------
 drivers/net/mlx4/fw.h          |  5 -----
 drivers/net/mlx4/main.c        |  5 -----
 drivers/net/mlx4/mcg.c         | 17 ++++++++++-------
 drivers/net/mlx4/port.c        |  8 ++++----
 include/linux/mlx4/device.h    | 12 ++++++------
 11 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mlx4/en_ethtool.c b/drivers/net/mlx4/en_ethtool.c
index 2e858e4dcf4d..eb096253d781 100644
--- a/drivers/net/mlx4/en_ethtool.c
+++ b/drivers/net/mlx4/en_ethtool.c
@@ -104,7 +104,7 @@ static void mlx4_en_get_wol(struct net_device *netdev,
 	int err = 0;
 	u64 config = 0;
 
-	if (!priv->mdev->dev->caps.wol) {
+	if (!(priv->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_WOL)) {
 		wol->supported = 0;
 		wol->wolopts = 0;
 		return;
@@ -134,7 +134,7 @@ static int mlx4_en_set_wol(struct net_device *netdev,
 	u64 config = 0;
 	int err = 0;
 
-	if (!priv->mdev->dev->caps.wol)
+	if (!(priv->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_WOL))
 		return -EOPNOTSUPP;
 
 	if (wol->supported & ~WAKE_MAGIC)
@@ -170,7 +170,8 @@ static int mlx4_en_get_sset_count(struct net_device *dev, int sset)
 		return NUM_ALL_STATS +
 			(priv->tx_ring_num + priv->rx_ring_num) * 2;
 	case ETH_SS_TEST:
-		return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.loopback_support) * 2;
+		return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.flags
+					& MLX4_DEV_CAP_FLAG_UC_LOOPBACK) * 2;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -220,7 +221,7 @@ static void mlx4_en_get_strings(struct net_device *dev,
 	case ETH_SS_TEST:
 		for (i = 0; i < MLX4_EN_NUM_SELF_TEST - 2; i++)
 			strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
-		if (priv->mdev->dev->caps.loopback_support)
+		if (priv->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UC_LOOPBACK)
 			for (; i < MLX4_EN_NUM_SELF_TEST; i++)
 				strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
 		break;
diff --git a/drivers/net/mlx4/en_main.c b/drivers/net/mlx4/en_main.c
index 9276b1b25586..6bfea233a9f2 100644
--- a/drivers/net/mlx4/en_main.c
+++ b/drivers/net/mlx4/en_main.c
@@ -106,7 +106,8 @@ static int mlx4_en_get_profile(struct mlx4_en_dev *mdev)
 
 	params->tcp_rss = tcp_rss;
 	params->udp_rss = udp_rss;
-	if (params->udp_rss && !mdev->dev->caps.udp_rss) {
+	if (params->udp_rss && !(mdev->dev->caps.flags
+					& MLX4_DEV_CAP_FLAG_UDP_RSS)) {
 		mlx4_warn(mdev, "UDP RSS is not supported on this device.\n");
 		params->udp_rss = 0;
 	}
diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c
index 61850adae6f7..aa6e73e7b6bb 100644
--- a/drivers/net/mlx4/en_netdev.c
+++ b/drivers/net/mlx4/en_netdev.c
@@ -239,7 +239,8 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 			priv->flags |= MLX4_EN_FLAG_PROMISC;
 
 			/* Enable promiscouos mode */
-			if (!mdev->dev->caps.vep_uc_steering)
+			if (!(mdev->dev->caps.flags &
+						MLX4_DEV_CAP_FLAG_VEP_UC_STEER))
 				err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
 							     priv->base_qpn, 1);
 			else
@@ -285,7 +286,7 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
 
 		/* Disable promiscouos mode */
-		if (!mdev->dev->caps.vep_uc_steering)
+		if (!(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER))
 			err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
 						     priv->base_qpn, 0);
 		else
diff --git a/drivers/net/mlx4/en_port.c b/drivers/net/mlx4/en_port.c
index f2a4f5dd313d..be861fbc1a4d 100644
--- a/drivers/net/mlx4/en_port.c
+++ b/drivers/net/mlx4/en_port.c
@@ -119,9 +119,11 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 	struct mlx4_set_port_rqp_calc_context *context;
 	int err;
 	u32 in_mod;
-	u32 m_promisc = (dev->caps.vep_mc_steering) ? MCAST_DIRECT : MCAST_DEFAULT;
+	u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ?
+						MCAST_DIRECT : MCAST_DEFAULT;
 
-	if (dev->caps.vep_mc_steering && dev->caps.vep_uc_steering)
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER  &&
+			dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER)
 		return 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
diff --git a/drivers/net/mlx4/en_selftest.c b/drivers/net/mlx4/en_selftest.c
index 191a8dcd8a93..9fdbcecd499d 100644
--- a/drivers/net/mlx4/en_selftest.c
+++ b/drivers/net/mlx4/en_selftest.c
@@ -159,7 +159,8 @@ retry_tx:
 				goto retry_tx;
 		}
 
-		if (priv->mdev->dev->caps.loopback_support){
+		if (priv->mdev->dev->caps.flags &
+					MLX4_DEV_CAP_FLAG_UC_LOOPBACK) {
 			buf[3] = mlx4_en_test_registers(priv);
 			buf[4] = mlx4_en_test_loopback(priv);
 		}
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 346d8b3d03df..1d3fc6d7689b 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -99,7 +99,12 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
 		[21] = "UD multicast support",
 		[24] = "Demand paging support",
 		[25] = "Router support",
-		[30] = "IBoE support"
+		[30] = "IBoE support",
+		[32] = "Unicast loopback support",
+		[38] = "Wake On LAN support",
+		[40] = "UDP RSS support",
+		[41] = "Unicast VEP steering support",
+		[42] = "Multicast VEP steering support"
 	};
 	int i;
 
@@ -142,7 +147,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	struct mlx4_cmd_mailbox *mailbox;
 	u32 *outbox;
 	u8 field;
-	u32 field32, flags;
+	u32 field32, flags, ext_flags;
 	u16 size;
 	u16 stat_rate;
 	int err;
@@ -180,8 +185,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
 #define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
 #define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
-#define QUERY_DEV_CAP_UDP_RSS_OFFSET		0x42
-#define QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET	0x43
+#define QUERY_DEV_CAP_EXT_FLAGS_OFFSET		0x40
 #define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
 #define QUERY_DEV_CAP_RSVD_UAR_OFFSET		0x48
 #define QUERY_DEV_CAP_UAR_SZ_OFFSET		0x49
@@ -272,15 +276,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev_cap->max_msg_sz = 1 << (field & 0x1f);
 	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
 	dev_cap->stat_rate_support = stat_rate;
-	MLX4_GET(field, outbox, QUERY_DEV_CAP_UDP_RSS_OFFSET);
-	dev_cap->udp_rss = field & 0x1;
-	dev_cap->vep_uc_steering = field & 0x2;
-	dev_cap->vep_mc_steering = field & 0x4;
-	MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET);
-	dev_cap->loopback_support = field & 0x1;
-	dev_cap->wol = field & 0x40;
+	MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
 	MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
-	dev_cap->flags = flags;
+	dev_cap->flags = flags | (u64)ext_flags << 32;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
 	dev_cap->reserved_uars = field >> 4;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
@@ -802,7 +800,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	MLX4_PUT(inbox, param->mc_base,		INIT_HCA_MC_BASE_OFFSET);
 	MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
 	MLX4_PUT(inbox, param->log_mc_hash_sz,  INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
-	if (dev->caps.vep_mc_steering)
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
 		MLX4_PUT(inbox, (u8) (1 << 3),	INIT_HCA_UC_STEERING_OFFSET);
 	MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
 
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index 2a8d11008c51..56ed1646cced 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -78,11 +78,6 @@ struct mlx4_dev_cap {
 	u16 wavelength[MLX4_MAX_PORTS + 1];
 	u64 trans_code[MLX4_MAX_PORTS + 1];
 	u16 stat_rate_support;
-	int udp_rss;
-	int loopback_support;
-	int vep_uc_steering;
-	int vep_mc_steering;
-	int wol;
 	u64 flags;
 	int reserved_uars;
 	int uar_size;
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 3814fc9b1145..650f4ca8606e 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -226,11 +226,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.bmme_flags	     = dev_cap->bmme_flags;
 	dev->caps.reserved_lkey	     = dev_cap->reserved_lkey;
 	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
-	dev->caps.udp_rss	     = dev_cap->udp_rss;
-	dev->caps.loopback_support   = dev_cap->loopback_support;
-	dev->caps.vep_uc_steering    = dev_cap->vep_uc_steering;
-	dev->caps.vep_mc_steering    = dev_cap->vep_mc_steering;
-	dev->caps.wol		     = dev_cap->wol;
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 
 	dev->caps.log_num_macs  = log_num_mac;
diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c
index e63c37d6a115..cd1784593a3c 100644
--- a/drivers/net/mlx4/mcg.c
+++ b/drivers/net/mlx4/mcg.c
@@ -559,7 +559,8 @@ static int find_entry(struct mlx4_dev *dev, u8 port,
 	struct mlx4_mgm *mgm = mgm_mailbox->buf;
 	u8 *mgid;
 	int err;
-	u8 op_mod = (prot == MLX4_PROT_ETH) ? !!(dev->caps.vep_mc_steering) : 0;
+	u8 op_mod = (prot == MLX4_PROT_ETH) ?
+		!!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) : 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -834,7 +835,8 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 
 	steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER;
 
-	if (prot == MLX4_PROT_ETH && !dev->caps.vep_mc_steering)
+	if (prot == MLX4_PROT_ETH &&
+			!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
 		return 0;
 
 	if (prot == MLX4_PROT_ETH)
@@ -853,7 +855,8 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 
 	steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER;
 
-	if (prot == MLX4_PROT_ETH && !dev->caps.vep_mc_steering)
+	if (prot == MLX4_PROT_ETH &&
+			!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
 		return 0;
 
 	if (prot == MLX4_PROT_ETH) {
@@ -867,7 +870,7 @@ EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
 
 int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
 {
-	if (!dev->caps.vep_mc_steering)
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
 		return 0;
 
 
@@ -877,7 +880,7 @@ EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_add);
 
 int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
 {
-	if (!dev->caps.vep_mc_steering)
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
 		return 0;
 
 
@@ -887,7 +890,7 @@ EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_remove);
 
 int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
 {
-	if (!dev->caps.vep_mc_steering)
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
 		return 0;
 
 
@@ -897,7 +900,7 @@ EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_add);
 
 int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
 {
-	if (!dev->caps.vep_mc_steering)
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
 		return 0;
 
 	return remove_promisc_qp(dev, 0, port, MLX4_UC_STEER, qpn);
diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c
index 8856659fb43c..1f95afda6841 100644
--- a/drivers/net/mlx4/port.c
+++ b/drivers/net/mlx4/port.c
@@ -146,7 +146,7 @@ int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap)
 	int i, err = 0;
 	int free = -1;
 
-	if (dev->caps.vep_uc_steering) {
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) {
 		err = mlx4_uc_steer_add(dev, port, mac, qpn, 1);
 		if (!err) {
 			entry = kmalloc(sizeof *entry, GFP_KERNEL);
@@ -203,7 +203,7 @@ int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap)
 		goto out;
 	}
 
-	if (!dev->caps.vep_uc_steering)
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER))
 		*qpn = info->base_qpn + free;
 	++table->total;
 out:
@@ -243,7 +243,7 @@ void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn)
 	int index = qpn - info->base_qpn;
 	struct mlx4_mac_entry *entry;
 
-	if (dev->caps.vep_uc_steering) {
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) {
 		entry = radix_tree_lookup(&info->mac_tree, qpn);
 		if (entry) {
 			mlx4_uc_steer_release(dev, port, entry->mac, qpn, 1);
@@ -274,7 +274,7 @@ int mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac, u8 wra
 	struct mlx4_mac_entry *entry;
 	int err;
 
-	if (dev->caps.vep_uc_steering) {
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) {
 		entry = radix_tree_lookup(&info->mac_tree, qpn);
 		if (!entry)
 			return -EINVAL;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index f337e989b70f..fa861134928a 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -73,7 +73,12 @@ enum {
 	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1LL << 19,
 	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1LL << 20,
 	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1LL << 21,
-	MLX4_DEV_CAP_FLAG_IBOE		= 1LL << 30
+	MLX4_DEV_CAP_FLAG_IBOE		= 1LL << 30,
+	MLX4_DEV_CAP_FLAG_UC_LOOPBACK	= 1LL << 32,
+	MLX4_DEV_CAP_FLAG_WOL		= 1LL << 38,
+	MLX4_DEV_CAP_FLAG_UDP_RSS	= 1LL << 40,
+	MLX4_DEV_CAP_FLAG_VEP_UC_STEER	= 1LL << 41,
+	MLX4_DEV_CAP_FLAG_VEP_MC_STEER	= 1LL << 42
 };
 
 enum {
@@ -257,11 +262,6 @@ struct mlx4_caps {
 	u32			bmme_flags;
 	u32			reserved_lkey;
 	u16			stat_rate_support;
-	int			udp_rss;
-	int			loopback_support;
-	int			vep_uc_steering;
-	int			vep_mc_steering;
-	int			wol;
 	u8			port_width_cap[MLX4_MAX_PORTS + 1];
 	int			max_gso_sz;
 	int                     reserved_qps_cnt[MLX4_NUM_QP_REGION];
-- 
cgit v1.2.3


From 98a13e487a3bdac8508e4dcb98d63385fabe6767 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 15 Jun 2011 14:43:43 +0000
Subject: mlx4_core: Fix location of counter index in QP context struct

Fix the address handle portion of the QP context structure to have the
correct bit location for the counter index field.

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.co.il>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 include/linux/mlx4/qp.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 9e9eb21056ca..6486d880bdbf 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -99,7 +99,7 @@ struct mlx4_qp_path {
 	u8			fl;
 	u8			reserved1[2];
 	u8			pkey_index;
-	u8			reserved2;
+	u8			counter_index;
 	u8			grh_mylmc;
 	__be16			rlid;
 	u8			ackto;
@@ -111,8 +111,7 @@ struct mlx4_qp_path {
 	u8			sched_queue;
 	u8			vlan_index;
 	u8			reserved3[2];
-	u8			counter_index;
-	u8			reserved4;
+	u8			reserved4[2];
 	u8			dmac[6];
 };
 
-- 
cgit v1.2.3


From f2a3f6a32cf64db1495b5ced8625b9a80bde44e5 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 15 Jun 2011 14:47:14 +0000
Subject: mlx4_core: Add network flow counters

ConnectX devices support a set of flow counters that can be attached
to a set containing one or more QPs.  Each such counter tracks receive
and transmit packets and bytes of these QPs.  This patch queries the
device to check support for counters, handles initialization of the
HCA to enable counters, and initializes a bitmap allocator to control
counter allocations.  Derived from patch by Eli Cohen <eli@mellanox.co.il>.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.co.il>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/mlx4/fw.c       | 12 +++++++++-
 drivers/net/mlx4/fw.h       |  1 +
 drivers/net/mlx4/main.c     | 53 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4.h     |  1 +
 include/linux/mlx4/cmd.h    |  3 +++
 include/linux/mlx4/device.h | 18 ++++++++++++++-
 6 files changed, 86 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 1d3fc6d7689b..7eb8ba822e97 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -104,7 +104,8 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
 		[38] = "Wake On LAN support",
 		[40] = "UDP RSS support",
 		[41] = "Unicast VEP steering support",
-		[42] = "Multicast VEP steering support"
+		[42] = "Multicast VEP steering support",
+		[48] = "Counters support",
 	};
 	int i;
 
@@ -203,6 +204,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_MCG_OFFSET		0x63
 #define QUERY_DEV_CAP_RSVD_PD_OFFSET		0x64
 #define QUERY_DEV_CAP_MAX_PD_OFFSET		0x65
+#define QUERY_DEV_CAP_MAX_COUNTERS_OFFSET	0x68
 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
@@ -355,6 +357,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
 	MLX4_GET(dev_cap->max_icm_sz, outbox,
 		 QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
+	if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		MLX4_GET(dev_cap->max_counters, outbox,
+			 QUERY_DEV_CAP_MAX_COUNTERS_OFFSET);
 
 	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
 		for (i = 1; i <= dev_cap->num_ports; ++i) {
@@ -448,6 +453,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
 		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
 	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
+	mlx4_dbg(dev, "Max counters: %d\n", dev_cap->max_counters);
 
 	dump_dev_cap_flags(dev, dev_cap->flags);
 
@@ -780,6 +786,10 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	if (enable_qos)
 		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2);
 
+	/* enable counters */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4);
+
 	/* QPC/EEC/CQC/EQC/RDMARC attributes */
 
 	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index 56ed1646cced..1e8ecc3708e2 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -111,6 +111,7 @@ struct mlx4_dev_cap {
 	u8  supported_port_types[MLX4_MAX_PORTS + 1];
 	u8  log_max_macs[MLX4_MAX_PORTS + 1];
 	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
+	u32 max_counters;
 };
 
 struct mlx4_adapter {
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 650f4ca8606e..932dac5e4db7 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -143,6 +143,7 @@ static void mlx4_set_port_mask(struct mlx4_dev *dev)
 		if (dev->caps.port_type[i] == MLX4_PORT_TYPE_IB)
 			dev->caps.port_mask |= 1 << (i - 1);
 }
+
 static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	int err;
@@ -257,6 +258,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 
 	mlx4_set_port_mask(dev);
 
+	dev->caps.max_counters = 1 << ilog2(dev_cap->max_counters);
+
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
@@ -834,6 +837,45 @@ err_stop_fw:
 	return err;
 }
 
+static int mlx4_init_counters_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nent;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return -ENOENT;
+
+	nent = dev->caps.max_counters;
+	return mlx4_bitmap_init(&priv->counters_bitmap, nent, nent - 1, 0, 0);
+}
+
+static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap);
+}
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return -ENOENT;
+
+	*idx = mlx4_bitmap_alloc(&priv->counters_bitmap);
+	if (*idx == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_alloc);
+
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx);
+	return;
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_free);
+
 static int mlx4_setup_hca(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -938,6 +980,12 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 		goto err_qp_table_free;
 	}
 
+	err = mlx4_init_counters_table(dev);
+	if (err && err != -ENOENT) {
+		mlx4_err(dev, "Failed to initialize counters table, aborting.\n");
+		goto err_counters_table_free;
+	}
+
 	for (port = 1; port <= dev->caps.num_ports; port++) {
 		enum mlx4_port_type port_type = 0;
 		mlx4_SENSE_PORT(dev, port, &port_type);
@@ -964,6 +1012,9 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 err_mcg_table_free:
 	mlx4_cleanup_mcg_table(dev);
 
+err_counters_table_free:
+	mlx4_cleanup_counters_table(dev);
+
 err_qp_table_free:
 	mlx4_cleanup_qp_table(dev);
 
@@ -1294,6 +1345,7 @@ err_port:
 	for (--port; port >= 1; --port)
 		mlx4_cleanup_port_info(&priv->port[port]);
 
+	mlx4_cleanup_counters_table(dev);
 	mlx4_cleanup_mcg_table(dev);
 	mlx4_cleanup_qp_table(dev);
 	mlx4_cleanup_srq_table(dev);
@@ -1354,6 +1406,7 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 			mlx4_CLOSE_PORT(dev, p);
 		}
 
+		mlx4_cleanup_counters_table(dev);
 		mlx4_cleanup_mcg_table(dev);
 		mlx4_cleanup_qp_table(dev);
 		mlx4_cleanup_srq_table(dev);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index dd7d745fbab4..5e405bbc8d1b 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -342,6 +342,7 @@ struct mlx4_priv {
 	struct mlx4_srq_table	srq_table;
 	struct mlx4_qp_table	qp_table;
 	struct mlx4_mcg_table	mcg_table;
+	struct mlx4_bitmap	counters_bitmap;
 
 	struct mlx4_catas_err	catas_err;
 
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 9a18667c13cc..b56e4587208d 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -123,6 +123,9 @@ enum {
 	/* debug commands */
 	MLX4_CMD_QUERY_DEBUG_MSG = 0x2a,
 	MLX4_CMD_SET_DEBUG_MSG	 = 0x2b,
+
+	/* statistics commands */
+	MLX4_CMD_QUERY_IF_STAT	 = 0X54,
 };
 
 enum {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index fa861134928a..387329e02303 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -78,7 +78,8 @@ enum {
 	MLX4_DEV_CAP_FLAG_WOL		= 1LL << 38,
 	MLX4_DEV_CAP_FLAG_UDP_RSS	= 1LL << 40,
 	MLX4_DEV_CAP_FLAG_VEP_UC_STEER	= 1LL << 41,
-	MLX4_DEV_CAP_FLAG_VEP_MC_STEER	= 1LL << 42
+	MLX4_DEV_CAP_FLAG_VEP_MC_STEER	= 1LL << 42,
+	MLX4_DEV_CAP_FLAG_COUNTERS	= 1LL << 48
 };
 
 enum {
@@ -274,6 +275,7 @@ struct mlx4_caps {
 	u8			supported_type[MLX4_MAX_PORTS + 1];
 	u32			port_mask;
 	enum mlx4_port_type	possible_type[MLX4_MAX_PORTS + 1];
+	u32			max_counters;
 };
 
 struct mlx4_buf_list {
@@ -438,6 +440,17 @@ union mlx4_ext_av {
 	struct mlx4_eth_av	eth;
 };
 
+struct mlx4_counter {
+	u8	reserved1[3];
+	u8	counter_mode;
+	__be32	num_ifc;
+	u32	reserved2[2];
+	__be64	rx_frames;
+	__be64	rx_bytes;
+	__be64	tx_frames;
+	__be64	tx_bytes;
+};
+
 struct mlx4_dev {
 	struct pci_dev	       *pdev;
 	unsigned long		flags;
@@ -568,4 +581,7 @@ void mlx4_release_eq(struct mlx4_dev *dev, int vec);
 int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port);
 int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
 
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+
 #endif /* MLX4_DEVICE_H */
-- 
cgit v1.2.3


From cfcde11c3d7ae175f49280bb6f913478c2f1bd8c Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 15 Jun 2011 14:49:57 +0000
Subject: IB/mlx4: Use flow counters on IBoE ports

Allocate flow counter per Ethernet/IBoE port, and attach this counter
to all the QPs created on that port.  Based on patch by Eli Cohen
<eli@mellanox.co.il>.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.co.il>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/main.c    | 19 +++++++++++++++++--
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  1 +
 drivers/infiniband/hw/mlx4/qp.c      | 10 +++++++++-
 include/linux/mlx4/qp.h              |  3 ++-
 4 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 621c409433e2..fa643f4f4e28 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1098,11 +1098,21 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	if (init_node_data(ibdev))
 		goto err_map;
 
+	for (i = 0; i < ibdev->num_ports; ++i) {
+		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
+						IB_LINK_LAYER_ETHERNET) {
+			err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]);
+			if (err)
+				ibdev->counters[i] = -1;
+		} else
+				ibdev->counters[i] = -1;
+	}
+
 	spin_lock_init(&ibdev->sm_lock);
 	mutex_init(&ibdev->cap_mask_mutex);
 
 	if (ib_register_device(&ibdev->ib_dev, NULL))
-		goto err_map;
+		goto err_counter;
 
 	if (mlx4_ib_mad_init(ibdev))
 		goto err_reg;
@@ -1132,6 +1142,10 @@ err_notif:
 err_reg:
 	ib_unregister_device(&ibdev->ib_dev);
 
+err_counter:
+	for (; i; --i)
+		mlx4_counter_free(ibdev->dev, ibdev->counters[i - 1]);
+
 err_map:
 	iounmap(ibdev->uar_map);
 
@@ -1160,7 +1174,8 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 		ibdev->iboe.nb.notifier_call = NULL;
 	}
 	iounmap(ibdev->uar_map);
-
+	for (p = 0; p < ibdev->num_ports; ++p)
+		mlx4_counter_free(ibdev->dev, ibdev->counters[p]);
 	mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
 		mlx4_CLOSE_PORT(dev, p);
 
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 2a322f21049f..e4bf2cff8662 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -193,6 +193,7 @@ struct mlx4_ib_dev {
 	struct mutex		cap_mask_mutex;
 	bool			ib_active;
 	struct mlx4_ib_iboe	iboe;
+	int			counters[MLX4_MAX_PORTS];
 };
 
 static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 2001f20a4361..3a91d9d8dc51 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -893,7 +893,6 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 			--path->static_rate;
 	} else
 		path->static_rate = 0;
-	path->counter_index = 0xff;
 
 	if (ah->ah_flags & IB_AH_GRH) {
 		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
@@ -1034,6 +1033,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		}
 	}
 
+	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		if (dev->counters[qp->port - 1] != -1) {
+			context->pri_path.counter_index =
+						dev->counters[qp->port - 1];
+			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
+		} else
+			context->pri_path.counter_index = 0xff;
+	}
+
 	if (attr_mask & IB_QP_PKEY_INDEX) {
 		context->pri_path.pkey_index = attr->pkey_index;
 		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 6486d880bdbf..4001c8249dbb 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -54,7 +54,8 @@ enum mlx4_qp_optpar {
 	MLX4_QP_OPTPAR_RETRY_COUNT		= 1 << 12,
 	MLX4_QP_OPTPAR_RNR_RETRY		= 1 << 13,
 	MLX4_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
-	MLX4_QP_OPTPAR_SCHED_QUEUE		= 1 << 16
+	MLX4_QP_OPTPAR_SCHED_QUEUE		= 1 << 16,
+	MLX4_QP_OPTPAR_COUNTER_INDEX		= 1 << 20
 };
 
 enum mlx4_qp_state {
-- 
cgit v1.2.3


From 97d32cf9440d2111a12471740446d4d63231b79a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 19 Jul 2011 11:46:33 +0200
Subject: netfilter: nfnetlink_queue: batch verdict support

Introduces a new nfnetlink type that applies a given
verdict to all queued packets with an id <= the id in the verdict
message.

If a mark is provided it is applied to all matched packets.

This reduces the number of verdicts that have to be sent.
Applications that make use of this feature need to maintain
a timeout to send a batchverdict periodically to avoid starvation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink_queue.h |   1 +
 net/netfilter/nfnetlink_queue.c           | 115 ++++++++++++++++++++++++++----
 2 files changed, 104 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
index af94e0014ebd..24b32e6c009e 100644
--- a/include/linux/netfilter/nfnetlink_queue.h
+++ b/include/linux/netfilter/nfnetlink_queue.h
@@ -8,6 +8,7 @@ enum nfqnl_msg_types {
 	NFQNL_MSG_PACKET,		/* packet from kernel to userspace */
 	NFQNL_MSG_VERDICT,		/* verdict from userspace to kernel */
 	NFQNL_MSG_CONFIG,		/* connect to a particular queue */
+	NFQNL_MSG_VERDICT_BATCH,	/* batchv from userspace to kernel */
 
 	NFQNL_MSG_MAX
 };
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 3b2af8cb7de9..fbfcd834140b 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -171,6 +171,13 @@ __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
        queue->queue_total++;
 }
 
+static void
+__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
+{
+	list_del(&entry->list);
+	queue->queue_total--;
+}
+
 static struct nf_queue_entry *
 find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
 {
@@ -185,10 +192,8 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
 		}
 	}
 
-	if (entry) {
-		list_del(&entry->list);
-		queue->queue_total--;
-	}
+	if (entry)
+		__dequeue_entry(queue, entry);
 
 	spin_unlock_bh(&queue->lock);
 
@@ -611,6 +616,92 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
 	[NFQA_PAYLOAD]		= { .type = NLA_UNSPEC },
 };
 
+static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
+	[NFQA_VERDICT_HDR]	= { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
+	[NFQA_MARK]		= { .type = NLA_U32 },
+};
+
+static struct nfqnl_instance *verdict_instance_lookup(u16 queue_num, int nlpid)
+{
+	struct nfqnl_instance *queue;
+
+	queue = instance_lookup(queue_num);
+	if (!queue)
+		return ERR_PTR(-ENODEV);
+
+	if (queue->peer_pid != nlpid)
+		return ERR_PTR(-EPERM);
+
+	return queue;
+}
+
+static struct nfqnl_msg_verdict_hdr*
+verdicthdr_get(const struct nlattr * const nfqa[])
+{
+	struct nfqnl_msg_verdict_hdr *vhdr;
+	unsigned int verdict;
+
+	if (!nfqa[NFQA_VERDICT_HDR])
+		return NULL;
+
+	vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]);
+	verdict = ntohl(vhdr->verdict);
+	if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)
+		return NULL;
+	return vhdr;
+}
+
+static int nfq_id_after(unsigned int id, unsigned int max)
+{
+	return (int)(id - max) > 0;
+}
+
+static int
+nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
+		   const struct nlmsghdr *nlh,
+		   const struct nlattr * const nfqa[])
+{
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	struct nf_queue_entry *entry, *tmp;
+	unsigned int verdict, maxid;
+	struct nfqnl_msg_verdict_hdr *vhdr;
+	struct nfqnl_instance *queue;
+	LIST_HEAD(batch_list);
+	u16 queue_num = ntohs(nfmsg->res_id);
+
+	queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).pid);
+	if (IS_ERR(queue))
+		return PTR_ERR(queue);
+
+	vhdr = verdicthdr_get(nfqa);
+	if (!vhdr)
+		return -EINVAL;
+
+	verdict = ntohl(vhdr->verdict);
+	maxid = ntohl(vhdr->id);
+
+	spin_lock_bh(&queue->lock);
+
+	list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) {
+		if (nfq_id_after(entry->id, maxid))
+			break;
+		__dequeue_entry(queue, entry);
+		list_add_tail(&entry->list, &batch_list);
+	}
+
+	spin_unlock_bh(&queue->lock);
+
+	if (list_empty(&batch_list))
+		return -ENOENT;
+
+	list_for_each_entry_safe(entry, tmp, &batch_list, list) {
+		if (nfqa[NFQA_MARK])
+			entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+		nf_reinject(entry, verdict);
+	}
+	return 0;
+}
+
 static int
 nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 		   const struct nlmsghdr *nlh,
@@ -626,20 +717,17 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 
 	queue = instance_lookup(queue_num);
 	if (!queue)
-		return -ENODEV;
 
-	if (queue->peer_pid != NETLINK_CB(skb).pid)
-		return -EPERM;
+	queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).pid);
+	if (IS_ERR(queue))
+		return PTR_ERR(queue);
 
-	if (!nfqa[NFQA_VERDICT_HDR])
+	vhdr = verdicthdr_get(nfqa);
+	if (!vhdr)
 		return -EINVAL;
 
-	vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]);
 	verdict = ntohl(vhdr->verdict);
 
-	if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)
-		return -EINVAL;
-
 	entry = find_dequeue_entry(queue, ntohl(vhdr->id));
 	if (entry == NULL)
 		return -ENOENT;
@@ -775,6 +863,9 @@ static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
 	[NFQNL_MSG_CONFIG]	= { .call = nfqnl_recv_config,
 				    .attr_count = NFQA_CFG_MAX,
 				    .policy = nfqa_cfg_policy },
+	[NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch,
+				    .attr_count = NFQA_MAX,
+				    .policy = nfqa_verdict_batch_policy },
 };
 
 static const struct nfnetlink_subsystem nfqnl_subsys = {
-- 
cgit v1.2.3


From 7765be2fec0f476fcd61812d5f9406b04c765020 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 14 Jul 2011 12:24:11 -0700
Subject: rcu: Fix RCU_BOOST race handling current->rcu_read_unlock_special

The RCU_BOOST commits for TREE_PREEMPT_RCU introduced an other-task
write to a new RCU_READ_UNLOCK_BOOSTED bit in the task_struct structure's
->rcu_read_unlock_special field, but, as noted by Steven Rostedt, without
correctly synchronizing all accesses to ->rcu_read_unlock_special.
This could result in bits in ->rcu_read_unlock_special being spuriously
set and cleared due to conflicting accesses, which in turn could result
in deadlocks between the rcu_node structure's ->lock and the scheduler's
rq and pi locks.  These deadlocks would result from RCU incorrectly
believing that the just-ended RCU read-side critical section had been
preempted and/or boosted.  If that RCU read-side critical section was
executed with either rq or pi locks held, RCU's ensuing (incorrect)
calls to the scheduler would cause the scheduler to attempt to once
again acquire the rq and pi locks, resulting in deadlock.  More complex
deadlock cycles are also possible, involving multiple rq and pi locks
as well as locks from multiple rcu_node structures.

This commit fixes synchronization by creating ->rcu_boosted field in
task_struct that is accessed and modified only when holding the ->lock
in the rcu_node structure on which the task is queued (on that rcu_node
structure's ->blkd_tasks list).  This results in tasks accessing only
their own current->rcu_read_unlock_special fields, making unsynchronized
access once again legal, and keeping the rcu_read_unlock() fastpath free
of atomic instructions and memory barriers.

The reason that the rcu_read_unlock() fastpath does not need to access
the new current->rcu_boosted field is that this new field cannot
be non-zero unless the RCU_READ_UNLOCK_BLOCKED bit is set in the
current->rcu_read_unlock_special field.  Therefore, rcu_read_unlock()
need only test current->rcu_read_unlock_special: if that is zero, then
current->rcu_boosted must also be zero.

This bug does not affect TINY_PREEMPT_RCU because this implementation
of RCU accesses current->rcu_read_unlock_special with irqs disabled,
thus preventing races on the !SMP systems that TINY_PREEMPT_RCU runs on.

Maybe-reported-by: Dave Jones <davej@redhat.com>
Maybe-reported-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/sched.h   | 3 +++
 kernel/rcutree_plugin.h | 8 ++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 496770a96487..76676a407e4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1254,6 +1254,9 @@ struct task_struct {
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
 	char rcu_read_unlock_special;
+#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU)
+	int rcu_boosted;
+#endif /* #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) */
 	struct list_head rcu_node_entry;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TREE_PREEMPT_RCU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 6abef3cfcbc1..3a0ae0355222 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -342,6 +342,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
 		if (&t->rcu_node_entry == rnp->boost_tasks)
 			rnp->boost_tasks = np;
+		/* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+		if (t->rcu_boosted) {
+			special |= RCU_READ_UNLOCK_BOOSTED;
+			t->rcu_boosted = 0;
+		}
 #endif /* #ifdef CONFIG_RCU_BOOST */
 		t->rcu_blocked_node = NULL;
 
@@ -358,7 +363,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
 		if (special & RCU_READ_UNLOCK_BOOSTED) {
-			t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
 			rt_mutex_unlock(t->rcu_boost_mutex);
 			t->rcu_boost_mutex = NULL;
 		}
@@ -1176,7 +1180,7 @@ static int rcu_boost(struct rcu_node *rnp)
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&mtx, t);
 	t->rcu_boost_mutex = &mtx;
-	t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+	t->rcu_boosted = 1;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
 	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-- 
cgit v1.2.3


From 44396f4b5cb8566f7118aec55eeac99be7ad94cb Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 31 May 2011 11:58:49 -0400
Subject: fs: add a DCACHE_NEED_LOOKUP flag for d_flags

Btrfs (and I'd venture most other fs's) stores its indexes in nice disk order
for readdir, but unfortunately in the case of anything that stats the files in
order that readdir spits back (like oh say ls) that means we still have to do
the normal lookup of the file, which means looking up our other index and then
looking up the inode.  What I want is a way to create dummy dentries when we
find them in readdir so that when ls or anything else subsequently does a
stat(), we already have the location information in the dentry and can go
straight to the inode itself.  The lookup stuff just assumes that if it finds a
dentry it is done, it doesn't perform a lookup.  So add a DCACHE_NEED_LOOKUP
flag so that the lookup code knows it still needs to run i_op->lookup() on the
parent to get the inode for the dentry.  I have tested this with btrfs and I
went from something that looks like this

http://people.redhat.com/jwhiter/ls-noreada.png

To this

http://people.redhat.com/jwhiter/ls-good.png

Thats a savings of 1300 seconds, or 22 minutes.  That is a significant savings.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 34 ++++++++++++++++++++++++++++++++--
 fs/namei.c             | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dcache.h |  7 +++++++
 3 files changed, 88 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 6e4ea6d87774..d3902139b533 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -343,6 +343,24 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
+/*
+ * d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag
+ * @dentry: dentry to drop
+ *
+ * This is called when we do a lookup on a placeholder dentry that needed to be
+ * looked up.  The dentry should have been hashed in order for it to be found by
+ * the lookup code, but now needs to be unhashed while we do the actual lookup
+ * and clear the DCACHE_NEED_LOOKUP flag.
+ */
+void d_clear_need_lookup(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_clear_need_lookup);
+
 /*
  * Finish off a dentry we've decided to kill.
  * dentry->d_lock must be held, returns with it unlocked.
@@ -432,8 +450,13 @@ repeat:
  	if (d_unhashed(dentry))
 		goto kill_it;
 
-	/* Otherwise leave it cached and ensure it's on the LRU */
-	dentry->d_flags |= DCACHE_REFERENCED;
+	/*
+	 * If this dentry needs lookup, don't set the referenced flag so that it
+	 * is more likely to be cleaned up by the dcache shrinker in case of
+	 * memory pressure.
+	 */
+	if (!d_need_lookup(dentry))
+		dentry->d_flags |= DCACHE_REFERENCED;
 	dentry_lru_add(dentry);
 
 	dentry->d_count--;
@@ -1707,6 +1730,13 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 		return found;
 	}
 
+	/*
+	 * We are going to instantiate this dentry, unhash it and clear the
+	 * lookup flag so we can do that.
+	 */
+	if (unlikely(d_need_lookup(found)))
+		d_clear_need_lookup(found);
+
 	/*
 	 * Negative dentry: instantiate it unless the inode is a directory and
 	 * already has a dentry.
diff --git a/fs/namei.c b/fs/namei.c
index 14ab8d3f2f0c..5ba42c453e31 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1133,6 +1133,30 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
 	return dentry;
 }
 
+/*
+ * We already have a dentry, but require a lookup to be performed on the parent
+ * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error.
+ * parent->d_inode->i_mutex must be held. d_lookup must have verified that no
+ * child exists while under i_mutex.
+ */
+static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry,
+				     struct nameidata *nd)
+{
+	struct inode *inode = parent->d_inode;
+	struct dentry *old;
+
+	/* Don't create child dentry for a dead directory. */
+	if (unlikely(IS_DEADDIR(inode)))
+		return ERR_PTR(-ENOENT);
+
+	old = inode->i_op->lookup(inode, dentry, nd);
+	if (unlikely(old)) {
+		dput(dentry);
+		dentry = old;
+	}
+	return dentry;
+}
+
 /*
  *  It's more convoluted than I'd like it to be, but... it's still fairly
  *  small and for now I'd prefer to have fast path as straight as possible.
@@ -1172,6 +1196,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 				goto unlazy;
 			}
 		}
+		if (unlikely(d_need_lookup(dentry)))
+			goto unlazy;
 		path->mnt = mnt;
 		path->dentry = dentry;
 		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
@@ -1186,6 +1212,10 @@ unlazy:
 		dentry = __d_lookup(parent, name);
 	}
 
+	if (dentry && unlikely(d_need_lookup(dentry))) {
+		dput(dentry);
+		dentry = NULL;
+	}
 retry:
 	if (unlikely(!dentry)) {
 		struct inode *dir = parent->d_inode;
@@ -1202,6 +1232,15 @@ retry:
 			/* known good */
 			need_reval = 0;
 			status = 1;
+		} else if (unlikely(d_need_lookup(dentry))) {
+			dentry = d_inode_lookup(parent, dentry, nd);
+			if (IS_ERR(dentry)) {
+				mutex_unlock(&dir->i_mutex);
+				return PTR_ERR(dentry);
+			}
+			/* known good */
+			need_reval = 0;
+			status = 1;
 		}
 		mutex_unlock(&dir->i_mutex);
 	}
@@ -1683,6 +1722,16 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	 */
 	dentry = d_lookup(base, name);
 
+	if (dentry && d_need_lookup(dentry)) {
+		/*
+		 * __lookup_hash is called with the parent dir's i_mutex already
+		 * held, so we are good to go here.
+		 */
+		dentry = d_inode_lookup(base, dentry, nd);
+		if (IS_ERR(dentry))
+			return dentry;
+	}
+
 	if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
 		dentry = do_revalidate(dentry, nd);
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 19d90a55541d..5fa5bd33b979 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -216,6 +216,7 @@ struct dentry_operations {
 #define DCACHE_MOUNTED		0x10000	/* is a mountpoint */
 #define DCACHE_NEED_AUTOMOUNT	0x20000	/* handle automount on this dir */
 #define DCACHE_MANAGE_TRANSIT	0x40000	/* manage transit from this dirent */
+#define DCACHE_NEED_LOOKUP	0x80000 /* dentry requires i_op->lookup */
 #define DCACHE_MANAGED_DENTRY \
 	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
 
@@ -416,6 +417,12 @@ static inline bool d_mountpoint(struct dentry *dentry)
 	return dentry->d_flags & DCACHE_MOUNTED;
 }
 
+static inline bool d_need_lookup(struct dentry *dentry)
+{
+	return dentry->d_flags & DCACHE_NEED_LOOKUP;
+}
+
+extern void d_clear_need_lookup(struct dentry *dentry);
 extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
 extern int sysctl_vfs_cache_pressure;
-- 
cgit v1.2.3


From 43e15cdbefea4ce6d68113de98d4f61c0cf45687 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Jun 2011 20:16:57 -0400
Subject: new helper: iterate_supers_type()

Call the given function for all superblocks of given type.  Function
gets a superblock (with s_umount locked shared) and (void *) argument
supplied by caller of iterator.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index ab3d672db0de..444da9579068 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -451,6 +451,42 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 	spin_unlock(&sb_lock);
 }
 
+/**
+ *	iterate_supers_type - call function for superblocks of given type
+ *	@type: fs type
+ *	@f: function to call
+ *	@arg: argument to pass to it
+ *
+ *	Scans the superblock list and calls given function, passing it
+ *	locked superblock and given argument.
+ */
+void iterate_supers_type(struct file_system_type *type,
+	void (*f)(struct super_block *, void *), void *arg)
+{
+	struct super_block *sb, *p = NULL;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &type->fs_supers, s_instances) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			f(sb, arg);
+		up_read(&sb->s_umount);
+
+		spin_lock(&sb_lock);
+		if (p)
+			__put_super(p);
+		p = sb;
+	}
+	if (p)
+		__put_super(p);
+	spin_unlock(&sb_lock);
+}
+
+EXPORT_SYMBOL(iterate_supers_type);
+
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b5b979247863..a8735e7e1b35 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2432,6 +2432,8 @@ extern struct super_block *get_active_super(struct block_device *bdev);
 extern struct super_block *user_get_super(dev_t);
 extern void drop_super(struct super_block *sb);
 extern void iterate_supers(void (*)(struct super_block *, void *), void *);
+extern void iterate_supers_type(struct file_system_type *,
+			        void (*)(struct super_block *, void *), void *);
 
 extern int dcache_dir_open(struct inode *, struct file *);
 extern int dcache_dir_close(struct inode *, struct file *);
-- 
cgit v1.2.3


From 1b5d783c94c328d406e801566f161adcfb018dda Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 19 Jun 2011 12:49:47 -0400
Subject: consolidate BINPRM_FLAGS_ENFORCE_NONDUMP handling

new helper: would_dump(bprm, file).  Checks if we are allowed to
read the file and if we are not - sets ENFORCE_NODUMP.  Exported,
used in places that previously open-coded the same logics.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c         |  3 +--
 fs/binfmt_elf_fdpic.c   |  3 +--
 fs/binfmt_misc.c        |  3 +--
 fs/exec.c               | 14 +++++++++++---
 include/linux/binfmts.h |  1 +
 5 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 303983fabfd6..dd0fdfc56d38 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -668,8 +668,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * mm->dumpable = 0 regardless of the interpreter's
 			 * permissions.
 			 */
-			if (file_permission(interpreter, MAY_READ) < 0)
-				bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+			would_dump(bprm, interpreter);
 
 			retval = kernel_read(interpreter, 0, bprm->buf,
 					     BINPRM_BUF_SIZE);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2bc5dc644b4c..30745f459faf 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -245,8 +245,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 			 * mm->dumpable = 0 regardless of the interpreter's
 			 * permissions.
 			 */
-			if (file_permission(interpreter, MAY_READ) < 0)
-				bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+			would_dump(bprm, interpreter);
 
 			retval = kernel_read(interpreter, 0, bprm->buf,
 					     BINPRM_BUF_SIZE);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1befe2ec8186..ba1a1ae4a18a 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -149,8 +149,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 
 		/* if the binary is not readable than enforce mm->dumpable=0
 		   regardless of the interpreter's permissions */
-		if (file_permission(bprm->file, MAY_READ))
-			bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+		would_dump(bprm, bprm->file);
 
 		allow_write_access(bprm->file);
 		bprm->file = NULL;
diff --git a/fs/exec.c b/fs/exec.c
index 6075a1e727ae..f9f12ad299af 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1105,6 +1105,13 @@ out:
 }
 EXPORT_SYMBOL(flush_old_exec);
 
+void would_dump(struct linux_binprm *bprm, struct file *file)
+{
+	if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
+		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+}
+EXPORT_SYMBOL(would_dump);
+
 void setup_new_exec(struct linux_binprm * bprm)
 {
 	int i, ch;
@@ -1144,9 +1151,10 @@ void setup_new_exec(struct linux_binprm * bprm)
 	if (bprm->cred->uid != current_euid() ||
 	    bprm->cred->gid != current_egid()) {
 		current->pdeath_signal = 0;
-	} else if (file_permission(bprm->file, MAY_READ) ||
-		   bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
-		set_dumpable(current->mm, suid_dumpable);
+	} else {
+		would_dump(bprm, bprm->file);
+		if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
+			set_dumpable(current->mm, suid_dumpable);
 	}
 
 	/*
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 8845613fd7e3..fd88a3945aa1 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -111,6 +111,7 @@ extern int __must_check remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *, struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
 extern void setup_new_exec(struct linux_binprm * bprm);
+extern void would_dump(struct linux_binprm *, struct file *);
 
 extern int suid_dumpable;
 #define SUID_DUMP_DISABLE	0	/* No setuid dumping */
-- 
cgit v1.2.3


From 3bfa784a6539f91a27d7ffdd408efdb638e3bebd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 19 Jun 2011 12:55:10 -0400
Subject: kill file_permission() completely

convert the last remaining caller to inode_permission()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c         | 18 ------------------
 include/linux/fs.h |  1 -
 kernel/cgroup.c    |  3 ++-
 3 files changed, 2 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index cf2554635a1c..4ad2b781a65c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -303,23 +303,6 @@ int inode_permission(struct inode *inode, int mask)
 	return security_inode_permission(inode, mask);
 }
 
-/**
- * file_permission  -  check for additional access rights to a given file
- * @file:	file to check access rights for
- * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Used to check for read/write/execute permissions on an already opened
- * file.
- *
- * Note:
- *	Do not use this function in new code.  All access checks should
- *	be done using inode_permission().
- */
-int file_permission(struct file *file, int mask)
-{
-	return inode_permission(file->f_path.dentry->d_inode, mask);
-}
-
 /*
  * get_write_access() gets write permission for a file.
  * put_write_access() releases this write permission.
@@ -3405,7 +3388,6 @@ EXPORT_SYMBOL(kern_path_parent);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
 EXPORT_SYMBOL(vfs_create);
 EXPORT_SYMBOL(vfs_follow_link);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a8735e7e1b35..d04e55586a17 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1490,7 +1490,6 @@ extern void dentry_unhash(struct dentry *dentry);
 /*
  * VFS file helper functions.
  */
-extern int file_permission(struct file *, int);
 extern void inode_init_owner(struct inode *inode, const struct inode *dir,
 			mode_t mode);
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2731d115d725..e1c72c0f512b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3542,7 +3542,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
 	}
 
 	/* the process need read permission on control file */
-	ret = file_permission(cfile, MAY_READ);
+	/* AV: shouldn't we check that it's been opened for read instead? */
+	ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
 	if (ret < 0)
 		goto fail;
 
-- 
cgit v1.2.3


From 07b8ce1ee87d291ff564c02cf878fae973317a52 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Jun 2011 10:52:57 -0400
Subject: lockless get_write_access/deny_write_access

new helpers: atomic_inc_unless_negative()/atomic_dec_unless_positive()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c             | 46 ----------------------------------------------
 include/linux/atomic.h | 26 ++++++++++++++++++++++++++
 include/linux/fs.h     | 29 ++++++++++++++++++++++++++---
 3 files changed, 52 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index e04f15ae4b07..d286cbc3f3e5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -341,52 +341,6 @@ ok:
 	return security_inode_exec_permission(inode, flags);
 }
 
-/*
- * get_write_access() gets write permission for a file.
- * put_write_access() releases this write permission.
- * This is used for regular files.
- * We cannot support write (and maybe mmap read-write shared) accesses and
- * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
- * can have the following values:
- * 0: no writers, no VM_DENYWRITE mappings
- * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
- * > 0: (i_writecount) users are writing to the file.
- *
- * Normally we operate on that counter with atomic_{inc,dec} and it's safe
- * except for the cases where we don't hold i_writecount yet. Then we need to
- * use {get,deny}_write_access() - these functions check the sign and refuse
- * to do the change if sign is wrong. Exclusion between them is provided by
- * the inode->i_lock spinlock.
- */
-
-int get_write_access(struct inode * inode)
-{
-	spin_lock(&inode->i_lock);
-	if (atomic_read(&inode->i_writecount) < 0) {
-		spin_unlock(&inode->i_lock);
-		return -ETXTBSY;
-	}
-	atomic_inc(&inode->i_writecount);
-	spin_unlock(&inode->i_lock);
-
-	return 0;
-}
-
-int deny_write_access(struct file * file)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-
-	spin_lock(&inode->i_lock);
-	if (atomic_read(&inode->i_writecount) > 0) {
-		spin_unlock(&inode->i_lock);
-		return -ETXTBSY;
-	}
-	atomic_dec(&inode->i_writecount);
-	spin_unlock(&inode->i_lock);
-
-	return 0;
-}
-
 /**
  * path_get - get a reference to a path
  * @path: path to get the reference to
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index ee456c79b0e6..bc6615d4132b 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -34,6 +34,32 @@ static inline int atomic_inc_not_zero_hint(atomic_t *v, int hint)
 }
 #endif
 
+#ifndef atomic_inc_unless_negative
+static inline int atomic_inc_unless_negative(atomic_t *p)
+{
+	int v, v1;
+	for (v = 0; v >= 0; v = v1) {
+		v1 = atomic_cmpxchg(p, v, v + 1);
+		if (likely(v1 == v))
+			return 1;
+	}
+	return 0;
+}
+#endif
+
+#ifndef atomic_dec_unless_positive
+static inline int atomic_dec_unless_positive(atomic_t *p)
+{
+	int v, v1;
+	for (v = 0; v <= 0; v = v1) {
+		v1 = atomic_cmpxchg(p, v, v - 1);
+		if (likely(v1 == v))
+			return 1;
+	}
+	return 0;
+}
+#endif
+
 #ifndef CONFIG_ARCH_HAS_ATOMIC_OR
 static inline void atomic_or(int i, atomic_t *v)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d04e55586a17..8c84ed930389 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -392,8 +392,8 @@ struct inodes_stat_t {
 #include <linux/semaphore.h>
 #include <linux/fiemap.h>
 #include <linux/rculist_bl.h>
+#include <linux/atomic.h>
 
-#include <asm/atomic.h>
 #include <asm/byteorder.h>
 
 struct export_operations;
@@ -2195,8 +2195,31 @@ static inline bool execute_ok(struct inode *inode)
 	return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
 }
 
-extern int get_write_access(struct inode *);
-extern int deny_write_access(struct file *);
+/*
+ * get_write_access() gets write permission for a file.
+ * put_write_access() releases this write permission.
+ * This is used for regular files.
+ * We cannot support write (and maybe mmap read-write shared) accesses and
+ * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
+ * can have the following values:
+ * 0: no writers, no VM_DENYWRITE mappings
+ * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
+ * > 0: (i_writecount) users are writing to the file.
+ *
+ * Normally we operate on that counter with atomic_{inc,dec} and it's safe
+ * except for the cases where we don't hold i_writecount yet. Then we need to
+ * use {get,deny}_write_access() - these functions check the sign and refuse
+ * to do the change if sign is wrong.
+ */
+static inline int get_write_access(struct inode *inode)
+{
+	return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
+}
+static inline int deny_write_access(struct file *file)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
+}
 static inline void put_write_access(struct inode * inode)
 {
 	atomic_dec(&inode->i_writecount);
-- 
cgit v1.2.3


From 178ea73521d64ba41d7aa5488fb9f549c6d4507d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Jun 2011 11:31:30 -0400
Subject: kill check_acl callback of generic_permission()

its value depends only on inode and does not change; we might as
well store it in ->i_op->check_acl and be done with that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/security.c              |  2 +-
 fs/btrfs/inode.c               |  7 ++++++-
 fs/ceph/inode.c                |  2 +-
 fs/cifs/cifsfs.c               |  2 +-
 fs/fuse/dir.c                  |  4 ++--
 fs/gfs2/inode.c                |  5 ++++-
 fs/hostfs/hostfs_kern.c        |  2 +-
 fs/hpfs/namei.c                |  2 +-
 fs/namei.c                     | 17 +++++++----------
 fs/nfs/dir.c                   |  2 +-
 fs/nilfs2/inode.c              |  2 +-
 fs/ocfs2/file.c                |  4 +++-
 fs/ocfs2/namei.c               |  1 +
 fs/proc/base.c                 |  2 +-
 fs/reiserfs/file.c             |  1 +
 fs/reiserfs/namei.c            |  4 +++-
 fs/reiserfs/xattr.c            | 18 ++++++++----------
 fs/sysfs/inode.c               |  2 +-
 include/linux/fs.h             |  3 +--
 include/linux/reiserfs_xattr.h |  2 ++
 20 files changed, 47 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/security.c b/fs/afs/security.c
index f44b9d355377..745ee654165f 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -350,7 +350,7 @@ int afs_permission(struct inode *inode, int mask, unsigned int flags)
 	}
 
 	key_put(key);
-	ret = generic_permission(inode, mask, flags, NULL);
+	ret = generic_permission(inode, mask, flags);
 	_leave(" = %d", ret);
 	return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3601f0aebddf..f0bd87371566 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7339,7 +7339,7 @@ static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 		return -EROFS;
 	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
-	return generic_permission(inode, mask, flags, btrfs_check_acl);
+	return generic_permission(inode, mask, flags);
 }
 
 static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7359,10 +7359,12 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
+	.check_acl	= btrfs_check_acl,
 };
 static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.permission	= btrfs_permission,
+	.check_acl	= btrfs_check_acl,
 };
 
 static const struct file_operations btrfs_dir_file_operations = {
@@ -7431,6 +7433,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 	.fiemap		= btrfs_fiemap,
+	.check_acl	= btrfs_check_acl,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
@@ -7440,6 +7443,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
 	.getxattr	= btrfs_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
+	.check_acl	= btrfs_check_acl,
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
@@ -7451,6 +7455,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 	.getxattr	= btrfs_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
+	.check_acl	= btrfs_check_acl,
 };
 
 const struct dentry_operations btrfs_dentry_operations = {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d8858e96ab18..beb5d55d6fd2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1805,7 +1805,7 @@ int ceph_permission(struct inode *inode, int mask, unsigned int flags)
 	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
 
 	if (!err)
-		err = generic_permission(inode, mask, flags, NULL);
+		err = generic_permission(inode, mask, flags);
 	return err;
 }
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index bc4b12ca537b..b79804fa410f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -239,7 +239,7 @@ static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
-		return generic_permission(inode, mask, flags, NULL);
+		return generic_permission(inode, mask, flags);
 }
 
 static struct kmem_cache *cifs_inode_cachep;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d50160714595..0a2fcd860ad6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1018,7 +1018,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 	}
 
 	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-		err = generic_permission(inode, mask, flags, NULL);
+		err = generic_permission(inode, mask, flags);
 
 		/* If permission is denied, try to refresh file
 		   attributes.  This is also needed, because the root
@@ -1027,7 +1027,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 			err = fuse_perm_getattr(inode, flags);
 			if (!err)
 				err = generic_permission(inode, mask,
-							flags, NULL);
+							flags);
 		}
 
 		/* Note: the opposite of the above test does not
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 03e0c529063e..d5f0f4ea25dc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1564,7 +1564,7 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
 		error = -EACCES;
 	else
-		error = generic_permission(inode, mask, flags, gfs2_check_acl);
+		error = generic_permission(inode, mask, flags);
 	if (unlock)
 		gfs2_glock_dq_uninit(&i_gh);
 
@@ -1854,6 +1854,7 @@ const struct inode_operations gfs2_file_iops = {
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
 	.fiemap = gfs2_fiemap,
+	.check_acl = gfs2_check_acl,
 };
 
 const struct inode_operations gfs2_dir_iops = {
@@ -1874,6 +1875,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
 	.fiemap = gfs2_fiemap,
+	.check_acl = gfs2_check_acl,
 };
 
 const struct inode_operations gfs2_symlink_iops = {
@@ -1888,5 +1890,6 @@ const struct inode_operations gfs2_symlink_iops = {
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
 	.fiemap = gfs2_fiemap,
+	.check_acl = gfs2_check_acl,
 };
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2638c834ed28..a98d0d1aef65 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -770,7 +770,7 @@ int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
 		err = access_file(name, r, w, x);
 	__putname(name);
 	if (!err)
-		err = generic_permission(ino, desired, flags, NULL);
+		err = generic_permission(ino, desired, flags);
 	return err;
 }
 
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index acf95dab2aac..bd2ce7dd8df3 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -398,7 +398,7 @@ again:
 			hpfs_unlock(dir->i_sb);
 			return -ENOSPC;
 		}
-		if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
+		if (generic_permission(inode, MAY_WRITE, 0) ||
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
 			d_rehash(dentry);
diff --git a/fs/namei.c b/fs/namei.c
index d286cbc3f3e5..c5af0f37e67d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -176,9 +176,9 @@ EXPORT_SYMBOL(putname);
 /*
  * This does basic POSIX ACL permission checking
  */
-static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
-		int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
+static int acl_permission_check(struct inode *inode, int mask, unsigned int flags)
 {
+	int (*check_acl)(struct inode *inode, int mask, unsigned int flags);
 	unsigned int mode = inode->i_mode;
 
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
@@ -189,6 +189,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
 	if (current_fsuid() == inode->i_uid)
 		mode >>= 6;
 	else {
+		check_acl = inode->i_op->check_acl;
 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
 			int error = check_acl(inode, mask, flags);
 			if (error != -EAGAIN)
@@ -212,7 +213,6 @@ other_perms:
  * generic_permission -  check for access rights on a Posix-like filesystem
  * @inode:	inode to check access rights for
  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- * @check_acl:	optional callback to check for Posix ACLs
  * @flags:	IPERM_FLAG_ flags.
  *
  * Used to check for read/write/execute permissions on a file.
@@ -224,15 +224,14 @@ other_perms:
  * request cannot be satisfied (eg. requires blocking or too much complexity).
  * It would then be called again in ref-walk mode.
  */
-int generic_permission(struct inode *inode, int mask, unsigned int flags,
-	int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
+int generic_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	int ret;
 
 	/*
 	 * Do the basic POSIX ACL permission checks.
 	 */
-	ret = acl_permission_check(inode, mask, flags, check_acl);
+	ret = acl_permission_check(inode, mask, flags);
 	if (ret != -EACCES)
 		return ret;
 
@@ -290,8 +289,7 @@ int inode_permission(struct inode *inode, int mask)
 	if (inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask, 0);
 	else
-		retval = generic_permission(inode, mask, 0,
-				inode->i_op->check_acl);
+		retval = generic_permission(inode, mask, 0);
 
 	if (retval)
 		return retval;
@@ -326,8 +324,7 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
 		if (likely(!ret))
 			goto ok;
 	} else {
-		ret = acl_permission_check(inode, MAY_EXEC, flags,
-				inode->i_op->check_acl);
+		ret = acl_permission_check(inode, MAY_EXEC, flags);
 		if (likely(!ret))
 			goto ok;
 		if (ret != -EACCES)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ededdbd0db38..0485dca34fb1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2328,7 +2328,7 @@ out:
 out_notsup:
 	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	if (res == 0)
-		res = generic_permission(inode, mask, flags, NULL);
+		res = generic_permission(inode, mask, flags);
 	goto out;
 }
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b9b45fc2903e..650aa7755003 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -806,7 +806,7 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 	    root->cno != NILFS_CPTREE_CURRENT_CNO)
 		return -EROFS; /* snapshot is not writable */
 
-	return generic_permission(inode, mask, flags, NULL);
+	return generic_permission(inode, mask, flags);
 }
 
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b1e35a392ca5..d058cb7e12d4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1293,7 +1293,7 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 		goto out;
 	}
 
-	ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
+	ret = generic_permission(inode, mask, flags);
 
 	ocfs2_inode_unlock(inode, 0);
 out:
@@ -2593,12 +2593,14 @@ const struct inode_operations ocfs2_file_iops = {
 	.listxattr	= ocfs2_listxattr,
 	.removexattr	= generic_removexattr,
 	.fiemap		= ocfs2_fiemap,
+	.check_acl	= ocfs2_check_acl,
 };
 
 const struct inode_operations ocfs2_special_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.check_acl	= ocfs2_check_acl,
 };
 
 /*
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e5d738cd9cc0..33889dc52dd7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2498,4 +2498,5 @@ const struct inode_operations ocfs2_dir_iops = {
 	.listxattr	= ocfs2_listxattr,
 	.removexattr	= generic_removexattr,
 	.fiemap         = ocfs2_fiemap,
+	.check_acl	= ocfs2_check_acl,
 };
diff --git a/fs/proc/base.c b/fs/proc/base.c
index fc5bc2767692..8b8470113576 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2169,7 +2169,7 @@ static const struct file_operations proc_fd_operations = {
  */
 static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	int rv = generic_permission(inode, mask, flags, NULL);
+	int rv = generic_permission(inode, mask, flags);
 	if (rv == 0)
 		return 0;
 	if (task_pid(current) == proc_pid(inode))
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 91f080cc76c8..bbf31003d308 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -312,4 +312,5 @@ const struct inode_operations reiserfs_file_inode_operations = {
 	.listxattr = reiserfs_listxattr,
 	.removexattr = reiserfs_removexattr,
 	.permission = reiserfs_permission,
+	.check_acl = reiserfs_check_acl,
 };
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 118662690cdf..551f1b79dbc4 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1529,6 +1529,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
 	.listxattr = reiserfs_listxattr,
 	.removexattr = reiserfs_removexattr,
 	.permission = reiserfs_permission,
+	.check_acl = reiserfs_check_acl,
 };
 
 /*
@@ -1545,6 +1546,7 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
 	.listxattr = reiserfs_listxattr,
 	.removexattr = reiserfs_removexattr,
 	.permission = reiserfs_permission,
+	.check_acl = reiserfs_check_acl,
 
 };
 
@@ -1558,5 +1560,5 @@ const struct inode_operations reiserfs_special_inode_operations = {
 	.listxattr = reiserfs_listxattr,
 	.removexattr = reiserfs_removexattr,
 	.permission = reiserfs_permission,
-
+	.check_acl = reiserfs_check_acl,
 };
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index d78089690965..ddc5301d2986 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -868,11 +868,17 @@ out:
 	return err;
 }
 
-static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	int error = -EAGAIN; /* do regular unix permission checks by default */
 
+	/*
+	 * Stat data v1 doesn't support ACLs.
+	 */
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return -EAGAIN;
+
 	if (flags & IPERM_FLAG_RCU)
 		return -ECHILD;
 
@@ -961,15 +967,7 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 	if (IS_PRIVATE(inode))
 		return 0;
 
-#ifdef CONFIG_REISERFS_FS_XATTR
-	/*
-	 * Stat data v1 doesn't support ACLs.
-	 */
-	if (get_inode_sd_version(inode) != STAT_DATA_V1)
-		return generic_permission(inode, mask, flags,
-					reiserfs_check_acl);
-#endif
-	return generic_permission(inode, mask, flags, NULL);
+	return generic_permission(inode, mask, flags);
 }
 
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0a12eb89cd32..a37165c64757 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -362,5 +362,5 @@ int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
 	sysfs_refresh_inode(sd, inode);
 	mutex_unlock(&sysfs_mutex);
 
-	return generic_permission(inode, mask, flags, NULL);
+	return generic_permission(inode, mask, flags);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8c84ed930389..0c15d5e459d5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2187,8 +2187,7 @@ extern sector_t bmap(struct inode *, sector_t);
 #endif
 extern int notify_change(struct dentry *, struct iattr *);
 extern int inode_permission(struct inode *, int);
-extern int generic_permission(struct inode *, int, unsigned int,
-		int (*check_acl)(struct inode *, int, unsigned int));
+extern int generic_permission(struct inode *, int, unsigned int);
 
 static inline bool execute_ok(struct inode *inode)
 {
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 6deef5dc95fb..1a3ca8f80200 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -45,6 +45,7 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags);
 
 #ifdef CONFIG_REISERFS_FS_XATTR
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
+int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name,
 			  void *buffer, size_t size);
 int reiserfs_setxattr(struct dentry *dentry, const char *name,
@@ -122,6 +123,7 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 #define reiserfs_setxattr NULL
 #define reiserfs_listxattr NULL
 #define reiserfs_removexattr NULL
+#define reiserfs_check_acl NULL
 
 static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 {
-- 
cgit v1.2.3


From 1fc0f78ca9f311c6277e2f1b7655bb4d43ceb311 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Jun 2011 18:59:02 -0400
Subject: ->permission() sanitizing: MAY_NOT_BLOCK

Duplicate the flags argument into mask bitmap.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 7 +++++--
 fs/proc/proc_sysctl.c | 2 +-
 include/linux/fs.h    | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index c5af0f37e67d..723a3fe4bc40 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -318,13 +318,16 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
 	int ret;
 	struct user_namespace *ns = inode_userns(inode);
+	int mask = MAY_EXEC;
+	if (flags & IPERM_FLAG_RCU)
+		mask |= MAY_NOT_BLOCK;
 
 	if (inode->i_op->permission) {
-		ret = inode->i_op->permission(inode, MAY_EXEC, flags);
+		ret = inode->i_op->permission(inode, mask, flags);
 		if (likely(!ret))
 			goto ok;
 	} else {
-		ret = acl_permission_check(inode, MAY_EXEC, flags);
+		ret = acl_permission_check(inode, mask, flags);
 		if (likely(!ret))
 			goto ok;
 		if (ret != -EACCES)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d167de365a8d..349b22f434d8 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -316,7 +316,7 @@ static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
 	if (!table) /* global root - r-xr-xr-x */
 		error = mask & MAY_WRITE ? -EACCES : 0;
 	else /* Use the permissions on the sysctl table entry */
-		error = sysctl_perm(head->root, table, mask);
+		error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK);
 
 	sysctl_head_finish(head);
 	return error;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0c15d5e459d5..60c1fe65bb2d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -63,6 +63,7 @@ struct inodes_stat_t {
 #define MAY_ACCESS 16
 #define MAY_OPEN 32
 #define MAY_CHDIR 64
+#define MAY_NOT_BLOCK 128	/* called from RCU mode, don't block */
 
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
-- 
cgit v1.2.3


From 7e40145eb111a5192e6d819f764db9d6828d1abb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Jun 2011 19:12:17 -0400
Subject: ->permission() sanitizing: don't pass flags to ->check_acl()

not used in the instances anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  2 +-
 Documentation/filesystems/vfs.txt |  2 +-
 fs/9p/acl.c                       |  2 +-
 fs/9p/acl.h                       |  2 +-
 fs/btrfs/acl.c                    |  2 +-
 fs/btrfs/ctree.h                  |  2 +-
 fs/ext2/acl.c                     |  2 +-
 fs/ext2/acl.h                     |  2 +-
 fs/ext3/acl.c                     |  2 +-
 fs/ext3/acl.h                     |  2 +-
 fs/ext4/acl.c                     |  2 +-
 fs/ext4/acl.h                     |  2 +-
 fs/generic_acl.c                  |  2 +-
 fs/gfs2/acl.c                     |  2 +-
 fs/gfs2/acl.h                     |  2 +-
 fs/jffs2/acl.c                    |  2 +-
 fs/jffs2/acl.h                    |  2 +-
 fs/jfs/acl.c                      |  2 +-
 fs/jfs/jfs_acl.h                  |  2 +-
 fs/namei.c                        | 10 +++++-----
 fs/ocfs2/acl.c                    |  2 +-
 fs/ocfs2/acl.h                    |  2 +-
 fs/reiserfs/xattr.c               |  2 +-
 fs/xfs/linux-2.6/xfs_acl.c        |  2 +-
 fs/xfs/xfs_acl.h                  |  2 +-
 include/linux/fs.h                |  2 +-
 include/linux/generic_acl.h       |  2 +-
 include/linux/reiserfs_xattr.h    |  2 +-
 28 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 57d827d6071d..9b6ed7c9f34f 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -52,7 +52,7 @@ ata *);
 	void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int, unsigned int);
-	int (*check_acl)(struct inode *, int, unsigned int);
+	int (*check_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 88b9f5519af9..8b4c8e04d879 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -334,7 +334,7 @@ struct inode_operations {
         void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int, unsigned int);
-	int (*check_acl)(struct inode *, int, unsigned int);
+	int (*check_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 94af68b092af..e98f56d3787d 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -96,7 +96,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
 	return acl;
 }
 
-int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
+int v9fs_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 	struct v9fs_session_info *v9ses;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 7ef3ac9f6d95..59e18c2e8c7e 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -16,7 +16,7 @@
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
+extern int v9fs_check_acl(struct inode *inode, int mask);
 extern int v9fs_acl_chmod(struct dentry *);
 extern int v9fs_set_create_acl(struct dentry *,
 			       struct posix_acl *, struct posix_acl *);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index a25a4a2e0df2..9f62ab2a7282 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -195,7 +195,7 @@ out:
 	return ret;
 }
 
-int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+int btrfs_check_acl(struct inode *inode, int mask)
 {
 	int error = -EAGAIN;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3b859a3e6a0e..9864cec801ed 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2642,7 +2642,7 @@ do {								\
 
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
+int btrfs_check_acl(struct inode *inode, int mask);
 #else
 #define btrfs_check_acl NULL
 #endif
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 6b9442d1be52..bfe651f9ae16 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -232,7 +232,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 
 int
-ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
+ext2_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index c939b7b12099..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext2_check_acl (struct inode *, int, unsigned int);
+extern int ext2_check_acl (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 0a6940d6c30c..edfeb293d4cb 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -240,7 +240,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 
 int
-ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
+ext3_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 5faf8048e906..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext3_check_acl (struct inode *, int, unsigned int);
+extern int ext3_check_acl (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 4f54252e439e..60d900fcc3db 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,7 +238,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 
 int
-ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
+ext4_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index dec821168fd4..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext4_check_acl(struct inode *, int, unsigned int);
+extern int ext4_check_acl(struct inode *, int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 5976bb1fa4ca..70e90b4974ce 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -190,7 +190,7 @@ generic_acl_chmod(struct inode *inode)
 }
 
 int
-generic_check_acl(struct inode *inode, int mask, unsigned int flags)
+generic_check_acl(struct inode *inode, int mask)
 {
 	if (mask & MAY_NOT_BLOCK) {
 		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 4d97352d39a1..8ef1079f1665 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -75,7 +75,7 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
  * Returns: errno
  */
 
-int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
+int gfs2_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 	int error;
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index a93907c8159b..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@
 #define GFS2_POSIX_ACL_DEFAULT		"posix_acl_default"
 #define GFS2_ACL_MAX_ENTRIES		25
 
-extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
+extern int gfs2_check_acl(struct inode *inode, int mask);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 952afb59e6f1..3675b3cdee89 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -259,7 +259,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return rc;
 }
 
-int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
+int jffs2_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 	int rc;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 3119f59253d3..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
-extern int jffs2_check_acl(struct inode *, int, unsigned int);
+extern int jffs2_check_acl(struct inode *, int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 859ae5a92166..8a0a0666d5a6 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,7 +114,7 @@ out:
 	return rc;
 }
 
-int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+int jfs_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index f9285c4900fa..54e07559878d 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 
 #ifdef CONFIG_JFS_POSIX_ACL
 
-int jfs_check_acl(struct inode *, int, unsigned int flags);
+int jfs_check_acl(struct inode *, int);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_acl_chmod(struct inode *inode);
 
diff --git a/fs/namei.c b/fs/namei.c
index e0624e2f0bba..560fd1dff1d0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -176,9 +176,9 @@ EXPORT_SYMBOL(putname);
 /*
  * This does basic POSIX ACL permission checking
  */
-static int acl_permission_check(struct inode *inode, int mask, unsigned int flags)
+static int acl_permission_check(struct inode *inode, int mask)
 {
-	int (*check_acl)(struct inode *inode, int mask, unsigned int flags);
+	int (*check_acl)(struct inode *inode, int mask);
 	unsigned int mode = inode->i_mode;
 
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
@@ -191,7 +191,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
 	else {
 		check_acl = inode->i_op->check_acl;
 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
-			int error = check_acl(inode, mask, flags);
+			int error = check_acl(inode, mask);
 			if (error != -EAGAIN)
 				return error;
 		}
@@ -231,7 +231,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags)
 	/*
 	 * Do the basic POSIX ACL permission checks.
 	 */
-	ret = acl_permission_check(inode, mask, flags);
+	ret = acl_permission_check(inode, mask);
 	if (ret != -EACCES)
 		return ret;
 
@@ -327,7 +327,7 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
 		if (likely(!ret))
 			goto ok;
 	} else {
-		ret = acl_permission_check(inode, mask, flags);
+		ret = acl_permission_check(inode, mask);
 		if (likely(!ret))
 			goto ok;
 		if (ret != -EACCES)
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 4b683ccc4506..1cee970eb55a 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -290,7 +290,7 @@ static int ocfs2_set_acl(handle_t *handle,
 	return ret;
 }
 
-int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
+int ocfs2_check_acl(struct inode *inode, int mask)
 {
 	struct ocfs2_super *osb;
 	struct buffer_head *di_bh = NULL;
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 4fe7c9cf4bfb..5c5d31f05853 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
 	__le32 e_id;
 };
 
-extern int ocfs2_check_acl(struct inode *, int, unsigned int);
+extern int ocfs2_check_acl(struct inode *, int);
 extern int ocfs2_acl_chmod(struct inode *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
 			  struct buffer_head *, struct buffer_head *,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6747470ec103..6ee3c11aa8d9 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -868,7 +868,7 @@ out:
 	return err;
 }
 
-int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+int reiserfs_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 	int error = -EAGAIN; /* do regular unix permission checks by default */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 278e6736135a..a5dcd6a0f1b5 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,7 +219,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 
 int
-xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+xfs_check_acl(struct inode *inode, int mask)
 {
 	struct xfs_inode *ip;
 	struct posix_acl *acl;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 11dd72070cbb..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
+extern int xfs_check_acl(struct inode *inode, int mask);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60c1fe65bb2d..f218b42718aa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1579,7 +1579,7 @@ struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
 	void * (*follow_link) (struct dentry *, struct nameidata *);
 	int (*permission) (struct inode *, int, unsigned int);
-	int (*check_acl)(struct inode *, int, unsigned int);
+	int (*check_acl)(struct inode *, int);
 
 	int (*readlink) (struct dentry *, char __user *,int);
 	void (*put_link) (struct dentry *, struct nameidata *, void *);
diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h
index 0437e377b555..574bea4013b6 100644
--- a/include/linux/generic_acl.h
+++ b/include/linux/generic_acl.h
@@ -10,6 +10,6 @@ extern const struct xattr_handler generic_acl_default_handler;
 
 int generic_acl_init(struct inode *, struct inode *);
 int generic_acl_chmod(struct inode *);
-int generic_check_acl(struct inode *inode, int mask, unsigned int flags);
+int generic_check_acl(struct inode *inode, int mask);
 
 #endif /* LINUX_GENERIC_ACL_H */
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 1a3ca8f80200..424ba6416e7c 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -45,7 +45,7 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags);
 
 #ifdef CONFIG_REISERFS_FS_XATTR
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
-int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags);
+int reiserfs_check_acl(struct inode *inode, int mask);
 ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name,
 			  void *buffer, size_t size);
 int reiserfs_setxattr(struct dentry *dentry, const char *name,
-- 
cgit v1.2.3


From 2830ba7f34ebb27c4e5b8b6ef408cd6d74860890 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Jun 2011 19:16:29 -0400
Subject: ->permission() sanitizing: don't pass flags to generic_permission()

redundant; all callers get it duplicated in mask & MAY_NOT_BLOCK and none of
them removes that bit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/security.c       | 2 +-
 fs/btrfs/inode.c        | 2 +-
 fs/ceph/inode.c         | 2 +-
 fs/cifs/cifsfs.c        | 2 +-
 fs/fuse/dir.c           | 5 ++---
 fs/gfs2/inode.c         | 2 +-
 fs/hostfs/hostfs_kern.c | 2 +-
 fs/hpfs/namei.c         | 2 +-
 fs/namei.c              | 4 ++--
 fs/nfs/dir.c            | 2 +-
 fs/nilfs2/inode.c       | 2 +-
 fs/ocfs2/file.c         | 2 +-
 fs/proc/base.c          | 2 +-
 fs/reiserfs/xattr.c     | 2 +-
 fs/sysfs/inode.c        | 2 +-
 include/linux/fs.h      | 2 +-
 16 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/security.c b/fs/afs/security.c
index 745ee654165f..ab6b3147f05f 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -350,7 +350,7 @@ int afs_permission(struct inode *inode, int mask, unsigned int flags)
 	}
 
 	key_put(key);
-	ret = generic_permission(inode, mask, flags);
+	ret = generic_permission(inode, mask);
 	_leave(" = %d", ret);
 	return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f0bd87371566..b61b0477a8ac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7339,7 +7339,7 @@ static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 		return -EROFS;
 	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
-	return generic_permission(inode, mask, flags);
+	return generic_permission(inode, mask);
 }
 
 static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index beb5d55d6fd2..9c169741b416 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1805,7 +1805,7 @@ int ceph_permission(struct inode *inode, int mask, unsigned int flags)
 	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
 
 	if (!err)
-		err = generic_permission(inode, mask, flags);
+		err = generic_permission(inode, mask);
 	return err;
 }
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b79804fa410f..b285b5bcf711 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -239,7 +239,7 @@ static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
-		return generic_permission(inode, mask, flags);
+		return generic_permission(inode, mask);
 }
 
 static struct kmem_cache *cifs_inode_cachep;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0a2fcd860ad6..0df56b6c26e2 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1018,7 +1018,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 	}
 
 	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-		err = generic_permission(inode, mask, flags);
+		err = generic_permission(inode, mask);
 
 		/* If permission is denied, try to refresh file
 		   attributes.  This is also needed, because the root
@@ -1026,8 +1026,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 		if (err == -EACCES && !refreshed) {
 			err = fuse_perm_getattr(inode, flags);
 			if (!err)
-				err = generic_permission(inode, mask,
-							flags);
+				err = generic_permission(inode, mask);
 		}
 
 		/* Note: the opposite of the above test does not
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d5f0f4ea25dc..b776ec8f9c19 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1564,7 +1564,7 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
 		error = -EACCES;
 	else
-		error = generic_permission(inode, mask, flags);
+		error = generic_permission(inode, mask);
 	if (unlock)
 		gfs2_glock_dq_uninit(&i_gh);
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index a98d0d1aef65..b1bc31bde833 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -770,7 +770,7 @@ int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
 		err = access_file(name, r, w, x);
 	__putname(name);
 	if (!err)
-		err = generic_permission(ino, desired, flags);
+		err = generic_permission(ino, desired);
 	return err;
 }
 
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index bd2ce7dd8df3..2df69e2f07cf 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -398,7 +398,7 @@ again:
 			hpfs_unlock(dir->i_sb);
 			return -ENOSPC;
 		}
-		if (generic_permission(inode, MAY_WRITE, 0) ||
+		if (generic_permission(inode, MAY_WRITE) ||
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
 			d_rehash(dentry);
diff --git a/fs/namei.c b/fs/namei.c
index 560fd1dff1d0..684e0f30cf4c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -224,7 +224,7 @@ other_perms:
  * request cannot be satisfied (eg. requires blocking or too much complexity).
  * It would then be called again in ref-walk mode.
  */
-int generic_permission(struct inode *inode, int mask, unsigned int flags)
+int generic_permission(struct inode *inode, int mask)
 {
 	int ret;
 
@@ -289,7 +289,7 @@ int inode_permission(struct inode *inode, int mask)
 	if (inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask, 0);
 	else
-		retval = generic_permission(inode, mask, 0);
+		retval = generic_permission(inode, mask);
 
 	if (retval)
 		return retval;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0485dca34fb1..16cf84b4afb9 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2328,7 +2328,7 @@ out:
 out_notsup:
 	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	if (res == 0)
-		res = generic_permission(inode, mask, flags);
+		res = generic_permission(inode, mask);
 	goto out;
 }
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 650aa7755003..0df6de58bbcb 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -806,7 +806,7 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 	    root->cno != NILFS_CPTREE_CURRENT_CNO)
 		return -EROFS; /* snapshot is not writable */
 
-	return generic_permission(inode, mask, flags);
+	return generic_permission(inode, mask);
 }
 
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d058cb7e12d4..ecb52b028964 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1293,7 +1293,7 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 		goto out;
 	}
 
-	ret = generic_permission(inode, mask, flags);
+	ret = generic_permission(inode, mask);
 
 	ocfs2_inode_unlock(inode, 0);
 out:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8b8470113576..53a1a961b25f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2169,7 +2169,7 @@ static const struct file_operations proc_fd_operations = {
  */
 static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	int rv = generic_permission(inode, mask, flags);
+	int rv = generic_permission(inode, mask);
 	if (rv == 0)
 		return 0;
 	if (task_pid(current) == proc_pid(inode))
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6ee3c11aa8d9..f17319613a85 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -967,7 +967,7 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 	if (IS_PRIVATE(inode))
 		return 0;
 
-	return generic_permission(inode, mask, flags);
+	return generic_permission(inode, mask);
 }
 
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index a37165c64757..04c81e5ba6f7 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -362,5 +362,5 @@ int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
 	sysfs_refresh_inode(sd, inode);
 	mutex_unlock(&sysfs_mutex);
 
-	return generic_permission(inode, mask, flags);
+	return generic_permission(inode, mask);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f218b42718aa..a1689c13eb77 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2188,7 +2188,7 @@ extern sector_t bmap(struct inode *, sector_t);
 #endif
 extern int notify_change(struct dentry *, struct iattr *);
 extern int inode_permission(struct inode *, int);
-extern int generic_permission(struct inode *, int, unsigned int);
+extern int generic_permission(struct inode *, int);
 
 static inline bool execute_ok(struct inode *inode)
 {
-- 
cgit v1.2.3


From 10556cb21a0d0b24d95f00ea6df16f599a3345b2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Jun 2011 19:28:19 -0400
Subject: ->permission() sanitizing: don't pass flags to ->permission()

not used by the instances anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/vfs.txt |  4 ++--
 fs/afs/internal.h                 |  2 +-
 fs/afs/security.c                 |  4 ++--
 fs/bad_inode.c                    |  2 +-
 fs/btrfs/inode.c                  |  2 +-
 fs/ceph/inode.c                   |  4 ++--
 fs/ceph/super.h                   |  2 +-
 fs/cifs/cifsfs.c                  |  2 +-
 fs/coda/coda_linux.h              |  2 +-
 fs/coda/dir.c                     |  4 ++--
 fs/coda/pioctl.c                  |  4 ++--
 fs/ecryptfs/inode.c               |  4 ++--
 fs/fuse/dir.c                     | 14 +++++++-------
 fs/gfs2/file.c                    |  2 +-
 fs/gfs2/inode.c                   | 16 ++++++++--------
 fs/gfs2/inode.h                   |  2 +-
 fs/hostfs/hostfs_kern.c           |  4 ++--
 fs/namei.c                        |  4 ++--
 fs/nfs/dir.c                      |  4 ++--
 fs/nilfs2/inode.c                 |  2 +-
 fs/nilfs2/nilfs.h                 |  2 +-
 fs/ocfs2/file.c                   |  4 ++--
 fs/ocfs2/file.h                   |  2 +-
 fs/proc/base.c                    |  2 +-
 fs/proc/proc_sysctl.c             |  2 +-
 fs/reiserfs/xattr.c               |  2 +-
 fs/sysfs/inode.c                  |  4 ++--
 fs/sysfs/sysfs.h                  |  2 +-
 include/linux/fs.h                |  2 +-
 include/linux/nfs_fs.h            |  2 +-
 include/linux/reiserfs_xattr.h    |  2 +-
 31 files changed, 55 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 8b4c8e04d879..d56151fe77dc 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -333,7 +333,7 @@ struct inode_operations {
         void * (*follow_link) (struct dentry *, struct nameidata *);
         void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
-	int (*permission) (struct inode *, int, unsigned int);
+	int (*permission) (struct inode *, int);
 	int (*check_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
@@ -423,7 +423,7 @@ otherwise noted.
   permission: called by the VFS to check for access rights on a POSIX-like
   	filesystem.
 
-	May be called in rcu-walk mode (flags & IPERM_FLAG_RCU). If in rcu-walk
+	May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk
         mode, the filesystem must check the permission without blocking or
 	storing to the inode.
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5a9b6843bac1..f396d337b817 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -627,7 +627,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int, unsigned int);
+extern int afs_permission(struct inode *, int);
 
 /*
  * server.c
diff --git a/fs/afs/security.c b/fs/afs/security.c
index ab6b3147f05f..8d010422dc89 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,14 +285,14 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
  * - AFS ACLs are attached to directories only, and a file is controlled by its
  *   parent directory's ACL
  */
-int afs_permission(struct inode *inode, int mask, unsigned int flags)
+int afs_permission(struct inode *inode, int mask)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_access_t uninitialized_var(access);
 	struct key *key;
 	int ret;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	_enter("{{%x:%u},%lx},%x,",
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index bfcb18feb1df..f024d8aaddef 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -229,7 +229,7 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
 	return -EIO;
 }
 
-static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
+static int bad_inode_permission(struct inode *inode, int mask)
 {
 	return -EIO;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b61b0477a8ac..cb170ca51c64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7331,7 +7331,7 @@ static int btrfs_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
+static int btrfs_permission(struct inode *inode, int mask)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9c169741b416..dfb2831d8d85 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1795,11 +1795,11 @@ int ceph_do_getattr(struct inode *inode, int mask)
  * Check inode permissions.  We verify we have a valid value for
  * the AUTH cap, then call the generic handler.
  */
-int ceph_permission(struct inode *inode, int mask, unsigned int flags)
+int ceph_permission(struct inode *inode, int mask)
 {
 	int err;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f5cabefa98dc..56c41ef47cad 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -692,7 +692,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
 
 extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
+extern int ceph_permission(struct inode *inode, int mask);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			struct kstat *stat);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b285b5bcf711..cbbb55e781d3 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -224,7 +224,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
+static int cifs_permission(struct inode *inode, int mask)
 {
 	struct cifs_sb_info *cifs_sb;
 
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 9b0c5323890b..44e17e9c21ae 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -39,7 +39,7 @@ extern const struct file_operations coda_ioctl_operations;
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
 int coda_release(struct inode *i, struct file *f);
-int coda_permission(struct inode *inode, int mask, unsigned int flags);
+int coda_permission(struct inode *inode, int mask);
 int coda_revalidate_inode(struct dentry *);
 int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int coda_setattr(struct dentry *, struct iattr *);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2b8dae4d121e..cd5532398c15 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -132,11 +132,11 @@ exit:
 }
 
 
-int coda_permission(struct inode *inode, int mask, unsigned int flags)
+int coda_permission(struct inode *inode, int mask)
 {
 	int error;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index cb140ef293e4..ee0981f1375b 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,7 +24,7 @@
 #include "coda_linux.h"
 
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
+static int coda_ioctl_permission(struct inode *inode, int mask);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
 			unsigned long user_data);
 
@@ -41,7 +41,7 @@ const struct file_operations coda_ioctl_operations = {
 };
 
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
+static int coda_ioctl_permission(struct inode *inode, int mask)
 {
 	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 7349ade17de6..bec75f8e91ac 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -942,9 +942,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 }
 
 static int
-ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
+ecryptfs_permission(struct inode *inode, int mask)
 {
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0df56b6c26e2..e2b14001cea5 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -971,9 +971,9 @@ static int fuse_access(struct inode *inode, int mask)
 	return err;
 }
 
-static int fuse_perm_getattr(struct inode *inode, int flags)
+static int fuse_perm_getattr(struct inode *inode, int mask)
 {
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	return fuse_do_getattr(inode, NULL, NULL);
@@ -992,7 +992,7 @@ static int fuse_perm_getattr(struct inode *inode, int flags)
  * access request is sent.  Execute permission is still checked
  * locally based on file mode.
  */
-static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
+static int fuse_permission(struct inode *inode, int mask)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool refreshed = false;
@@ -1011,7 +1011,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 		if (fi->i_time < get_jiffies_64()) {
 			refreshed = true;
 
-			err = fuse_perm_getattr(inode, flags);
+			err = fuse_perm_getattr(inode, mask);
 			if (err)
 				return err;
 		}
@@ -1024,7 +1024,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 		   attributes.  This is also needed, because the root
 		   node will at first have no permissions */
 		if (err == -EACCES && !refreshed) {
-			err = fuse_perm_getattr(inode, flags);
+			err = fuse_perm_getattr(inode, mask);
 			if (!err)
 				err = generic_permission(inode, mask);
 		}
@@ -1034,7 +1034,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 		   noticed immediately, only after the attribute
 		   timeout has expired */
 	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
-		if (flags & IPERM_FLAG_RCU)
+		if (mask & MAY_NOT_BLOCK)
 			return -ECHILD;
 
 		err = fuse_access(inode, mask);
@@ -1043,7 +1043,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 			if (refreshed)
 				return -EACCES;
 
-			err = fuse_perm_getattr(inode, flags);
+			err = fuse_perm_getattr(inode, mask);
 			if (!err && !(inode->i_mode & S_IXUGO))
 				return -EACCES;
 		}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a9f5cbe45cd9..89c39e53760d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -243,7 +243,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	    !capable(CAP_LINUX_IMMUTABLE))
 		goto out;
 	if (!IS_IMMUTABLE(inode)) {
-		error = gfs2_permission(inode, MAY_WRITE, 0);
+		error = gfs2_permission(inode, MAY_WRITE);
 		if (error)
 			goto out;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b776ec8f9c19..b1090d66a6fd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -307,7 +307,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 	}
 
 	if (!is_root) {
-		error = gfs2_permission(dir, MAY_EXEC, 0);
+		error = gfs2_permission(dir, MAY_EXEC);
 		if (error)
 			goto out;
 	}
@@ -337,7 +337,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
 	int error;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -857,7 +857,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (inode->i_nlink == 0)
 		goto out_gunlock;
 
-	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
+	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out_gunlock;
 
@@ -990,7 +990,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (IS_APPEND(&dip->i_inode))
 		return -EPERM;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -1336,7 +1336,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			}
 		}
 	} else {
-		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
+		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
 		if (error)
 			goto out_gunlock;
 
@@ -1371,7 +1371,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	/* Check out the dir to be renamed */
 
 	if (dir_rename) {
-		error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
+		error = gfs2_permission(odentry->d_inode, MAY_WRITE);
 		if (error)
 			goto out_gunlock;
 	}
@@ -1543,7 +1543,7 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
  * Returns: errno
  */
 
-int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
+int gfs2_permission(struct inode *inode, int mask)
 {
 	struct gfs2_inode *ip;
 	struct gfs2_holder i_gh;
@@ -1553,7 +1553,7 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 
 	ip = GFS2_I(inode);
 	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
-		if (flags & IPERM_FLAG_RCU)
+		if (mask & MAY_NOT_BLOCK)
 			return -ECHILD;
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
 		if (error)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 31606076f701..8d90e0c07672 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -108,7 +108,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip);
 
 extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 				  int is_root);
-extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
+extern int gfs2_permission(struct inode *inode, int mask);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b1bc31bde833..6e449c599b9d 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -748,12 +748,12 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	return err;
 }
 
-int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
+int hostfs_permission(struct inode *ino, int desired)
 {
 	char *name;
 	int r = 0, w = 0, x = 0, err;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (desired & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	if (desired & MAY_READ) r = 1;
diff --git a/fs/namei.c b/fs/namei.c
index 684e0f30cf4c..c5c382620a86 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -287,7 +287,7 @@ int inode_permission(struct inode *inode, int mask)
 	}
 
 	if (inode->i_op->permission)
-		retval = inode->i_op->permission(inode, mask, 0);
+		retval = inode->i_op->permission(inode, mask);
 	else
 		retval = generic_permission(inode, mask);
 
@@ -323,7 +323,7 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
 		mask |= MAY_NOT_BLOCK;
 
 	if (inode->i_op->permission) {
-		ret = inode->i_op->permission(inode, mask, flags);
+		ret = inode->i_op->permission(inode, mask);
 		if (likely(!ret))
 			goto ok;
 	} else {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 16cf84b4afb9..a86acd6d0cb4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2273,12 +2273,12 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
 	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
 
-int nfs_permission(struct inode *inode, int mask, unsigned int flags)
+int nfs_permission(struct inode *inode, int mask)
 {
 	struct rpc_cred *cred;
 	int res = 0;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 0df6de58bbcb..be8664c6a825 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -799,7 +799,7 @@ out_err:
 	return err;
 }
 
-int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
+int nilfs_permission(struct inode *inode, int mask)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 	if ((mask & MAY_WRITE) && root &&
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f02b9ad43a21..6fb7511c0328 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -264,7 +264,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
-int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
+int nilfs_permission(struct inode *inode, int mask);
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ecb52b028964..1406c37a5722 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1279,11 +1279,11 @@ bail:
 	return err;
 }
 
-int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
+int ocfs2_permission(struct inode *inode, int mask)
 {
 	int ret;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	ret = ocfs2_inode_lock(inode, NULL, 0);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index f5afbbef6703..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
+int ocfs2_permission(struct inode *inode, int mask);
 
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 53a1a961b25f..be1ff932033b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2167,7 +2167,7 @@ static const struct file_operations proc_fd_operations = {
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
-static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
+static int proc_fd_permission(struct inode *inode, int mask)
 {
 	int rv = generic_permission(inode, mask);
 	if (rv == 0)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 349b22f434d8..1a77dbef226f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -294,7 +294,7 @@ out:
 	return ret;
 }
 
-static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
+static int proc_sys_permission(struct inode *inode, int mask)
 {
 	/*
 	 * sysctl entries that are not writeable,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f17319613a85..4ea2ab41fdee 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -958,7 +958,7 @@ static int xattr_mount_check(struct super_block *s)
 	return 0;
 }
 
-int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
+int reiserfs_permission(struct inode *inode, int mask)
 {
 	/*
 	 * We don't do permission checks on the internal objects.
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 04c81e5ba6f7..e3f091a81c72 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -349,11 +349,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 		return -ENOENT;
 }
 
-int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
+int sysfs_permission(struct inode *inode, int mask)
 {
 	struct sysfs_dirent *sd;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	sd = inode->i_private;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 2ed2404f3113..845ab3ad229d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -201,7 +201,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_evict_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
+int sysfs_permission(struct inode *inode, int mask);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a1689c13eb77..1fdefe3995fd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1578,7 +1578,7 @@ struct file_operations {
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
 	void * (*follow_link) (struct dentry *, struct nameidata *);
-	int (*permission) (struct inode *, int, unsigned int);
+	int (*permission) (struct inode *, int);
 	int (*check_acl)(struct inode *, int);
 
 	int (*readlink) (struct dentry *, char __user *,int);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1b93b9c60e55..86014811a0fb 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -360,7 +360,7 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-extern int nfs_permission(struct inode *, int, unsigned int);
+extern int nfs_permission(struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 424ba6416e7c..57958c0e1d38 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -41,7 +41,7 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
 int reiserfs_lookup_privroot(struct super_block *sb);
 int reiserfs_delete_xattrs(struct inode *inode);
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
-int reiserfs_permission(struct inode *inode, int mask, unsigned int flags);
+int reiserfs_permission(struct inode *inode, int mask);
 
 #ifdef CONFIG_REISERFS_FS_XATTR
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
-- 
cgit v1.2.3


From e74f71eb78a4a8b9eaf1bc65f20f761648e85f76 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Jun 2011 19:38:15 -0400
Subject: ->permission() sanitizing: don't pass flags to ->inode_permission()

pass that via mask instead.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/security.h   | 2 +-
 security/capability.c      | 2 +-
 security/security.c        | 7 +++++--
 security/selinux/hooks.c   | 5 +++--
 security/smack/smack_lsm.c | 5 +++--
 5 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 8ce59ef3e5af..ca02f1716736 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1456,7 +1456,7 @@ struct security_operations {
 			     struct inode *new_dir, struct dentry *new_dentry);
 	int (*inode_readlink) (struct dentry *dentry);
 	int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd);
-	int (*inode_permission) (struct inode *inode, int mask, unsigned flags);
+	int (*inode_permission) (struct inode *inode, int mask);
 	int (*inode_setattr)	(struct dentry *dentry, struct iattr *attr);
 	int (*inode_getattr) (struct vfsmount *mnt, struct dentry *dentry);
 	int (*inode_setxattr) (struct dentry *dentry, const char *name,
diff --git a/security/capability.c b/security/capability.c
index bbb51156261b..2984ea4f776f 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -181,7 +181,7 @@ static int cap_inode_follow_link(struct dentry *dentry,
 	return 0;
 }
 
-static int cap_inode_permission(struct inode *inode, int mask, unsigned flags)
+static int cap_inode_permission(struct inode *inode, int mask)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 4ba6d4cc061f..db3b750da353 100644
--- a/security/security.c
+++ b/security/security.c
@@ -518,14 +518,17 @@ int security_inode_permission(struct inode *inode, int mask)
 {
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
-	return security_ops->inode_permission(inode, mask, 0);
+	return security_ops->inode_permission(inode, mask);
 }
 
 int security_inode_exec_permission(struct inode *inode, unsigned int flags)
 {
+	int mask = MAY_EXEC;
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
-	return security_ops->inode_permission(inode, MAY_EXEC, flags);
+	if (flags)
+		mask |= MAY_NOT_BLOCK;
+	return security_ops->inode_permission(inode, mask);
 }
 
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 20219ef5439a..47a059fff344 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2659,12 +2659,13 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct nameidata *na
 	return dentry_has_perm(cred, dentry, FILE__READ);
 }
 
-static int selinux_inode_permission(struct inode *inode, int mask, unsigned flags)
+static int selinux_inode_permission(struct inode *inode, int mask)
 {
 	const struct cred *cred = current_cred();
 	struct common_audit_data ad;
 	u32 perms;
 	bool from_access;
+	unsigned __flags = mask & MAY_NOT_BLOCK ? IPERM_FLAG_RCU : 0;
 
 	from_access = mask & MAY_ACCESS;
 	mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND);
@@ -2681,7 +2682,7 @@ static int selinux_inode_permission(struct inode *inode, int mask, unsigned flag
 
 	perms = file_mask_to_av(inode->i_mode, mask);
 
-	return inode_has_perm(cred, inode, perms, &ad, flags);
+	return inode_has_perm(cred, inode, perms, &ad, __flags);
 }
 
 static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr)
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 9831a39c11f6..f375eb2e1957 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -689,9 +689,10 @@ static int smack_inode_rename(struct inode *old_inode,
  *
  * Returns 0 if access is permitted, -EACCES otherwise
  */
-static int smack_inode_permission(struct inode *inode, int mask, unsigned flags)
+static int smack_inode_permission(struct inode *inode, int mask)
 {
 	struct smk_audit_info ad;
+	int no_block = mask & MAY_NOT_BLOCK;
 
 	mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND);
 	/*
@@ -701,7 +702,7 @@ static int smack_inode_permission(struct inode *inode, int mask, unsigned flags)
 		return 0;
 
 	/* May be droppable after audit */
-	if (flags & IPERM_FLAG_RCU)
+	if (no_block)
 		return -ECHILD;
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_INODE);
 	smk_ad_setfield_u_fs_inode(&ad, inode);
-- 
cgit v1.2.3


From eecdd358b467405a084d400d5ec571bbdbfe97a3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Jun 2011 19:48:41 -0400
Subject: ->permission() sanitizing: don't pass flags to exec_permission()

pass mask instead; kill security_inode_exec_permission() since we can use
security_inode_permission() instead.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c               | 17 +++++++----------
 include/linux/security.h |  7 -------
 security/security.c      | 10 ----------
 3 files changed, 7 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index c5c382620a86..21eba95368f2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -304,7 +304,7 @@ int inode_permission(struct inode *inode, int mask)
 /**
  * exec_permission  -  check for right to do lookups in a given directory
  * @inode:	inode to check permission on
- * @flags:	IPERM_FLAG_ flags.
+ * @mask:	MAY_EXEC and possibly MAY_NOT_BLOCK flags.
  *
  * Short-cut version of inode_permission(), for calling on directories
  * during pathname resolution.  Combines parts of inode_permission()
@@ -314,13 +314,10 @@ int inode_permission(struct inode *inode, int mask)
  * short-cut DAC fails, then call ->permission() to do more
  * complete permission check.
  */
-static inline int exec_permission(struct inode *inode, unsigned int flags)
+static inline int exec_permission(struct inode *inode, int mask)
 {
 	int ret;
 	struct user_namespace *ns = inode_userns(inode);
-	int mask = MAY_EXEC;
-	if (flags & IPERM_FLAG_RCU)
-		mask |= MAY_NOT_BLOCK;
 
 	if (inode->i_op->permission) {
 		ret = inode->i_op->permission(inode, mask);
@@ -338,7 +335,7 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
 	}
 	return ret;
 ok:
-	return security_inode_exec_permission(inode, flags);
+	return security_inode_permission(inode, mask);
 }
 
 /**
@@ -1214,13 +1211,13 @@ retry:
 static inline int may_lookup(struct nameidata *nd)
 {
 	if (nd->flags & LOOKUP_RCU) {
-		int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+		int err = exec_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
 		if (err != -ECHILD)
 			return err;
 		if (unlazy_walk(nd, NULL))
 			return -ECHILD;
 	}
-	return exec_permission(nd->inode, 0);
+	return exec_permission(nd->inode, MAY_EXEC);
 }
 
 static inline int handle_dots(struct nameidata *nd, int type)
@@ -1495,7 +1492,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 			if (!S_ISDIR(dentry->d_inode->i_mode))
 				goto fput_fail;
 
-			retval = exec_permission(dentry->d_inode, 0);
+			retval = exec_permission(dentry->d_inode, MAY_EXEC);
 			if (retval)
 				goto fput_fail;
 		}
@@ -1652,7 +1649,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	struct dentry *dentry;
 	int err;
 
-	err = exec_permission(inode, 0);
+	err = exec_permission(inode, MAY_EXEC);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/include/linux/security.h b/include/linux/security.h
index ca02f1716736..ebd2a53a3d07 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1720,7 +1720,6 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
 int security_inode_readlink(struct dentry *dentry);
 int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd);
 int security_inode_permission(struct inode *inode, int mask);
-int security_inode_exec_permission(struct inode *inode, unsigned int flags);
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
 int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry);
 int security_inode_setxattr(struct dentry *dentry, const char *name,
@@ -2113,12 +2112,6 @@ static inline int security_inode_permission(struct inode *inode, int mask)
 	return 0;
 }
 
-static inline int security_inode_exec_permission(struct inode *inode,
-						  unsigned int flags)
-{
-	return 0;
-}
-
 static inline int security_inode_setattr(struct dentry *dentry,
 					  struct iattr *attr)
 {
diff --git a/security/security.c b/security/security.c
index db3b750da353..0e4fccfef12c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -521,16 +521,6 @@ int security_inode_permission(struct inode *inode, int mask)
 	return security_ops->inode_permission(inode, mask);
 }
 
-int security_inode_exec_permission(struct inode *inode, unsigned int flags)
-{
-	int mask = MAY_EXEC;
-	if (unlikely(IS_PRIVATE(inode)))
-		return 0;
-	if (flags)
-		mask |= MAY_NOT_BLOCK;
-	return security_ops->inode_permission(inode, mask);
-}
-
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	if (unlikely(IS_PRIVATE(dentry->d_inode)))
-- 
cgit v1.2.3


From 729cdb3a1ee03a4363f9c7e66ddd979727e99e1f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Jun 2011 01:01:22 -0400
Subject: kill IPERM_FLAG_RCU

not used anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1fdefe3995fd..8494aac189f0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1573,8 +1573,6 @@ struct file_operations {
 			  loff_t len);
 };
 
-#define IPERM_FLAG_RCU	0x0001
-
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
 	void * (*follow_link) (struct dentry *, struct nameidata *);
-- 
cgit v1.2.3


From 3d4ff43d895c50319af45eb4bf04a4618eccdf76 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 22 Jun 2011 18:40:12 -0400
Subject: nfs_open_context doesn't need struct path either

just dentry, please...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c           | 14 +++++---------
 fs/nfs/direct.c        |  4 ++--
 fs/nfs/inode.c         | 20 +++++++++++---------
 fs/nfs/nfs4proc.c      | 10 +++++-----
 fs/nfs/pagelist.c      |  4 ++--
 fs/nfs/read.c          |  8 ++++----
 fs/nfs/write.c         | 22 +++++++++++-----------
 include/linux/nfs_fs.h |  4 ++--
 8 files changed, 42 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a86acd6d0cb4..a0693f3f11c0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1345,10 +1345,6 @@ static int is_atomic_open(struct nameidata *nd)
 
 static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
 {
-	struct path path = {
-		.mnt = nd->path.mnt,
-		.dentry = dentry,
-	};
 	struct nfs_open_context *ctx;
 	struct rpc_cred *cred;
 	fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
@@ -1356,7 +1352,7 @@ static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *den
 	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return ERR_CAST(cred);
-	ctx = alloc_nfs_open_context(&path, cred, fmode);
+	ctx = alloc_nfs_open_context(dentry, cred, fmode);
 	put_rpccred(cred);
 	if (ctx == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1376,13 +1372,13 @@ static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ct
 
 	/* If the open_intent is for execute, we have an extra check to make */
 	if (ctx->mode & FMODE_EXEC) {
-		ret = nfs_may_open(ctx->path.dentry->d_inode,
+		ret = nfs_may_open(ctx->dentry->d_inode,
 				ctx->cred,
 				nd->intent.open.flags);
 		if (ret < 0)
 			goto out;
 	}
-	filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
+	filp = lookup_instantiate_filp(nd, ctx->dentry, do_open);
 	if (IS_ERR(filp))
 		ret = PTR_ERR(filp);
 	else
@@ -1463,8 +1459,8 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 	res = d_add_unique(dentry, inode);
 	nfs_unblock_sillyrename(dentry->d_parent);
 	if (res != NULL) {
-		dput(ctx->path.dentry);
-		ctx->path.dentry = dget(res);
+		dput(ctx->dentry);
+		ctx->dentry = dget(res);
 		dentry = res;
 	}
 	err = nfs_intent_set_file(nd, ctx);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 8eea25366717..b35d25b98da6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -284,7 +284,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 						loff_t pos)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
-	struct inode *inode = ctx->path.dentry->d_inode;
+	struct inode *inode = ctx->dentry->d_inode;
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
 	size_t rsize = NFS_SERVER(inode)->rsize;
@@ -715,7 +715,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 						 loff_t pos, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
-	struct inode *inode = ctx->path.dentry->d_inode;
+	struct inode *inode = ctx->dentry->d_inode;
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
 	struct rpc_task *task;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6f4850deb272..fe1203797b2b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -567,7 +567,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
 struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
 {
 	struct nfs_lock_context *res, *new = NULL;
-	struct inode *inode = ctx->path.dentry->d_inode;
+	struct inode *inode = ctx->dentry->d_inode;
 
 	spin_lock(&inode->i_lock);
 	res = __nfs_find_lock_context(ctx);
@@ -594,7 +594,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
 void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
 {
 	struct nfs_open_context *ctx = l_ctx->open_context;
-	struct inode *inode = ctx->path.dentry->d_inode;
+	struct inode *inode = ctx->dentry->d_inode;
 
 	if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
 		return;
@@ -620,7 +620,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 		return;
 	if (!is_sync)
 		return;
-	inode = ctx->path.dentry->d_inode;
+	inode = ctx->dentry->d_inode;
 	if (!list_empty(&NFS_I(inode)->open_files))
 		return;
 	server = NFS_SERVER(inode);
@@ -629,14 +629,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 	nfs_revalidate_inode(server, inode);
 }
 
-struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
+struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred, fmode_t f_mode)
 {
 	struct nfs_open_context *ctx;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (ctx != NULL) {
-		ctx->path = *path;
-		path_get(&ctx->path);
+		nfs_sb_active(dentry->d_sb);
+		ctx->dentry = dget(dentry);
 		ctx->cred = get_rpccred(cred);
 		ctx->state = NULL;
 		ctx->mode = f_mode;
@@ -658,7 +658,8 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 
 static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
-	struct inode *inode = ctx->path.dentry->d_inode;
+	struct inode *inode = ctx->dentry->d_inode;
+	struct super_block *sb = ctx->dentry->d_sb;
 
 	if (!list_empty(&ctx->list)) {
 		if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
@@ -671,7 +672,8 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 		NFS_PROTO(inode)->close_context(ctx, is_sync);
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
-	path_put(&ctx->path);
+	dput(ctx->dentry);
+	nfs_sb_deactive(sb);
 	kfree(ctx);
 }
 
@@ -741,7 +743,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
-	ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
+	ctx = alloc_nfs_open_context(filp->f_path.dentry, cred, filp->f_mode);
 	put_rpccred(cred);
 	if (ctx == NULL)
 		return -ENOMEM;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aef22fa0a1c0..26bece8f3083 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1132,7 +1132,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 {
 	struct nfs4_opendata *opendata;
 
-	opendata = nfs4_opendata_alloc(ctx->path.dentry, state->owner, 0, 0, NULL, GFP_NOFS);
+	opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, NULL, GFP_NOFS);
 	if (opendata == NULL)
 		return ERR_PTR(-ENOMEM);
 	opendata->state = state;
@@ -1650,7 +1650,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
 		return PTR_ERR(opendata);
 	ret = nfs4_open_recover(opendata, state);
 	if (ret == -ESTALE)
-		d_drop(ctx->path.dentry);
+		d_drop(ctx->dentry);
 	nfs4_opendata_put(opendata);
 	return ret;
 }
@@ -2081,7 +2081,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
 	struct nfs4_state *state;
 
 	/* Protect against concurrent sillydeletes */
-	state = nfs4_do_open(dir, ctx->path.dentry, ctx->mode, open_flags, attr, ctx->cred);
+	state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, ctx->cred);
 	if (IS_ERR(state))
 		return ERR_CAST(state);
 	ctx->state = state;
@@ -2625,7 +2625,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 
 	if (ctx != NULL) {
 		cred = ctx->cred;
-		de = ctx->path.dentry;
+		de = ctx->dentry;
 		fmode = ctx->mode;
 	}
 	sattr->ia_mode &= ~current_umask();
@@ -4292,7 +4292,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 		memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
 					sizeof(data->lsp->ls_stateid.data));
 		data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
-		renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp);
+		renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
 	}
 out:
 	dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 009855716286..18449f43c568 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -114,7 +114,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 	if (!nfs_lock_request_dontget(req))
 		return 0;
 	if (test_bit(PG_MAPPED, &req->wb_flags))
-		radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+		radix_tree_tag_set(&NFS_I(req->wb_context->dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 	return 1;
 }
 
@@ -124,7 +124,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
 	if (test_bit(PG_MAPPED, &req->wb_flags)) {
-		struct inode *inode = req->wb_context->path.dentry->d_inode;
+		struct inode *inode = req->wb_context->dentry->d_inode;
 		struct nfs_inode *nfsi = NFS_I(inode);
 
 		spin_lock(&inode->i_lock);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 20a7f952e244..a68679f538fc 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -144,7 +144,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 
 static void nfs_readpage_release(struct nfs_page *req)
 {
-	struct inode *d_inode = req->wb_context->path.dentry->d_inode;
+	struct inode *d_inode = req->wb_context->dentry->d_inode;
 
 	if (PageUptodate(req->wb_page))
 		nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
@@ -152,8 +152,8 @@ static void nfs_readpage_release(struct nfs_page *req)
 	unlock_page(req->wb_page);
 
 	dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
-			req->wb_context->path.dentry->d_inode->i_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+			req->wb_context->dentry->d_inode->i_sb->s_id,
+			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 	nfs_release_request(req);
@@ -207,7 +207,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 		unsigned int count, unsigned int offset,
 		struct pnfs_layout_segment *lseg)
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct inode *inode = req->wb_context->dentry->d_inode;
 
 	data->req	  = req;
 	data->inode	  = inode;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 727168059684..08579312c57b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -409,7 +409,7 @@ out:
  */
 static void nfs_inode_remove_request(struct nfs_page *req)
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	BUG_ON (!NFS_WBACK_BUSY(req));
@@ -438,7 +438,7 @@ nfs_mark_request_dirty(struct nfs_page *req)
 static void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
@@ -852,13 +852,13 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 		struct pnfs_layout_segment *lseg,
 		int how)
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct inode *inode = req->wb_context->dentry->d_inode;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 
 	data->req = req;
-	data->inode = inode = req->wb_context->path.dentry->d_inode;
+	data->inode = inode = req->wb_context->dentry->d_inode;
 	data->cred = req->wb_context->cred;
 	data->lseg = get_lseg(lseg);
 
@@ -1053,9 +1053,9 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 
 	dprintk("NFS: %5u write(%s/%lld %d@%lld)",
 		task->tk_pid,
-		data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
+		data->req->wb_context->dentry->d_inode->i_sb->s_id,
 		(long long)
-		  NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+		  NFS_FILEID(data->req->wb_context->dentry->d_inode),
 		data->req->wb_bytes, (long long)req_offset(data->req));
 
 	nfs_writeback_done(task, data);
@@ -1148,8 +1148,8 @@ static void nfs_writeback_release_full(void *calldata)
 
 		dprintk("NFS: %5u write (%s/%lld %d@%lld)",
 			data->task.tk_pid,
-			req->wb_context->path.dentry->d_inode->i_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+			req->wb_context->dentry->d_inode->i_sb->s_id,
+			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 
@@ -1347,7 +1347,7 @@ void nfs_init_commit(struct nfs_write_data *data,
 			    struct pnfs_layout_segment *lseg)
 {
 	struct nfs_page *first = nfs_list_entry(head->next);
-	struct inode *inode = first->wb_context->path.dentry->d_inode;
+	struct inode *inode = first->wb_context->dentry->d_inode;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
@@ -1435,8 +1435,8 @@ void nfs_commit_release_pages(struct nfs_write_data *data)
 		nfs_clear_request_commit(req);
 
 		dprintk("NFS:       commit (%s/%lld %d@%lld)",
-			req->wb_context->path.dentry->d_inode->i_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+			req->wb_context->dentry->d_sb->s_id,
+			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 		if (status < 0) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 86014811a0fb..8b579beb6358 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -85,7 +85,7 @@ struct nfs_lock_context {
 struct nfs4_state;
 struct nfs_open_context {
 	struct nfs_lock_context lock_context;
-	struct path path;
+	struct dentry *dentry;
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
 	fmode_t mode;
@@ -372,7 +372,7 @@ extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode);
-extern struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode);
+extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred, fmode_t f_mode);
 extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
 extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
 extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
-- 
cgit v1.2.3


From 49084c3bb2055c401f3493c13edae14d49128ca0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 25 Jun 2011 21:59:52 -0400
Subject: kill LOOKUP_CONTINUE

LOOKUP_PARENT is equivalent to it now

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 11 +++--------
 include/linux/namei.h |  1 -
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 5e65f67ee926..f49d6abfa799 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -664,7 +664,7 @@ static int follow_automount(struct path *path, unsigned flags,
 	/* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
 	 * and this is the terminal part of the path.
 	 */
-	if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
+	if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT))
 		return -EISDIR; /* we actually want to stop here */
 
 	/* We want to mount if someone is trying to open/create a file of any
@@ -676,7 +676,7 @@ static int follow_automount(struct path *path, unsigned flags,
 	 * appended a '/' to the name.
 	 */
 	if (!(flags & LOOKUP_FOLLOW) &&
-	    !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+	    !(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
 		       LOOKUP_OPEN | LOOKUP_CREATE)))
 		return -EISDIR;
 
@@ -695,7 +695,7 @@ static int follow_automount(struct path *path, unsigned flags,
 		 * the path being looked up; if it wasn't then the remainder of
 		 * the path is inaccessible and we should say so.
 		 */
-		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
 			return -EREMOTE;
 		return PTR_ERR(mnt);
 	}
@@ -1281,7 +1281,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 {
 	struct path next;
 	int err;
-	unsigned int lookup_flags = nd->flags;
 	
 	while (*name=='/')
 		name++;
@@ -1295,8 +1294,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		unsigned int c;
 		int type;
 
-		nd->flags |= LOOKUP_CONTINUE;
-
 		err = may_lookup(nd);
  		if (err)
 			break;
@@ -1358,8 +1355,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		/* here ends the main loop */
 
 last_component:
-		/* Clear LOOKUP_CONTINUE iff it was previously unset */
-		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
 		nd->last = this;
 		nd->last_type = type;
 		return 0;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index eba45ea10298..3439ab862e1d 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -48,7 +48,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
  */
 #define LOOKUP_FOLLOW		0x0001
 #define LOOKUP_DIRECTORY	0x0002
-#define LOOKUP_CONTINUE		0x0004
 
 #define LOOKUP_PARENT		0x0010
 #define LOOKUP_REVAL		0x0020
-- 
cgit v1.2.3


From dae6ad8f37529963ae7df52baaccf056b38f210e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jun 2011 11:50:15 -0400
Subject: new helpers: kern_path_create/user_path_create

combination of kern_path_parent() and lookup_create().  Does *not*
expose struct nameidata to caller.  Syscalls converted to that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c              | 154 ++++++++++++++++++++++++------------------------
 fs/ocfs2/refcounttree.c |  49 ++++-----------
 include/linux/namei.h   |   2 +
 net/unix/af_unix.c      |  38 ++++++------
 4 files changed, 106 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index f49d6abfa799..b292eb03d9d2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2311,6 +2311,35 @@ fail:
 }
 EXPORT_SYMBOL_GPL(lookup_create);
 
+struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
+{
+	struct nameidata nd;
+	struct dentry *res;
+	int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+	if (error)
+		return ERR_PTR(error);
+	res = lookup_create(&nd, is_dir);
+	if (IS_ERR(res)) {
+		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+		path_put(&nd.path);
+	}
+	*path = nd.path;
+	return res;
+}
+EXPORT_SYMBOL(kern_path_create);
+
+struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
+{
+	char *tmp = getname(pathname);
+	struct dentry *res;
+	if (IS_ERR(tmp))
+		return ERR_CAST(tmp);
+	res = kern_path_create(dfd, tmp, path, is_dir);
+	putname(tmp);
+	return res;
+}
+EXPORT_SYMBOL(user_path_create);
+
 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 {
 	int error = may_create(dir, dentry);
@@ -2359,54 +2388,46 @@ static int may_mknod(mode_t mode)
 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
 		unsigned, dev)
 {
-	int error;
-	char *tmp;
 	struct dentry *dentry;
-	struct nameidata nd;
+	struct path path;
+	int error;
 
 	if (S_ISDIR(mode))
 		return -EPERM;
 
-	error = user_path_parent(dfd, filename, &nd, &tmp);
-	if (error)
-		return error;
+	dentry = user_path_create(dfd, filename, &path, 0);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
-	dentry = lookup_create(&nd, 0);
-	if (IS_ERR(dentry)) {
-		error = PTR_ERR(dentry);
-		goto out_unlock;
-	}
-	if (!IS_POSIXACL(nd.path.dentry->d_inode))
+	if (!IS_POSIXACL(path.dentry->d_inode))
 		mode &= ~current_umask();
 	error = may_mknod(mode);
 	if (error)
 		goto out_dput;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_dput;
-	error = security_path_mknod(&nd.path, dentry, mode, dev);
+	error = security_path_mknod(&path, dentry, mode, dev);
 	if (error)
 		goto out_drop_write;
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
-			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,NULL);
+			error = vfs_create(path.dentry->d_inode,dentry,mode,NULL);
 			break;
 		case S_IFCHR: case S_IFBLK:
-			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
+			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
 					new_decode_dev(dev));
 			break;
 		case S_IFIFO: case S_IFSOCK:
-			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
+			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
 			break;
 	}
 out_drop_write:
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(path.mnt);
 out_dput:
 	dput(dentry);
-out_unlock:
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-	path_put(&nd.path);
-	putname(tmp);
+	mutex_unlock(&path.dentry->d_inode->i_mutex);
+	path_put(&path);
 
 	return error;
 }
@@ -2439,38 +2460,29 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
 {
-	int error = 0;
-	char * tmp;
 	struct dentry *dentry;
-	struct nameidata nd;
-
-	error = user_path_parent(dfd, pathname, &nd, &tmp);
-	if (error)
-		goto out_err;
+	struct path path;
+	int error;
 
-	dentry = lookup_create(&nd, 1);
-	error = PTR_ERR(dentry);
+	dentry = user_path_create(dfd, pathname, &path, 1);
 	if (IS_ERR(dentry))
-		goto out_unlock;
+		return PTR_ERR(dentry);
 
-	if (!IS_POSIXACL(nd.path.dentry->d_inode))
+	if (!IS_POSIXACL(path.dentry->d_inode))
 		mode &= ~current_umask();
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_dput;
-	error = security_path_mkdir(&nd.path, dentry, mode);
+	error = security_path_mkdir(&path, dentry, mode);
 	if (error)
 		goto out_drop_write;
-	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+	error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
 out_drop_write:
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(path.mnt);
 out_dput:
 	dput(dentry);
-out_unlock:
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-	path_put(&nd.path);
-	putname(tmp);
-out_err:
+	mutex_unlock(&path.dentry->d_inode->i_mutex);
+	path_put(&path);
 	return error;
 }
 
@@ -2730,38 +2742,31 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
 {
 	int error;
 	char *from;
-	char *to;
 	struct dentry *dentry;
-	struct nameidata nd;
+	struct path path;
 
 	from = getname(oldname);
 	if (IS_ERR(from))
 		return PTR_ERR(from);
 
-	error = user_path_parent(newdfd, newname, &nd, &to);
-	if (error)
-		goto out_putname;
-
-	dentry = lookup_create(&nd, 0);
+	dentry = user_path_create(newdfd, newname, &path, 0);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
-		goto out_unlock;
+		goto out_putname;
 
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_dput;
-	error = security_path_symlink(&nd.path, dentry, from);
+	error = security_path_symlink(&path, dentry, from);
 	if (error)
 		goto out_drop_write;
-	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+	error = vfs_symlink(path.dentry->d_inode, dentry, from);
 out_drop_write:
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(path.mnt);
 out_dput:
 	dput(dentry);
-out_unlock:
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-	path_put(&nd.path);
-	putname(to);
+	mutex_unlock(&path.dentry->d_inode->i_mutex);
+	path_put(&path);
 out_putname:
 	putname(from);
 	return error;
@@ -2826,11 +2831,9 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 		int, newdfd, const char __user *, newname, int, flags)
 {
 	struct dentry *new_dentry;
-	struct nameidata nd;
-	struct path old_path;
+	struct path old_path, new_path;
 	int how = 0;
 	int error;
-	char *to;
 
 	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
 		return -EINVAL;
@@ -2852,32 +2855,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	if (error)
 		return error;
 
-	error = user_path_parent(newdfd, newname, &nd, &to);
-	if (error)
-		goto out;
-	error = -EXDEV;
-	if (old_path.mnt != nd.path.mnt)
-		goto out_release;
-	new_dentry = lookup_create(&nd, 0);
+	new_dentry = user_path_create(newdfd, newname, &new_path, 0);
 	error = PTR_ERR(new_dentry);
 	if (IS_ERR(new_dentry))
-		goto out_unlock;
-	error = mnt_want_write(nd.path.mnt);
+		goto out;
+
+	error = -EXDEV;
+	if (old_path.mnt != new_path.mnt)
+		goto out_dput;
+	error = mnt_want_write(new_path.mnt);
 	if (error)
 		goto out_dput;
-	error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_drop_write;
-	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
 out_drop_write:
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(new_path.mnt);
 out_dput:
 	dput(new_dentry);
-out_unlock:
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-out_release:
-	path_put(&nd.path);
-	putname(to);
+	mutex_unlock(&new_path.dentry->d_inode->i_mutex);
+	path_put(&new_path);
 out:
 	path_put(&old_path);
 
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ebfd3825f12a..cf7823382664 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4368,25 +4368,6 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
-/* copied from user_path_parent. */
-static int ocfs2_user_path_parent(const char __user *path,
-				  struct nameidata *nd, char **name)
-{
-	char *s = getname(path);
-	int error;
-
-	if (IS_ERR(s))
-		return PTR_ERR(s);
-
-	error = kern_path_parent(s, nd);
-	if (error)
-		putname(s);
-	else
-		*name = s;
-
-	return error;
-}
-
 /**
  * ocfs2_vfs_reflink - Create a reference-counted link
  *
@@ -4460,10 +4441,8 @@ int ocfs2_reflink_ioctl(struct inode *inode,
 			bool preserve)
 {
 	struct dentry *new_dentry;
-	struct nameidata nd;
-	struct path old_path;
+	struct path old_path, new_path;
 	int error;
-	char *to = NULL;
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
 		return -EOPNOTSUPP;
@@ -4474,39 +4453,33 @@ int ocfs2_reflink_ioctl(struct inode *inode,
 		return error;
 	}
 
-	error = ocfs2_user_path_parent(newname, &nd, &to);
-	if (error) {
+	new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
+	error = PTR_ERR(new_dentry);
+	if (IS_ERR(new_dentry)) {
 		mlog_errno(error);
 		goto out;
 	}
 
 	error = -EXDEV;
-	if (old_path.mnt != nd.path.mnt)
-		goto out_release;
-	new_dentry = lookup_create(&nd, 0);
-	error = PTR_ERR(new_dentry);
-	if (IS_ERR(new_dentry)) {
+	if (old_path.mnt != new_path.mnt) {
 		mlog_errno(error);
-		goto out_unlock;
+		goto out_dput;
 	}
 
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(new_path.mnt);
 	if (error) {
 		mlog_errno(error);
 		goto out_dput;
 	}
 
 	error = ocfs2_vfs_reflink(old_path.dentry,
-				  nd.path.dentry->d_inode,
+				  new_path.dentry->d_inode,
 				  new_dentry, preserve);
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(new_path.mnt);
 out_dput:
 	dput(new_dentry);
-out_unlock:
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-out_release:
-	path_put(&nd.path);
-	putname(to);
+	mutex_unlock(&new_path.dentry->d_inode->i_mutex);
+	path_put(&new_path);
 out:
 	path_put(&old_path);
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 3439ab862e1d..b8cea804d31a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -74,6 +74,8 @@ extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
 extern int kern_path(const char *, unsigned, struct path *);
 
+extern struct dentry *kern_path_create(int, const char *, struct path *, int);
+extern struct dentry *user_path_create(int, const char __user *, struct path *, int);
 extern int kern_path_parent(const char *, struct nameidata *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0722a25a3a33..ec68e1c05b85 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -808,8 +808,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct net *net = sock_net(sk);
 	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
+	char *sun_path = sunaddr->sun_path;
 	struct dentry *dentry = NULL;
-	struct nameidata nd;
+	struct path path;
 	int err;
 	unsigned hash;
 	struct unix_address *addr;
@@ -845,48 +846,44 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	addr->hash = hash ^ sk->sk_type;
 	atomic_set(&addr->refcnt, 1);
 
-	if (sunaddr->sun_path[0]) {
+	if (sun_path[0]) {
 		unsigned int mode;
 		err = 0;
 		/*
 		 * Get the parent directory, calculate the hash for last
 		 * component.
 		 */
-		err = kern_path_parent(sunaddr->sun_path, &nd);
-		if (err)
-			goto out_mknod_parent;
-
-		dentry = lookup_create(&nd, 0);
+		dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 		err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
-			goto out_mknod_unlock;
+			goto out_mknod_parent;
 
 		/*
 		 * All right, let's create it.
 		 */
 		mode = S_IFSOCK |
 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
-		err = mnt_want_write(nd.path.mnt);
+		err = mnt_want_write(path.mnt);
 		if (err)
 			goto out_mknod_dput;
-		err = security_path_mknod(&nd.path, dentry, mode, 0);
+		err = security_path_mknod(&path, dentry, mode, 0);
 		if (err)
 			goto out_mknod_drop_write;
-		err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0);
+		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 out_mknod_drop_write:
-		mnt_drop_write(nd.path.mnt);
+		mnt_drop_write(path.mnt);
 		if (err)
 			goto out_mknod_dput;
-		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-		dput(nd.path.dentry);
-		nd.path.dentry = dentry;
+		mutex_unlock(&path.dentry->d_inode->i_mutex);
+		dput(path.dentry);
+		path.dentry = dentry;
 
 		addr->hash = UNIX_HASH_SIZE;
 	}
 
 	spin_lock(&unix_table_lock);
 
-	if (!sunaddr->sun_path[0]) {
+	if (!sun_path[0]) {
 		err = -EADDRINUSE;
 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
 					      sk->sk_type, hash)) {
@@ -897,8 +894,8 @@ out_mknod_drop_write:
 		list = &unix_socket_table[addr->hash];
 	} else {
 		list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
-		u->dentry = nd.path.dentry;
-		u->mnt    = nd.path.mnt;
+		u->dentry = path.dentry;
+		u->mnt    = path.mnt;
 	}
 
 	err = 0;
@@ -915,9 +912,8 @@ out:
 
 out_mknod_dput:
 	dput(dentry);
-out_mknod_unlock:
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-	path_put(&nd.path);
+	mutex_unlock(&path.dentry->d_inode->i_mutex);
+	path_put(&path);
 out_mknod_parent:
 	if (err == -EEXIST)
 		err = -EADDRINUSE;
-- 
cgit v1.2.3


From 6657719390cd05be45f4e3b501d8bb46889c0a19 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 28 Jun 2011 15:41:10 -0400
Subject: make sure that nsproxy_cache is initialized early enough

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/nsproxy.h | 1 +
 kernel/fork.c           | 1 +
 kernel/nsproxy.c        | 4 +---
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 50d20aba57d3..cc37a55ad004 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -68,6 +68,7 @@ void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 void free_nsproxy(struct nsproxy *ns);
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
 	struct fs_struct *);
+int __init nsproxy_cache_init(void);
 
 static inline void put_nsproxy(struct nsproxy *ns)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30401a0..31fa13e63b70 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1574,6 +1574,7 @@ void __init proc_caches_init(void)
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
 	mmap_init();
+	nsproxy_cache_init();
 }
 
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index d6a00f3de15d..9aeab4b98c64 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -271,10 +271,8 @@ out:
 	return err;
 }
 
-static int __init nsproxy_cache_init(void)
+int __init nsproxy_cache_init(void)
 {
 	nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
 	return 0;
 }
-
-module_init(nsproxy_cache_init);
-- 
cgit v1.2.3


From ed75e95de574c99575e5f3e1d9ca59ea8c12a9cb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Jun 2011 16:53:43 -0400
Subject: kill lookup_create()

folded into the only caller (kern_path_create())

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c             | 54 +++++++++++++++++---------------------------------
 include/linux/dcache.h |  1 -
 2 files changed, 18 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index b292eb03d9d2..b45a039216c7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2258,35 +2258,29 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	return file;
 }
 
-/**
- * lookup_create - lookup a dentry, creating it if it doesn't exist
- * @nd: nameidata info
- * @is_dir: directory flag
- *
- * Simple function to lookup and return a dentry and create it
- * if it doesn't exist.  Is SMP-safe.
- *
- * Returns with nd->path.dentry->d_inode->i_mutex locked.
- */
-struct dentry *lookup_create(struct nameidata *nd, int is_dir)
+struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
 {
 	struct dentry *dentry = ERR_PTR(-EEXIST);
+	struct nameidata nd;
+	int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+	if (error)
+		return ERR_PTR(error);
 
-	mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	/*
 	 * Yucky last component or no last component at all?
 	 * (foo/., foo/.., /////)
 	 */
-	if (nd->last_type != LAST_NORM)
-		goto fail;
-	nd->flags &= ~LOOKUP_PARENT;
-	nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
-	nd->intent.open.flags = O_EXCL;
+	if (nd.last_type != LAST_NORM)
+		goto out;
+	nd.flags &= ~LOOKUP_PARENT;
+	nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+	nd.intent.open.flags = O_EXCL;
 
 	/*
 	 * Do the final lookup.
 	 */
-	dentry = lookup_hash(nd);
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_hash(&nd);
 	if (IS_ERR(dentry))
 		goto fail;
 
@@ -2298,34 +2292,22 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 	 * all is fine. Let's be bastards - you had / on the end, you've
 	 * been asking for (non-existent) directory. -ENOENT for you.
 	 */
-	if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
+	if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
 		dput(dentry);
 		dentry = ERR_PTR(-ENOENT);
+		goto fail;
 	}
+	*path = nd.path;
 	return dentry;
 eexist:
 	dput(dentry);
 	dentry = ERR_PTR(-EEXIST);
 fail:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+out:
+	path_put(&nd.path);
 	return dentry;
 }
-EXPORT_SYMBOL_GPL(lookup_create);
-
-struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
-{
-	struct nameidata nd;
-	struct dentry *res;
-	int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
-	if (error)
-		return ERR_PTR(error);
-	res = lookup_create(&nd, is_dir);
-	if (IS_ERR(res)) {
-		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-		path_put(&nd.path);
-	}
-	*path = nd.path;
-	return res;
-}
 EXPORT_SYMBOL(kern_path_create);
 
 struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 5fa5bd33b979..3f22d8d6d8a3 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -423,7 +423,6 @@ static inline bool d_need_lookup(struct dentry *dentry)
 }
 
 extern void d_clear_need_lookup(struct dentry *dentry);
-extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
 extern int sysctl_vfs_cache_pressure;
 
-- 
cgit v1.2.3


From e0a0124936171af6156b80fe8ac8799f039e767f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Jun 2011 17:00:37 -0400
Subject: switch vfs_path_lookup() to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 16 +++++++++++-----
 fs/nfs/cache_lib.c    |  9 ++++-----
 fs/nfs/super.c        | 16 +++++-----------
 include/linux/namei.h |  2 +-
 net/sunrpc/clnt.c     | 11 +++++------
 5 files changed, 26 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index b45a039216c7..7e6ba8c80e77 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1575,16 +1575,22 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
  * @mnt: pointer to vfs mount of the base directory
  * @name: pointer to file name
  * @flags: lookup flags
- * @nd: pointer to nameidata
+ * @path: pointer to struct path to fill
  */
 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		    const char *name, unsigned int flags,
-		    struct nameidata *nd)
+		    struct path *path)
 {
-	nd->root.dentry = dentry;
-	nd->root.mnt = mnt;
+	struct nameidata nd;
+	int err;
+	nd.root.dentry = dentry;
+	nd.root.mnt = mnt;
+	BUG_ON(flags & LOOKUP_PARENT);
 	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
-	return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
+	err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
+	if (!err)
+		*path = nd.path;
+	return err;
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 84690319e625..c98b439332fc 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -113,19 +113,18 @@ int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
 
 int nfs_cache_register(struct cache_detail *cd)
 {
-	struct nameidata nd;
 	struct vfsmount *mnt;
+	struct path path;
 	int ret;
 
 	mnt = rpc_get_mount();
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
-	ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
+	ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path);
 	if (ret)
 		goto err;
-	ret = sunrpc_cache_register_pipefs(nd.path.dentry,
-			cd->name, 0600, cd);
-	path_put(&nd.path);
+	ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd);
+	path_put(&path);
 	if (!ret)
 		return ret;
 err:
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ce40e5c568ba..b961ceac66b4 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2773,16 +2773,12 @@ static void nfs_referral_loop_unprotect(void)
 static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
 		const char *export_path)
 {
-	struct nameidata *nd = NULL;
 	struct mnt_namespace *ns_private;
 	struct super_block *s;
 	struct dentry *dentry;
+	struct path path;
 	int ret;
 
-	nd = kmalloc(sizeof(*nd), GFP_KERNEL);
-	if (nd == NULL)
-		return ERR_PTR(-ENOMEM);
-
 	ns_private = create_mnt_ns(root_mnt);
 	ret = PTR_ERR(ns_private);
 	if (IS_ERR(ns_private))
@@ -2793,7 +2789,7 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
 		goto out_put_mnt_ns;
 
 	ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
-			export_path, LOOKUP_FOLLOW, nd);
+			export_path, LOOKUP_FOLLOW, &path);
 
 	nfs_referral_loop_unprotect();
 	put_mnt_ns(ns_private);
@@ -2801,12 +2797,11 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
 	if (ret != 0)
 		goto out_err;
 
-	s = nd->path.mnt->mnt_sb;
+	s = path.mnt->mnt_sb;
 	atomic_inc(&s->s_active);
-	dentry = dget(nd->path.dentry);
+	dentry = dget(path.dentry);
 
-	path_put(&nd->path);
-	kfree(nd);
+	path_put(&path);
 	down_write(&s->s_umount);
 	return dentry;
 out_put_mnt_ns:
@@ -2814,7 +2809,6 @@ out_put_mnt_ns:
 out_mntput:
 	mntput(root_mnt);
 out_err:
-	kfree(nd);
 	return ERR_PTR(ret);
 }
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index b8cea804d31a..76fe2c62ae71 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -78,7 +78,7 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, int);
 extern struct dentry *user_path_create(int, const char __user *, struct path *, int);
 extern int kern_path_parent(const char *, struct nameidata *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
-			   const char *, unsigned int, struct nameidata *);
+			   const char *, unsigned int, struct path *);
 
 extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
 		int (*open)(struct inode *, struct file *));
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 8c9141583d6f..304f403a0411 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -97,8 +97,7 @@ static int
 rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
 {
 	static uint32_t clntid;
-	struct nameidata nd;
-	struct path path;
+	struct path path, dir;
 	char name[15];
 	struct qstr q = {
 		.name = name,
@@ -113,7 +112,7 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
 	path.mnt = rpc_get_mount();
 	if (IS_ERR(path.mnt))
 		return PTR_ERR(path.mnt);
-	error = vfs_path_lookup(path.mnt->mnt_root, path.mnt, dir_name, 0, &nd);
+	error = vfs_path_lookup(path.mnt->mnt_root, path.mnt, dir_name, 0, &dir);
 	if (error)
 		goto err;
 
@@ -121,7 +120,7 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
 		q.len = snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
 		name[sizeof(name) - 1] = '\0';
 		q.hash = full_name_hash(q.name, q.len);
-		path.dentry = rpc_create_client_dir(nd.path.dentry, &q, clnt);
+		path.dentry = rpc_create_client_dir(dir.dentry, &q, clnt);
 		if (!IS_ERR(path.dentry))
 			break;
 		error = PTR_ERR(path.dentry);
@@ -132,11 +131,11 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
 			goto err_path_put;
 		}
 	}
-	path_put(&nd.path);
+	path_put(&dir);
 	clnt->cl_path = path;
 	return 0;
 err_path_put:
-	path_put(&nd.path);
+	path_put(&dir);
 err:
 	rpc_put_mount();
 	return error;
-- 
cgit v1.2.3


From 0ee5dc676a5f8fadede608c7281dfedb1ae714ea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Jul 2011 15:44:25 -0400
Subject: btrfs: kill magical embedded struct superblock

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c | 15 ++++-----------
 fs/btrfs/inode.c   |  2 +-
 fs/super.c         | 32 +++++++++++++++++++++++---------
 include/linux/fs.h |  2 ++
 5 files changed, 31 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9864cec801ed..9552afc24ef7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1219,7 +1219,7 @@ struct btrfs_root {
 	 * right now this just gets used so that a root has its own devid
 	 * for stat.  It may be used for more later
 	 */
-	struct super_block anon_super;
+	dev_t anon_dev;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ac8db5dc0a3..b231ae13b269 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1077,12 +1077,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	init_completion(&root->kobj_unregister);
 	root->defrag_running = 0;
 	root->root_key.objectid = objectid;
-	root->anon_super.s_root = NULL;
-	root->anon_super.s_dev = 0;
-	INIT_LIST_HEAD(&root->anon_super.s_list);
-	INIT_LIST_HEAD(&root->anon_super.s_instances);
-	init_rwsem(&root->anon_super.s_umount);
-
+	root->anon_dev = 0;
 	return 0;
 }
 
@@ -1311,7 +1306,7 @@ again:
 	spin_lock_init(&root->cache_lock);
 	init_waitqueue_head(&root->cache_wait);
 
-	ret = set_anon_super(&root->anon_super, NULL);
+	ret = get_anon_bdev(&root->anon_dev);
 	if (ret)
 		goto fail;
 
@@ -2393,10 +2388,8 @@ static void free_fs_root(struct btrfs_root *root)
 {
 	iput(root->cache_inode);
 	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
-	if (root->anon_super.s_dev) {
-		down_write(&root->anon_super.s_umount);
-		kill_anon_super(&root->anon_super);
-	}
+	if (root->anon_dev)
+		free_anon_bdev(root->anon_dev);
 	free_extent_buffer(root->node);
 	free_extent_buffer(root->commit_root);
 	kfree(root->free_ino_ctl);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cb170ca51c64..ecf0fac712d6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6900,7 +6900,7 @@ static int btrfs_getattr(struct vfsmount *mnt,
 {
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
-	stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
+	stat->dev = BTRFS_I(inode)->root->anon_dev;
 	stat->blksize = PAGE_CACHE_SIZE;
 	stat->blocks = (inode_get_bytes(inode) +
 			BTRFS_I(inode)->delalloc_bytes) >> 9;
diff --git a/fs/super.c b/fs/super.c
index 444da9579068..263edeb9f0e9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -693,7 +693,7 @@ static DEFINE_IDA(unnamed_dev_ida);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
 static int unnamed_dev_start = 0; /* don't bother trying below it */
 
-int set_anon_super(struct super_block *s, void *data)
+int get_anon_bdev(dev_t *p)
 {
 	int dev;
 	int error;
@@ -720,24 +720,38 @@ int set_anon_super(struct super_block *s, void *data)
 		spin_unlock(&unnamed_dev_lock);
 		return -EMFILE;
 	}
-	s->s_dev = MKDEV(0, dev & MINORMASK);
-	s->s_bdi = &noop_backing_dev_info;
+	*p = MKDEV(0, dev & MINORMASK);
 	return 0;
 }
+EXPORT_SYMBOL(get_anon_bdev);
 
-EXPORT_SYMBOL(set_anon_super);
-
-void kill_anon_super(struct super_block *sb)
+void free_anon_bdev(dev_t dev)
 {
-	int slot = MINOR(sb->s_dev);
-
-	generic_shutdown_super(sb);
+	int slot = MINOR(dev);
 	spin_lock(&unnamed_dev_lock);
 	ida_remove(&unnamed_dev_ida, slot);
 	if (slot < unnamed_dev_start)
 		unnamed_dev_start = slot;
 	spin_unlock(&unnamed_dev_lock);
 }
+EXPORT_SYMBOL(free_anon_bdev);
+
+int set_anon_super(struct super_block *s, void *data)
+{
+	int error = get_anon_bdev(&s->s_dev);
+	if (!error)
+		s->s_bdi = &noop_backing_dev_info;
+	return error;
+}
+
+EXPORT_SYMBOL(set_anon_super);
+
+void kill_anon_super(struct super_block *sb)
+{
+	dev_t dev = sb->s_dev;
+	generic_shutdown_super(sb);
+	free_anon_bdev(dev);
+}
 
 EXPORT_SYMBOL(kill_anon_super);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8494aac189f0..a0011aef4338 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1835,6 +1835,8 @@ void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
 void deactivate_locked_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
+int get_anon_bdev(dev_t *);
+void free_anon_bdev(dev_t);
 struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-- 
cgit v1.2.3


From e9299f5058595a655c3b207cda9635e28b9197e6 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:37 +1000
Subject: vmscan: add customisable shrinker batch size

For shrinkers that have their own cond_resched* calls, having
shrink_slab break the work down into small batches is not
paticularly efficient. Add a custom batchsize field to the struct
shrinker so that shrinkers can use a larger batch size if they
desire.

A value of zero (uninitialised) means "use the default", so
behaviour is unchanged by this patch.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/mm.h |  1 +
 mm/vmscan.c        | 11 ++++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71d7be9..9b9777ac726d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1150,6 +1150,7 @@ struct shrink_control {
 struct shrinker {
 	int (*shrink)(struct shrinker *, struct shrink_control *sc);
 	int seeks;	/* seeks to recreate an obj */
+	long batch;	/* reclaim batch size, 0 = default */
 
 	/* These are for internal use */
 	struct list_head list;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 387422470c95..febbc044e792 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -253,6 +253,8 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		int shrink_ret = 0;
 		long nr;
 		long new_nr;
+		long batch_size = shrinker->batch ? shrinker->batch
+						  : SHRINK_BATCH;
 
 		/*
 		 * copy the current shrinker scan count into a local variable
@@ -303,19 +305,18 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 					nr_pages_scanned, lru_pages,
 					max_pass, delta, total_scan);
 
-		while (total_scan >= SHRINK_BATCH) {
-			long this_scan = SHRINK_BATCH;
+		while (total_scan >= batch_size) {
 			int nr_before;
 
 			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
 			shrink_ret = do_shrinker_shrink(shrinker, shrink,
-							this_scan);
+							batch_size);
 			if (shrink_ret == -1)
 				break;
 			if (shrink_ret < nr_before)
 				ret += nr_before - shrink_ret;
-			count_vm_events(SLABS_SCANNED, this_scan);
-			total_scan -= this_scan;
+			count_vm_events(SLABS_SCANNED, batch_size);
+			total_scan -= batch_size;
 
 			cond_resched();
 		}
-- 
cgit v1.2.3


From 98b745c647a5a90c3c21ea43cbfad9a47b0dfad7 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:39 +1000
Subject: inode: Make unused inode LRU per superblock

The inode unused list is currently a global LRU. This does not match
the other global filesystem cache - the dentry cache - which uses
per-superblock LRU lists. Hence we have related filesystem object
types using different LRU reclaimation schemes.

To enable a per-superblock filesystem cache shrinker, both of these
caches need to have per-sb unused object LRU lists. Hence this patch
converts the global inode LRU to per-sb LRUs.

The patch only does rudimentary per-sb propotioning in the shrinker
infrastructure, as this gets removed when the per-sb shrinker
callouts are introduced later on.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 91 +++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/super.c         |  1 +
 include/linux/fs.h |  4 +++
 3 files changed, 85 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 9a0361121712..8c3491302e0c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -34,7 +34,7 @@
  * inode->i_lock protects:
  *   inode->i_state, inode->i_hash, __iget()
  * inode_lru_lock protects:
- *   inode_lru, inode->i_lru
+ *   inode->i_sb->s_inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
  * inode_wb_list_lock protects:
@@ -64,7 +64,6 @@ static unsigned int i_hash_shift __read_mostly;
 static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
-static LIST_HEAD(inode_lru);
 static DEFINE_SPINLOCK(inode_lru_lock);
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
@@ -345,7 +344,8 @@ static void inode_lru_list_add(struct inode *inode)
 {
 	spin_lock(&inode_lru_lock);
 	if (list_empty(&inode->i_lru)) {
-		list_add(&inode->i_lru, &inode_lru);
+		list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
+		inode->i_sb->s_nr_inodes_unused++;
 		this_cpu_inc(nr_unused);
 	}
 	spin_unlock(&inode_lru_lock);
@@ -356,6 +356,7 @@ static void inode_lru_list_del(struct inode *inode)
 	spin_lock(&inode_lru_lock);
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
+		inode->i_sb->s_nr_inodes_unused--;
 		this_cpu_dec(nr_unused);
 	}
 	spin_unlock(&inode_lru_lock);
@@ -628,21 +629,20 @@ static int can_unuse(struct inode *inode)
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
-static void prune_icache(int nr_to_scan)
+static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
 {
 	LIST_HEAD(freeable);
 	int nr_scanned;
 	unsigned long reap = 0;
 
-	down_read(&iprune_sem);
 	spin_lock(&inode_lru_lock);
-	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+	for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) {
 		struct inode *inode;
 
-		if (list_empty(&inode_lru))
+		if (list_empty(&sb->s_inode_lru))
 			break;
 
-		inode = list_entry(inode_lru.prev, struct inode, i_lru);
+		inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
 
 		/*
 		 * we are inverting the inode_lru_lock/inode->i_lock here,
@@ -650,7 +650,7 @@ static void prune_icache(int nr_to_scan)
 		 * inode to the back of the list so we don't spin on it.
 		 */
 		if (!spin_trylock(&inode->i_lock)) {
-			list_move(&inode->i_lru, &inode_lru);
+			list_move(&inode->i_lru, &sb->s_inode_lru);
 			continue;
 		}
 
@@ -662,6 +662,7 @@ static void prune_icache(int nr_to_scan)
 		    (inode->i_state & ~I_REFERENCED)) {
 			list_del_init(&inode->i_lru);
 			spin_unlock(&inode->i_lock);
+			sb->s_nr_inodes_unused--;
 			this_cpu_dec(nr_unused);
 			continue;
 		}
@@ -669,7 +670,7 @@ static void prune_icache(int nr_to_scan)
 		/* recently referenced inodes get one more pass */
 		if (inode->i_state & I_REFERENCED) {
 			inode->i_state &= ~I_REFERENCED;
-			list_move(&inode->i_lru, &inode_lru);
+			list_move(&inode->i_lru, &sb->s_inode_lru);
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
@@ -683,7 +684,7 @@ static void prune_icache(int nr_to_scan)
 			iput(inode);
 			spin_lock(&inode_lru_lock);
 
-			if (inode != list_entry(inode_lru.next,
+			if (inode != list_entry(sb->s_inode_lru.next,
 						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
 			/* avoid lock inversions with trylock */
@@ -699,6 +700,7 @@ static void prune_icache(int nr_to_scan)
 		spin_unlock(&inode->i_lock);
 
 		list_move(&inode->i_lru, &freeable);
+		sb->s_nr_inodes_unused--;
 		this_cpu_dec(nr_unused);
 	}
 	if (current_is_kswapd())
@@ -706,8 +708,75 @@ static void prune_icache(int nr_to_scan)
 	else
 		__count_vm_events(PGINODESTEAL, reap);
 	spin_unlock(&inode_lru_lock);
+	*nr_to_scan = nr_scanned;
 
 	dispose_list(&freeable);
+}
+
+static void prune_icache(int count)
+{
+	struct super_block *sb, *p = NULL;
+	int w_count;
+	int unused = inodes_stat.nr_unused;
+	int prune_ratio;
+	int pruned;
+
+	if (unused == 0 || count == 0)
+		return;
+	down_read(&iprune_sem);
+	if (count >= unused)
+		prune_ratio = 1;
+	else
+		prune_ratio = unused / count;
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
+		if (sb->s_nr_inodes_unused == 0)
+			continue;
+		sb->s_count++;
+		/* Now, we reclaim unused dentrins with fairness.
+		 * We reclaim them same percentage from each superblock.
+		 * We calculate number of dentries to scan on this sb
+		 * as follows, but the implementation is arranged to avoid
+		 * overflows:
+		 * number of dentries to scan on this sb =
+		 * count * (number of dentries on this sb /
+		 * number of dentries in the machine)
+		 */
+		spin_unlock(&sb_lock);
+		if (prune_ratio != 1)
+			w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1;
+		else
+			w_count = sb->s_nr_inodes_unused;
+		pruned = w_count;
+		/*
+		 * We need to be sure this filesystem isn't being unmounted,
+		 * otherwise we could race with generic_shutdown_super(), and
+		 * end up holding a reference to an inode while the filesystem
+		 * is unmounted.  So we try to get s_umount, and make sure
+		 * s_root isn't NULL.
+		 */
+		if (down_read_trylock(&sb->s_umount)) {
+			if ((sb->s_root != NULL) &&
+			    (!list_empty(&sb->s_dentry_lru))) {
+				shrink_icache_sb(sb, &w_count);
+				pruned -= w_count;
+			}
+			up_read(&sb->s_umount);
+		}
+		spin_lock(&sb_lock);
+		if (p)
+			__put_super(p);
+		count -= pruned;
+		p = sb;
+		/* more work left to do? */
+		if (count <= 0)
+			break;
+	}
+	if (p)
+		__put_super(p);
+	spin_unlock(&sb_lock);
 	up_read(&iprune_sem);
 }
 
diff --git a/fs/super.c b/fs/super.c
index 263edeb9f0e9..e8e6dbfefe8c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -77,6 +77,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_HLIST_BL_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
+		INIT_LIST_HEAD(&s->s_inode_lru);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a0011aef4338..9724f0a48742 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1397,6 +1397,10 @@ struct super_block {
 	struct list_head	s_dentry_lru;	/* unused dentry lru */
 	int			s_nr_dentry_unused;	/* # of dentry on lru */
 
+	/* inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
+	struct list_head	s_inode_lru;		/* unused inode lru */
+	int			s_nr_inodes_unused;	/* # of inodes on lru */
+
 	struct block_device	*s_bdev;
 	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
-- 
cgit v1.2.3


From 09cc9fc7a7c3d872065426d7fb0f0ad6d3eb90fc Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:40 +1000
Subject: inode: move to per-sb LRU locks

With the inode LRUs moving to per-sb structures, there is no longer
a need for a global inode_lru_lock. The locking can be made more
fine-grained by moving to a per-sb LRU lock, isolating the LRU
operations of different filesytsems completely from each other.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 27 +++++++++++++--------------
 fs/super.c         |  1 +
 include/linux/fs.h |  3 ++-
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 8c3491302e0c..0450e25aeda0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -33,7 +33,7 @@
  *
  * inode->i_lock protects:
  *   inode->i_state, inode->i_hash, __iget()
- * inode_lru_lock protects:
+ * inode->i_sb->s_inode_lru_lock protects:
  *   inode->i_sb->s_inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
@@ -46,7 +46,7 @@
  *
  * inode_sb_list_lock
  *   inode->i_lock
- *     inode_lru_lock
+ *     inode->i_sb->s_inode_lru_lock
  *
  * inode_wb_list_lock
  *   inode->i_lock
@@ -64,8 +64,6 @@ static unsigned int i_hash_shift __read_mostly;
 static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
-static DEFINE_SPINLOCK(inode_lru_lock);
-
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
 
@@ -342,24 +340,24 @@ EXPORT_SYMBOL(ihold);
 
 static void inode_lru_list_add(struct inode *inode)
 {
-	spin_lock(&inode_lru_lock);
+	spin_lock(&inode->i_sb->s_inode_lru_lock);
 	if (list_empty(&inode->i_lru)) {
 		list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
 		inode->i_sb->s_nr_inodes_unused++;
 		this_cpu_inc(nr_unused);
 	}
-	spin_unlock(&inode_lru_lock);
+	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 static void inode_lru_list_del(struct inode *inode)
 {
-	spin_lock(&inode_lru_lock);
+	spin_lock(&inode->i_sb->s_inode_lru_lock);
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
 		inode->i_sb->s_nr_inodes_unused--;
 		this_cpu_dec(nr_unused);
 	}
-	spin_unlock(&inode_lru_lock);
+	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 /**
@@ -615,7 +613,8 @@ static int can_unuse(struct inode *inode)
 
 /*
  * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside inode_lru_lock by dispose_list().
+ * temporary list and then are freed outside sb->s_inode_lru_lock by
+ * dispose_list().
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
@@ -635,7 +634,7 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
 	int nr_scanned;
 	unsigned long reap = 0;
 
-	spin_lock(&inode_lru_lock);
+	spin_lock(&sb->s_inode_lru_lock);
 	for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) {
 		struct inode *inode;
 
@@ -645,7 +644,7 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
 		inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
 
 		/*
-		 * we are inverting the inode_lru_lock/inode->i_lock here,
+		 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
 		 * so use a trylock. If we fail to get the lock, just move the
 		 * inode to the back of the list so we don't spin on it.
 		 */
@@ -677,12 +676,12 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
 			spin_unlock(&inode->i_lock);
-			spin_unlock(&inode_lru_lock);
+			spin_unlock(&sb->s_inode_lru_lock);
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 			iput(inode);
-			spin_lock(&inode_lru_lock);
+			spin_lock(&sb->s_inode_lru_lock);
 
 			if (inode != list_entry(sb->s_inode_lru.next,
 						struct inode, i_lru))
@@ -707,7 +706,7 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
 	else
 		__count_vm_events(PGINODESTEAL, reap);
-	spin_unlock(&inode_lru_lock);
+	spin_unlock(&sb->s_inode_lru_lock);
 	*nr_to_scan = nr_scanned;
 
 	dispose_list(&freeable);
diff --git a/fs/super.c b/fs/super.c
index e8e6dbfefe8c..73ab9f9b3571 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -78,6 +78,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
 		INIT_LIST_HEAD(&s->s_inode_lru);
+		spin_lock_init(&s->s_inode_lru_lock);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9724f0a48742..460d2cc21ec6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1397,7 +1397,8 @@ struct super_block {
 	struct list_head	s_dentry_lru;	/* unused dentry lru */
 	int			s_nr_dentry_unused;	/* # of dentry on lru */
 
-	/* inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
+	/* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
+	spinlock_t		s_inode_lru_lock ____cacheline_aligned_in_smp;
 	struct list_head	s_inode_lru;		/* unused inode lru */
 	int			s_nr_inodes_unused;	/* # of inodes on lru */
 
-- 
cgit v1.2.3


From 9c3f75cbd144014bea6af866a154cc2e73ab2287 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Jul 2011 13:00:06 +0200
Subject: sched: Break out cpu_power from the sched_group structure

In order to prepare for non-unique sched_groups per domain, we need to
carry the cpu_power elsewhere, so put a level of indirection in.

Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-qkho2byuhe4482fuknss40ad@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 14 +++++++++-----
 kernel/sched.c        | 32 ++++++++++++++++++++++++++------
 kernel/sched_fair.c   | 46 +++++++++++++++++++++++-----------------------
 3 files changed, 58 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 496770a96487..2e5b3c8e2d3e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -893,16 +893,20 @@ static inline int sd_power_saving_flags(void)
 	return 0;
 }
 
-struct sched_group {
-	struct sched_group *next;	/* Must be a circular list */
-	atomic_t ref;
-
+struct sched_group_power {
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU.
 	 */
-	unsigned int cpu_power, cpu_power_orig;
+	unsigned int power, power_orig;
+};
+
+struct sched_group {
+	struct sched_group *next;	/* Must be a circular list */
+	atomic_t ref;
+
 	unsigned int group_weight;
+	struct sched_group_power *sgp;
 
 	/*
 	 * The CPUs this group covers.
diff --git a/kernel/sched.c b/kernel/sched.c
index 3dc716f6d8ad..36c10d25d4cd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6557,7 +6557,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!group->cpu_power) {
+		if (!group->sgp->power) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
@@ -6581,9 +6581,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
 		printk(KERN_CONT " %s", str);
-		if (group->cpu_power != SCHED_POWER_SCALE) {
+		if (group->sgp->power != SCHED_POWER_SCALE) {
 			printk(KERN_CONT " (cpu_power = %d)",
-				group->cpu_power);
+				group->sgp->power);
 		}
 
 		group = group->next;
@@ -6777,8 +6777,10 @@ static struct root_domain *alloc_rootdomain(void)
 static void free_sched_domain(struct rcu_head *rcu)
 {
 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-	if (atomic_dec_and_test(&sd->groups->ref))
+	if (atomic_dec_and_test(&sd->groups->ref)) {
+		kfree(sd->groups->sgp);
 		kfree(sd->groups);
+	}
 	kfree(sd);
 }
 
@@ -6945,6 +6947,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 struct sd_data {
 	struct sched_domain **__percpu sd;
 	struct sched_group **__percpu sg;
+	struct sched_group_power **__percpu sgp;
 };
 
 struct s_data {
@@ -6981,8 +6984,10 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 	if (child)
 		cpu = cpumask_first(sched_domain_span(child));
 
-	if (sg)
+	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
+		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+	}
 
 	return cpu;
 }
@@ -7020,7 +7025,7 @@ build_sched_groups(struct sched_domain *sd)
 			continue;
 
 		cpumask_clear(sched_group_cpus(sg));
-		sg->cpu_power = 0;
+		sg->sgp->power = 0;
 
 		for_each_cpu(j, span) {
 			if (get_group(j, sdd, NULL) != group)
@@ -7185,6 +7190,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 	if (cpu == cpumask_first(sched_group_cpus(sg))) {
 		WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
 	}
 }
 
@@ -7234,9 +7240,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 		if (!sdd->sg)
 			return -ENOMEM;
 
+		sdd->sgp = alloc_percpu(struct sched_group_power *);
+		if (!sdd->sgp)
+			return -ENOMEM;
+
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
 			struct sched_group *sg;
+			struct sched_group_power *sgp;
 
 		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
@@ -7251,6 +7262,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 				return -ENOMEM;
 
 			*per_cpu_ptr(sdd->sg, j) = sg;
+
+			sgp = kzalloc_node(sizeof(struct sched_group_power),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sgp)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sgp, j) = sgp;
 		}
 	}
 
@@ -7268,9 +7286,11 @@ static void __sdt_free(const struct cpumask *cpu_map)
 		for_each_cpu(j, cpu_map) {
 			kfree(*per_cpu_ptr(sdd->sd, j));
 			kfree(*per_cpu_ptr(sdd->sg, j));
+			kfree(*per_cpu_ptr(sdd->sgp, j));
 		}
 		free_percpu(sdd->sd);
 		free_percpu(sdd->sg);
+		free_percpu(sdd->sgp);
 	}
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 433491c2dc8f..c768588e180b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		}
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
+		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power >>= SCHED_POWER_SHIFT;
 	}
 
-	sdg->cpu_power_orig = power;
+	sdg->sgp->power_orig = power;
 
 	if (sched_feat(ARCH_POWER))
 		power *= arch_scale_freq_power(sd, cpu);
@@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power = 1;
 
 	cpu_rq(cpu)->cpu_power = power;
-	sdg->cpu_power = power;
+	sdg->sgp->power = power;
 }
 
 static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 
 	group = child->groups;
 	do {
-		power += group->cpu_power;
+		power += group->sgp->power;
 		group = group->next;
 	} while (group != child->groups);
 
-	sdg->cpu_power = power;
+	sdg->sgp->power = power;
 }
 
 /*
@@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 	/*
 	 * If ~90% of the cpu_power is still there, we're good.
 	 */
-	if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
 		return 1;
 
 	return 0;
@@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	}
 
 	/* Adjust by relative CPU power of the group */
-	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
+	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
 
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
@@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
-	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
 						SCHED_POWER_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
@@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			return;
 
 		sds->total_load += sgs.group_load;
-		sds->total_pwr += sg->cpu_power;
+		sds->total_pwr += sg->sgp->power;
 
 		/*
 		 * In case the child domain prefers tasks go to siblings
@@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,
 	if (this_cpu > busiest_cpu)
 		return 0;
 
-	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
 				       SCHED_POWER_SCALE);
 	return 1;
 }
@@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 
 	scaled_busy_load_per_task = sds->busiest_load_per_task
 					 * SCHED_POWER_SCALE;
-	scaled_busy_load_per_task /= sds->busiest->cpu_power;
+	scaled_busy_load_per_task /= sds->busiest->sgp->power;
 
 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
 			(scaled_busy_load_per_task * imbn)) {
@@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 	 * moving them.
 	 */
 
-	pwr_now += sds->busiest->cpu_power *
+	pwr_now += sds->busiest->sgp->power *
 			min(sds->busiest_load_per_task, sds->max_load);
-	pwr_now += sds->this->cpu_power *
+	pwr_now += sds->this->sgp->power *
 			min(sds->this_load_per_task, sds->this_load);
 	pwr_now /= SCHED_POWER_SCALE;
 
 	/* Amount of load we'd subtract */
 	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-		sds->busiest->cpu_power;
+		sds->busiest->sgp->power;
 	if (sds->max_load > tmp)
-		pwr_move += sds->busiest->cpu_power *
+		pwr_move += sds->busiest->sgp->power *
 			min(sds->busiest_load_per_task, sds->max_load - tmp);
 
 	/* Amount of load we'd add */
-	if (sds->max_load * sds->busiest->cpu_power <
+	if (sds->max_load * sds->busiest->sgp->power <
 		sds->busiest_load_per_task * SCHED_POWER_SCALE)
-		tmp = (sds->max_load * sds->busiest->cpu_power) /
-			sds->this->cpu_power;
+		tmp = (sds->max_load * sds->busiest->sgp->power) /
+			sds->this->sgp->power;
 	else
 		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-			sds->this->cpu_power;
-	pwr_move += sds->this->cpu_power *
+			sds->this->sgp->power;
+	pwr_move += sds->this->sgp->power *
 			min(sds->this_load_per_task, sds->this_load + tmp);
 	pwr_move /= SCHED_POWER_SCALE;
 
@@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 
 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
 
-		load_above_capacity /= sds->busiest->cpu_power;
+		load_above_capacity /= sds->busiest->sgp->power;
 	}
 
 	/*
@@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
 
 	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * sds->busiest->cpu_power,
-		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
+	*imbalance = min(max_pull * sds->busiest->sgp->power,
+		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
 			/ SCHED_POWER_SCALE;
 
 	/*
-- 
cgit v1.2.3


From e3589f6c81e4764d32a25d2a2a0afe54fa344f5c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 Jul 2011 10:35:52 +0200
Subject: sched: Allow for overlapping sched_domain spans

Allow for sched_domain spans that overlap by giving such domains their
own sched_group list instead of sharing the sched_groups amongst
each-other.

This is needed for machines with more than 16 nodes, because
sched_domain_node_span() will generate a node mask from the
16 nearest nodes without regard if these masks have any overlap.

Currently sched_domains have a sched_group that maps to their child
sched_domain span, and since there is no overlap we share the
sched_group between the sched_domains of the various CPUs. If however
there is overlap, we would need to link the sched_group list in
different ways for each cpu, and hence sharing isn't possible.

In order to solve this, allocate private sched_groups for each CPU's
sched_domain but have the sched_groups share a sched_group_power
structure such that we can uniquely track the power.

Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-08bxqw9wis3qti9u5inifh3y@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |   2 +
 kernel/sched.c          | 157 +++++++++++++++++++++++++++++++++++++++---------
 kernel/sched_features.h |   2 +
 3 files changed, 132 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e5b3c8e2d3e..bde99d5358dc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -844,6 +844,7 @@ enum cpu_idle_type {
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
+#define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
 
 enum powersavings_balance_level {
 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
@@ -894,6 +895,7 @@ static inline int sd_power_saving_flags(void)
 }
 
 struct sched_group_power {
+	atomic_t ref;
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU.
diff --git a/kernel/sched.c b/kernel/sched.c
index 36c10d25d4cd..921adf6f6fad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6774,10 +6774,36 @@ static struct root_domain *alloc_rootdomain(void)
 	return rd;
 }
 
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+	struct sched_group *tmp, *first;
+
+	if (!sg)
+		return;
+
+	first = sg;
+	do {
+		tmp = sg->next;
+
+		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+			kfree(sg->sgp);
+
+		kfree(sg);
+		sg = tmp;
+	} while (sg != first);
+}
+
 static void free_sched_domain(struct rcu_head *rcu)
 {
 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-	if (atomic_dec_and_test(&sd->groups->ref)) {
+
+	/*
+	 * If its an overlapping domain it has private groups, iterate and
+	 * nuke them all.
+	 */
+	if (sd->flags & SD_OVERLAP) {
+		free_sched_groups(sd->groups, 1);
+	} else if (atomic_dec_and_test(&sd->groups->ref)) {
 		kfree(sd->groups->sgp);
 		kfree(sd->groups);
 	}
@@ -6967,15 +6993,73 @@ struct sched_domain_topology_level;
 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 
+#define SDTL_OVERLAP	0x01
+
 struct sched_domain_topology_level {
 	sched_domain_init_f init;
 	sched_domain_mask_f mask;
+	int		    flags;
 	struct sd_data      data;
 };
 
-/*
- * Assumes the sched_domain tree is fully constructed
- */
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+	const struct cpumask *span = sched_domain_span(sd);
+	struct cpumask *covered = sched_domains_tmpmask;
+	struct sd_data *sdd = sd->private;
+	struct sched_domain *child;
+	int i;
+
+	cpumask_clear(covered);
+
+	for_each_cpu(i, span) {
+		struct cpumask *sg_span;
+
+		if (cpumask_test_cpu(i, covered))
+			continue;
+
+		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+				GFP_KERNEL, cpu_to_node(i));
+
+		if (!sg)
+			goto fail;
+
+		sg_span = sched_group_cpus(sg);
+
+		child = *per_cpu_ptr(sdd->sd, i);
+		if (child->child) {
+			child = child->child;
+			cpumask_copy(sg_span, sched_domain_span(child));
+		} else
+			cpumask_set_cpu(i, sg_span);
+
+		cpumask_or(covered, covered, sg_span);
+
+		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+		atomic_inc(&sg->sgp->ref);
+
+		if (cpumask_test_cpu(cpu, sg_span))
+			groups = sg;
+
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+		last->next = first;
+	}
+	sd->groups = groups;
+
+	return 0;
+
+fail:
+	free_sched_groups(first, 0);
+
+	return -ENOMEM;
+}
+
 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 {
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6987,23 +7071,21 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
 		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
 	}
 
 	return cpu;
 }
 
 /*
- * build_sched_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
  * build_sched_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
  */
-static void
-build_sched_groups(struct sched_domain *sd)
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	struct sd_data *sdd = sd->private;
@@ -7011,6 +7093,12 @@ build_sched_groups(struct sched_domain *sd)
 	struct cpumask *covered;
 	int i;
 
+	get_group(cpu, sdd, &sd->groups);
+	atomic_inc(&sd->groups->ref);
+
+	if (cpu != cpumask_first(sched_domain_span(sd)))
+		return 0;
+
 	lockdep_assert_held(&sched_domains_mutex);
 	covered = sched_domains_tmpmask;
 
@@ -7042,6 +7130,8 @@ build_sched_groups(struct sched_domain *sd)
 		last = sg;
 	}
 	last->next = first;
+
+	return 0;
 }
 
 /*
@@ -7056,12 +7146,17 @@ build_sched_groups(struct sched_domain *sd)
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
-	WARN_ON(!sd || !sd->groups);
+	struct sched_group *sg = sd->groups;
 
-	if (cpu != group_first_cpu(sd->groups))
-		return;
+	WARN_ON(!sd || !sg);
+
+	do {
+		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+		sg = sg->next;
+	} while (sg != sd->groups);
 
-	sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+	if (cpu != group_first_cpu(sg))
+		return;
 
 	update_group_power(sd, cpu);
 }
@@ -7182,16 +7277,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 static void claim_allocations(int cpu, struct sched_domain *sd)
 {
 	struct sd_data *sdd = sd->private;
-	struct sched_group *sg = sd->groups;
 
 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
 
-	if (cpu == cpumask_first(sched_group_cpus(sg))) {
-		WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
 		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
-	}
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -7216,7 +7310,7 @@ static struct sched_domain_topology_level default_topology[] = {
 #endif
 	{ sd_init_CPU, cpu_cpu_mask, },
 #ifdef CONFIG_NUMA
-	{ sd_init_NODE, cpu_node_mask, },
+	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
 	{ sd_init_ALLNODES, cpu_allnodes_mask, },
 #endif
 	{ NULL, },
@@ -7284,7 +7378,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
 		struct sd_data *sdd = &tl->data;
 
 		for_each_cpu(j, cpu_map) {
-			kfree(*per_cpu_ptr(sdd->sd, j));
+			struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+			if (sd && (sd->flags & SD_OVERLAP))
+				free_sched_groups(sd->groups, 0);
 			kfree(*per_cpu_ptr(sdd->sg, j));
 			kfree(*per_cpu_ptr(sdd->sgp, j));
 		}
@@ -7336,8 +7432,11 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 		struct sched_domain_topology_level *tl;
 
 		sd = NULL;
-		for (tl = sched_domain_topology; tl->init; tl++)
+		for (tl = sched_domain_topology; tl->init; tl++) {
 			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+				sd->flags |= SD_OVERLAP;
+		}
 
 		while (sd->child)
 			sd = sd->child;
@@ -7349,13 +7448,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	for_each_cpu(i, cpu_map) {
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
-			get_group(i, sd->private, &sd->groups);
-			atomic_inc(&sd->groups->ref);
-
-			if (i != cpumask_first(sched_domain_span(sd)))
-				continue;
-
-			build_sched_groups(sd);
+			if (sd->flags & SD_OVERLAP) {
+				if (build_overlap_sched_groups(sd, i))
+					goto error;
+			} else {
+				if (build_sched_groups(sd, i))
+					goto error;
+			}
 		}
 	}
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f7371ee1..1e7066d76c26 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1)
  * using the scheduler IPI. Reduces rq->lock contention/bounces.
  */
 SCHED_FEAT(TTWU_QUEUE, 1)
+
+SCHED_FEAT(FORCE_SD_OVERLAP, 0)
-- 
cgit v1.2.3


From b56efcf0a45aa7fc32de90d5f9838541082fbc19 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 20 Jul 2011 19:04:23 +0200
Subject: slab: shrink sizeof(struct kmem_cache)

Reduce high order allocations for some setups.
(NR_CPUS=4096 -> we need 64KB per kmem_cache struct)

We now allocate exact needed size (using nr_cpu_ids and nr_node_ids)

This also makes code a bit smaller on x86_64, since some field offsets
are less than the 127 limit :

Before patch :
# size mm/slab.o
   text    data     bss     dec     hex filename
  22605  361665      32  384302   5dd2e mm/slab.o

After patch :
# size mm/slab.o
   text    data     bss     dec     hex filename
  22349	 353473	   8224	 384046	  5dc2e	mm/slab.o

CC: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab_def.h | 26 +++++++++++++-------------
 mm/slab.c                | 10 ++++++----
 2 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index d7f63112f63c..d00e0bacda93 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -24,21 +24,19 @@
  */
 
 struct kmem_cache {
-/* 1) per-cpu data, touched during every alloc/free */
-	struct array_cache *array[NR_CPUS];
-/* 2) Cache tunables. Protected by cache_chain_mutex */
+/* 1) Cache tunables. Protected by cache_chain_mutex */
 	unsigned int batchcount;
 	unsigned int limit;
 	unsigned int shared;
 
 	unsigned int buffer_size;
 	u32 reciprocal_buffer_size;
-/* 3) touched by every alloc & free from the backend */
+/* 2) touched by every alloc & free from the backend */
 
 	unsigned int flags;		/* constant flags */
 	unsigned int num;		/* # of objs per slab */
 
-/* 4) cache_grow/shrink */
+/* 3) cache_grow/shrink */
 	/* order of pgs per slab (2^n) */
 	unsigned int gfporder;
 
@@ -54,11 +52,11 @@ struct kmem_cache {
 	/* constructor func */
 	void (*ctor)(void *obj);
 
-/* 5) cache creation/removal */
+/* 4) cache creation/removal */
 	const char *name;
 	struct list_head next;
 
-/* 6) statistics */
+/* 5) statistics */
 #ifdef CONFIG_DEBUG_SLAB
 	unsigned long num_active;
 	unsigned long num_allocations;
@@ -85,16 +83,18 @@ struct kmem_cache {
 	int obj_size;
 #endif /* CONFIG_DEBUG_SLAB */
 
+/* 6) per-cpu/per-node data, touched during every alloc/free */
 	/*
-	 * We put nodelists[] at the end of kmem_cache, because we want to size
-	 * this array to nr_node_ids slots instead of MAX_NUMNODES
+	 * We put array[] at the end of kmem_cache, because we want to size
+	 * this array to nr_cpu_ids slots instead of NR_CPUS
 	 * (see kmem_cache_init())
-	 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
-	 * is statically defined, so we reserve the max number of nodes.
+	 * We still use [NR_CPUS] and not [1] or [0] because cache_cache
+	 * is statically defined, so we reserve the max number of cpus.
 	 */
-	struct kmem_list3 *nodelists[MAX_NUMNODES];
+	struct kmem_list3 **nodelists;
+	struct array_cache *array[NR_CPUS];
 	/*
-	 * Do not add fields after nodelists[]
+	 * Do not add fields after array[]
 	 */
 };
 
diff --git a/mm/slab.c b/mm/slab.c
index ef8ceb726e71..c3cb3598555a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
+static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
 static struct kmem_cache cache_cache = {
+	.nodelists = cache_cache_nodelists,
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
@@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void)
 	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
 
 	/*
-	 * struct kmem_cache size depends on nr_node_ids, which
-	 * can be less than MAX_NUMNODES.
+	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
 	 */
-	cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
-				 nr_node_ids * sizeof(struct kmem_list3 *);
+	cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+				  nr_node_ids * sizeof(struct kmem_list3 *);
 #if DEBUG
 	cache_cache.obj_size = cache_cache.buffer_size;
 #endif
@@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if (!cachep)
 		goto oops;
 
+	cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
 #if DEBUG
 	cachep->obj_size = size;
 
-- 
cgit v1.2.3


From 7f70893173b056df691b2ee7546bb44fd9abae6a Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <schnhrr@cs.tu-berlin.de>
Date: Tue, 19 Jul 2011 21:10:26 +0200
Subject: rcu: Fix wrong check in list_splice_init_rcu()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the list to be spliced is empty, then list_splice_init_rcu() has
nothing to do.  Unfortunately, list_splice_init_rcu() does not check
the list to be spliced; it instead checks the list to be spliced into.
This results in memory leaks given current usage.  This commit
therefore fixes the empty-list check.

Signed-off-by: Jan H. Schönherr <schnhrr@cs.tu-berlin.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e3beb315517a..d079290843a9 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -183,7 +183,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	struct list_head *last = list->prev;
 	struct list_head *at = head->next;
 
-	if (list_empty(head))
+	if (list_empty(list))
 		return;
 
 	/* "first" and "last" tracking list, so initialize it. */
-- 
cgit v1.2.3


From 85d6509dc8ca24b2b652863ef7a75622ddca17d6 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Fri, 27 May 2011 23:48:12 +0800
Subject: mmc: sdhci: make sdhci-pltfm device drivers self registered

The patch turns the common stuff in sdhci-pltfm.c into functions, and
add device drivers their own .probe and .remove which in turn call
into the common functions, so that those sdhci-pltfm device drivers
register itself and keep all device specific things away from common
sdhci-pltfm file.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Reviewed-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Anton Vorontsov <cbouatmailru@gmail.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/Kconfig           |  29 +++----
 drivers/mmc/host/Makefile          |  14 ++--
 drivers/mmc/host/sdhci-cns3xxx.c   |  42 +++++++++-
 drivers/mmc/host/sdhci-dove.c      |  42 +++++++++-
 drivers/mmc/host/sdhci-esdhc-imx.c | 113 ++++++++++++++++++--------
 drivers/mmc/host/sdhci-pltfm.c     | 157 +++++++++----------------------------
 drivers/mmc/host/sdhci-pltfm.h     |  17 +++-
 drivers/mmc/host/sdhci-tegra.c     | 116 ++++++++++++++++++---------
 include/linux/mmc/sdhci-pltfm.h    |   6 --
 9 files changed, 313 insertions(+), 223 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index 56dbf3f6ad08..d9ca2623038d 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -112,29 +112,19 @@ config MMC_SDHCI_OF_HLWD
 
 	  If unsure, say N.
 
-config MMC_SDHCI_PLTFM
-	tristate "SDHCI support on the platform specific bus"
-	depends on MMC_SDHCI
-	help
-	  This selects the platform specific bus support for Secure Digital Host
-	  Controller Interface.
-
-	  If you have a controller with this interface, say Y or M here.
-
-	  If unsure, say N.
-
 config MMC_SDHCI_CNS3XXX
-	bool "SDHCI support on the Cavium Networks CNS3xxx SoC"
+	tristate "SDHCI support on the Cavium Networks CNS3xxx SoC"
 	depends on ARCH_CNS3XXX
-	depends on MMC_SDHCI_PLTFM
+	depends on MMC_SDHCI
 	help
 	  This selects the SDHCI support for CNS3xxx System-on-Chip devices.
 
 	  If unsure, say N.
 
 config MMC_SDHCI_ESDHC_IMX
-	bool "SDHCI platform support for the Freescale eSDHC i.MX controller"
-	depends on MMC_SDHCI_PLTFM && (ARCH_MX25 || ARCH_MX35 || ARCH_MX5)
+	tristate "SDHCI platform support for the Freescale eSDHC i.MX controller"
+	depends on ARCH_MX25 || ARCH_MX35 || ARCH_MX5
+	depends on MMC_SDHCI
 	select MMC_SDHCI_IO_ACCESSORS
 	help
 	  This selects the Freescale eSDHC controller support on the platform
@@ -143,9 +133,9 @@ config MMC_SDHCI_ESDHC_IMX
 	  If unsure, say N.
 
 config MMC_SDHCI_DOVE
-	bool "SDHCI support on Marvell's Dove SoC"
+	tristate "SDHCI support on Marvell's Dove SoC"
 	depends on ARCH_DOVE
-	depends on MMC_SDHCI_PLTFM
+	depends on MMC_SDHCI
 	select MMC_SDHCI_IO_ACCESSORS
 	help
 	  This selects the Secure Digital Host Controller Interface in
@@ -154,8 +144,9 @@ config MMC_SDHCI_DOVE
 	  If unsure, say N.
 
 config MMC_SDHCI_TEGRA
-	bool "SDHCI platform support for the Tegra SD/MMC Controller"
-	depends on MMC_SDHCI_PLTFM && ARCH_TEGRA
+	tristate "SDHCI platform support for the Tegra SD/MMC Controller"
+	depends on ARCH_TEGRA
+	depends on MMC_SDHCI
 	select MMC_SDHCI_IO_ACCESSORS
 	help
 	  This selects the Tegra SD/MMC controller. If you have a Tegra
diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile
index 58a5cf73d6e9..732ec1e2a3d0 100644
--- a/drivers/mmc/host/Makefile
+++ b/drivers/mmc/host/Makefile
@@ -44,12 +44,14 @@ obj-$(CONFIG_MMC_JZ4740)	+= jz4740_mmc.o
 obj-$(CONFIG_MMC_VUB300)	+= vub300.o
 obj-$(CONFIG_MMC_USHC)		+= ushc.o
 
-obj-$(CONFIG_MMC_SDHCI_PLTFM)			+= sdhci-platform.o
-sdhci-platform-y				:= sdhci-pltfm.o
-sdhci-platform-$(CONFIG_MMC_SDHCI_CNS3XXX)	+= sdhci-cns3xxx.o
-sdhci-platform-$(CONFIG_MMC_SDHCI_ESDHC_IMX)	+= sdhci-esdhc-imx.o
-sdhci-platform-$(CONFIG_MMC_SDHCI_DOVE)		+= sdhci-dove.o
-sdhci-platform-$(CONFIG_MMC_SDHCI_TEGRA)	+= sdhci-tegra.o
+obj-$(CONFIG_MMC_SDHCI_CNS3XXX)		+= sdhci-cns3xxx.o
+sdhci-cns3xxx-objs			:= sdhci-pltfm.o
+obj-$(CONFIG_MMC_SDHCI_ESDHC_IMX)	+= sdhci-esdhc-imx.o
+sdhci-esdhc-imx-objs			:= sdhci-pltfm.o
+obj-$(CONFIG_MMC_SDHCI_DOVE)		+= sdhci-dove.o
+sdhci-dove-objs				:= sdhci-pltfm.o
+obj-$(CONFIG_MMC_SDHCI_TEGRA)		+= sdhci-tegra.o
+sdhci-tegra-objs			:= sdhci-pltfm.o
 
 obj-$(CONFIG_MMC_SDHCI_OF)	+= sdhci-of.o
 sdhci-of-y				:= sdhci-of-core.o
diff --git a/drivers/mmc/host/sdhci-cns3xxx.c b/drivers/mmc/host/sdhci-cns3xxx.c
index 9ebd1d7759dc..ac4b26f555e6 100644
--- a/drivers/mmc/host/sdhci-cns3xxx.c
+++ b/drivers/mmc/host/sdhci-cns3xxx.c
@@ -86,7 +86,7 @@ static struct sdhci_ops sdhci_cns3xxx_ops = {
 	.set_clock	= sdhci_cns3xxx_set_clock,
 };
 
-struct sdhci_pltfm_data sdhci_cns3xxx_pdata = {
+static struct sdhci_pltfm_data sdhci_cns3xxx_pdata = {
 	.ops = &sdhci_cns3xxx_ops,
 	.quirks = SDHCI_QUIRK_BROKEN_DMA |
 		  SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK |
@@ -95,3 +95,43 @@ struct sdhci_pltfm_data sdhci_cns3xxx_pdata = {
 		  SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
 		  SDHCI_QUIRK_NONSTANDARD_CLOCK,
 };
+
+static int __devinit sdhci_cns3xxx_probe(struct platform_device *pdev)
+{
+	return sdhci_pltfm_register(pdev, &sdhci_cns3xxx_pdata);
+}
+
+static int __devexit sdhci_cns3xxx_remove(struct platform_device *pdev)
+{
+	return sdhci_pltfm_unregister(pdev);
+}
+
+static struct platform_driver sdhci_cns3xxx_driver = {
+	.driver		= {
+		.name	= "sdhci-cns3xxx",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sdhci_cns3xxx_probe,
+	.remove		= __devexit_p(sdhci_cns3xxx_remove),
+#ifdef CONFIG_PM
+	.suspend	= sdhci_pltfm_suspend,
+	.resume		= sdhci_pltfm_resume,
+#endif
+};
+
+static int __init sdhci_cns3xxx_init(void)
+{
+	return platform_driver_register(&sdhci_cns3xxx_driver);
+}
+module_init(sdhci_cns3xxx_init);
+
+static void __exit sdhci_cns3xxx_exit(void)
+{
+	platform_driver_unregister(&sdhci_cns3xxx_driver);
+}
+module_exit(sdhci_cns3xxx_exit);
+
+MODULE_DESCRIPTION("SDHCI driver for CNS3xxx");
+MODULE_AUTHOR("Scott Shu, "
+	      "Anton Vorontsov <avorontsov@mvista.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mmc/host/sdhci-dove.c b/drivers/mmc/host/sdhci-dove.c
index 2aeef4ffed8c..49aa533f23d4 100644
--- a/drivers/mmc/host/sdhci-dove.c
+++ b/drivers/mmc/host/sdhci-dove.c
@@ -61,10 +61,50 @@ static struct sdhci_ops sdhci_dove_ops = {
 	.read_l	= sdhci_dove_readl,
 };
 
-struct sdhci_pltfm_data sdhci_dove_pdata = {
+static struct sdhci_pltfm_data sdhci_dove_pdata = {
 	.ops	= &sdhci_dove_ops,
 	.quirks	= SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER |
 		  SDHCI_QUIRK_NO_BUSY_IRQ |
 		  SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
 		  SDHCI_QUIRK_FORCE_DMA,
 };
+
+static int __devinit sdhci_dove_probe(struct platform_device *pdev)
+{
+	return sdhci_pltfm_register(pdev, &sdhci_dove_pdata);
+}
+
+static int __devexit sdhci_dove_remove(struct platform_device *pdev)
+{
+	return sdhci_pltfm_unregister(pdev);
+}
+
+static struct platform_driver sdhci_dove_driver = {
+	.driver		= {
+		.name	= "sdhci-dove",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sdhci_dove_probe,
+	.remove		= __devexit_p(sdhci_dove_remove),
+#ifdef CONFIG_PM
+	.suspend	= sdhci_pltfm_suspend,
+	.resume		= sdhci_pltfm_resume,
+#endif
+};
+
+static int __init sdhci_dove_init(void)
+{
+	return platform_driver_register(&sdhci_dove_driver);
+}
+module_init(sdhci_dove_init);
+
+static void __exit sdhci_dove_exit(void)
+{
+	platform_driver_unregister(&sdhci_dove_driver);
+}
+module_exit(sdhci_dove_exit);
+
+MODULE_DESCRIPTION("SDHCI driver for Dove");
+MODULE_AUTHOR("Saeed Bishara <saeed@marvell.com>, "
+	      "Mike Rapoport <mike@compulab.co.il>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index a19967d0bfc4..e27ccbb5285b 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -191,16 +191,6 @@ static unsigned int esdhc_pltfm_get_min_clock(struct sdhci_host *host)
 	return clk_get_rate(pltfm_host->clk) / 256 / 16;
 }
 
-static unsigned int esdhc_pltfm_get_ro(struct sdhci_host *host)
-{
-	struct esdhc_platform_data *boarddata = host->mmc->parent->platform_data;
-
-	if (boarddata && gpio_is_valid(boarddata->wp_gpio))
-		return gpio_get_value(boarddata->wp_gpio);
-	else
-		return -ENOSYS;
-}
-
 static struct sdhci_ops sdhci_esdhc_ops = {
 	.read_l = esdhc_readl_le,
 	.read_w = esdhc_readw_le,
@@ -212,6 +202,24 @@ static struct sdhci_ops sdhci_esdhc_ops = {
 	.get_min_clock = esdhc_pltfm_get_min_clock,
 };
 
+static struct sdhci_pltfm_data sdhci_esdhc_imx_pdata = {
+	.quirks = ESDHC_DEFAULT_QUIRKS | SDHCI_QUIRK_BROKEN_ADMA
+			| SDHCI_QUIRK_BROKEN_CARD_DETECTION,
+	/* ADMA has issues. Might be fixable */
+	.ops = &sdhci_esdhc_ops,
+};
+
+static unsigned int esdhc_pltfm_get_ro(struct sdhci_host *host)
+{
+	struct esdhc_platform_data *boarddata =
+			host->mmc->parent->platform_data;
+
+	if (boarddata && gpio_is_valid(boarddata->wp_gpio))
+		return gpio_get_value(boarddata->wp_gpio);
+	else
+		return -ENOSYS;
+}
+
 static irqreturn_t cd_irq(int irq, void *data)
 {
 	struct sdhci_host *sdhost = (struct sdhci_host *)data;
@@ -220,30 +228,35 @@ static irqreturn_t cd_irq(int irq, void *data)
 	return IRQ_HANDLED;
 };
 
-static int esdhc_pltfm_init(struct sdhci_host *host, struct sdhci_pltfm_data *pdata)
+static int __devinit sdhci_esdhc_imx_probe(struct platform_device *pdev)
 {
-	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-	struct esdhc_platform_data *boarddata = host->mmc->parent->platform_data;
+	struct sdhci_pltfm_host *pltfm_host;
+	struct sdhci_host *host;
+	struct esdhc_platform_data *boarddata;
 	struct clk *clk;
 	int err;
 	struct pltfm_imx_data *imx_data;
 
+	host = sdhci_pltfm_init(pdev, &sdhci_esdhc_imx_pdata);
+	if (IS_ERR(host))
+		return PTR_ERR(host);
+
+	pltfm_host = sdhci_priv(host);
+
+	imx_data = kzalloc(sizeof(struct pltfm_imx_data), GFP_KERNEL);
+	if (!imx_data)
+		return -ENOMEM;
+	pltfm_host->priv = imx_data;
+
 	clk = clk_get(mmc_dev(host->mmc), NULL);
 	if (IS_ERR(clk)) {
 		dev_err(mmc_dev(host->mmc), "clk err\n");
-		return PTR_ERR(clk);
+		err = PTR_ERR(clk);
+		goto err_clk_get;
 	}
 	clk_enable(clk);
 	pltfm_host->clk = clk;
 
-	imx_data = kzalloc(sizeof(struct pltfm_imx_data), GFP_KERNEL);
-	if (!imx_data) {
-		clk_disable(pltfm_host->clk);
-		clk_put(pltfm_host->clk);
-		return -ENOMEM;
-	}
-	pltfm_host->priv = imx_data;
-
 	if (!cpu_is_mx25())
 		host->quirks |= SDHCI_QUIRK_BROKEN_TIMEOUT_VAL;
 
@@ -257,6 +270,7 @@ static int esdhc_pltfm_init(struct sdhci_host *host, struct sdhci_pltfm_data *pd
 	if (!(cpu_is_mx25() || cpu_is_mx35() || cpu_is_mx51()))
 		imx_data->flags |= ESDHC_FLAG_MULTIBLK_NO_INT;
 
+	boarddata = host->mmc->parent->platform_data;
 	if (boarddata) {
 		err = gpio_request_one(boarddata->wp_gpio, GPIOF_IN, "ESDHC_WP");
 		if (err) {
@@ -289,6 +303,10 @@ static int esdhc_pltfm_init(struct sdhci_host *host, struct sdhci_pltfm_data *pd
 		host->quirks &= ~SDHCI_QUIRK_BROKEN_CARD_DETECTION;
 	}
 
+	err = sdhci_add_host(host);
+	if (err)
+		goto err_add_host;
+
 	return 0;
 
  no_card_detect_irq:
@@ -297,14 +315,23 @@ static int esdhc_pltfm_init(struct sdhci_host *host, struct sdhci_pltfm_data *pd
 	boarddata->cd_gpio = err;
  not_supported:
 	kfree(imx_data);
-	return 0;
+ err_add_host:
+	clk_disable(pltfm_host->clk);
+	clk_put(pltfm_host->clk);
+ err_clk_get:
+	sdhci_pltfm_free(pdev);
+	return err;
 }
 
-static void esdhc_pltfm_exit(struct sdhci_host *host)
+static int __devexit sdhci_esdhc_imx_remove(struct platform_device *pdev)
 {
+	struct sdhci_host *host = platform_get_drvdata(pdev);
 	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
 	struct esdhc_platform_data *boarddata = host->mmc->parent->platform_data;
 	struct pltfm_imx_data *imx_data = pltfm_host->priv;
+	int dead = (readl(host->ioaddr + SDHCI_INT_STATUS) == 0xffffffff);
+
+	sdhci_remove_host(host, dead);
 
 	if (boarddata && gpio_is_valid(boarddata->wp_gpio))
 		gpio_free(boarddata->wp_gpio);
@@ -319,13 +346,37 @@ static void esdhc_pltfm_exit(struct sdhci_host *host)
 	clk_disable(pltfm_host->clk);
 	clk_put(pltfm_host->clk);
 	kfree(imx_data);
+
+	sdhci_pltfm_free(pdev);
+
+	return 0;
 }
 
-struct sdhci_pltfm_data sdhci_esdhc_imx_pdata = {
-	.quirks = ESDHC_DEFAULT_QUIRKS | SDHCI_QUIRK_BROKEN_ADMA
-			| SDHCI_QUIRK_BROKEN_CARD_DETECTION,
-	/* ADMA has issues. Might be fixable */
-	.ops = &sdhci_esdhc_ops,
-	.init = esdhc_pltfm_init,
-	.exit = esdhc_pltfm_exit,
+static struct platform_driver sdhci_esdhc_imx_driver = {
+	.driver		= {
+		.name	= "sdhci-esdhc-imx",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sdhci_esdhc_imx_probe,
+	.remove		= __devexit_p(sdhci_esdhc_imx_remove),
+#ifdef CONFIG_PM
+	.suspend	= sdhci_pltfm_suspend,
+	.resume		= sdhci_pltfm_resume,
+#endif
 };
+
+static int __init sdhci_esdhc_imx_init(void)
+{
+	return platform_driver_register(&sdhci_esdhc_imx_driver);
+}
+module_init(sdhci_esdhc_imx_init);
+
+static void __exit sdhci_esdhc_imx_exit(void)
+{
+	platform_driver_unregister(&sdhci_esdhc_imx_driver);
+}
+module_exit(sdhci_esdhc_imx_exit);
+
+MODULE_DESCRIPTION("SDHCI driver for Freescale i.MX eSDHC");
+MODULE_AUTHOR("Wolfram Sang <w.sang@pengutronix.de>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mmc/host/sdhci-pltfm.c b/drivers/mmc/host/sdhci-pltfm.c
index dbab0407f4b6..8ccf25666201 100644
--- a/drivers/mmc/host/sdhci-pltfm.c
+++ b/drivers/mmc/host/sdhci-pltfm.c
@@ -22,48 +22,22 @@
  * Inspired by sdhci-pci.c, by Pierre Ossman
  */
 
-#include <linux/delay.h>
-#include <linux/highmem.h>
-#include <linux/mod_devicetable.h>
-#include <linux/platform_device.h>
-
-#include <linux/mmc/host.h>
-
-#include <linux/io.h>
-#include <linux/mmc/sdhci-pltfm.h>
+#include <linux/err.h>
 
 #include "sdhci.h"
 #include "sdhci-pltfm.h"
 
-/*****************************************************************************\
- *                                                                           *
- * SDHCI core callbacks                                                      *
- *                                                                           *
-\*****************************************************************************/
-
 static struct sdhci_ops sdhci_pltfm_ops = {
 };
 
-/*****************************************************************************\
- *                                                                           *
- * Device probing/removal                                                    *
- *                                                                           *
-\*****************************************************************************/
-
-static int __devinit sdhci_pltfm_probe(struct platform_device *pdev)
+struct sdhci_host *sdhci_pltfm_init(struct platform_device *pdev,
+				    struct sdhci_pltfm_data *pdata)
 {
-	const struct platform_device_id *platid = platform_get_device_id(pdev);
-	struct sdhci_pltfm_data *pdata;
 	struct sdhci_host *host;
 	struct sdhci_pltfm_host *pltfm_host;
 	struct resource *iomem;
 	int ret;
 
-	if (platid && platid->driver_data)
-		pdata = (void *)platid->driver_data;
-	else
-		pdata = pdev->dev.platform_data;
-
 	iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!iomem) {
 		ret = -ENOMEM;
@@ -71,8 +45,7 @@ static int __devinit sdhci_pltfm_probe(struct platform_device *pdev)
 	}
 
 	if (resource_size(iomem) < 0x100)
-		dev_err(&pdev->dev, "Invalid iomem size. You may "
-			"experience problems.\n");
+		dev_err(&pdev->dev, "Invalid iomem size!\n");
 
 	/* Some PCI-based MFD need the parent here */
 	if (pdev->dev.parent != &platform_bus)
@@ -87,7 +60,7 @@ static int __devinit sdhci_pltfm_probe(struct platform_device *pdev)
 
 	pltfm_host = sdhci_priv(host);
 
-	host->hw_name = "platform";
+	host->hw_name = dev_name(&pdev->dev);
 	if (pdata && pdata->ops)
 		host->ops = pdata->ops;
 	else
@@ -110,126 +83,70 @@ static int __devinit sdhci_pltfm_probe(struct platform_device *pdev)
 		goto err_remap;
 	}
 
-	if (pdata && pdata->init) {
-		ret = pdata->init(host, pdata);
-		if (ret)
-			goto err_plat_init;
-	}
-
-	ret = sdhci_add_host(host);
-	if (ret)
-		goto err_add_host;
-
 	platform_set_drvdata(pdev, host);
 
-	return 0;
+	return host;
 
-err_add_host:
-	if (pdata && pdata->exit)
-		pdata->exit(host);
-err_plat_init:
-	iounmap(host->ioaddr);
 err_remap:
 	release_mem_region(iomem->start, resource_size(iomem));
 err_request:
 	sdhci_free_host(host);
 err:
-	printk(KERN_ERR"Probing of sdhci-pltfm failed: %d\n", ret);
-	return ret;
+	dev_err(&pdev->dev, "%s failed %d\n", __func__, ret);
+	return ERR_PTR(ret);
 }
 
-static int __devexit sdhci_pltfm_remove(struct platform_device *pdev)
+void sdhci_pltfm_free(struct platform_device *pdev)
 {
-	struct sdhci_pltfm_data *pdata = pdev->dev.platform_data;
 	struct sdhci_host *host = platform_get_drvdata(pdev);
 	struct resource *iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	int dead;
-	u32 scratch;
-
-	dead = 0;
-	scratch = readl(host->ioaddr + SDHCI_INT_STATUS);
-	if (scratch == (u32)-1)
-		dead = 1;
 
-	sdhci_remove_host(host, dead);
-	if (pdata && pdata->exit)
-		pdata->exit(host);
 	iounmap(host->ioaddr);
 	release_mem_region(iomem->start, resource_size(iomem));
 	sdhci_free_host(host);
 	platform_set_drvdata(pdev, NULL);
+}
 
-	return 0;
+int sdhci_pltfm_register(struct platform_device *pdev,
+			 struct sdhci_pltfm_data *pdata)
+{
+	struct sdhci_host *host;
+	int ret = 0;
+
+	host = sdhci_pltfm_init(pdev, pdata);
+	if (IS_ERR(host))
+		return PTR_ERR(host);
+
+	ret = sdhci_add_host(host);
+	if (ret)
+		sdhci_pltfm_free(pdev);
+
+	return ret;
 }
 
-static const struct platform_device_id sdhci_pltfm_ids[] = {
-	{ "sdhci", },
-#ifdef CONFIG_MMC_SDHCI_CNS3XXX
-	{ "sdhci-cns3xxx", (kernel_ulong_t)&sdhci_cns3xxx_pdata },
-#endif
-#ifdef CONFIG_MMC_SDHCI_ESDHC_IMX
-	{ "sdhci-esdhc-imx", (kernel_ulong_t)&sdhci_esdhc_imx_pdata },
-#endif
-#ifdef CONFIG_MMC_SDHCI_DOVE
-	{ "sdhci-dove", (kernel_ulong_t)&sdhci_dove_pdata },
-#endif
-#ifdef CONFIG_MMC_SDHCI_TEGRA
-	{ "sdhci-tegra", (kernel_ulong_t)&sdhci_tegra_pdata },
-#endif
-	{ },
-};
-MODULE_DEVICE_TABLE(platform, sdhci_pltfm_ids);
+int sdhci_pltfm_unregister(struct platform_device *pdev)
+{
+	struct sdhci_host *host = platform_get_drvdata(pdev);
+	int dead = (readl(host->ioaddr + SDHCI_INT_STATUS) == 0xffffffff);
+
+	sdhci_remove_host(host, dead);
+	sdhci_pltfm_free(pdev);
+
+	return 0;
+}
 
 #ifdef CONFIG_PM
-static int sdhci_pltfm_suspend(struct platform_device *dev, pm_message_t state)
+int sdhci_pltfm_suspend(struct platform_device *dev, pm_message_t state)
 {
 	struct sdhci_host *host = platform_get_drvdata(dev);
 
 	return sdhci_suspend_host(host, state);
 }
 
-static int sdhci_pltfm_resume(struct platform_device *dev)
+int sdhci_pltfm_resume(struct platform_device *dev)
 {
 	struct sdhci_host *host = platform_get_drvdata(dev);
 
 	return sdhci_resume_host(host);
 }
-#else
-#define sdhci_pltfm_suspend	NULL
-#define sdhci_pltfm_resume	NULL
 #endif	/* CONFIG_PM */
-
-static struct platform_driver sdhci_pltfm_driver = {
-	.driver = {
-		.name	= "sdhci",
-		.owner	= THIS_MODULE,
-	},
-	.probe		= sdhci_pltfm_probe,
-	.remove		= __devexit_p(sdhci_pltfm_remove),
-	.id_table	= sdhci_pltfm_ids,
-	.suspend	= sdhci_pltfm_suspend,
-	.resume		= sdhci_pltfm_resume,
-};
-
-/*****************************************************************************\
- *                                                                           *
- * Driver init/exit                                                          *
- *                                                                           *
-\*****************************************************************************/
-
-static int __init sdhci_drv_init(void)
-{
-	return platform_driver_register(&sdhci_pltfm_driver);
-}
-
-static void __exit sdhci_drv_exit(void)
-{
-	platform_driver_unregister(&sdhci_pltfm_driver);
-}
-
-module_init(sdhci_drv_init);
-module_exit(sdhci_drv_exit);
-
-MODULE_DESCRIPTION("Secure Digital Host Controller Interface platform driver");
-MODULE_AUTHOR("Mocean Laboratories <info@mocean-labs.com>");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/mmc/host/sdhci-pltfm.h b/drivers/mmc/host/sdhci-pltfm.h
index 2b37016ad0ac..ff4b7eb326fb 100644
--- a/drivers/mmc/host/sdhci-pltfm.h
+++ b/drivers/mmc/host/sdhci-pltfm.h
@@ -13,6 +13,7 @@
 
 #include <linux/clk.h>
 #include <linux/types.h>
+#include <linux/platform_device.h>
 #include <linux/mmc/sdhci-pltfm.h>
 
 struct sdhci_pltfm_host {
@@ -20,9 +21,17 @@ struct sdhci_pltfm_host {
 	void *priv; /* to handle quirks across io-accessor calls */
 };
 
-extern struct sdhci_pltfm_data sdhci_cns3xxx_pdata;
-extern struct sdhci_pltfm_data sdhci_esdhc_imx_pdata;
-extern struct sdhci_pltfm_data sdhci_dove_pdata;
-extern struct sdhci_pltfm_data sdhci_tegra_pdata;
+extern struct sdhci_host *sdhci_pltfm_init(struct platform_device *pdev,
+					   struct sdhci_pltfm_data *pdata);
+extern void sdhci_pltfm_free(struct platform_device *pdev);
+
+extern int sdhci_pltfm_register(struct platform_device *pdev,
+				struct sdhci_pltfm_data *pdata);
+extern int sdhci_pltfm_unregister(struct platform_device *pdev);
+
+#ifdef CONFIG_PM
+extern int sdhci_pltfm_suspend(struct platform_device *dev, pm_message_t state);
+extern int sdhci_pltfm_resume(struct platform_device *dev);
+#endif
 
 #endif /* _DRIVERS_MMC_SDHCI_PLTFM_H */
diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c
index 343c97edba32..1f66aca5f506 100644
--- a/drivers/mmc/host/sdhci-tegra.c
+++ b/drivers/mmc/host/sdhci-tegra.c
@@ -116,20 +116,42 @@ static int tegra_sdhci_8bit(struct sdhci_host *host, int bus_width)
 	return 0;
 }
 
+static struct sdhci_ops tegra_sdhci_ops = {
+	.get_ro     = tegra_sdhci_get_ro,
+	.read_l     = tegra_sdhci_readl,
+	.read_w     = tegra_sdhci_readw,
+	.write_l    = tegra_sdhci_writel,
+	.platform_8bit_width = tegra_sdhci_8bit,
+};
+
+static struct sdhci_pltfm_data sdhci_tegra_pdata = {
+	.quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
+		  SDHCI_QUIRK_SINGLE_POWER_WRITE |
+		  SDHCI_QUIRK_NO_HISPD_BIT |
+		  SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC,
+	.ops  = &tegra_sdhci_ops,
+};
 
-static int tegra_sdhci_pltfm_init(struct sdhci_host *host,
-				  struct sdhci_pltfm_data *pdata)
+static int __devinit sdhci_tegra_probe(struct platform_device *pdev)
 {
-	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-	struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc));
+	struct sdhci_pltfm_host *pltfm_host;
 	struct tegra_sdhci_platform_data *plat;
+	struct sdhci_host *host;
 	struct clk *clk;
 	int rc;
 
+	host = sdhci_pltfm_init(pdev, &sdhci_tegra_pdata);
+	if (IS_ERR(host))
+		return PTR_ERR(host);
+
+	pltfm_host = sdhci_priv(host);
+
 	plat = pdev->dev.platform_data;
+
 	if (plat == NULL) {
 		dev_err(mmc_dev(host->mmc), "missing platform data\n");
-		return -ENXIO;
+		rc = -ENXIO;
+		goto err_no_plat;
 	}
 
 	if (gpio_is_valid(plat->power_gpio)) {
@@ -137,7 +159,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host,
 		if (rc) {
 			dev_err(mmc_dev(host->mmc),
 				"failed to allocate power gpio\n");
-			goto out;
+			goto err_power_req;
 		}
 		tegra_gpio_enable(plat->power_gpio);
 		gpio_direction_output(plat->power_gpio, 1);
@@ -148,7 +170,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host,
 		if (rc) {
 			dev_err(mmc_dev(host->mmc),
 				"failed to allocate cd gpio\n");
-			goto out_power;
+			goto err_cd_req;
 		}
 		tegra_gpio_enable(plat->cd_gpio);
 		gpio_direction_input(plat->cd_gpio);
@@ -159,7 +181,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host,
 
 		if (rc)	{
 			dev_err(mmc_dev(host->mmc), "request irq error\n");
-			goto out_cd;
+			goto err_cd_irq_req;
 		}
 
 	}
@@ -169,7 +191,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host,
 		if (rc) {
 			dev_err(mmc_dev(host->mmc),
 				"failed to allocate wp gpio\n");
-			goto out_irq;
+			goto err_wp_req;
 		}
 		tegra_gpio_enable(plat->wp_gpio);
 		gpio_direction_input(plat->wp_gpio);
@@ -179,7 +201,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host,
 	if (IS_ERR(clk)) {
 		dev_err(mmc_dev(host->mmc), "clk err\n");
 		rc = PTR_ERR(clk);
-		goto out_wp;
+		goto err_clk_get;
 	}
 	clk_enable(clk);
 	pltfm_host->clk = clk;
@@ -189,38 +211,47 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host,
 	if (plat->is_8bit)
 		host->mmc->caps |= MMC_CAP_8_BIT_DATA;
 
+	rc = sdhci_add_host(host);
+	if (rc)
+		goto err_add_host;
+
 	return 0;
 
-out_wp:
+err_add_host:
+	clk_disable(pltfm_host->clk);
+	clk_put(pltfm_host->clk);
+err_clk_get:
 	if (gpio_is_valid(plat->wp_gpio)) {
 		tegra_gpio_disable(plat->wp_gpio);
 		gpio_free(plat->wp_gpio);
 	}
-
-out_irq:
+err_wp_req:
 	if (gpio_is_valid(plat->cd_gpio))
 		free_irq(gpio_to_irq(plat->cd_gpio), host);
-out_cd:
+err_cd_irq_req:
 	if (gpio_is_valid(plat->cd_gpio)) {
 		tegra_gpio_disable(plat->cd_gpio);
 		gpio_free(plat->cd_gpio);
 	}
-
-out_power:
+err_cd_req:
 	if (gpio_is_valid(plat->power_gpio)) {
 		tegra_gpio_disable(plat->power_gpio);
 		gpio_free(plat->power_gpio);
 	}
-
-out:
+err_power_req:
+err_no_plat:
+	sdhci_pltfm_free(pdev);
 	return rc;
 }
 
-static void tegra_sdhci_pltfm_exit(struct sdhci_host *host)
+static int __devexit sdhci_tegra_remove(struct platform_device *pdev)
 {
+	struct sdhci_host *host = platform_get_drvdata(pdev);
 	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-	struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc));
 	struct tegra_sdhci_platform_data *plat;
+	int dead = (readl(host->ioaddr + SDHCI_INT_STATUS) == 0xffffffff);
+
+	sdhci_remove_host(host, dead);
 
 	plat = pdev->dev.platform_data;
 
@@ -242,22 +273,37 @@ static void tegra_sdhci_pltfm_exit(struct sdhci_host *host)
 
 	clk_disable(pltfm_host->clk);
 	clk_put(pltfm_host->clk);
+
+	sdhci_pltfm_free(pdev);
+
+	return 0;
 }
 
-static struct sdhci_ops tegra_sdhci_ops = {
-	.get_ro     = tegra_sdhci_get_ro,
-	.read_l     = tegra_sdhci_readl,
-	.read_w     = tegra_sdhci_readw,
-	.write_l    = tegra_sdhci_writel,
-	.platform_8bit_width = tegra_sdhci_8bit,
+static struct platform_driver sdhci_tegra_driver = {
+	.driver		= {
+		.name	= "sdhci-tegra",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sdhci_tegra_probe,
+	.remove		= __devexit_p(sdhci_tegra_remove),
+#ifdef CONFIG_PM
+	.suspend	= sdhci_pltfm_suspend,
+	.resume		= sdhci_pltfm_resume,
+#endif
 };
 
-struct sdhci_pltfm_data sdhci_tegra_pdata = {
-	.quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
-		  SDHCI_QUIRK_SINGLE_POWER_WRITE |
-		  SDHCI_QUIRK_NO_HISPD_BIT |
-		  SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC,
-	.ops  = &tegra_sdhci_ops,
-	.init = tegra_sdhci_pltfm_init,
-	.exit = tegra_sdhci_pltfm_exit,
-};
+static int __init sdhci_tegra_init(void)
+{
+	return platform_driver_register(&sdhci_tegra_driver);
+}
+module_init(sdhci_tegra_init);
+
+static void __exit sdhci_tegra_exit(void)
+{
+	platform_driver_unregister(&sdhci_tegra_driver);
+}
+module_exit(sdhci_tegra_exit);
+
+MODULE_DESCRIPTION("SDHCI driver for Tegra");
+MODULE_AUTHOR(" Google, Inc.");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mmc/sdhci-pltfm.h b/include/linux/mmc/sdhci-pltfm.h
index 548d59d404cb..f1c2ac3fb300 100644
--- a/include/linux/mmc/sdhci-pltfm.h
+++ b/include/linux/mmc/sdhci-pltfm.h
@@ -15,21 +15,15 @@
 #define _SDHCI_PLTFM_H
 
 struct sdhci_ops;
-struct sdhci_host;
 
 /**
  * struct sdhci_pltfm_data - SDHCI platform-specific information & hooks
  * @ops: optional pointer to the platform-provided SDHCI ops
  * @quirks: optional SDHCI quirks
- * @init: optional hook that is called during device probe, before the
- *        driver tries to access any SDHCI registers
- * @exit: optional hook that is called during device removal
  */
 struct sdhci_pltfm_data {
 	struct sdhci_ops *ops;
 	unsigned int quirks;
-	int (*init)(struct sdhci_host *host, struct sdhci_pltfm_data *pdata);
-	void (*exit)(struct sdhci_host *host);
 };
 
 #endif /* _SDHCI_PLTFM_H */
-- 
cgit v1.2.3


From 94cc6a86567cb3c2234807081a46ce5400c36b31 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Fri, 27 May 2011 23:48:15 +0800
Subject: mmc: sdhci: merge two sdhci-pltfm.h into one

The structure sdhci_pltfm_data is not necessarily to be in a public
header like include/linux/mmc/sdhci-pltfm.h, so the patch moves it
into drivers/mmc/host/sdhci-pltfm.h and eliminates the former one.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Reviewed-by: Grant Likely <grant.likely@secretlab.ca>
Reviewed-by: Wolfram Sang <w.sang@pengutronix.de>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/sdhci-cns3xxx.c   |  1 -
 drivers/mmc/host/sdhci-esdhc-imx.c |  1 -
 drivers/mmc/host/sdhci-pltfm.h     |  6 +++++-
 include/linux/mmc/sdhci-pltfm.h    | 29 -----------------------------
 4 files changed, 5 insertions(+), 32 deletions(-)
 delete mode 100644 include/linux/mmc/sdhci-pltfm.h

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci-cns3xxx.c b/drivers/mmc/host/sdhci-cns3xxx.c
index ac4b26f555e6..025d1a5ab964 100644
--- a/drivers/mmc/host/sdhci-cns3xxx.c
+++ b/drivers/mmc/host/sdhci-cns3xxx.c
@@ -15,7 +15,6 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/mmc/host.h>
-#include <linux/mmc/sdhci-pltfm.h>
 #include <mach/cns3xxx.h>
 #include "sdhci.h"
 #include "sdhci-pltfm.h"
diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index e27ccbb5285b..977f1421ec27 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -18,7 +18,6 @@
 #include <linux/gpio.h>
 #include <linux/slab.h>
 #include <linux/mmc/host.h>
-#include <linux/mmc/sdhci-pltfm.h>
 #include <linux/mmc/mmc.h>
 #include <linux/mmc/sdio.h>
 #include <mach/hardware.h>
diff --git a/drivers/mmc/host/sdhci-pltfm.h b/drivers/mmc/host/sdhci-pltfm.h
index fe27b83f5bb1..fd726948641a 100644
--- a/drivers/mmc/host/sdhci-pltfm.h
+++ b/drivers/mmc/host/sdhci-pltfm.h
@@ -14,9 +14,13 @@
 #include <linux/clk.h>
 #include <linux/types.h>
 #include <linux/platform_device.h>
-#include <linux/mmc/sdhci-pltfm.h>
 #include <linux/mmc/sdhci.h>
 
+struct sdhci_pltfm_data {
+	struct sdhci_ops *ops;
+	unsigned int quirks;
+};
+
 struct sdhci_pltfm_host {
 	struct clk *clk;
 	void *priv; /* to handle quirks across io-accessor calls */
diff --git a/include/linux/mmc/sdhci-pltfm.h b/include/linux/mmc/sdhci-pltfm.h
deleted file mode 100644
index f1c2ac3fb300..000000000000
--- a/include/linux/mmc/sdhci-pltfm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Platform data declarations for the sdhci-pltfm driver.
- *
- * Copyright (c) 2010 MontaVista Software, LLC.
- *
- * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- */
-
-#ifndef _SDHCI_PLTFM_H
-#define _SDHCI_PLTFM_H
-
-struct sdhci_ops;
-
-/**
- * struct sdhci_pltfm_data - SDHCI platform-specific information & hooks
- * @ops: optional pointer to the platform-provided SDHCI ops
- * @quirks: optional SDHCI quirks
- */
-struct sdhci_pltfm_data {
-	struct sdhci_ops *ops;
-	unsigned int quirks;
-};
-
-#endif /* _SDHCI_PLTFM_H */
-- 
cgit v1.2.3


From 100e918610b7487fa18db97b3879cd8d1fdd5974 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Fri, 27 May 2011 16:04:03 -0400
Subject: mmc: Standardize header file inclusion checks.

Standardize the checks for multiple MMC header file inclusion,
including adding comments to terminating #endif's, and fixing
one incorrect comment.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 include/linux/mmc/boot.h           | 6 +++---
 include/linux/mmc/card.h           | 2 +-
 include/linux/mmc/core.h           | 2 +-
 include/linux/mmc/dw_mmc.h         | 6 +++---
 include/linux/mmc/host.h           | 3 +--
 include/linux/mmc/ioctl.h          | 2 +-
 include/linux/mmc/mmc.h            | 7 +++----
 include/linux/mmc/pm.h             | 2 +-
 include/linux/mmc/sd.h             | 7 +++----
 include/linux/mmc/sdhci-spear.h    | 6 +++---
 include/linux/mmc/sdhci.h          | 6 +++---
 include/linux/mmc/sdio.h           | 7 +++----
 include/linux/mmc/sdio_func.h      | 7 +++----
 include/linux/mmc/sdio_ids.h       | 6 +++---
 include/linux/mmc/sh_mmcif.h       | 6 +++---
 include/linux/mmc/sh_mobile_sdhi.h | 6 +++---
 include/linux/mmc/tmio.h           | 6 +++---
 17 files changed, 41 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/boot.h b/include/linux/mmc/boot.h
index 39d787c229cb..23acc3baa07d 100644
--- a/include/linux/mmc/boot.h
+++ b/include/linux/mmc/boot.h
@@ -1,7 +1,7 @@
-#ifndef MMC_BOOT_H
-#define MMC_BOOT_H
+#ifndef LINUX_MMC_BOOT_H
+#define LINUX_MMC_BOOT_H
 
 enum { MMC_PROGRESS_ENTER, MMC_PROGRESS_INIT,
        MMC_PROGRESS_LOAD, MMC_PROGRESS_DONE };
 
-#endif
+#endif /* LINUX_MMC_BOOT_H */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 6ad43554ac05..b460fc2af8a1 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -403,4 +403,4 @@ extern void mmc_unregister_driver(struct mmc_driver *);
 extern void mmc_fixup_device(struct mmc_card *card,
 			     const struct mmc_fixup *table);
 
-#endif
+#endif /* LINUX_MMC_CARD_H */
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index b6718e549a51..791f060a6f1d 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -179,4 +179,4 @@ static inline void mmc_claim_host(struct mmc_host *host)
 
 extern u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max);
 
-#endif
+#endif /* LINUX_MMC_CORE_H */
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index bdd7ceeb99e4..55d9909c7abd 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -11,8 +11,8 @@
  * (at your option) any later version.
  */
 
-#ifndef _LINUX_MMC_DW_MMC_H_
-#define _LINUX_MMC_DW_MMC_H_
+#ifndef LINUX_MMC_DW_MMC_H
+#define LINUX_MMC_DW_MMC_H
 
 #define MAX_MCI_SLOTS	2
 
@@ -219,4 +219,4 @@ struct dw_mci_board {
 	struct block_settings *blk_settings;
 };
 
-#endif /* _LINUX_MMC_DW_MMC_H_ */
+#endif /* LINUX_MMC_DW_MMC_H */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 1ee4424462eb..fc24b38b48cb 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -373,5 +373,4 @@ static inline int mmc_host_cmd23(struct mmc_host *host)
 {
 	return host->caps & MMC_CAP_CMD23;
 }
-#endif
-
+#endif /* LINUX_MMC_HOST_H */
diff --git a/include/linux/mmc/ioctl.h b/include/linux/mmc/ioctl.h
index 5baf2983a12f..8fa5bc5f8059 100644
--- a/include/linux/mmc/ioctl.h
+++ b/include/linux/mmc/ioctl.h
@@ -51,4 +51,4 @@ struct mmc_ioc_cmd {
  * block device operations.
  */
 #define MMC_IOC_MAX_BYTES  (512L * 256)
-#endif  /* LINUX_MMC_IOCTL_H */
+#endif /* LINUX_MMC_IOCTL_H */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index ac26a685cca8..5bade0605641 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -21,8 +21,8 @@
  *          15 May 2002
  */
 
-#ifndef MMC_MMC_H
-#define MMC_MMC_H
+#ifndef LINUX_MMC_MMC_H
+#define LINUX_MMC_MMC_H
 
 /* Standard MMC commands (4.1)           type  argument     response */
    /* class 1 */
@@ -327,5 +327,4 @@ struct _mmc_csd {
 #define MMC_SWITCH_MODE_CLEAR_BITS	0x02	/* Clear bits which are 1 in value */
 #define MMC_SWITCH_MODE_WRITE_BYTE	0x03	/* Set target to value */
 
-#endif  /* MMC_MMC_PROTOCOL_H */
-
+#endif /* LINUX_MMC_MMC_H */
diff --git a/include/linux/mmc/pm.h b/include/linux/mmc/pm.h
index d37aac49cf9a..4a139204c20c 100644
--- a/include/linux/mmc/pm.h
+++ b/include/linux/mmc/pm.h
@@ -27,4 +27,4 @@ typedef unsigned int mmc_pm_flag_t;
 #define MMC_PM_KEEP_POWER	(1 << 0)	/* preserve card power during suspend */
 #define MMC_PM_WAKE_SDIO_IRQ	(1 << 1)	/* wake up host system on SDIO IRQ assertion */
 
-#endif
+#endif /* LINUX_MMC_PM_H */
diff --git a/include/linux/mmc/sd.h b/include/linux/mmc/sd.h
index 7d35d52c3df3..1ebcf9ba1256 100644
--- a/include/linux/mmc/sd.h
+++ b/include/linux/mmc/sd.h
@@ -9,8 +9,8 @@
  * your option) any later version.
  */
 
-#ifndef MMC_SD_H
-#define MMC_SD_H
+#ifndef LINUX_MMC_SD_H
+#define LINUX_MMC_SD_H
 
 /* SD commands                           type  argument     response */
   /* class 0 */
@@ -91,5 +91,4 @@
 #define SD_SWITCH_ACCESS_DEF	0
 #define SD_SWITCH_ACCESS_HS	1
 
-#endif
-
+#endif /* LINUX_MMC_SD_H */
diff --git a/include/linux/mmc/sdhci-spear.h b/include/linux/mmc/sdhci-spear.h
index 9188c973f3e1..5cdc96da9dd5 100644
--- a/include/linux/mmc/sdhci-spear.h
+++ b/include/linux/mmc/sdhci-spear.h
@@ -11,8 +11,8 @@
  * warranty of any kind, whether express or implied.
  */
 
-#ifndef MMC_SDHCI_SPEAR_H
-#define MMC_SDHCI_SPEAR_H
+#ifndef LINUX_MMC_SDHCI_SPEAR_H
+#define LINUX_MMC_SDHCI_SPEAR_H
 
 #include <linux/platform_device.h>
 /*
@@ -39,4 +39,4 @@ sdhci_set_plat_data(struct platform_device *pdev, struct sdhci_plat_data *data)
 	pdev->dev.platform_data = data;
 }
 
-#endif /* MMC_SDHCI_SPEAR_H */
+#endif /* LINUX_MMC_SDHCI_SPEAR_H */
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 6a68c4eb4e44..5666f3abfab7 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -8,8 +8,8 @@
  * the Free Software Foundation; either version 2 of the License, or (at
  * your option) any later version.
  */
-#ifndef __SDHCI_H
-#define __SDHCI_H
+#ifndef LINUX_MMC_SDHCI_H
+#define LINUX_MMC_SDHCI_H
 
 #include <linux/scatterlist.h>
 #include <linux/compiler.h>
@@ -162,4 +162,4 @@ struct sdhci_host {
 
 	unsigned long private[0] ____cacheline_aligned;
 };
-#endif /* __SDHCI_H */
+#endif /* LINUX_MMC_SDHCI_H */
diff --git a/include/linux/mmc/sdio.h b/include/linux/mmc/sdio.h
index 245cdacee544..2a2e9905a247 100644
--- a/include/linux/mmc/sdio.h
+++ b/include/linux/mmc/sdio.h
@@ -9,8 +9,8 @@
  * your option) any later version.
  */
 
-#ifndef MMC_SDIO_H
-#define MMC_SDIO_H
+#ifndef LINUX_MMC_SDIO_H
+#define LINUX_MMC_SDIO_H
 
 /* SDIO commands                         type  argument     response */
 #define SD_IO_SEND_OP_COND          5 /* bcr  [23:0] OCR         R4  */
@@ -161,5 +161,4 @@
 
 #define SDIO_FBR_BLKSIZE	0x10	/* block size (2 bytes) */
 
-#endif
-
+#endif /* LINUX_MMC_SDIO_H */
diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h
index 31baaf82f458..50f0bc952328 100644
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -9,8 +9,8 @@
  * your option) any later version.
  */
 
-#ifndef MMC_SDIO_FUNC_H
-#define MMC_SDIO_FUNC_H
+#ifndef LINUX_MMC_SDIO_FUNC_H
+#define LINUX_MMC_SDIO_FUNC_H
 
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
@@ -161,5 +161,4 @@ extern void sdio_f0_writeb(struct sdio_func *func, unsigned char b,
 extern mmc_pm_flag_t sdio_get_host_pm_caps(struct sdio_func *func);
 extern int sdio_set_host_pm_flags(struct sdio_func *func, mmc_pm_flag_t flags);
 
-#endif
-
+#endif /* LINUX_MMC_SDIO_FUNC_H */
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index a36ab3bc7b03..9f03feedc8e7 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -2,8 +2,8 @@
  * SDIO Classes, Interface Types, Manufacturer IDs, etc.
  */
 
-#ifndef MMC_SDIO_IDS_H
-#define MMC_SDIO_IDS_H
+#ifndef LINUX_MMC_SDIO_IDS_H
+#define LINUX_MMC_SDIO_IDS_H
 
 /*
  * Standard SDIO Function Interfaces
@@ -44,4 +44,4 @@
 #define SDIO_DEVICE_ID_SIANO_NOVA_A0		0x1100
 #define SDIO_DEVICE_ID_SIANO_STELLAR 		0x5347
 
-#endif
+#endif /* LINUX_MMC_SDIO_IDS_H */
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index 9eb9b4b96f55..0222cd8ebe76 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -11,8 +11,8 @@
  *
  */
 
-#ifndef __SH_MMCIF_H__
-#define __SH_MMCIF_H__
+#ifndef LINUX_MMC_SH_MMCIF_H
+#define LINUX_MMC_SH_MMCIF_H
 
 #include <linux/io.h>
 #include <linux/platform_device.h>
@@ -220,4 +220,4 @@ static inline void sh_mmcif_boot_init(void __iomem *base)
 	sh_mmcif_boot_cmd(base, 0x03400040, 0x00010000);
 }
 
-#endif /* __SH_MMCIF_H__ */
+#endif /* LINUX_MMC_SH_MMCIF_H */
diff --git a/include/linux/mmc/sh_mobile_sdhi.h b/include/linux/mmc/sh_mobile_sdhi.h
index faf32b6ec185..bd50b365167f 100644
--- a/include/linux/mmc/sh_mobile_sdhi.h
+++ b/include/linux/mmc/sh_mobile_sdhi.h
@@ -1,5 +1,5 @@
-#ifndef __SH_MOBILE_SDHI_H__
-#define __SH_MOBILE_SDHI_H__
+#ifndef LINUX_MMC_SH_MOBILE_SDHI_H
+#define LINUX_MMC_SH_MOBILE_SDHI_H
 
 #include <linux/types.h>
 
@@ -17,4 +17,4 @@ struct sh_mobile_sdhi_info {
 	int (*get_cd)(struct platform_device *pdev);
 };
 
-#endif /* __SH_MOBILE_SDHI_H__ */
+#endif /* LINUX_MMC_SH_MOBILE_SDHI_H */
diff --git a/include/linux/mmc/tmio.h b/include/linux/mmc/tmio.h
index 19490b942db0..551490f0613d 100644
--- a/include/linux/mmc/tmio.h
+++ b/include/linux/mmc/tmio.h
@@ -12,8 +12,8 @@
  *
  * TC6393XB TC6391XB TC6387XB T7L66XB ASIC3
  */
-#ifndef _LINUX_MMC_TMIO_H_
-#define _LINUX_MMC_TMIO_H_
+#ifndef LINUX_MMC_TMIO_H
+#define LINUX_MMC_TMIO_H
 
 #define CTL_SD_CMD 0x00
 #define CTL_ARG_REG 0x04
@@ -60,4 +60,4 @@
 
 #define TMIO_BBS		512		/* Boot block size */
 
-#endif /* _LINUX_MMC_TMIO_H_ */
+#endif /* LINUX_MMC_TMIO_H */
-- 
cgit v1.2.3


From bfed345edfe05b291f7e5d396d4b447b6e8e66fa Mon Sep 17 00:00:00 2001
From: Zhangfei Gao <zhangfei.gao@marvell.com>
Date: Mon, 20 Jun 2011 22:11:52 +0800
Subject: mmc: sdhci-pxa: move platform data to include/linux/platform_data

As suggested by Arnd, move platform data to include/linux/platform_data
in order to improve build coverage for the driver.

Signed-off-by: Zhangfei Gao <zhangfei.gao@marvell.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 arch/arm/mach-mmp/include/mach/mmp2.h   |  2 +-
 arch/arm/plat-pxa/include/plat/sdhci.h  | 60 ---------------------------------
 drivers/mmc/host/sdhci-pxav2.c          |  2 +-
 drivers/mmc/host/sdhci-pxav3.c          |  2 +-
 include/linux/platform_data/pxa_sdhci.h | 60 +++++++++++++++++++++++++++++++++
 5 files changed, 63 insertions(+), 63 deletions(-)
 delete mode 100644 arch/arm/plat-pxa/include/plat/sdhci.h
 create mode 100644 include/linux/platform_data/pxa_sdhci.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-mmp/include/mach/mmp2.h b/arch/arm/mach-mmp/include/mach/mmp2.h
index 2cbf6df09b82..de7b88826ad7 100644
--- a/arch/arm/mach-mmp/include/mach/mmp2.h
+++ b/arch/arm/mach-mmp/include/mach/mmp2.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_MACH_MMP2_H
 #define __ASM_MACH_MMP2_H
 
-#include <plat/sdhci.h>
+#include <linux/platform_data/pxa_sdhci.h>
 
 struct sys_timer;
 
diff --git a/arch/arm/plat-pxa/include/plat/sdhci.h b/arch/arm/plat-pxa/include/plat/sdhci.h
deleted file mode 100644
index 800ebc149fc1..000000000000
--- a/arch/arm/plat-pxa/include/plat/sdhci.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* linux/arch/arm/plat-pxa/include/plat/sdhci.h
- *
- * Copyright 2010 Marvell
- *	Zhangfei Gao <zhangfei.gao@marvell.com>
- *
- * PXA Platform - SDHCI platform data definitions
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __PLAT_PXA_SDHCI_H
-#define __PLAT_PXA_SDHCI_H
-
-/* pxa specific flag */
-/* Require clock free running */
-#define PXA_FLAG_ENABLE_CLOCK_GATING (1<<0)
-/* card always wired to host, like on-chip emmc */
-#define PXA_FLAG_CARD_PERMANENT	(1<<1)
-/* Board design supports 8-bit data on SD/SDIO BUS */
-#define PXA_FLAG_SD_8_BIT_CAPABLE_SLOT (1<<2)
-
-/*
- * struct pxa_sdhci_platdata() - Platform device data for PXA SDHCI
- * @flags: flags for platform requirement
- * @clk_delay_cycles:
- *	mmp2: each step is roughly 100ps, 5bits width
- *	pxa910: each step is 1ns, 4bits width
- * @clk_delay_sel: select clk_delay, used on pxa910
- *	0: choose feedback clk
- *	1: choose feedback clk + delay value
- *	2: choose internal clk
- * @clk_delay_enable: enable clk_delay or not, used on pxa910
- * @ext_cd_gpio: gpio pin used for external CD line
- * @ext_cd_gpio_invert: invert values for external CD gpio line
- * @max_speed: the maximum speed supported
- * @host_caps: Standard MMC host capabilities bit field.
- * @quirks: quirks of platfrom
- * @pm_caps: pm_caps of platfrom
- */
-struct sdhci_pxa_platdata {
-	unsigned int	flags;
-	unsigned int	clk_delay_cycles;
-	unsigned int	clk_delay_sel;
-	bool		clk_delay_enable;
-	unsigned int	ext_cd_gpio;
-	bool		ext_cd_gpio_invert;
-	unsigned int	max_speed;
-	unsigned int	host_caps;
-	unsigned int	quirks;
-	unsigned int	pm_caps;
-};
-
-struct sdhci_pxa {
-	u8	clk_enable;
-	u8	power_mode;
-};
-
-#endif /* __PLAT_PXA_SDHCI_H */
diff --git a/drivers/mmc/host/sdhci-pxav2.c b/drivers/mmc/host/sdhci-pxav2.c
index 7a6fa8c08363..38f58994f79a 100644
--- a/drivers/mmc/host/sdhci-pxav2.c
+++ b/drivers/mmc/host/sdhci-pxav2.c
@@ -25,7 +25,7 @@
 #include <linux/gpio.h>
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
-#include <plat/sdhci.h>
+#include <linux/platform_data/pxa_sdhci.h>
 #include <linux/slab.h>
 #include "sdhci.h"
 #include "sdhci-pltfm.h"
diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c
index 901f00fb252b..4198dbbc5c20 100644
--- a/drivers/mmc/host/sdhci-pxav3.c
+++ b/drivers/mmc/host/sdhci-pxav3.c
@@ -24,7 +24,7 @@
 #include <linux/gpio.h>
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
-#include <plat/sdhci.h>
+#include <linux/platform_data/pxa_sdhci.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include "sdhci.h"
diff --git a/include/linux/platform_data/pxa_sdhci.h b/include/linux/platform_data/pxa_sdhci.h
new file mode 100644
index 000000000000..51ad0995abac
--- /dev/null
+++ b/include/linux/platform_data/pxa_sdhci.h
@@ -0,0 +1,60 @@
+/*
+ * include/linux/platform_data/pxa_sdhci.h
+ *
+ * Copyright 2010 Marvell
+ *	Zhangfei Gao <zhangfei.gao@marvell.com>
+ *
+ * PXA Platform - SDHCI platform data definitions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _PXA_SDHCI_H_
+#define _PXA_SDHCI_H_
+
+/* pxa specific flag */
+/* Require clock free running */
+#define PXA_FLAG_ENABLE_CLOCK_GATING (1<<0)
+/* card always wired to host, like on-chip emmc */
+#define PXA_FLAG_CARD_PERMANENT	(1<<1)
+/* Board design supports 8-bit data on SD/SDIO BUS */
+#define PXA_FLAG_SD_8_BIT_CAPABLE_SLOT (1<<2)
+
+/*
+ * struct pxa_sdhci_platdata() - Platform device data for PXA SDHCI
+ * @flags: flags for platform requirement
+ * @clk_delay_cycles:
+ *	mmp2: each step is roughly 100ps, 5bits width
+ *	pxa910: each step is 1ns, 4bits width
+ * @clk_delay_sel: select clk_delay, used on pxa910
+ *	0: choose feedback clk
+ *	1: choose feedback clk + delay value
+ *	2: choose internal clk
+ * @clk_delay_enable: enable clk_delay or not, used on pxa910
+ * @ext_cd_gpio: gpio pin used for external CD line
+ * @ext_cd_gpio_invert: invert values for external CD gpio line
+ * @max_speed: the maximum speed supported
+ * @host_caps: Standard MMC host capabilities bit field.
+ * @quirks: quirks of platfrom
+ * @pm_caps: pm_caps of platfrom
+ */
+struct sdhci_pxa_platdata {
+	unsigned int	flags;
+	unsigned int	clk_delay_cycles;
+	unsigned int	clk_delay_sel;
+	bool		clk_delay_enable;
+	unsigned int	ext_cd_gpio;
+	bool		ext_cd_gpio_invert;
+	unsigned int	max_speed;
+	unsigned int	host_caps;
+	unsigned int	quirks;
+	unsigned int	pm_caps;
+};
+
+struct sdhci_pxa {
+	u8	clk_enable;
+	u8	power_mode;
+};
+#endif /* _PXA_SDHCI_H_ */
-- 
cgit v1.2.3


From 0a2d4048a22079d7e79d6654bbacbef57bd5728a Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 20 Jun 2011 20:10:08 +0100
Subject: mmc: block: allow get_card_status() to return error status

If the MMC_SEND_STATUS command is not successful, we should not return
a zero status word, but instead allow the caller to know positively
that an error occurred.

Convert the open-coded get_card_status() to use the helper function,
and provide definitions for the card state field.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Tested-by: Pawel Moll <pawel.moll@arm.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/card/block.c | 33 +++++++++------------------------
 include/linux/mmc/mmc.h  | 10 ++++++++++
 2 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index f85e42224559..3200e2ca6a9d 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -525,7 +525,7 @@ static u32 mmc_sd_num_wr_blocks(struct mmc_card *card)
 	return result;
 }
 
-static u32 get_card_status(struct mmc_card *card, struct request *req)
+static int get_card_status(struct mmc_card *card, u32 *status, int retries)
 {
 	struct mmc_command cmd = {0};
 	int err;
@@ -534,11 +534,10 @@ static u32 get_card_status(struct mmc_card *card, struct request *req)
 	if (!mmc_host_is_spi(card->host))
 		cmd.arg = card->rca << 16;
 	cmd.flags = MMC_RSP_SPI_R2 | MMC_RSP_R1 | MMC_CMD_AC;
-	err = mmc_wait_for_cmd(card->host, &cmd, 0);
-	if (err)
-		printk(KERN_ERR "%s: error %d sending status command",
-		       req->rq_disk->disk_name, err);
-	return cmd.resp[0];
+	err = mmc_wait_for_cmd(card->host, &cmd, retries);
+	if (err == 0)
+		*status = cmd.resp[0];
+	return err;
 }
 
 static int mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req)
@@ -686,7 +685,6 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 		(md->flags & MMC_BLK_REL_WR);
 
 	do {
-		struct mmc_command cmd = {0};
 		u32 readcmd, writecmd, status = 0;
 
 		memset(&brq, 0, sizeof(struct mmc_blk_request));
@@ -817,7 +815,7 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 				disable_multi = 1;
 				continue;
 			}
-			status = get_card_status(card, req);
+			get_card_status(card, &status, 0);
 		}
 
 		if (brq.sbc.error) {
@@ -854,12 +852,7 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 
 		if (!mmc_host_is_spi(card->host) && rq_data_dir(req) != READ) {
 			do {
-				int err;
-
-				cmd.opcode = MMC_SEND_STATUS;
-				cmd.arg = card->rca << 16;
-				cmd.flags = MMC_RSP_R1 | MMC_CMD_AC;
-				err = mmc_wait_for_cmd(card->host, &cmd, 5);
+				int err = get_card_status(card, &status, 5);
 				if (err) {
 					printk(KERN_ERR "%s: error %d requesting status\n",
 					       req->rq_disk->disk_name, err);
@@ -870,16 +863,8 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 				 * so make sure to check both the busy
 				 * indication and the card state.
 				 */
-			} while (!(cmd.resp[0] & R1_READY_FOR_DATA) ||
-				(R1_CURRENT_STATE(cmd.resp[0]) == 7));
-
-#if 0
-			if (cmd.resp[0] & ~0x00000900)
-				printk(KERN_ERR "%s: status = %08x\n",
-				       req->rq_disk->disk_name, cmd.resp[0]);
-			if (mmc_decode_status(cmd.resp))
-				goto cmd_err;
-#endif
+			} while (!(status & R1_READY_FOR_DATA) ||
+				 (R1_CURRENT_STATE(status) == R1_STATE_PRG));
 		}
 
 		if (brq.cmd.error || brq.stop.error || brq.data.error) {
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 5bade0605641..5a794cb503ea 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -140,6 +140,16 @@ static inline bool mmc_op_multi(u32 opcode)
 #define R1_SWITCH_ERROR		(1 << 7)	/* sx, c */
 #define R1_APP_CMD		(1 << 5)	/* sr, c */
 
+#define R1_STATE_IDLE	0
+#define R1_STATE_READY	1
+#define R1_STATE_IDENT	2
+#define R1_STATE_STBY	3
+#define R1_STATE_TRAN	4
+#define R1_STATE_DATA	5
+#define R1_STATE_RCV	6
+#define R1_STATE_PRG	7
+#define R1_STATE_DIS	8
+
 /*
  * MMC/SD in SPI mode reports R1 status always, and R2 for SEND_STATUS
  * R1 is the low order byte; R2 is the next highest byte, when present.
-- 
cgit v1.2.3


From 95c7348d948dc4832434ddfaeba804ac14732f02 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Tue, 21 Jun 2011 08:00:08 +0900
Subject: mmc: tmio: name 0xd8 as CTL_DMA_ENABLE

This reflects at least the current usage of this register
and I think it improves the readability of the code ever so slightly.

Cc: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Cc: Magnus Damm <magnus.damm@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/tmio_mmc_dma.c | 2 +-
 include/linux/mmc/tmio.h        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index 25f1ad6cbe09..9c4da6637503 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -26,7 +26,7 @@ static void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable)
 {
 #if defined(CONFIG_SUPERH) || defined(CONFIG_ARCH_SHMOBILE)
 	/* Switch DMA mode on or off - SuperH specific? */
-	writew(enable ? 2 : 0, host->ctl + (0xd8 << host->bus_shift));
+	writew(enable ? 2 : 0, host->ctl + (CTL_DMA_ENABLE << host->bus_shift));
 #endif
 }
 
diff --git a/include/linux/mmc/tmio.h b/include/linux/mmc/tmio.h
index 551490f0613d..3ae377623db0 100644
--- a/include/linux/mmc/tmio.h
+++ b/include/linux/mmc/tmio.h
@@ -30,6 +30,7 @@
 #define CTL_TRANSACTION_CTL 0x34
 #define CTL_SDIO_STATUS 0x36
 #define CTL_SDIO_IRQ_MASK 0x38
+#define CTL_DMA_ENABLE 0xd8
 #define CTL_RESET_SD 0xe0
 #define CTL_SDIO_REGS 0x100
 #define CTL_CLK_AND_WAIT_CTL 0x138
-- 
cgit v1.2.3


From 973ed3af1a570612771ed10dec6506c757767668 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Tue, 21 Jun 2011 08:00:10 +0900
Subject: mmc: sdhi: Add write16_hook

Some controllers require waiting for the bus to become idle
before writing to some registers. I have implemented this
by adding a hook to sd_ctrl_write16() and implementing
a hook for SDHI which waits for the bus to become idle.

Cc: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Cc: Magnus Damm <magnus.damm@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 36 ++++++++++++++++++++++++++++++++++++
 drivers/mmc/host/tmio_mmc.h       |  5 +++++
 include/linux/mfd/tmio.h          |  8 ++++++++
 include/linux/mmc/tmio.h          |  1 +
 4 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index ce500f03df85..774f6439d7ce 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -26,6 +26,7 @@
 #include <linux/mmc/sh_mobile_sdhi.h>
 #include <linux/mfd/tmio.h>
 #include <linux/sh_dma.h>
+#include <linux/delay.h>
 
 #include "tmio_mmc.h"
 
@@ -55,6 +56,39 @@ static int sh_mobile_sdhi_get_cd(struct platform_device *pdev)
 		return -ENOSYS;
 }
 
+static int sh_mobile_sdhi_wait_idle(struct tmio_mmc_host *host)
+{
+	int timeout = 1000;
+
+	while (--timeout && !(sd_ctrl_read16(host, CTL_STATUS2) & (1 << 13)))
+		udelay(1);
+
+	if (!timeout) {
+		dev_warn(host->pdata->dev, "timeout waiting for SD bus idle\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int sh_mobile_sdhi_write16_hook(struct tmio_mmc_host *host, int addr)
+{
+	switch (addr)
+	{
+	case CTL_SD_CMD:
+	case CTL_STOP_INTERNAL_ACTION:
+	case CTL_XFER_BLK_COUNT:
+	case CTL_SD_CARD_CLK_CTL:
+	case CTL_SD_XFER_LEN:
+	case CTL_SD_MEM_CARD_OPT:
+	case CTL_TRANSACTION_CTL:
+	case CTL_DMA_ENABLE:
+		return sh_mobile_sdhi_wait_idle(host);
+	}
+
+	return 0;
+}
+
 static int __devinit sh_mobile_sdhi_probe(struct platform_device *pdev)
 {
 	struct sh_mobile_sdhi *priv;
@@ -86,6 +120,8 @@ static int __devinit sh_mobile_sdhi_probe(struct platform_device *pdev)
 	mmc_data->hclk = clk_get_rate(priv->clk);
 	mmc_data->set_pwr = sh_mobile_sdhi_set_pwr;
 	mmc_data->get_cd = sh_mobile_sdhi_get_cd;
+	if (mmc_data->flags & TMIO_MMC_HAS_IDLE_WAIT)
+		mmc_data->write16_hook = sh_mobile_sdhi_write16_hook;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
 	if (p) {
 		mmc_data->flags = p->tmio_flags;
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 0c22df0f954d..211ef6e7a820 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -153,6 +153,11 @@ static inline u32 sd_ctrl_read32(struct tmio_mmc_host *host, int addr)
 
 static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val)
 {
+	/* If there is a hook and it returns non-zero then there
+	 * is an error and the write should be skipped
+	 */
+	if (host->pdata->write16_hook && host->pdata->write16_hook(host, addr))
+		return;
 	writew(val, host->ctl + (addr << host->bus_shift));
 }
 
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 5a90266c3a5a..0dc98044d8b7 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -68,6 +68,11 @@
  * controller and report the event to the driver.
  */
 #define TMIO_MMC_HAS_COLD_CD		(1 << 3)
+/*
+ * Some controllers require waiting for the SD bus to become
+ * idle before writing to some registers.
+ */
+#define TMIO_MMC_HAS_IDLE_WAIT		(1 << 4)
 
 int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base);
 int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base);
@@ -80,6 +85,8 @@ struct tmio_mmc_dma {
 	int alignment_shift;
 };
 
+struct tmio_mmc_host;
+
 /*
  * data for the MMC controller
  */
@@ -94,6 +101,7 @@ struct tmio_mmc_data {
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
 	int (*get_cd)(struct platform_device *host);
+	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
 };
 
 static inline void tmio_mmc_cd_wakeup(struct tmio_mmc_data *pdata)
diff --git a/include/linux/mmc/tmio.h b/include/linux/mmc/tmio.h
index 3ae377623db0..a1c1f321e519 100644
--- a/include/linux/mmc/tmio.h
+++ b/include/linux/mmc/tmio.h
@@ -21,6 +21,7 @@
 #define CTL_XFER_BLK_COUNT 0xa
 #define CTL_RESPONSE 0x0c
 #define CTL_STATUS 0x1c
+#define CTL_STATUS2 0x1e
 #define CTL_IRQ_MASK 0x20
 #define CTL_SD_CARD_CLK_CTL 0x24
 #define CTL_SD_XFER_LEN 0x26
-- 
cgit v1.2.3


From 1791b13ea4d97a6a7c162edd54485e932ad92f1b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 24 Jun 2011 13:55:55 +0100
Subject: mmc: dw_mmc: convert card tasklet to workqueue

Convert the card insert/remove tasklet to a workqueue, and call the
setpower platform specific callback without the spinlock held. This
means neither of the setpower or get_cd callbacks are called from atomic
context which allows them to sleep.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Will Newton <will.newton@imgtec.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/dw_mmc.c  | 50 ++++++++++++++++++++++++++++------------------
 include/linux/mmc/dw_mmc.h |  2 +-
 2 files changed, 32 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index c4bddf6a5f1f..3f610d55f38e 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -33,6 +33,7 @@
 #include <linux/mmc/dw_mmc.h>
 #include <linux/bitops.h>
 #include <linux/regulator/consumer.h>
+#include <linux/workqueue.h>
 
 #include "dw_mmc.h"
 
@@ -100,6 +101,8 @@ struct dw_mci_slot {
 	int			last_detect_state;
 };
 
+static struct workqueue_struct *dw_mci_card_workqueue;
+
 #if defined(CONFIG_DEBUG_FS)
 static int dw_mci_req_show(struct seq_file *s, void *v)
 {
@@ -1255,7 +1258,7 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id)
 
 		if (pending & SDMMC_INT_CD) {
 			mci_writel(host, RINTSTS, SDMMC_INT_CD);
-			tasklet_schedule(&host->card_tasklet);
+			queue_work(dw_mci_card_workqueue, &host->card_work);
 		}
 
 	} while (pass_count++ < 5);
@@ -1274,9 +1277,9 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static void dw_mci_tasklet_card(unsigned long data)
+static void dw_mci_work_routine_card(struct work_struct *work)
 {
-	struct dw_mci *host = (struct dw_mci *)data;
+	struct dw_mci *host = container_of(work, struct dw_mci, card_work);
 	int i;
 
 	for (i = 0; i < host->num_slots; i++) {
@@ -1288,22 +1291,21 @@ static void dw_mci_tasklet_card(unsigned long data)
 
 		present = dw_mci_get_cd(mmc);
 		while (present != slot->last_detect_state) {
-			spin_lock(&host->lock);
-
 			dev_dbg(&slot->mmc->class_dev, "card %s\n",
 				present ? "inserted" : "removed");
 
+			/* Power up slot (before spin_lock, may sleep) */
+			if (present != 0 && host->pdata->setpower)
+				host->pdata->setpower(slot->id, mmc->ocr_avail);
+
+			spin_lock_bh(&host->lock);
+
 			/* Card change detected */
 			slot->last_detect_state = present;
 
-			/* Power up slot */
-			if (present != 0) {
-				if (host->pdata->setpower)
-					host->pdata->setpower(slot->id,
-							      mmc->ocr_avail);
-
+			/* Mark card as present if applicable */
+			if (present != 0)
 				set_bit(DW_MMC_CARD_PRESENT, &slot->flags);
-			}
 
 			/* Clean up queue if present */
 			mrq = slot->mrq;
@@ -1353,8 +1355,6 @@ static void dw_mci_tasklet_card(unsigned long data)
 
 			/* Power down slot */
 			if (present == 0) {
-				if (host->pdata->setpower)
-					host->pdata->setpower(slot->id, 0);
 				clear_bit(DW_MMC_CARD_PRESENT, &slot->flags);
 
 				/*
@@ -1376,7 +1376,12 @@ static void dw_mci_tasklet_card(unsigned long data)
 
 			}
 
-			spin_unlock(&host->lock);
+			spin_unlock_bh(&host->lock);
+
+			/* Power down slot (after spin_unlock, may sleep) */
+			if (present == 0 && host->pdata->setpower)
+				host->pdata->setpower(slot->id, 0);
+
 			present = dw_mci_get_cd(mmc);
 		}
 
@@ -1476,7 +1481,7 @@ static int __init dw_mci_init_slot(struct dw_mci *host, unsigned int id)
 	 * Card may have been plugged in prior to boot so we
 	 * need to run the detect tasklet
 	 */
-	tasklet_schedule(&host->card_tasklet);
+	queue_work(dw_mci_card_workqueue, &host->card_work);
 
 	return 0;
 }
@@ -1665,12 +1670,15 @@ static int dw_mci_probe(struct platform_device *pdev)
 	mci_writel(host, CLKSRC, 0);
 
 	tasklet_init(&host->tasklet, dw_mci_tasklet_func, (unsigned long)host);
-	tasklet_init(&host->card_tasklet,
-		     dw_mci_tasklet_card, (unsigned long)host);
+	dw_mci_card_workqueue = alloc_workqueue("dw-mci-card",
+			WQ_MEM_RECLAIM | WQ_NON_REENTRANT, 1);
+	if (!dw_mci_card_workqueue)
+		goto err_dmaunmap;
+	INIT_WORK(&host->card_work, dw_mci_work_routine_card);
 
 	ret = request_irq(irq, dw_mci_interrupt, 0, "dw-mci", host);
 	if (ret)
-		goto err_dmaunmap;
+		goto err_workqueue;
 
 	platform_set_drvdata(pdev, host);
 
@@ -1714,6 +1722,9 @@ err_init_slot:
 	}
 	free_irq(irq, host);
 
+err_workqueue:
+	destroy_workqueue(dw_mci_card_workqueue);
+
 err_dmaunmap:
 	if (host->use_dma && host->dma_ops->exit)
 		host->dma_ops->exit(host);
@@ -1753,6 +1764,7 @@ static int __exit dw_mci_remove(struct platform_device *pdev)
 	mci_writel(host, CLKSRC, 0);
 
 	free_irq(platform_get_irq(pdev, 0), host);
+	destroy_workqueue(dw_mci_card_workqueue);
 	dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma);
 
 	if (host->use_dma && host->dma_ops->exit)
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index 55d9909c7abd..dea48475c436 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -131,7 +131,7 @@ struct dw_mci {
 	u32			stop_cmdr;
 	u32			dir_status;
 	struct tasklet_struct	tasklet;
-	struct tasklet_struct	card_tasklet;
+	struct work_struct	card_work;
 	unsigned long		pending_events;
 	unsigned long		completed_events;
 	enum dw_mci_state	state;
-- 
cgit v1.2.3


From b86d825323b4c5d0c406e5b1a85af614acf0cf5a Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 24 Jun 2011 13:57:18 +0100
Subject: mmc: dw_mmc: don't hard code fifo depth, fix usage

The FIFO_DEPTH hardware configuration parameter can be found from the
power-on value of RX_WMark in the FIFOTH register. This is used to
initialise the watermarks, but when calculating the number of free fifo
spaces a preprocessor definition is used which is hard coded to 32.

Fix reading the value out of FIFOTH (the default value in the RX_WMark
field is FIFO_DEPTH-1 not FIFO_DEPTH). Allow the fifo depth to be
overriden by platform data (since a bootloader may have changed FIFOTH
making auto-detection unreliable). Store the fifo_depth for later use.
Also fix the calculation to find the number of free bytes in the fifo to
include the fifo depth in the left shift by the data shift, since the
fifo depth is measured in fifo items not bytes.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Will Newton <will.newton@imgtec.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/dw_mmc.c  | 23 ++++++++++++++++++-----
 drivers/mmc/host/dw_mmc.h  |  1 -
 include/linux/mmc/dw_mmc.h |  8 ++++++++
 3 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 3f610d55f38e..3832312ce7cb 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -1116,8 +1116,8 @@ static void dw_mci_write_data_pio(struct dw_mci *host)
 	unsigned int nbytes = 0, len;
 
 	do {
-		len = SDMMC_FIFO_SZ -
-			(SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift);
+		len = (host->fifo_depth -
+			SDMMC_GET_FCNT(mci_readl(host, STATUS))) << shift;
 		if (offset + len <= sg->length) {
 			host->push_data(host, (void *)(buf + offset), len);
 
@@ -1659,8 +1659,19 @@ static int dw_mci_probe(struct platform_device *pdev)
 	 * FIFO threshold settings  RxMark  = fifo_size / 2 - 1,
 	 *                          Tx Mark = fifo_size / 2 DMA Size = 8
 	 */
-	fifo_size = mci_readl(host, FIFOTH);
-	fifo_size = (fifo_size >> 16) & 0x7ff;
+	if (!host->pdata->fifo_depth) {
+		/*
+		 * Power-on value of RX_WMark is FIFO_DEPTH-1, but this may
+		 * have been overwritten by the bootloader, just like we're
+		 * about to do, so if you know the value for your hardware, you
+		 * should put it in the platform data.
+		 */
+		fifo_size = mci_readl(host, FIFOTH);
+		fifo_size = 1 + ((fifo_size >> 16) & 0x7ff);
+	} else {
+		fifo_size = host->pdata->fifo_depth;
+	}
+	host->fifo_depth = fifo_size;
 	host->fifoth_val = ((0x2 << 28) | ((fifo_size/2 - 1) << 16) |
 			((fifo_size/2) << 0));
 	mci_writel(host, FIFOTH, host->fifoth_val);
@@ -1707,7 +1718,9 @@ static int dw_mci_probe(struct platform_device *pdev)
 	mci_writel(host, CTRL, SDMMC_CTRL_INT_ENABLE); /* Enable mci interrupt */
 
 	dev_info(&pdev->dev, "DW MMC controller at irq %d, "
-		 "%d bit host data width\n", irq, width);
+		 "%d bit host data width, "
+		 "%u deep fifo\n",
+		 irq, width, fifo_size);
 	if (host->quirks & DW_MCI_QUIRK_IDMAC_DTO)
 		dev_info(&pdev->dev, "Internal DMAC interrupt fix enabled.\n");
 
diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h
index bb9e5de76ae9..027d37735394 100644
--- a/drivers/mmc/host/dw_mmc.h
+++ b/drivers/mmc/host/dw_mmc.h
@@ -118,7 +118,6 @@
 #define SDMMC_CMD_INDX(n)		((n) & 0x1F)
 /* Status register defines */
 #define SDMMC_GET_FCNT(x)		(((x)>>17) & 0x1FF)
-#define SDMMC_FIFO_SZ			32
 /* Internal DMAC interrupt defines */
 #define SDMMC_IDMAC_INT_AI		BIT(9)
 #define SDMMC_IDMAC_INT_NI		BIT(8)
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index dea48475c436..01db1a0b3dd1 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -74,6 +74,7 @@ struct mmc_data;
  * @pdev: Platform device associated with the MMC controller.
  * @pdata: Platform data associated with the MMC controller.
  * @slot: Slots sharing this MMC controller.
+ * @fifo_depth: depth of FIFO.
  * @data_shift: log2 of FIFO item size.
  * @push_data: Pointer to FIFO push function.
  * @pull_data: Pointer to FIFO pull function.
@@ -146,6 +147,7 @@ struct dw_mci {
 	struct dw_mci_slot	*slot[MAX_MCI_SLOTS];
 
 	/* FIFO push and pull */
+	int			fifo_depth;
 	int			data_shift;
 	void (*push_data)(struct dw_mci *host, void *buf, int cnt);
 	void (*pull_data)(struct dw_mci *host, void *buf, int cnt);
@@ -196,6 +198,12 @@ struct dw_mci_board {
 	unsigned int bus_hz; /* Bus speed */
 
 	unsigned int caps;	/* Capabilities */
+	/*
+	 * Override fifo depth. If 0, autodetect it from the FIFOTH register,
+	 * but note that this may not be reliable after a bootloader has used
+	 * it.
+	 */
+	unsigned int fifo_depth;
 
 	/* delay in mS before detecting cards after interrupt */
 	u32 detect_delay_ms;
-- 
cgit v1.2.3


From 34b664a20e2664de0d0d7990ca60276b96c08c75 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 24 Jun 2011 13:57:56 +0100
Subject: mmc: dw_mmc: handle unaligned buffers and sizes

Update functions for PIO pushing and pulling data to and from the FIFO
so that they can handle unaligned output buffers and unaligned buffer
lengths. This makes more of the tests in mmc_test pass.

Unaligned lengths in pulls are handled by reading the full FIFO item,
and storing the remaining bytes in a small internal buffer (part_buf).
The next data pull will copy data out of this buffer first before
accessing the FIFO again. Similarly, for pushes the final bytes that
don't fill a FIFO item are stored in the part_buf (or sent anyway if
it's the last transfer), and then the part_buf is included at the
beginning of the next buffer pushed.

Unaligned buffers in pulls are handled specially if the architecture
cannot do efficient unaligned accesses, by reading FIFO items into a
aligned local buffer, and memcpy'ing them into the output buffer, again
storing any remaining bytes in the internal buffer. Similarly for pushes
the buffer is memcpy'd into an aligned local buffer then written to the
FIFO.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Will Newton <will.newton@imgtec.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/dw_mmc.c  | 309 ++++++++++++++++++++++++++++++++++++---------
 include/linux/mmc/dw_mmc.h |  10 ++
 2 files changed, 262 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 3832312ce7cb..10b697986283 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -495,6 +495,8 @@ static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
 	if (dw_mci_submit_data_dma(host, data)) {
 		host->sg = data->sg;
 		host->pio_offset = 0;
+		host->part_buf_start = 0;
+		host->part_buf_count = 0;
 		if (data->flags & MMC_DATA_READ)
 			host->dir_status = DW_MCI_RECV_STATUS;
 		else
@@ -957,84 +959,278 @@ unlock:
 
 }
 
-static void dw_mci_push_data16(struct dw_mci *host, void *buf, int cnt)
+/* push final bytes to part_buf, only use during push */
+static void dw_mci_set_part_bytes(struct dw_mci *host, void *buf, int cnt)
 {
-	u16 *pdata = (u16 *)buf;
+	memcpy((void *)&host->part_buf, buf, cnt);
+	host->part_buf_count = cnt;
+}
 
-	WARN_ON(cnt % 2 != 0);
+/* append bytes to part_buf, only use during push */
+static int dw_mci_push_part_bytes(struct dw_mci *host, void *buf, int cnt)
+{
+	cnt = min(cnt, (1 << host->data_shift) - host->part_buf_count);
+	memcpy((void *)&host->part_buf + host->part_buf_count, buf, cnt);
+	host->part_buf_count += cnt;
+	return cnt;
+}
 
-	cnt = cnt >> 1;
-	while (cnt > 0) {
-		mci_writew(host, DATA, *pdata++);
-		cnt--;
+/* pull first bytes from part_buf, only use during pull */
+static int dw_mci_pull_part_bytes(struct dw_mci *host, void *buf, int cnt)
+{
+	cnt = min(cnt, (int)host->part_buf_count);
+	if (cnt) {
+		memcpy(buf, (void *)&host->part_buf + host->part_buf_start,
+		       cnt);
+		host->part_buf_count -= cnt;
+		host->part_buf_start += cnt;
 	}
+	return cnt;
 }
 
-static void dw_mci_pull_data16(struct dw_mci *host, void *buf, int cnt)
+/* pull final bytes from the part_buf, assuming it's just been filled */
+static void dw_mci_pull_final_bytes(struct dw_mci *host, void *buf, int cnt)
 {
-	u16 *pdata = (u16 *)buf;
+	memcpy(buf, &host->part_buf, cnt);
+	host->part_buf_start = cnt;
+	host->part_buf_count = (1 << host->data_shift) - cnt;
+}
 
-	WARN_ON(cnt % 2 != 0);
+static void dw_mci_push_data16(struct dw_mci *host, void *buf, int cnt)
+{
+	/* try and push anything in the part_buf */
+	if (unlikely(host->part_buf_count)) {
+		int len = dw_mci_push_part_bytes(host, buf, cnt);
+		buf += len;
+		cnt -= len;
+		if (!sg_next(host->sg) || host->part_buf_count == 2) {
+			mci_writew(host, DATA, host->part_buf16);
+			host->part_buf_count = 0;
+		}
+	}
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (unlikely((unsigned long)buf & 0x1)) {
+		while (cnt >= 2) {
+			u16 aligned_buf[64];
+			int len = min(cnt & -2, (int)sizeof(aligned_buf));
+			int items = len >> 1;
+			int i;
+			/* memcpy from input buffer into aligned buffer */
+			memcpy(aligned_buf, buf, len);
+			buf += len;
+			cnt -= len;
+			/* push data from aligned buffer into fifo */
+			for (i = 0; i < items; ++i)
+				mci_writew(host, DATA, aligned_buf[i]);
+		}
+	} else
+#endif
+	{
+		u16 *pdata = buf;
+		for (; cnt >= 2; cnt -= 2)
+			mci_writew(host, DATA, *pdata++);
+		buf = pdata;
+	}
+	/* put anything remaining in the part_buf */
+	if (cnt) {
+		dw_mci_set_part_bytes(host, buf, cnt);
+		if (!sg_next(host->sg))
+			mci_writew(host, DATA, host->part_buf16);
+	}
+}
 
-	cnt = cnt >> 1;
-	while (cnt > 0) {
-		*pdata++ = mci_readw(host, DATA);
-		cnt--;
+static void dw_mci_pull_data16(struct dw_mci *host, void *buf, int cnt)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (unlikely((unsigned long)buf & 0x1)) {
+		while (cnt >= 2) {
+			/* pull data from fifo into aligned buffer */
+			u16 aligned_buf[64];
+			int len = min(cnt & -2, (int)sizeof(aligned_buf));
+			int items = len >> 1;
+			int i;
+			for (i = 0; i < items; ++i)
+				aligned_buf[i] = mci_readw(host, DATA);
+			/* memcpy from aligned buffer into output buffer */
+			memcpy(buf, aligned_buf, len);
+			buf += len;
+			cnt -= len;
+		}
+	} else
+#endif
+	{
+		u16 *pdata = buf;
+		for (; cnt >= 2; cnt -= 2)
+			*pdata++ = mci_readw(host, DATA);
+		buf = pdata;
+	}
+	if (cnt) {
+		host->part_buf16 = mci_readw(host, DATA);
+		dw_mci_pull_final_bytes(host, buf, cnt);
 	}
 }
 
 static void dw_mci_push_data32(struct dw_mci *host, void *buf, int cnt)
 {
-	u32 *pdata = (u32 *)buf;
-
-	WARN_ON(cnt % 4 != 0);
-	WARN_ON((unsigned long)pdata & 0x3);
-
-	cnt = cnt >> 2;
-	while (cnt > 0) {
-		mci_writel(host, DATA, *pdata++);
-		cnt--;
+	/* try and push anything in the part_buf */
+	if (unlikely(host->part_buf_count)) {
+		int len = dw_mci_push_part_bytes(host, buf, cnt);
+		buf += len;
+		cnt -= len;
+		if (!sg_next(host->sg) || host->part_buf_count == 4) {
+			mci_writel(host, DATA, host->part_buf32);
+			host->part_buf_count = 0;
+		}
+	}
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (unlikely((unsigned long)buf & 0x3)) {
+		while (cnt >= 4) {
+			u32 aligned_buf[32];
+			int len = min(cnt & -4, (int)sizeof(aligned_buf));
+			int items = len >> 2;
+			int i;
+			/* memcpy from input buffer into aligned buffer */
+			memcpy(aligned_buf, buf, len);
+			buf += len;
+			cnt -= len;
+			/* push data from aligned buffer into fifo */
+			for (i = 0; i < items; ++i)
+				mci_writel(host, DATA, aligned_buf[i]);
+		}
+	} else
+#endif
+	{
+		u32 *pdata = buf;
+		for (; cnt >= 4; cnt -= 4)
+			mci_writel(host, DATA, *pdata++);
+		buf = pdata;
+	}
+	/* put anything remaining in the part_buf */
+	if (cnt) {
+		dw_mci_set_part_bytes(host, buf, cnt);
+		if (!sg_next(host->sg))
+			mci_writel(host, DATA, host->part_buf32);
 	}
 }
 
 static void dw_mci_pull_data32(struct dw_mci *host, void *buf, int cnt)
 {
-	u32 *pdata = (u32 *)buf;
-
-	WARN_ON(cnt % 4 != 0);
-	WARN_ON((unsigned long)pdata & 0x3);
-
-	cnt = cnt >> 2;
-	while (cnt > 0) {
-		*pdata++ = mci_readl(host, DATA);
-		cnt--;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (unlikely((unsigned long)buf & 0x3)) {
+		while (cnt >= 4) {
+			/* pull data from fifo into aligned buffer */
+			u32 aligned_buf[32];
+			int len = min(cnt & -4, (int)sizeof(aligned_buf));
+			int items = len >> 2;
+			int i;
+			for (i = 0; i < items; ++i)
+				aligned_buf[i] = mci_readl(host, DATA);
+			/* memcpy from aligned buffer into output buffer */
+			memcpy(buf, aligned_buf, len);
+			buf += len;
+			cnt -= len;
+		}
+	} else
+#endif
+	{
+		u32 *pdata = buf;
+		for (; cnt >= 4; cnt -= 4)
+			*pdata++ = mci_readl(host, DATA);
+		buf = pdata;
+	}
+	if (cnt) {
+		host->part_buf32 = mci_readl(host, DATA);
+		dw_mci_pull_final_bytes(host, buf, cnt);
 	}
 }
 
 static void dw_mci_push_data64(struct dw_mci *host, void *buf, int cnt)
 {
-	u64 *pdata = (u64 *)buf;
-
-	WARN_ON(cnt % 8 != 0);
-
-	cnt = cnt >> 3;
-	while (cnt > 0) {
-		mci_writeq(host, DATA, *pdata++);
-		cnt--;
+	/* try and push anything in the part_buf */
+	if (unlikely(host->part_buf_count)) {
+		int len = dw_mci_push_part_bytes(host, buf, cnt);
+		buf += len;
+		cnt -= len;
+		if (!sg_next(host->sg) || host->part_buf_count == 8) {
+			mci_writew(host, DATA, host->part_buf);
+			host->part_buf_count = 0;
+		}
+	}
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (unlikely((unsigned long)buf & 0x7)) {
+		while (cnt >= 8) {
+			u64 aligned_buf[16];
+			int len = min(cnt & -8, (int)sizeof(aligned_buf));
+			int items = len >> 3;
+			int i;
+			/* memcpy from input buffer into aligned buffer */
+			memcpy(aligned_buf, buf, len);
+			buf += len;
+			cnt -= len;
+			/* push data from aligned buffer into fifo */
+			for (i = 0; i < items; ++i)
+				mci_writeq(host, DATA, aligned_buf[i]);
+		}
+	} else
+#endif
+	{
+		u64 *pdata = buf;
+		for (; cnt >= 8; cnt -= 8)
+			mci_writeq(host, DATA, *pdata++);
+		buf = pdata;
+	}
+	/* put anything remaining in the part_buf */
+	if (cnt) {
+		dw_mci_set_part_bytes(host, buf, cnt);
+		if (!sg_next(host->sg))
+			mci_writeq(host, DATA, host->part_buf);
 	}
 }
 
 static void dw_mci_pull_data64(struct dw_mci *host, void *buf, int cnt)
 {
-	u64 *pdata = (u64 *)buf;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (unlikely((unsigned long)buf & 0x7)) {
+		while (cnt >= 8) {
+			/* pull data from fifo into aligned buffer */
+			u64 aligned_buf[16];
+			int len = min(cnt & -8, (int)sizeof(aligned_buf));
+			int items = len >> 3;
+			int i;
+			for (i = 0; i < items; ++i)
+				aligned_buf[i] = mci_readq(host, DATA);
+			/* memcpy from aligned buffer into output buffer */
+			memcpy(buf, aligned_buf, len);
+			buf += len;
+			cnt -= len;
+		}
+	} else
+#endif
+	{
+		u64 *pdata = buf;
+		for (; cnt >= 8; cnt -= 8)
+			*pdata++ = mci_readq(host, DATA);
+		buf = pdata;
+	}
+	if (cnt) {
+		host->part_buf = mci_readq(host, DATA);
+		dw_mci_pull_final_bytes(host, buf, cnt);
+	}
+}
+
+static void dw_mci_pull_data(struct dw_mci *host, void *buf, int cnt)
+{
+	int len;
 
-	WARN_ON(cnt % 8 != 0);
+	/* get remaining partial bytes */
+	len = dw_mci_pull_part_bytes(host, buf, cnt);
+	if (unlikely(len == cnt))
+		return;
+	buf += len;
+	cnt -= len;
 
-	cnt = cnt >> 3;
-	while (cnt > 0) {
-		*pdata++ = mci_readq(host, DATA);
-		cnt--;
-	}
+	/* get the rest of the data */
+	host->pull_data(host, buf, cnt);
 }
 
 static void dw_mci_read_data_pio(struct dw_mci *host)
@@ -1048,9 +1244,10 @@ static void dw_mci_read_data_pio(struct dw_mci *host)
 	unsigned int nbytes = 0, len;
 
 	do {
-		len = SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift;
+		len = host->part_buf_count +
+			(SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift);
 		if (offset + len <= sg->length) {
-			host->pull_data(host, (void *)(buf + offset), len);
+			dw_mci_pull_data(host, (void *)(buf + offset), len);
 
 			offset += len;
 			nbytes += len;
@@ -1066,8 +1263,8 @@ static void dw_mci_read_data_pio(struct dw_mci *host)
 			}
 		} else {
 			unsigned int remaining = sg->length - offset;
-			host->pull_data(host, (void *)(buf + offset),
-					remaining);
+			dw_mci_pull_data(host, (void *)(buf + offset),
+					 remaining);
 			nbytes += remaining;
 
 			flush_dcache_page(sg_page(sg));
@@ -1077,7 +1274,7 @@ static void dw_mci_read_data_pio(struct dw_mci *host)
 
 			offset = len - remaining;
 			buf = sg_virt(sg);
-			host->pull_data(host, buf, offset);
+			dw_mci_pull_data(host, buf, offset);
 			nbytes += offset;
 		}
 
@@ -1094,7 +1291,6 @@ static void dw_mci_read_data_pio(struct dw_mci *host)
 			return;
 		}
 	} while (status & SDMMC_INT_RXDR); /*if the RXDR is ready read again*/
-	len = SDMMC_GET_FCNT(mci_readl(host, STATUS));
 	host->pio_offset = offset;
 	data->bytes_xfered += nbytes;
 	return;
@@ -1116,8 +1312,9 @@ static void dw_mci_write_data_pio(struct dw_mci *host)
 	unsigned int nbytes = 0, len;
 
 	do {
-		len = (host->fifo_depth -
-			SDMMC_GET_FCNT(mci_readl(host, STATUS))) << shift;
+		len = ((host->fifo_depth -
+			SDMMC_GET_FCNT(mci_readl(host, STATUS))) << shift)
+			- host->part_buf_count;
 		if (offset + len <= sg->length) {
 			host->push_data(host, (void *)(buf + offset), len);
 
@@ -1162,10 +1359,8 @@ static void dw_mci_write_data_pio(struct dw_mci *host)
 			return;
 		}
 	} while (status & SDMMC_INT_TXDR); /* if TXDR write again */
-
 	host->pio_offset = offset;
 	data->bytes_xfered += nbytes;
-
 	return;
 
 done:
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index 01db1a0b3dd1..f3f68ee08a1e 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -76,6 +76,9 @@ struct mmc_data;
  * @slot: Slots sharing this MMC controller.
  * @fifo_depth: depth of FIFO.
  * @data_shift: log2 of FIFO item size.
+ * @part_buf_start: Start index in part_buf.
+ * @part_buf_count: Bytes of partial data in part_buf.
+ * @part_buf: Simple buffer for partial fifo reads/writes.
  * @push_data: Pointer to FIFO push function.
  * @pull_data: Pointer to FIFO pull function.
  * @quirks: Set of quirks that apply to specific versions of the IP.
@@ -149,6 +152,13 @@ struct dw_mci {
 	/* FIFO push and pull */
 	int			fifo_depth;
 	int			data_shift;
+	u8			part_buf_start;
+	u8			part_buf_count;
+	union {
+		u16		part_buf16;
+		u32		part_buf32;
+		u64		part_buf;
+	};
 	void (*push_data)(struct dw_mci *host, void *buf, int cnt);
 	void (*pull_data)(struct dw_mci *host, void *buf, int cnt);
 
-- 
cgit v1.2.3


From e056a1b5b67b4e4bfad00bf143ab14f634777705 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 28 Jun 2011 17:16:02 +0300
Subject: mmc: queue: let host controllers specify maximum discard timeout

Some host controllers will not operate without a hardware
timeout that is limited in value.  However large discards
require large timeouts, so there needs to be a way to
specify the maximum discard size.

A host controller driver may now specify the maximum discard
timeout possible so that max_discard_sectors can be calculated.

However, for eMMC when the High Capacity Erase Group Size
is not in use, the timeout calculation depends on clock
rate which may change.  For that case Preferred Erase Size
is used instead.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/card/queue.c | 33 ++++++++++++++-------
 drivers/mmc/core/core.c  | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/core.h |  1 +
 include/linux/mmc/host.h |  1 +
 4 files changed, 101 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 6413afa318d2..defc11b4572c 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -101,6 +101,27 @@ static void mmc_request(struct request_queue *q)
 		wake_up_process(mq->thread);
 }
 
+static void mmc_queue_setup_discard(struct request_queue *q,
+				    struct mmc_card *card)
+{
+	unsigned max_discard;
+
+	max_discard = mmc_calc_max_discard(card);
+	if (!max_discard)
+		return;
+
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+	q->limits.max_discard_sectors = max_discard;
+	if (card->erased_byte == 0)
+		q->limits.discard_zeroes_data = 1;
+	q->limits.discard_granularity = card->pref_erase << 9;
+	/* granularity must not be greater than max. discard */
+	if (card->pref_erase > max_discard)
+		q->limits.discard_granularity = 0;
+	if (mmc_can_secure_erase_trim(card))
+		queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
+}
+
 /**
  * mmc_init_queue - initialise a queue structure.
  * @mq: mmc queue
@@ -130,16 +151,8 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 
 	blk_queue_prep_rq(mq->queue, mmc_prep_request);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
-	if (mmc_can_erase(card)) {
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mq->queue);
-		mq->queue->limits.max_discard_sectors = UINT_MAX;
-		if (card->erased_byte == 0)
-			mq->queue->limits.discard_zeroes_data = 1;
-		mq->queue->limits.discard_granularity = card->pref_erase << 9;
-		if (mmc_can_secure_erase_trim(card))
-			queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD,
-						mq->queue);
-	}
+	if (mmc_can_erase(card))
+		mmc_queue_setup_discard(mq->queue, card);
 
 #ifdef CONFIG_MMC_BLOCK_BOUNCE
 	if (host->max_segs == 1) {
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 7843efe22359..ac82865b8c2f 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1516,6 +1516,82 @@ int mmc_erase_group_aligned(struct mmc_card *card, unsigned int from,
 }
 EXPORT_SYMBOL(mmc_erase_group_aligned);
 
+static unsigned int mmc_do_calc_max_discard(struct mmc_card *card,
+					    unsigned int arg)
+{
+	struct mmc_host *host = card->host;
+	unsigned int max_discard, x, y, qty = 0, max_qty, timeout;
+	unsigned int last_timeout = 0;
+
+	if (card->erase_shift)
+		max_qty = UINT_MAX >> card->erase_shift;
+	else if (mmc_card_sd(card))
+		max_qty = UINT_MAX;
+	else
+		max_qty = UINT_MAX / card->erase_size;
+
+	/* Find the largest qty with an OK timeout */
+	do {
+		y = 0;
+		for (x = 1; x && x <= max_qty && max_qty - x >= qty; x <<= 1) {
+			timeout = mmc_erase_timeout(card, arg, qty + x);
+			if (timeout > host->max_discard_to)
+				break;
+			if (timeout < last_timeout)
+				break;
+			last_timeout = timeout;
+			y = x;
+		}
+		qty += y;
+	} while (y);
+
+	if (!qty)
+		return 0;
+
+	if (qty == 1)
+		return 1;
+
+	/* Convert qty to sectors */
+	if (card->erase_shift)
+		max_discard = --qty << card->erase_shift;
+	else if (mmc_card_sd(card))
+		max_discard = qty;
+	else
+		max_discard = --qty * card->erase_size;
+
+	return max_discard;
+}
+
+unsigned int mmc_calc_max_discard(struct mmc_card *card)
+{
+	struct mmc_host *host = card->host;
+	unsigned int max_discard, max_trim;
+
+	if (!host->max_discard_to)
+		return UINT_MAX;
+
+	/*
+	 * Without erase_group_def set, MMC erase timeout depends on clock
+	 * frequence which can change.  In that case, the best choice is
+	 * just the preferred erase size.
+	 */
+	if (mmc_card_mmc(card) && !(card->ext_csd.erase_group_def & 1))
+		return card->pref_erase;
+
+	max_discard = mmc_do_calc_max_discard(card, MMC_ERASE_ARG);
+	if (mmc_can_trim(card)) {
+		max_trim = mmc_do_calc_max_discard(card, MMC_TRIM_ARG);
+		if (max_trim < max_discard)
+			max_discard = max_trim;
+	} else if (max_discard < card->erase_size) {
+		max_discard = 0;
+	}
+	pr_debug("%s: calculated max. discard sectors %u for timeout %u ms\n",
+		 mmc_hostname(host), max_discard, host->max_discard_to);
+	return max_discard;
+}
+EXPORT_SYMBOL(mmc_calc_max_discard);
+
 int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen)
 {
 	struct mmc_command cmd = {0};
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 791f060a6f1d..86d81cf75b70 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -155,6 +155,7 @@ extern int mmc_can_trim(struct mmc_card *card);
 extern int mmc_can_secure_erase_trim(struct mmc_card *card);
 extern int mmc_erase_group_aligned(struct mmc_card *card, unsigned int from,
 				   unsigned int nr);
+extern unsigned int mmc_calc_max_discard(struct mmc_card *card);
 
 extern int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen);
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index fc24b38b48cb..c67d19b5542a 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -231,6 +231,7 @@ struct mmc_host {
 	unsigned int		max_req_size;	/* maximum number of bytes in one req */
 	unsigned int		max_blk_size;	/* maximum size of one mmc block */
 	unsigned int		max_blk_count;	/* maximum number of blocks in one req */
+	unsigned int		max_discard_to;	/* max. discard timeout in ms */
 
 	/* private data */
 	spinlock_t		lock;		/* lock for claim and bus ops */
-- 
cgit v1.2.3


From 03e8cb534e7cc3f71a07528a44da7ce68e5b5708 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 29 Jun 2011 09:28:43 +0100
Subject: mmc: dw_mmc: fix stop when fallen back to PIO

There are several situations when dw_mci_submit_data_dma() decides to
fall back to PIO mode instead of using DMA, due to a short (to avoid
overhead) or "complex" (e.g. with unaligned buffers) transaction, even
though host->use_dma is set. However dw_mci_stop_dma() decides whether
to stop DMA or set the EVENT_XFER_COMPLETE event based on host->use_dma.
When falling back to PIO mode this results in data timeout errors
getting missed and the driver locking up.

Therefore add host->using_dma to indicate whether the current
transaction is using dma or not, and adjust dw_mci_stop_dma() to use
that instead.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Will Newton <will.newton@imgtec.com>
Tested-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/dw_mmc.c  | 6 +++++-
 include/linux/mmc/dw_mmc.h | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 10b697986283..fcff3c042f69 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -287,7 +287,7 @@ static void send_stop_cmd(struct dw_mci *host, struct mmc_data *data)
 /* DMA interface functions */
 static void dw_mci_stop_dma(struct dw_mci *host)
 {
-	if (host->use_dma) {
+	if (host->using_dma) {
 		host->dma_ops->stop(host);
 		host->dma_ops->cleanup(host);
 	} else {
@@ -435,6 +435,8 @@ static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
 	unsigned int i, direction, sg_len;
 	u32 temp;
 
+	host->using_dma = 0;
+
 	/* If we don't have a channel, we can't do DMA */
 	if (!host->use_dma)
 		return -ENODEV;
@@ -454,6 +456,8 @@ static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
 			return -EINVAL;
 	}
 
+	host->using_dma = 1;
+
 	if (data->flags & MMC_DATA_READ)
 		direction = DMA_FROM_DEVICE;
 	else
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index f3f68ee08a1e..6b46819705d1 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -48,6 +48,7 @@ struct mmc_data;
  * @data: The data currently being transferred, or NULL if no data
  *	transfer is in progress.
  * @use_dma: Whether DMA channel is initialized or not.
+ * @using_dma: Whether DMA is in use for the current transfer.
  * @sg_dma: Bus address of DMA buffer.
  * @sg_cpu: Virtual address of DMA buffer.
  * @dma_ops: Pointer to platform-specific DMA callbacks.
@@ -121,6 +122,7 @@ struct dw_mci {
 
 	/* DMA interface members*/
 	int			use_dma;
+	int			using_dma;
 
 	dma_addr_t		sg_dma;
 	void			*sg_cpu;
-- 
cgit v1.2.3


From aa8b683a7d392271ed349c6ab9f36b8c313794b7 Mon Sep 17 00:00:00 2001
From: Per Forlin <per.forlin@linaro.org>
Date: Fri, 1 Jul 2011 18:55:22 +0200
Subject: mmc: core: add non-blocking mmc request function

Previously there has only been one function mmc_wait_for_req()
to start and wait for a request. This patch adds:

 * mmc_start_req() - starts a request wihtout waiting
   If there is on ongoing request wait for completion
   of that request and start the new one and return.
   Does not wait for the new command to complete.

This patch also adds new function members in struct mmc_host_ops
only called from core.c:

 * pre_req - asks the host driver to prepare for the next job
 * post_req - asks the host driver to clean up after a completed job

The intention is to use pre_req() and post_req() to do cache maintenance
while a request is active. pre_req() can be called while a request is
active to minimize latency to start next job. post_req() can be used after
the next job is started to clean up the request. This will minimize the
host driver request end latency. post_req() is typically used before
ending the block request and handing over the buffer to the block layer.

Add a host-private member in mmc_data to be used by pre_req to mark the
data. The host driver will then check this mark to see if the data is
prepared or not.

Signed-off-by: Per Forlin <per.forlin@linaro.org>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Venkatraman S <svenkatr@ti.com>
Tested-by: Sourav Poddar <sourav.poddar@ti.com>
Tested-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/core.c  | 113 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/mmc/core.h |   6 ++-
 include/linux/mmc/host.h |  21 +++++++++
 3 files changed, 129 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index ac82865b8c2f..ab36c7b491f3 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -198,9 +198,109 @@ mmc_start_request(struct mmc_host *host, struct mmc_request *mrq)
 
 static void mmc_wait_done(struct mmc_request *mrq)
 {
-	complete(mrq->done_data);
+	complete(&mrq->completion);
 }
 
+static void __mmc_start_req(struct mmc_host *host, struct mmc_request *mrq)
+{
+	init_completion(&mrq->completion);
+	mrq->done = mmc_wait_done;
+	mmc_start_request(host, mrq);
+}
+
+static void mmc_wait_for_req_done(struct mmc_host *host,
+				  struct mmc_request *mrq)
+{
+	wait_for_completion(&mrq->completion);
+}
+
+/**
+ *	mmc_pre_req - Prepare for a new request
+ *	@host: MMC host to prepare command
+ *	@mrq: MMC request to prepare for
+ *	@is_first_req: true if there is no previous started request
+ *                     that may run in parellel to this call, otherwise false
+ *
+ *	mmc_pre_req() is called in prior to mmc_start_req() to let
+ *	host prepare for the new request. Preparation of a request may be
+ *	performed while another request is running on the host.
+ */
+static void mmc_pre_req(struct mmc_host *host, struct mmc_request *mrq,
+		 bool is_first_req)
+{
+	if (host->ops->pre_req)
+		host->ops->pre_req(host, mrq, is_first_req);
+}
+
+/**
+ *	mmc_post_req - Post process a completed request
+ *	@host: MMC host to post process command
+ *	@mrq: MMC request to post process for
+ *	@err: Error, if non zero, clean up any resources made in pre_req
+ *
+ *	Let the host post process a completed request. Post processing of
+ *	a request may be performed while another reuqest is running.
+ */
+static void mmc_post_req(struct mmc_host *host, struct mmc_request *mrq,
+			 int err)
+{
+	if (host->ops->post_req)
+		host->ops->post_req(host, mrq, err);
+}
+
+/**
+ *	mmc_start_req - start a non-blocking request
+ *	@host: MMC host to start command
+ *	@areq: async request to start
+ *	@error: out parameter returns 0 for success, otherwise non zero
+ *
+ *	Start a new MMC custom command request for a host.
+ *	If there is on ongoing async request wait for completion
+ *	of that request and start the new one and return.
+ *	Does not wait for the new request to complete.
+ *
+ *      Returns the completed request, NULL in case of none completed.
+ *	Wait for the an ongoing request (previoulsy started) to complete and
+ *	return the completed request. If there is no ongoing request, NULL
+ *	is returned without waiting. NULL is not an error condition.
+ */
+struct mmc_async_req *mmc_start_req(struct mmc_host *host,
+				    struct mmc_async_req *areq, int *error)
+{
+	int err = 0;
+	struct mmc_async_req *data = host->areq;
+
+	/* Prepare a new request */
+	if (areq)
+		mmc_pre_req(host, areq->mrq, !host->areq);
+
+	if (host->areq) {
+		mmc_wait_for_req_done(host, host->areq->mrq);
+		err = host->areq->err_check(host->card, host->areq);
+		if (err) {
+			mmc_post_req(host, host->areq->mrq, 0);
+			if (areq)
+				mmc_post_req(host, areq->mrq, -EINVAL);
+
+			host->areq = NULL;
+			goto out;
+		}
+	}
+
+	if (areq)
+		__mmc_start_req(host, areq->mrq);
+
+	if (host->areq)
+		mmc_post_req(host, host->areq->mrq, 0);
+
+	host->areq = areq;
+ out:
+	if (error)
+		*error = err;
+	return data;
+}
+EXPORT_SYMBOL(mmc_start_req);
+
 /**
  *	mmc_wait_for_req - start a request and wait for completion
  *	@host: MMC host to start command
@@ -212,16 +312,9 @@ static void mmc_wait_done(struct mmc_request *mrq)
  */
 void mmc_wait_for_req(struct mmc_host *host, struct mmc_request *mrq)
 {
-	DECLARE_COMPLETION_ONSTACK(complete);
-
-	mrq->done_data = &complete;
-	mrq->done = mmc_wait_done;
-
-	mmc_start_request(host, mrq);
-
-	wait_for_completion(&complete);
+	__mmc_start_req(host, mrq);
+	mmc_wait_for_req_done(host, mrq);
 }
-
 EXPORT_SYMBOL(mmc_wait_for_req);
 
 /**
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 86d81cf75b70..b8b1b7a311f1 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -117,6 +117,7 @@ struct mmc_data {
 
 	unsigned int		sg_len;		/* size of scatter list */
 	struct scatterlist	*sg;		/* I/O scatter list */
+	s32			host_cookie;	/* host private data */
 };
 
 struct mmc_request {
@@ -125,13 +126,16 @@ struct mmc_request {
 	struct mmc_data		*data;
 	struct mmc_command	*stop;
 
-	void			*done_data;	/* completion data */
+	struct completion	completion;
 	void			(*done)(struct mmc_request *);/* completion function */
 };
 
 struct mmc_host;
 struct mmc_card;
+struct mmc_async_req;
 
+extern struct mmc_async_req *mmc_start_req(struct mmc_host *,
+					   struct mmc_async_req *, int *);
 extern void mmc_wait_for_req(struct mmc_host *, struct mmc_request *);
 extern int mmc_wait_for_cmd(struct mmc_host *, struct mmc_command *, int);
 extern int mmc_app_cmd(struct mmc_host *, struct mmc_card *);
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index c67d19b5542a..9f9a4c64cde7 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -106,6 +106,15 @@ struct mmc_host_ops {
 	 */
 	int (*enable)(struct mmc_host *host);
 	int (*disable)(struct mmc_host *host, int lazy);
+	/*
+	 * It is optional for the host to implement pre_req and post_req in
+	 * order to support double buffering of requests (prepare one
+	 * request while another request is active).
+	 */
+	void	(*post_req)(struct mmc_host *host, struct mmc_request *req,
+			    int err);
+	void	(*pre_req)(struct mmc_host *host, struct mmc_request *req,
+			   bool is_first_req);
 	void	(*request)(struct mmc_host *host, struct mmc_request *req);
 	/*
 	 * Avoid calling these three functions too often or in a "fast path",
@@ -144,6 +153,16 @@ struct mmc_host_ops {
 struct mmc_card;
 struct device;
 
+struct mmc_async_req {
+	/* active mmc request */
+	struct mmc_request	*mrq;
+	/*
+	 * Check error status of completed mmc request.
+	 * Returns 0 if success otherwise non zero.
+	 */
+	int (*err_check) (struct mmc_card *, struct mmc_async_req *);
+};
+
 struct mmc_host {
 	struct device		*parent;
 	struct device		class_dev;
@@ -282,6 +301,8 @@ struct mmc_host {
 
 	struct dentry		*debugfs_root;
 
+	struct mmc_async_req	*areq;		/* active async req */
+
 	unsigned long		private[0] ____cacheline_aligned;
 };
 
-- 
cgit v1.2.3


From ca8e99b32e3863c98ac958617cc157a00bf445b8 Mon Sep 17 00:00:00 2001
From: Philip Rakity <prakity@marvell.com>
Date: Wed, 6 Jul 2011 08:51:32 -0700
Subject: mmc: core: Set non-default Drive Strength via platform hook

Non default Drive Strength cannot be set automatically.  It is a function
of the board design and only if there is a specific platform handler can
it be set.  The platform handler needs to take into account the board
design.  Pass to the platform code the necessary information.

For example:  The card and host controller may indicate they support HIGH
and LOW drive strength.  There is no way to know what should be chosen
without specific board knowledge.  Setting HIGH may lead to reflections
and setting LOW may not suffice.  There is no mechanism (like ethernet
duplex or speed pulses) to determine what should be done automatically.

If no platform handler is defined -- use the default value.

Signed-off-by: Philip Rakity <prakity@marvell.com>
Reviewed-by: Arindam Nath <arindam.nath@amd.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/sd.c    | 68 +++++++++++++++++++++++++++---------------------
 include/linux/mmc/host.h |  1 +
 2 files changed, 40 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index ff2774128aa9..633975ff2bb3 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -409,52 +409,62 @@ out:
 
 static int sd_select_driver_type(struct mmc_card *card, u8 *status)
 {
-	int host_drv_type = 0, card_drv_type = 0;
+	int host_drv_type = SD_DRIVER_TYPE_B;
+	int card_drv_type = SD_DRIVER_TYPE_B;
+	int drive_strength;
 	int err;
 
 	/*
 	 * If the host doesn't support any of the Driver Types A,C or D,
-	 * default Driver Type B is used.
+	 * or there is no board specific handler then default Driver
+	 * Type B is used.
 	 */
 	if (!(card->host->caps & (MMC_CAP_DRIVER_TYPE_A | MMC_CAP_DRIVER_TYPE_C
 	    | MMC_CAP_DRIVER_TYPE_D)))
 		return 0;
 
-	if (card->host->caps & MMC_CAP_DRIVER_TYPE_A) {
-		host_drv_type = MMC_SET_DRIVER_TYPE_A;
-		if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_A)
-			card_drv_type = MMC_SET_DRIVER_TYPE_A;
-		else if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_B)
-			card_drv_type = MMC_SET_DRIVER_TYPE_B;
-		else if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_C)
-			card_drv_type = MMC_SET_DRIVER_TYPE_C;
-	} else if (card->host->caps & MMC_CAP_DRIVER_TYPE_C) {
-		host_drv_type = MMC_SET_DRIVER_TYPE_C;
-		if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_C)
-			card_drv_type = MMC_SET_DRIVER_TYPE_C;
-	} else if (!(card->host->caps & MMC_CAP_DRIVER_TYPE_D)) {
-		/*
-		 * If we are here, that means only the default driver type
-		 * B is supported by the host.
-		 */
-		host_drv_type = MMC_SET_DRIVER_TYPE_B;
-		if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_B)
-			card_drv_type = MMC_SET_DRIVER_TYPE_B;
-		else if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_C)
-			card_drv_type = MMC_SET_DRIVER_TYPE_C;
-	}
+	if (!card->host->ops->select_drive_strength)
+		return 0;
+
+	if (card->host->caps & MMC_CAP_DRIVER_TYPE_A)
+		host_drv_type |= SD_DRIVER_TYPE_A;
+
+	if (card->host->caps & MMC_CAP_DRIVER_TYPE_C)
+		host_drv_type |= SD_DRIVER_TYPE_C;
+
+	if (card->host->caps & MMC_CAP_DRIVER_TYPE_D)
+		host_drv_type |= SD_DRIVER_TYPE_D;
+
+	if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_A)
+		card_drv_type |= SD_DRIVER_TYPE_A;
+
+	if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_C)
+		card_drv_type |= SD_DRIVER_TYPE_C;
+
+	if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_D)
+		card_drv_type |= SD_DRIVER_TYPE_D;
+
+	/*
+	 * The drive strength that the hardware can support
+	 * depends on the board design.  Pass the appropriate
+	 * information and let the hardware specific code
+	 * return what is possible given the options
+	 */
+	drive_strength = card->host->ops->select_drive_strength(
+		card->sw_caps.uhs_max_dtr,
+		host_drv_type, card_drv_type);
 
-	err = mmc_sd_switch(card, 1, 2, card_drv_type, status);
+	err = mmc_sd_switch(card, 1, 2, drive_strength, status);
 	if (err)
 		return err;
 
-	if ((status[15] & 0xF) != card_drv_type) {
-		printk(KERN_WARNING "%s: Problem setting driver strength!\n",
+	if ((status[15] & 0xF) != drive_strength) {
+		printk(KERN_WARNING "%s: Problem setting drive strength!\n",
 			mmc_hostname(card->host));
 		return 0;
 	}
 
-	mmc_set_driver_type(card->host, host_drv_type);
+	mmc_set_driver_type(card->host, drive_strength);
 
 	return 0;
 }
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 9f9a4c64cde7..0f83858147a6 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -148,6 +148,7 @@ struct mmc_host_ops {
 	int	(*start_signal_voltage_switch)(struct mmc_host *host, struct mmc_ios *ios);
 	int	(*execute_tuning)(struct mmc_host *host);
 	void	(*enable_preset_value)(struct mmc_host *host, bool enable);
+	int	(*select_drive_strength)(unsigned int max_dtr, int host_drv, int card_drv);
 };
 
 struct mmc_card;
-- 
cgit v1.2.3


From b0d40c92adafde7c2d81203ce7c1c69275f41140 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:42 +1000
Subject: superblock: introduce per-sb cache shrinker infrastructure

With context based shrinkers, we can implement a per-superblock
shrinker that shrinks the caches attached to the superblock. We
currently have global shrinkers for the inode and dentry caches that
split up into per-superblock operations via a coarse proportioning
method that does not batch very well.  The global shrinkers also
have a dependency - dentries pin inodes - so we have to be very
careful about how we register the global shrinkers so that the
implicit call order is always correct.

With a per-sb shrinker callout, we can encode this dependency
directly into the per-sb shrinker, hence avoiding the need for
strictly ordering shrinker registrations. We also have no need for
any proportioning code for the shrinker subsystem already provides
this functionality across all shrinkers. Allowing the shrinker to
operate on a single superblock at a time means that we do less
superblock list traversals and locking and reclaim should batch more
effectively. This should result in less CPU overhead for reclaim and
potentially faster reclaim of items from each filesystem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c              | 121 +++++------------------------------------------
 fs/inode.c               | 117 ++++-----------------------------------------
 fs/super.c               |  51 +++++++++++++++++++-
 include/linux/fs.h       |   7 +++
 include/linux/mm.h       |  40 +---------------
 include/linux/shrinker.h |  42 ++++++++++++++++
 6 files changed, 121 insertions(+), 257 deletions(-)
 create mode 100644 include/linux/shrinker.h

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 41e2085d430b..2762804a140d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -743,13 +743,11 @@ static void shrink_dentry_list(struct list_head *list)
  *
  * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
  */
-static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
 {
-	/* called from prune_dcache() and shrink_dcache_parent() */
 	struct dentry *dentry;
 	LIST_HEAD(referenced);
 	LIST_HEAD(tmp);
-	int cnt = *count;
 
 relock:
 	spin_lock(&dcache_lru_lock);
@@ -777,7 +775,7 @@ relock:
 		} else {
 			list_move_tail(&dentry->d_lru, &tmp);
 			spin_unlock(&dentry->d_lock);
-			if (!--cnt)
+			if (!--count)
 				break;
 		}
 		cond_resched_lock(&dcache_lru_lock);
@@ -787,83 +785,22 @@ relock:
 	spin_unlock(&dcache_lru_lock);
 
 	shrink_dentry_list(&tmp);
-
-	*count = cnt;
 }
 
 /**
- * prune_dcache - shrink the dcache
- * @count: number of entries to try to free
+ * prune_dcache_sb - shrink the dcache
+ * @nr_to_scan: number of entries to try to free
  *
- * Shrink the dcache. This is done when we need more memory, or simply when we
- * need to unmount something (at which point we need to unuse all dentries).
+ * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
+ * done when we need more memory an called from the superblock shrinker
+ * function.
  *
- * This function may fail to free any resources if all the dentries are in use.
+ * This function may fail to free any resources if all the dentries are in
+ * use.
  */
-static void prune_dcache(int count)
+void prune_dcache_sb(struct super_block *sb, int nr_to_scan)
 {
-	struct super_block *sb, *p = NULL;
-	int w_count;
-	int unused = dentry_stat.nr_unused;
-	int prune_ratio;
-	int pruned;
-
-	if (unused == 0 || count == 0)
-		return;
-	if (count >= unused)
-		prune_ratio = 1;
-	else
-		prune_ratio = unused / count;
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (list_empty(&sb->s_instances))
-			continue;
-		if (sb->s_nr_dentry_unused == 0)
-			continue;
-		sb->s_count++;
-		/* Now, we reclaim unused dentrins with fairness.
-		 * We reclaim them same percentage from each superblock.
-		 * We calculate number of dentries to scan on this sb
-		 * as follows, but the implementation is arranged to avoid
-		 * overflows:
-		 * number of dentries to scan on this sb =
-		 * count * (number of dentries on this sb /
-		 * number of dentries in the machine)
-		 */
-		spin_unlock(&sb_lock);
-		if (prune_ratio != 1)
-			w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
-		else
-			w_count = sb->s_nr_dentry_unused;
-		pruned = w_count;
-		/*
-		 * We need to be sure this filesystem isn't being unmounted,
-		 * otherwise we could race with generic_shutdown_super(), and
-		 * end up holding a reference to an inode while the filesystem
-		 * is unmounted.  So we try to get s_umount, and make sure
-		 * s_root isn't NULL.
-		 */
-		if (down_read_trylock(&sb->s_umount)) {
-			if ((sb->s_root != NULL) &&
-			    (!list_empty(&sb->s_dentry_lru))) {
-				__shrink_dcache_sb(sb, &w_count,
-						DCACHE_REFERENCED);
-				pruned -= w_count;
-			}
-			up_read(&sb->s_umount);
-		}
-		spin_lock(&sb_lock);
-		if (p)
-			__put_super(p);
-		count -= pruned;
-		p = sb;
-		/* more work left to do? */
-		if (count <= 0)
-			break;
-	}
-	if (p)
-		__put_super(p);
-	spin_unlock(&sb_lock);
+	__shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED);
 }
 
 /**
@@ -1238,42 +1175,10 @@ void shrink_dcache_parent(struct dentry * parent)
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		__shrink_dcache_sb(sb, &found, 0);
+		__shrink_dcache_sb(sb, found, 0);
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
 
-/*
- * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
- *
- * We need to avoid reentering the filesystem if the caller is performing a
- * GFP_NOFS allocation attempt.  One example deadlock is:
- *
- * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
- * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode->
- * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK.
- *
- * In this case we return -1 to tell the caller that we baled.
- */
-static int shrink_dcache_memory(struct shrinker *shrink,
-				struct shrink_control *sc)
-{
-	int nr = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
-
-	if (nr) {
-		if (!(gfp_mask & __GFP_FS))
-			return -1;
-		prune_dcache(nr);
-	}
-
-	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
-}
-
-static struct shrinker dcache_shrinker = {
-	.shrink = shrink_dcache_memory,
-	.seeks = DEFAULT_SEEKS,
-};
-
 /**
  * __d_alloc	-	allocate a dcache entry
  * @sb: filesystem it will belong to
@@ -3083,8 +2988,6 @@ static void __init dcache_init(void)
 	 */
 	dentry_cache = KMEM_CACHE(dentry,
 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
-	
-	register_shrinker(&dcache_shrinker);
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
diff --git a/fs/inode.c b/fs/inode.c
index 0450e25aeda0..1fdbb64a952f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -73,7 +73,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
  *
  * We don't actually need it to protect anything in the umount path,
  * but only need to cycle through it to make sure any inode that
- * prune_icache took off the LRU list has been fully torn down by the
+ * prune_icache_sb took off the LRU list has been fully torn down by the
  * time we are past evict_inodes.
  */
 static DECLARE_RWSEM(iprune_sem);
@@ -544,7 +544,7 @@ void evict_inodes(struct super_block *sb)
 	dispose_list(&dispose);
 
 	/*
-	 * Cycle through iprune_sem to make sure any inode that prune_icache
+	 * Cycle through iprune_sem to make sure any inode that prune_icache_sb
 	 * moved off the list before we took the lock has been fully torn
 	 * down.
 	 */
@@ -612,9 +612,10 @@ static int can_unuse(struct inode *inode)
 }
 
 /*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside sb->s_inode_lru_lock by
- * dispose_list().
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ * This is called from the superblock shrinker function with a number of inodes
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
@@ -628,14 +629,15 @@ static int can_unuse(struct inode *inode)
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
-static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
+void prune_icache_sb(struct super_block *sb, int nr_to_scan)
 {
 	LIST_HEAD(freeable);
 	int nr_scanned;
 	unsigned long reap = 0;
 
+	down_read(&iprune_sem);
 	spin_lock(&sb->s_inode_lru_lock);
-	for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) {
+	for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
 		struct inode *inode;
 
 		if (list_empty(&sb->s_inode_lru))
@@ -707,111 +709,11 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
 	else
 		__count_vm_events(PGINODESTEAL, reap);
 	spin_unlock(&sb->s_inode_lru_lock);
-	*nr_to_scan = nr_scanned;
 
 	dispose_list(&freeable);
-}
-
-static void prune_icache(int count)
-{
-	struct super_block *sb, *p = NULL;
-	int w_count;
-	int unused = inodes_stat.nr_unused;
-	int prune_ratio;
-	int pruned;
-
-	if (unused == 0 || count == 0)
-		return;
-	down_read(&iprune_sem);
-	if (count >= unused)
-		prune_ratio = 1;
-	else
-		prune_ratio = unused / count;
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (list_empty(&sb->s_instances))
-			continue;
-		if (sb->s_nr_inodes_unused == 0)
-			continue;
-		sb->s_count++;
-		/* Now, we reclaim unused dentrins with fairness.
-		 * We reclaim them same percentage from each superblock.
-		 * We calculate number of dentries to scan on this sb
-		 * as follows, but the implementation is arranged to avoid
-		 * overflows:
-		 * number of dentries to scan on this sb =
-		 * count * (number of dentries on this sb /
-		 * number of dentries in the machine)
-		 */
-		spin_unlock(&sb_lock);
-		if (prune_ratio != 1)
-			w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1;
-		else
-			w_count = sb->s_nr_inodes_unused;
-		pruned = w_count;
-		/*
-		 * We need to be sure this filesystem isn't being unmounted,
-		 * otherwise we could race with generic_shutdown_super(), and
-		 * end up holding a reference to an inode while the filesystem
-		 * is unmounted.  So we try to get s_umount, and make sure
-		 * s_root isn't NULL.
-		 */
-		if (down_read_trylock(&sb->s_umount)) {
-			if ((sb->s_root != NULL) &&
-			    (!list_empty(&sb->s_dentry_lru))) {
-				shrink_icache_sb(sb, &w_count);
-				pruned -= w_count;
-			}
-			up_read(&sb->s_umount);
-		}
-		spin_lock(&sb_lock);
-		if (p)
-			__put_super(p);
-		count -= pruned;
-		p = sb;
-		/* more work left to do? */
-		if (count <= 0)
-			break;
-	}
-	if (p)
-		__put_super(p);
-	spin_unlock(&sb_lock);
 	up_read(&iprune_sem);
 }
 
-/*
- * shrink_icache_memory() will attempt to reclaim some unused inodes.  Here,
- * "unused" means that no dentries are referring to the inodes: the files are
- * not open and the dcache references to those inodes have already been
- * reclaimed.
- *
- * This function is passed the number of inodes to scan, and it returns the
- * total number of remaining possibly-reclaimable inodes.
- */
-static int shrink_icache_memory(struct shrinker *shrink,
-				struct shrink_control *sc)
-{
-	int nr = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
-
-	if (nr) {
-		/*
-		 * Nasty deadlock avoidance.  We may hold various FS locks,
-		 * and we don't want to recurse into the FS that called us
-		 * in clear_inode() and friends..
-		 */
-		if (!(gfp_mask & __GFP_FS))
-			return -1;
-		prune_icache(nr);
-	}
-	return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
-}
-
-static struct shrinker icache_shrinker = {
-	.shrink = shrink_icache_memory,
-	.seeks = DEFAULT_SEEKS,
-};
-
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
  * Called with the inode lock held.
@@ -1691,7 +1593,6 @@ void __init inode_init(void)
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 					 SLAB_MEM_SPREAD),
 					 init_once);
-	register_shrinker(&icache_shrinker);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)
diff --git a/fs/super.c b/fs/super.c
index e63c754447ce..37a75410079e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,48 @@
 LIST_HEAD(super_blocks);
 DEFINE_SPINLOCK(sb_lock);
 
+/*
+ * One thing we have to be careful of with a per-sb shrinker is that we don't
+ * drop the last active reference to the superblock from within the shrinker.
+ * If that happens we could trigger unregistering the shrinker from within the
+ * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * take a passive reference to the superblock to avoid this from occurring.
+ */
+static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct super_block *sb;
+	int count;
+
+	sb = container_of(shrink, struct super_block, s_shrink);
+
+	/*
+	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
+	 * to recurse into the FS that called us in clear_inode() and friends..
+	 */
+	if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
+		return -1;
+
+	if (!grab_super_passive(sb))
+		return -1;
+
+	if (sc->nr_to_scan) {
+		/* proportion the scan between the two caches */
+		int total;
+
+		total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1;
+		count = (sc->nr_to_scan * sb->s_nr_dentry_unused) / total;
+
+		/* prune dcache first as icache is pinned by it */
+		prune_dcache_sb(sb, count);
+		prune_icache_sb(sb, sc->nr_to_scan - count);
+	}
+
+	count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
+						* sysctl_vfs_cache_pressure;
+	drop_super(sb);
+	return count;
+}
+
 /**
  *	alloc_super	-	create new superblock
  *	@type:	filesystem type superblock should belong to
@@ -116,6 +158,9 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
 		s->cleancache_poolid = -1;
+
+		s->s_shrink.seeks = DEFAULT_SEEKS;
+		s->s_shrink.shrink = prune_super;
 	}
 out:
 	return s;
@@ -183,6 +228,10 @@ void deactivate_locked_super(struct super_block *s)
 	if (atomic_dec_and_test(&s->s_active)) {
 		cleancache_flush_fs(s);
 		fs->kill_sb(s);
+
+		/* caches are now gone, we can safely kill the shrinker now */
+		unregister_shrinker(&s->s_shrink);
+
 		/*
 		 * We need to call rcu_barrier so all the delayed rcu free
 		 * inodes are flushed before we release the fs module.
@@ -311,7 +360,6 @@ void generic_shutdown_super(struct super_block *sb)
 {
 	const struct super_operations *sop = sb->s_op;
 
-
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
 		sync_filesystem(sb);
@@ -399,6 +447,7 @@ retry:
 	list_add(&s->s_instances, &type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(type);
+	register_shrinker(&s->s_shrink);
 	return s;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 460d2cc21ec6..d7f35e90b84a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -393,6 +393,7 @@ struct inodes_stat_t {
 #include <linux/semaphore.h>
 #include <linux/fiemap.h>
 #include <linux/rculist_bl.h>
+#include <linux/shrinker.h>
 #include <linux/atomic.h>
 
 #include <asm/byteorder.h>
@@ -1444,8 +1445,14 @@ struct super_block {
 	 * Saved pool identifier for cleancache (-1 means none)
 	 */
 	int cleancache_poolid;
+
+	struct shrinker s_shrink;	/* per-sb shrinker handle */
 };
 
+/* superblock cache pruning functions */
+extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
+extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
+
 extern struct timespec current_fs_time(struct super_block *sb);
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9b9777ac726d..e3a1a9eec0b1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -15,6 +15,7 @@
 #include <linux/range.h>
 #include <linux/pfn.h>
 #include <linux/bit_spinlock.h>
+#include <linux/shrinker.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1121,45 +1122,6 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
 }
 #endif
 
-/*
- * This struct is used to pass information from page reclaim to the shrinkers.
- * We consolidate the values for easier extention later.
- */
-struct shrink_control {
-	gfp_t gfp_mask;
-
-	/* How many slab objects shrinker() should scan and try to reclaim */
-	unsigned long nr_to_scan;
-};
-
-/*
- * A callback you can register to apply pressure to ageable caches.
- *
- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
- * and a 'gfpmask'.  It should look through the least-recently-used
- * 'nr_to_scan' entries and attempt to free them up.  It should return
- * the number of objects which remain in the cache.  If it returns -1, it means
- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
- *
- * The 'gfpmask' refers to the allocation we are currently trying to
- * fulfil.
- *
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
- */
-struct shrinker {
-	int (*shrink)(struct shrinker *, struct shrink_control *sc);
-	int seeks;	/* seeks to recreate an obj */
-	long batch;	/* reclaim batch size, 0 = default */
-
-	/* These are for internal use */
-	struct list_head list;
-	long nr;	/* objs pending delete */
-};
-#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
-extern void register_shrinker(struct shrinker *);
-extern void unregister_shrinker(struct shrinker *);
-
 int vma_wants_writenotify(struct vm_area_struct *vma);
 
 extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
new file mode 100644
index 000000000000..790651b4e5ba
--- /dev/null
+++ b/include/linux/shrinker.h
@@ -0,0 +1,42 @@
+#ifndef _LINUX_SHRINKER_H
+#define _LINUX_SHRINKER_H
+
+/*
+ * This struct is used to pass information from page reclaim to the shrinkers.
+ * We consolidate the values for easier extention later.
+ */
+struct shrink_control {
+	gfp_t gfp_mask;
+
+	/* How many slab objects shrinker() should scan and try to reclaim */
+	unsigned long nr_to_scan;
+};
+
+/*
+ * A callback you can register to apply pressure to ageable caches.
+ *
+ * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
+ * and a 'gfpmask'.  It should look through the least-recently-used
+ * 'nr_to_scan' entries and attempt to free them up.  It should return
+ * the number of objects which remain in the cache.  If it returns -1, it means
+ * it cannot do any scanning at this time (eg. there is a risk of deadlock).
+ *
+ * The 'gfpmask' refers to the allocation we are currently trying to
+ * fulfil.
+ *
+ * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
+ * querying the cache size, so a fastpath for that case is appropriate.
+ */
+struct shrinker {
+	int (*shrink)(struct shrinker *, struct shrink_control *sc);
+	int seeks;	/* seeks to recreate an obj */
+	long batch;	/* reclaim batch size, 0 = default */
+
+	/* These are for internal use */
+	struct list_head list;
+	long nr;	/* objs pending delete */
+};
+#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
+extern void register_shrinker(struct shrinker *);
+extern void unregister_shrinker(struct shrinker *);
+#endif
-- 
cgit v1.2.3


From 0e1fdafd93980eac62e778798549ce0f6073905c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:44 +1000
Subject: superblock: add filesystem shrinker operations

Now we have a per-superblock shrinker implementation, we can add a
filesystem specific callout to it to allow filesystem internal
caches to be shrunk by the superblock shrinker.

Rather than perpetuate the multipurpose shrinker callback API (i.e.
nr_to_scan == 0 meaning "tell me how many objects freeable in the
cache), two operations will be added. The first will return the
number of objects that are freeable, the second is the actual
shrinker call.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/vfs.txt | 16 ++++++++++++++
 fs/super.c                        | 45 ++++++++++++++++++++++++++++-----------
 include/linux/fs.h                |  2 ++
 3 files changed, 51 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index d56151fe77dc..fd24f34f120f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -229,6 +229,8 @@ struct super_operations {
 
         ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
         ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+	int (*nr_cached_objects)(struct super_block *);
+	void (*free_cached_objects)(struct super_block *, int);
 };
 
 All methods are called without any locks being held, unless otherwise
@@ -301,6 +303,20 @@ or bottom half).
 
   quota_write: called by the VFS to write to filesystem quota file.
 
+  nr_cached_objects: called by the sb cache shrinking function for the
+	filesystem to return the number of freeable cached objects it contains.
+	Optional.
+
+  free_cache_objects: called by the sb cache shrinking function for the
+	filesystem to scan the number of objects indicated to try to free them.
+	Optional, but any filesystem implementing this method needs to also
+	implement ->nr_cached_objects for it to be called correctly.
+
+	We can't do anything with any errors that the filesystem might
+	encountered, hence the void return type. This will never be called if
+	the VM is trying to reclaim under GFP_NOFS conditions, hence this
+	method does not need to handle that situation itself.
+
 Whoever sets up the inode is responsible for filling in the "i_op" field. This
 is a pointer to a "struct inode_operations" which describes the methods that
 can be performed on individual inodes.
diff --git a/fs/super.c b/fs/super.c
index 37a75410079e..5101f0544960 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -48,7 +48,8 @@ DEFINE_SPINLOCK(sb_lock);
 static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct super_block *sb;
-	int count;
+	int	fs_objects = 0;
+	int	total_objects;
 
 	sb = container_of(shrink, struct super_block, s_shrink);
 
@@ -62,22 +63,42 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 	if (!grab_super_passive(sb))
 		return -1;
 
-	if (sc->nr_to_scan) {
-		/* proportion the scan between the two caches */
-		int total;
+	if (sb->s_op && sb->s_op->nr_cached_objects)
+		fs_objects = sb->s_op->nr_cached_objects(sb);
+
+	total_objects = sb->s_nr_dentry_unused +
+			sb->s_nr_inodes_unused + fs_objects + 1;
 
-		total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1;
-		count = (sc->nr_to_scan * sb->s_nr_dentry_unused) / total;
+	if (sc->nr_to_scan) {
+		int	dentries;
+		int	inodes;
+
+		/* proportion the scan between the caches */
+		dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) /
+							total_objects;
+		inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) /
+							total_objects;
+		if (fs_objects)
+			fs_objects = (sc->nr_to_scan * fs_objects) /
+							total_objects;
+		/*
+		 * prune the dcache first as the icache is pinned by it, then
+		 * prune the icache, followed by the filesystem specific caches
+		 */
+		prune_dcache_sb(sb, dentries);
+		prune_icache_sb(sb, inodes);
 
-		/* prune dcache first as icache is pinned by it */
-		prune_dcache_sb(sb, count);
-		prune_icache_sb(sb, sc->nr_to_scan - count);
+		if (fs_objects && sb->s_op->free_cached_objects) {
+			sb->s_op->free_cached_objects(sb, fs_objects);
+			fs_objects = sb->s_op->nr_cached_objects(sb);
+		}
+		total_objects = sb->s_nr_dentry_unused +
+				sb->s_nr_inodes_unused + fs_objects;
 	}
 
-	count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
-						* sysctl_vfs_cache_pressure;
+	total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure;
 	drop_super(sb);
-	return count;
+	return total_objects;
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d7f35e90b84a..1393742bba9b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1655,6 +1655,8 @@ struct super_operations {
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+	int (*nr_cached_objects)(struct super_block *);
+	void (*free_cached_objects)(struct super_block *, int);
 };
 
 /*
-- 
cgit v1.2.3


From e46ebd27823b27273da780376d54c080250ff1a2 Mon Sep 17 00:00:00 2001
From: Tomasz Stanislawski <t.stanislaws@samsung.com>
Date: Tue, 12 Jul 2011 11:27:20 +0200
Subject: anonfd: fix missing declaration

The forward declaration of struct file_operations is
added to avoid compilation warnings.

Signed-off-by: Tomasz Stanislawski <t.stanislaws@samsung.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/anon_inodes.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index 69a21e0ebd33..8013a45242fe 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -8,6 +8,8 @@
 #ifndef _LINUX_ANON_INODES_H
 #define _LINUX_ANON_INODES_H
 
+struct file_operations;
+
 struct file *anon_inode_getfile(const char *name,
 				const struct file_operations *fops,
 				void *priv, int flags);
-- 
cgit v1.2.3


From bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 24 Jun 2011 14:29:43 -0400
Subject: fs: kill i_alloc_sem

i_alloc_sem is a rather special rw_semaphore.  It's the last one that may
be released by a non-owner, and it's write side is always mirrored by
real exclusion.  It's intended use it to wait for all pending direct I/O
requests to finish before starting a truncate.

Replace it with a hand-grown construct:

 - exclusion for truncates is already guaranteed by i_mutex, so it can
   simply fall way
 - the reader side is replaced by an i_dio_count member in struct inode
   that counts the number of pending direct I/O requests.  Truncate can't
   proceed as long as it's non-zero
 - when i_dio_count reaches non-zero we wake up a pending truncate using
   wake_up_bit on a new bit in i_flags
 - new references to i_dio_count can't appear while we are waiting for
   it to read zero because the direct I/O count always needs i_mutex
   (or an equivalent like XFS's i_iolock) for starting a new operation.

This scheme is much simpler, and saves the space of a spinlock_t and a
struct list_head in struct inode (typically 160 bits on a non-debug 64-bit
system).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/attr.c           |  5 +----
 fs/direct-io.c      | 65 +++++++++++++++++++++++++++++++++++++++++------------
 fs/inode.c          |  3 +--
 fs/ntfs/file.c      |  3 +--
 fs/ntfs/inode.c     | 10 ++-------
 fs/ocfs2/aops.c     |  7 +++---
 fs/ocfs2/file.c     | 15 ++++++-------
 fs/reiserfs/xattr.c |  3 +--
 include/linux/fs.h  | 11 +++++++--
 mm/filemap.c        |  3 ---
 mm/madvise.c        |  2 +-
 mm/rmap.c           |  1 -
 mm/truncate.c       |  3 +--
 13 files changed, 78 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index caf2aa521e2b..f177ac86fa48 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -233,16 +233,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 		return error;
 
 	if (ia_valid & ATTR_SIZE)
-		down_write(&dentry->d_inode->i_alloc_sem);
+		inode_dio_wait(inode);
 
 	if (inode->i_op->setattr)
 		error = inode->i_op->setattr(dentry, attr);
 	else
 		error = simple_setattr(dentry, attr);
 
-	if (ia_valid & ATTR_SIZE)
-		up_write(&dentry->d_inode->i_alloc_sem);
-
 	if (!error)
 		fsnotify_change(dentry, ia_valid);
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 98ce3ac0d94b..354cbdbc14bd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -135,6 +135,50 @@ struct dio {
 	struct page *pages[DIO_PAGES];	/* page buffer */
 };
 
+static void __inode_dio_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
+	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
+
+	do {
+		prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&inode->i_dio_count))
+			schedule();
+	} while (atomic_read(&inode->i_dio_count));
+	finish_wait(wq, &q.wait);
+}
+
+/**
+ * inode_dio_wait - wait for outstanding DIO requests to finish
+ * @inode: inode to wait for
+ *
+ * Waits for all pending direct I/O requests to finish so that we can
+ * proceed with a truncate or equivalent operation.
+ *
+ * Must be called under a lock that serializes taking new references
+ * to i_dio_count, usually by inode->i_mutex.
+ */
+void inode_dio_wait(struct inode *inode)
+{
+	if (atomic_read(&inode->i_dio_count))
+		__inode_dio_wait(inode);
+}
+EXPORT_SYMBOL_GPL(inode_dio_wait);
+
+/*
+ * inode_dio_done - signal finish of a direct I/O requests
+ * @inode: inode the direct I/O happens on
+ *
+ * This is called once we've finished processing a direct I/O request,
+ * and is used to wake up callers waiting for direct I/O to be quiesced.
+ */
+void inode_dio_done(struct inode *inode)
+{
+	if (atomic_dec_and_test(&inode->i_dio_count))
+		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
+}
+EXPORT_SYMBOL_GPL(inode_dio_done);
+
 /*
  * How many pages are in the queue?
  */
@@ -254,9 +298,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
 	}
 
 	if (dio->flags & DIO_LOCKING)
-		/* lockdep: non-owner release */
-		up_read_non_owner(&dio->inode->i_alloc_sem);
-
+		inode_dio_done(dio->inode);
 	return ret;
 }
 
@@ -980,9 +1022,6 @@ out:
 	return ret;
 }
 
-/*
- * Releases both i_mutex and i_alloc_sem
- */
 static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
@@ -1146,15 +1185,14 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
  *    For writes this function is called under i_mutex and returns with
  *    i_mutex held, for reads, i_mutex is not held on entry, but it is
  *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
+ *    The i_dio_count counter keeps track of the number of outstanding
+ *    direct I/O requests, and truncate waits for it to reach zero.
+ *    New references to i_dio_count must only be grabbed with i_mutex
+ *    held.
  *
  *  - if the flags value does NOT contain DIO_LOCKING we don't use any
  *    internal locking but rather rely on the filesystem to synchronize
  *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
  */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1234,10 +1272,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 
 		/*
-		 * Will be released at I/O completion, possibly in a
-		 * different thread.
+		 * Will be decremented at I/O completion time.
 		 */
-		down_read_non_owner(&inode->i_alloc_sem);
+		atomic_inc(&inode->i_dio_count);
 	}
 
 	/*
diff --git a/fs/inode.c b/fs/inode.c
index cf81baf1898a..96c77b81167c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -168,8 +168,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	mutex_init(&inode->i_mutex);
 	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
 
-	init_rwsem(&inode->i_alloc_sem);
-	lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+	atomic_set(&inode->i_dio_count, 0);
 
 	mapping->a_ops = &empty_aops;
 	mapping->host = inode;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f4b1057abdd2..b59f5ac26bef 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1832,9 +1832,8 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 	 * fails again.
 	 */
 	if (unlikely(NInoTruncateFailed(ni))) {
-		down_write(&vi->i_alloc_sem);
+		inode_dio_wait(vi);
 		err = ntfs_truncate(vi);
-		up_write(&vi->i_alloc_sem);
 		if (err || NInoTruncateFailed(ni)) {
 			if (!err)
 				err = -EIO;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index c05d6dcf77a4..1371487da955 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2357,12 +2357,7 @@ static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
  *
  * Returns 0 on success or -errno on error.
  *
- * Called with ->i_mutex held.  In all but one case ->i_alloc_sem is held for
- * writing.  The only case in the kernel where ->i_alloc_sem is not held is
- * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
- * with the current i_size as the offset.  The analogous place in NTFS is in
- * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
- * without holding ->i_alloc_sem.
+ * Called with ->i_mutex held.
  */
 int ntfs_truncate(struct inode *vi)
 {
@@ -2887,8 +2882,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
  * We also abort all changes of user, group, and mode as we do not implement
  * the NTFS ACLs yet.
  *
- * Called with ->i_mutex held.  For the ATTR_SIZE (i.e. ->truncate) case, also
- * called with ->i_alloc_sem held for writing.
+ * Called with ->i_mutex held.
  */
 int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ac97bca282d2..de1d3953599d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -551,9 +551,8 @@ bail:
 
 /*
  * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
- * particularly interested in the aio/dio case.  Like the core uses
- * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
- * truncation on another.
+ * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
  */
 static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     loff_t offset,
@@ -569,7 +568,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 
 	if (ocfs2_iocb_is_sem_locked(iocb)) {
-		up_read(&inode->i_alloc_sem);
+		inode_dio_done(inode);
 		ocfs2_iocb_clear_sem_locked(iocb);
 	}
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1406c37a5722..2c3a465514a2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2236,9 +2236,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	ocfs2_iocb_clear_sem_locked(iocb);
 
 relock:
-	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+	/* to match setattr's i_mutex -> rw_lock ordering */
 	if (direct_io) {
-		down_read(&inode->i_alloc_sem);
+		atomic_inc(&inode->i_dio_count);
 		have_alloc_sem = 1;
 		/* communicate with ocfs2_dio_end_io */
 		ocfs2_iocb_set_sem_locked(iocb);
@@ -2290,7 +2290,7 @@ relock:
 	 */
 	if (direct_io && !can_do_direct) {
 		ocfs2_rw_unlock(inode, rw_level);
-		up_read(&inode->i_alloc_sem);
+		inode_dio_done(inode);
 
 		have_alloc_sem = 0;
 		rw_level = -1;
@@ -2361,8 +2361,7 @@ out_dio:
 	/*
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
 	 * function pointer which is called when o_direct io completes so that
-	 * it can unlock our rw lock.  (it's the clustered equivalent of
-	 * i_alloc_sem; protects truncate from racing with pending ios).
+	 * it can unlock our rw lock.
 	 * Unfortunately there are error cases which call end_io and others
 	 * that don't.  so we don't have to unlock the rw_lock if either an
 	 * async dio is going to do it in the future or an end_io after an
@@ -2379,7 +2378,7 @@ out:
 
 out_sems:
 	if (have_alloc_sem) {
-		up_read(&inode->i_alloc_sem);
+		inode_dio_done(inode);
 		ocfs2_iocb_clear_sem_locked(iocb);
 	}
 
@@ -2531,8 +2530,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	 * need locks to protect pending reads from racing with truncate.
 	 */
 	if (filp->f_flags & O_DIRECT) {
-		down_read(&inode->i_alloc_sem);
 		have_alloc_sem = 1;
+		atomic_inc(&inode->i_dio_count);
 		ocfs2_iocb_set_sem_locked(iocb);
 
 		ret = ocfs2_rw_lock(inode, 0);
@@ -2575,7 +2574,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 
 bail:
 	if (have_alloc_sem) {
-		up_read(&inode->i_alloc_sem);
+		inode_dio_done(inode);
 		ocfs2_iocb_clear_sem_locked(iocb);
 	}
 	if (rw_level != -1)
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 4ea2ab41fdee..6938d8c68d6e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -555,11 +555,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 
 		reiserfs_write_unlock(inode->i_sb);
 		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
-		down_write(&dentry->d_inode->i_alloc_sem);
+		inode_dio_wait(dentry->d_inode);
 		reiserfs_write_lock(inode->i_sb);
 
 		err = reiserfs_setattr(dentry, &newattrs);
-		up_write(&dentry->d_inode->i_alloc_sem);
 		mutex_unlock(&dentry->d_inode->i_mutex);
 	} else
 		update_ctime(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1393742bba9b..2fe920774abf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -779,7 +779,7 @@ struct inode {
 	struct timespec		i_ctime;
 	blkcnt_t		i_blocks;
 	unsigned short          i_bytes;
-	struct rw_semaphore	i_alloc_sem;
+	atomic_t		i_dio_count;
 	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct file_lock	*i_flock;
 	struct address_space	*i_mapping;
@@ -1705,6 +1705,10 @@ struct super_operations {
  *			set during data writeback, and cleared with a wakeup
  *			on the bit address once it is done.
  *
+ * I_REFERENCED		Marks the inode as recently references on the LRU list.
+ *
+ * I_DIO_WAKEUP		Never set.  Only used as a key for wait_on_bit().
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -1718,6 +1722,8 @@ struct super_operations {
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
 #define I_REFERENCED		(1 << 8)
+#define __I_DIO_WAKEUP		9
+#define I_DIO_WAKEUP		(1 << I_DIO_WAKEUP)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
@@ -1828,7 +1834,6 @@ struct file_system_type {
 	struct lock_class_key i_lock_key;
 	struct lock_class_key i_mutex_key;
 	struct lock_class_key i_mutex_dir_key;
-	struct lock_class_key i_alloc_sem_key;
 };
 
 extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
@@ -2404,6 +2409,8 @@ enum {
 };
 
 void dio_end_io(struct bio *bio, int error);
+void inode_dio_wait(struct inode *inode);
+void inode_dio_done(struct inode *inode);
 
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d3457..f820e600f1ad 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -78,9 +78,6 @@
  *  ->i_mutex			(generic_file_buffered_write)
  *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
  *
- *  ->i_mutex
- *    ->i_alloc_sem             (various)
- *
  *  inode_wb_list_lock
  *    sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed503..74bf193eff04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 	endoff = (loff_t)(end - vma->vm_start - 1)
 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
-	/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+	/* vmtruncate_range needs to take i_mutex */
 	up_read(&current->mm->mmap_sem);
 	error = vmtruncate_range(mapping->host, offset, endoff);
 	down_read(&current->mm->mmap_sem);
diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f65ae43..2540a39eea4a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,6 @@
  * Lock ordering in mm:
  *
  * inode->i_mutex	(while writing or truncating, not reading or faulting)
- *   inode->i_alloc_sem (vmtruncate_range)
  *   mm->mmap_sem
  *     page->flags PG_locked (lock_page)
  *       mapping->i_mmap_mutex
diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22efaad7..003c6c685fc8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -622,12 +622,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 		return -ENOSYS;
 
 	mutex_lock(&inode->i_mutex);
-	down_write(&inode->i_alloc_sem);
+	inode_dio_wait(inode);
 	unmap_mapping_range(mapping, offset, (end - offset), 1);
 	inode->i_op->truncate_range(inode, offset, end);
 	/* unmap again to remove racily COWed private pages */
 	unmap_mapping_range(mapping, offset, (end - offset), 1);
-	up_write(&inode->i_alloc_sem);
 	mutex_unlock(&inode->i_mutex);
 
 	return 0;
-- 
cgit v1.2.3


From 11b80f459adaf91a712f95e7734a17655a36bf30 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 24 Jun 2011 14:29:44 -0400
Subject: rw_semaphore: remove up/down_read_non_owner

Now that the last users is gone these can be removed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/rwsem.h | 10 ----------
 kernel/rwsem.c        | 16 ----------------
 2 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index a8afe9cd000c..77950dfa0a9e 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -124,19 +124,9 @@ extern void downgrade_write(struct rw_semaphore *sem);
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
-/*
- * Take/release a lock when not the owner will release it.
- *
- * [ This API should be avoided as much as possible - the
- *   proper abstraction for this case is completions. ]
- */
-extern void down_read_non_owner(struct rw_semaphore *sem);
-extern void up_read_non_owner(struct rw_semaphore *sem);
 #else
 # define down_read_nested(sem, subclass)		down_read(sem)
 # define down_write_nested(sem, subclass)	down_write(sem)
-# define down_read_non_owner(sem)		down_read(sem)
-# define up_read_non_owner(sem)			up_read(sem)
 #endif
 
 #endif /* _LINUX_RWSEM_H */
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b05f5e..176e5e56ffab 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 
 EXPORT_SYMBOL(down_read_nested);
 
-void down_read_non_owner(struct rw_semaphore *sem)
-{
-	might_sleep();
-
-	__down_read(sem);
-}
-
-EXPORT_SYMBOL(down_read_non_owner);
-
 void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	might_sleep();
@@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 
 EXPORT_SYMBOL(down_write_nested);
 
-void up_read_non_owner(struct rw_semaphore *sem)
-{
-	__up_read(sem);
-}
-
-EXPORT_SYMBOL(up_read_non_owner);
-
 #endif
 
 
-- 
cgit v1.2.3


From aacfc19c626ebd3daa675652457d71019a1f583f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 24 Jun 2011 14:29:47 -0400
Subject: fs: simplify the blockdev_direct_IO prototype

Simple filesystems always pass inode->i_sb_bdev as the block device
argument, and never need a end_io handler.  Let's simply things for
them and for my grepping activity by dropping these arguments.  The
only thing not falling into that scheme is ext4, which passes and
end_io handler without needing special flags (yet), but given how
messy the direct I/O code there is use of __blockdev_direct_IO
in one instead of two out of three cases isn't going to make a large
difference anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/inode.c     |  4 ++--
 fs/ext3/inode.c     |  5 ++---
 fs/ext4/inode.c     | 12 ++++++------
 fs/fat/inode.c      |  4 ++--
 fs/hfs/inode.c      |  4 ++--
 fs/hfsplus/inode.c  |  4 ++--
 fs/jfs/inode.c      |  4 ++--
 fs/nilfs2/inode.c   |  4 ++--
 fs/reiserfs/inode.c |  5 ++---
 include/linux/fs.h  |  9 ++++-----
 10 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 06e7c767ab35..a8a58f63f07c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -843,8 +843,8 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct inode *inode = mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-				iov, offset, nr_segs, ext2_get_block, NULL);
+	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+				 ext2_get_block);
 	if (ret < 0 && (rw & WRITE))
 		ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
 	return ret;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 99c28b246b89..2978a2a17a59 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1816,9 +1816,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
 	}
 
 retry:
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				 offset, nr_segs,
-				 ext3_get_block, NULL);
+	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+				 ext3_get_block);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9ec0a2ba2502..1f35573a34e1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3501,10 +3501,8 @@ retry:
 				 offset, nr_segs,
 				 ext4_get_block, NULL, NULL, 0);
 	else {
-		ret = blockdev_direct_IO(rw, iocb, inode,
-				 inode->i_sb->s_bdev, iov,
-				 offset, nr_segs,
-				 ext4_get_block, NULL);
+		ret = blockdev_direct_IO(rw, iocb, inode, iov,
+				 offset, nr_segs, ext4_get_block);
 
 		if (unlikely((rw & WRITE) && ret < 0)) {
 			loff_t isize = i_size_read(inode);
@@ -3748,11 +3746,13 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 			EXT4_I(inode)->cur_aio_dio = iocb->private;
 		}
 
-		ret = blockdev_direct_IO(rw, iocb, inode,
+		ret = __blockdev_direct_IO(rw, iocb, inode,
 					 inode->i_sb->s_bdev, iov,
 					 offset, nr_segs,
 					 ext4_get_block_write,
-					 ext4_end_io_dio);
+					 ext4_end_io_dio,
+					 NULL,
+					 DIO_LOCKING | DIO_SKIP_HOLES);
 		if (iocb->private)
 			EXT4_I(inode)->cur_aio_dio = NULL;
 		/*
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 3decce46c38f..5942fec22c65 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -211,8 +211,8 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 	 * FAT need to use the DIO_LOCKING for avoiding the race
 	 * condition of fat_get_block() and ->truncate().
 	 */
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-				 iov, offset, nr_segs, fat_get_block, NULL);
+	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+				 fat_get_block);
 	if (ret < 0 && (rw & WRITE))
 		fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 48d567cc203e..5e7c3f309617 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -123,8 +123,8 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, hfs_get_block, NULL);
+	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+				 hfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index b0a0a4b621eb..5b1cb98741cc 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -119,8 +119,8 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, hfsplus_get_block, NULL);
+	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+				 hfsplus_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 109655904bbc..77b69b27f825 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -329,8 +329,8 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				offset, nr_segs, jfs_get_block, NULL);
+	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+				 jfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 13f113154a9f..666628b395f1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -259,8 +259,8 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 		return 0;
 
 	/* Needs synchronization with the cleaner */
-	size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, nilfs_get_block, NULL);
+	size = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+				  nilfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index dcf543d8caf1..2922b90ceac1 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3068,9 +3068,8 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs,
-				  reiserfs_get_blocks_direct_io, NULL);
+	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+				  reiserfs_get_blocks_direct_io);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2fe920774abf..824453be9fee 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2418,12 +2418,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	dio_submit_t submit_io,	int flags);
 
 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
-	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs, get_block_t get_block,
-	dio_iodone_t end_io)
+		struct inode *inode, const struct iovec *iov, loff_t offset,
+		unsigned long nr_segs, get_block_t get_block)
 {
-	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				    nr_segs, get_block, end_io, NULL,
+	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				    offset, nr_segs, get_block, NULL, NULL,
 				    DIO_LOCKING | DIO_SKIP_HOLES);
 }
 #endif
-- 
cgit v1.2.3


From f15146380d28b746df3c8b81b392812eb982382a Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 12 Jul 2011 20:48:39 +0200
Subject: fs: seq_file - add event counter to simplify poll() support

Moving the event counter into the dynamically allocated 'struc seq_file'
allows poll() support without the need to allocate its own tracking
structure.

All current users are switched over to use the new counter.

Requested-by: Andrew Morton akpm@linux-foundation.org
Acked-by: NeilBrown <neilb@suse.de>
Tested-by: Lucas De Marchi lucas.demarchi@profusion.mobi
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/md/md.c               | 26 ++++++++------------------
 fs/namespace.c                |  4 ++--
 fs/proc/base.c                |  2 +-
 include/linux/mnt_namespace.h |  1 -
 include/linux/seq_file.h      |  1 +
 mm/swapfile.c                 | 29 ++++++++---------------------
 6 files changed, 20 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 91e31e260b4a..dfc9425db70b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6394,16 +6394,11 @@ static void md_seq_stop(struct seq_file *seq, void *v)
 		mddev_put(mddev);
 }
 
-struct mdstat_info {
-	int event;
-};
-
 static int md_seq_show(struct seq_file *seq, void *v)
 {
 	mddev_t *mddev = v;
 	sector_t sectors;
 	mdk_rdev_t *rdev;
-	struct mdstat_info *mi = seq->private;
 	struct bitmap *bitmap;
 
 	if (v == (void*)1) {
@@ -6415,7 +6410,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 
 		spin_unlock(&pers_lock);
 		seq_printf(seq, "\n");
-		mi->event = atomic_read(&md_event_count);
+		seq->poll_event = atomic_read(&md_event_count);
 		return 0;
 	}
 	if (v == (void*)2) {
@@ -6527,26 +6522,21 @@ static const struct seq_operations md_seq_ops = {
 
 static int md_seq_open(struct inode *inode, struct file *file)
 {
+	struct seq_file *seq;
 	int error;
-	struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
-	if (mi == NULL)
-		return -ENOMEM;
 
 	error = seq_open(file, &md_seq_ops);
 	if (error)
-		kfree(mi);
-	else {
-		struct seq_file *p = file->private_data;
-		p->private = mi;
-		mi->event = atomic_read(&md_event_count);
-	}
+		return error;
+
+	seq = file->private_data;
+	seq->poll_event = atomic_read(&md_event_count);
 	return error;
 }
 
 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
 {
-	struct seq_file *m = filp->private_data;
-	struct mdstat_info *mi = m->private;
+	struct seq_file *seq = filp->private_data;
 	int mask;
 
 	poll_wait(filp, &md_event_waiters, wait);
@@ -6554,7 +6544,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
 	/* always allow read */
 	mask = POLLIN | POLLRDNORM;
 
-	if (mi->event != atomic_read(&md_event_count))
+	if (seq->poll_event != atomic_read(&md_event_count))
 		mask |= POLLERR | POLLPRI;
 	return mask;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index fe59bd145d21..cda50fe9250a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -934,8 +934,8 @@ int mnt_had_events(struct proc_mounts *p)
 	int res = 0;
 
 	br_read_lock(vfsmount_lock);
-	if (p->event != ns->event) {
-		p->event = ns->event;
+	if (p->m.poll_event != ns->event) {
+		p->m.poll_event = ns->event;
 		res = 1;
 	}
 	br_read_unlock(vfsmount_lock);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index be1ff932033b..3dc5e2a5cc38 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -673,7 +673,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	p->m.private = p;
 	p->ns = ns;
 	p->root = root;
-	p->event = ns->event;
+	p->m.poll_event = ns->event;
 
 	return 0;
 
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 0b89efc6f215..29304855652d 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -18,7 +18,6 @@ struct proc_mounts {
 	struct seq_file m; /* must be the first element */
 	struct mnt_namespace *ns;
 	struct path root;
-	int event;
 };
 
 struct fs_struct;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 03c0232b4169..be720cd2038d 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -23,6 +23,7 @@ struct seq_file {
 	u64 version;
 	struct mutex lock;
 	const struct seq_operations *op;
+	int poll_event;
 	void *private;
 };
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff8dc1a18cb4..1b8c33907242 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1681,19 +1681,14 @@ out:
 }
 
 #ifdef CONFIG_PROC_FS
-struct proc_swaps {
-	struct seq_file seq;
-	int event;
-};
-
 static unsigned swaps_poll(struct file *file, poll_table *wait)
 {
-	struct proc_swaps *s = file->private_data;
+	struct seq_file *seq = file->private_data;
 
 	poll_wait(file, &proc_poll_wait, wait);
 
-	if (s->event != atomic_read(&proc_poll_event)) {
-		s->event = atomic_read(&proc_poll_event);
+	if (seq->poll_event != atomic_read(&proc_poll_event)) {
+		seq->poll_event = atomic_read(&proc_poll_event);
 		return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
 	}
 
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
 
 static int swaps_open(struct inode *inode, struct file *file)
 {
-	struct proc_swaps *s;
+	struct seq_file *seq;
 	int ret;
 
-	s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
-	if (!s)
-		return -ENOMEM;
-
-	file->private_data = s;
-
 	ret = seq_open(file, &swaps_op);
-	if (ret) {
-		kfree(s);
+	if (ret)
 		return ret;
-	}
 
-	s->seq.private = s;
-	s->event = atomic_read(&proc_poll_event);
-	return ret;
+	seq = file->private_data;
+	seq->poll_event = atomic_read(&proc_poll_event);
+	return 0;
 }
 
 static const struct file_operations proc_swaps_operations = {
-- 
cgit v1.2.3


From 982d816581eeeacfe5b2b7c6d47d13a157616eff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 18 Jul 2011 13:21:35 -0400
Subject: fs: add SEEK_HOLE and SEEK_DATA flags

This just gets us ready to support the SEEK_HOLE and SEEK_DATA flags.  Turns out
using fiemap in things like cp cause more problems than it solves, so lets try
and give userspace an interface that doesn't suck.  We need to match solaris
here, and the definitions are

*o* If /whence/ is SEEK_HOLE, the offset of the start of the
next hole greater than or equal to the supplied offset
is returned. The definition of a hole is provided near
the end of the DESCRIPTION.

*o* If /whence/ is SEEK_DATA, the file pointer is set to the
start of the next non-hole file region greater than or
equal to the supplied offset.

So in the generic case the entire file is data and there is a virtual hole at
the end.  That means we will just return i_size for SEEK_HOLE and will return
the same offset for SEEK_DATA.  This is how Solaris does it so we have to do it
the same way.

Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting | 10 +++++++++
 fs/read_write.c                   | 44 ++++++++++++++++++++++++++++++++++++---
 include/linux/fs.h                |  4 +++-
 3 files changed, 54 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 0eeb3954dea3..6b96773e27cb 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -411,3 +411,13 @@ to some pointer to returning that pointer.  On errors return ERR_PTR(...).
 argument; instead of passing IPERM_FLAG_RCU we add MAY_NOT_BLOCK into mask.
 	generic_permission() has also lost the check_acl argument; if you want
 non-NULL to be used for that inode, put it into ->i_op->check_acl.
+
+--
+[mandatory]
+	If you implement your own ->llseek() you must handle SEEK_HOLE and
+SEEK_DATA.  You can hanle this by returning -EINVAL, but it would be nicer to
+support it in some way.  The generic handler assumes that the entire file is
+data and there is a virtual hole at the end of the file.  So if the provided
+offset is less than i_size and SEEK_DATA is specified, return the same offset.
+If the above is true for the offset and you are given SEEK_HOLE, return the end
+of the file.  If the offset is i_size or greater return -ENXIO in either case.
diff --git a/fs/read_write.c b/fs/read_write.c
index 5520f8ad5504..5907b49e4d7e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -64,6 +64,23 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 			return file->f_pos;
 		offset += file->f_pos;
 		break;
+	case SEEK_DATA:
+		/*
+		 * In the generic case the entire file is data, so as long as
+		 * offset isn't at the end of the file then the offset is data.
+		 */
+		if (offset >= inode->i_size)
+			return -ENXIO;
+		break;
+	case SEEK_HOLE:
+		/*
+		 * There is a virtual hole at the end of the file, so as long as
+		 * offset isn't i_size or larger, return i_size.
+		 */
+		if (offset >= inode->i_size)
+			return -ENXIO;
+		offset = inode->i_size;
+		break;
 	}
 
 	if (offset < 0 && !unsigned_offsets(file))
@@ -128,12 +145,13 @@ EXPORT_SYMBOL(no_llseek);
 
 loff_t default_llseek(struct file *file, loff_t offset, int origin)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	loff_t retval;
 
-	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	mutex_lock(&inode->i_mutex);
 	switch (origin) {
 		case SEEK_END:
-			offset += i_size_read(file->f_path.dentry->d_inode);
+			offset += i_size_read(inode);
 			break;
 		case SEEK_CUR:
 			if (offset == 0) {
@@ -141,6 +159,26 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 				goto out;
 			}
 			offset += file->f_pos;
+			break;
+		case SEEK_DATA:
+			/*
+			 * In the generic case the entire file is data, so as
+			 * long as offset isn't at the end of the file then the
+			 * offset is data.
+			 */
+			if (offset >= inode->i_size)
+				return -ENXIO;
+			break;
+		case SEEK_HOLE:
+			/*
+			 * There is a virtual hole at the end of the file, so
+			 * as long as offset isn't i_size or larger, return
+			 * i_size.
+			 */
+			if (offset >= inode->i_size)
+				return -ENXIO;
+			offset = inode->i_size;
+			break;
 	}
 	retval = -EINVAL;
 	if (offset >= 0 || unsigned_offsets(file)) {
@@ -151,7 +189,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 		retval = offset;
 	}
 out:
-	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
 EXPORT_SYMBOL(default_llseek);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 824453be9fee..4a61f98823a6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -32,7 +32,9 @@
 #define SEEK_SET	0	/* seek relative to beginning of file */
 #define SEEK_CUR	1	/* seek relative to current file position */
 #define SEEK_END	2	/* seek relative to end of file */
-#define SEEK_MAX	SEEK_END
+#define SEEK_DATA	3	/* seek to the next data */
+#define SEEK_HOLE	4	/* seek to the next hole */
+#define SEEK_MAX	SEEK_HOLE
 
 struct fstrim_range {
 	__u64 start;
-- 
cgit v1.2.3


From 02c24a82187d5a628c68edfe71ae60dc135cd178 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Sat, 16 Jul 2011 20:44:56 -0400
Subject: fs: push i_mutex and filemap_write_and_wait down into ->fsync()
 handlers

Btrfs needs to be able to control how filemap_write_and_wait_range() is called
in fsync to make it less of a painful operation, so push down taking i_mutex and
the calling of filemap_write_and_wait() down into the ->fsync() handlers.  Some
file systems can drop taking the i_mutex altogether it seems, like ext3 and
ocfs2.  For correctness sake I just pushed everything down in all cases to make
sure that we keep the current behavior the same for everybody, and then each
individual fs maintainer can make up their mind about what to do from there.
Thanks,

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking        |  6 ++---
 Documentation/filesystems/porting        |  7 ++++++
 Documentation/filesystems/vfs.txt        |  2 +-
 arch/powerpc/platforms/cell/spufs/file.c | 11 +++++++--
 drivers/char/ps3flash.c                  |  9 ++++++--
 drivers/mtd/ubi/cdev.c                   | 10 ++++++---
 drivers/staging/pohmelfs/inode.c         | 11 ++++++---
 drivers/usb/gadget/printer.c             |  5 ++++-
 drivers/video/fb_defio.c                 | 11 +++++++--
 fs/9p/v9fs_vfs.h                         |  3 ++-
 fs/9p/vfs_file.c                         | 22 ++++++++++++++++--
 fs/affs/affs.h                           |  2 +-
 fs/affs/file.c                           |  8 ++++++-
 fs/afs/internal.h                        |  2 +-
 fs/afs/write.c                           | 18 +++++++++++----
 fs/bad_inode.c                           |  3 ++-
 fs/block_dev.c                           |  6 +----
 fs/btrfs/ctree.h                         |  2 +-
 fs/btrfs/file.c                          | 21 +++++++++++++-----
 fs/ceph/caps.c                           |  6 +++--
 fs/ceph/dir.c                            | 10 ++++++++-
 fs/ceph/super.h                          |  3 ++-
 fs/cifs/cifsfs.h                         |  4 ++--
 fs/cifs/file.c                           | 18 +++++++++++++--
 fs/coda/coda_int.h                       |  2 +-
 fs/coda/file.c                           |  8 ++++++-
 fs/ecryptfs/file.c                       |  7 +++---
 fs/exofs/file.c                          | 10 ++++++++-
 fs/ext2/ext2.h                           |  3 ++-
 fs/ext2/file.c                           |  4 ++--
 fs/ext3/fsync.c                          | 18 +++++++++++++--
 fs/ext4/ext4.h                           |  2 +-
 fs/ext4/fsync.c                          | 38 +++++++++++++++++++++++++++++---
 fs/fat/fat.h                             |  3 ++-
 fs/fat/file.c                            |  4 ++--
 fs/fuse/dir.c                            |  5 +++--
 fs/fuse/file.c                           | 24 +++++++++++++++-----
 fs/fuse/fuse_i.h                         |  3 ++-
 fs/gfs2/file.c                           | 17 +++++++++++---
 fs/hfs/inode.c                           |  9 +++++++-
 fs/hfsplus/hfsplus_fs.h                  |  3 ++-
 fs/hfsplus/inode.c                       | 10 ++++++++-
 fs/hostfs/hostfs_kern.c                  | 15 +++++++++++--
 fs/hpfs/file.c                           |  7 +++++-
 fs/hpfs/hpfs_fn.h                        |  2 +-
 fs/hppfs/hppfs.c                         |  5 +++--
 fs/jffs2/file.c                          |  9 +++++++-
 fs/jffs2/os-linux.h                      |  2 +-
 fs/jfs/file.c                            |  9 +++++++-
 fs/jfs/jfs_inode.h                       |  2 +-
 fs/libfs.c                               | 16 ++++++++++----
 fs/logfs/file.c                          | 11 ++++++++-
 fs/logfs/logfs.h                         |  2 +-
 fs/ncpfs/file.c                          |  4 ++--
 fs/nfs/dir.c                             |  8 +++++--
 fs/nfs/file.c                            | 11 ++++++---
 fs/nilfs2/file.c                         | 12 ++++++++--
 fs/nilfs2/nilfs.h                        |  2 +-
 fs/ntfs/dir.c                            | 10 ++++++++-
 fs/ntfs/file.c                           | 10 ++++++++-
 fs/ocfs2/file.c                          | 14 +++++++++++-
 fs/reiserfs/dir.c                        | 13 +++++++++--
 fs/reiserfs/file.c                       |  9 +++++++-
 fs/sync.c                                | 25 +++------------------
 fs/ubifs/file.c                          | 21 +++++++++---------
 fs/ubifs/ubifs.h                         |  2 +-
 fs/xfs/linux-2.6/xfs_file.c              | 17 +++++++-------
 include/linux/ext3_fs.h                  |  2 +-
 include/linux/fb.h                       |  3 ++-
 include/linux/fs.h                       |  9 ++++----
 ipc/shm.c                                |  4 ++--
 71 files changed, 462 insertions(+), 164 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 9b6ed7c9f34f..ca7e25292542 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -412,7 +412,7 @@ prototypes:
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *);
 	int (*release) (struct inode *, struct file *);
-	int (*fsync) (struct file *, int datasync);
+	int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
@@ -438,9 +438,7 @@ prototypes:
 
 locking rules:
 	All may block except for ->setlease.
-	No VFS locks held on entry except for ->fsync and ->setlease.
-
-->fsync() has i_mutex on inode.
+	No VFS locks held on entry except for ->setlease.
 
 ->setlease has the file_list_lock held and must not sleep.
 
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 6b96773e27cb..7f8861d341ea 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -421,3 +421,10 @@ data and there is a virtual hole at the end of the file.  So if the provided
 offset is less than i_size and SEEK_DATA is specified, return the same offset.
 If the above is true for the offset and you are given SEEK_HOLE, return the end
 of the file.  If the offset is i_size or greater return -ENXIO in either case.
+
+[mandatory]
+	If you have your own ->fsync() you must make sure to call
+filemap_write_and_wait_range() so that all dirty pages are synced out properly.
+You must also keep in mind that ->fsync() is not called with i_mutex held
+anymore, so if you require i_mutex locking you must make sure to take it and
+release it yourself.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 6bf85b78cfea..eff6617c9a0f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -777,7 +777,7 @@ struct file_operations {
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *);
 	int (*release) (struct inode *, struct file *);
-	int (*fsync) (struct file *, int datasync);
+	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 3c7c3f82d842..fb59c46e9e9e 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1850,9 +1850,16 @@ out:
 	return ret;
 }
 
-static int spufs_mfc_fsync(struct file *file, int datasync)
+static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	return spufs_mfc_flush(file, NULL);
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (!err) {
+		mutex_lock(&inode->i_mutex);
+		err = spufs_mfc_flush(file, NULL);
+		mutex_unlock(&inode->i_mutex);
+	}
+	return err;
 }
 
 static int spufs_mfc_fasync(int fd, struct file *file, int on)
diff --git a/drivers/char/ps3flash.c b/drivers/char/ps3flash.c
index 5a06787e5be3..d0c57c2e2909 100644
--- a/drivers/char/ps3flash.c
+++ b/drivers/char/ps3flash.c
@@ -309,9 +309,14 @@ static int ps3flash_flush(struct file *file, fl_owner_t id)
 	return ps3flash_writeback(ps3flash_dev);
 }
 
-static int ps3flash_fsync(struct file *file, int datasync)
+static int ps3flash_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	return ps3flash_writeback(ps3flash_dev);
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int err;
+	mutex_lock(&inode->i_mutex);
+	err = ps3flash_writeback(ps3flash_dev);
+	mutex_unlock(&inode->i_mutex);
+	return err;
 }
 
 static irqreturn_t ps3flash_interrupt(int irq, void *data)
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 191f3bb3c41a..3320a50ba4f0 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -189,12 +189,16 @@ static loff_t vol_cdev_llseek(struct file *file, loff_t offset, int origin)
 	return new_offset;
 }
 
-static int vol_cdev_fsync(struct file *file, int datasync)
+static int vol_cdev_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct ubi_volume_desc *desc = file->private_data;
 	struct ubi_device *ubi = desc->vol->ubi;
-
-	return ubi_sync(ubi->ubi_num);
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int err;
+	mutex_lock(&inode->i_mutex);
+	err = ubi_sync(ubi->ubi_num);
+	mutex_unlock(&inode->i_mutex);
+	return err;
 }
 
 
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index c0f0ac7c1cdb..f3c6060c96b8 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -887,11 +887,16 @@ static struct inode *pohmelfs_alloc_inode(struct super_block *sb)
 /*
  * We want fsync() to work on POHMELFS.
  */
-static int pohmelfs_fsync(struct file *file, int datasync)
+static int pohmelfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
-
-	return sync_inode_metadata(inode, 1);
+	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (!err) {
+		mutex_lock(&inode->i_mutex);
+		err = sync_inode_metadata(inode, 1);
+		mutex_unlock(&inode->i_mutex);
+	}
+	return err;
 }
 
 ssize_t pohmelfs_write(struct file *file, const char __user *buf,
diff --git a/drivers/usb/gadget/printer.c b/drivers/usb/gadget/printer.c
index 271ef94668e7..978e6a101bf2 100644
--- a/drivers/usb/gadget/printer.c
+++ b/drivers/usb/gadget/printer.c
@@ -795,12 +795,14 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr)
 }
 
 static int
-printer_fsync(struct file *fd, int datasync)
+printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync)
 {
 	struct printer_dev	*dev = fd->private_data;
+	struct inode *inode = fd->f_path.dentry->d_inode;
 	unsigned long		flags;
 	int			tx_list_empty;
 
+	mutex_lock(&inode->i_mutex);
 	spin_lock_irqsave(&dev->lock, flags);
 	tx_list_empty = (likely(list_empty(&dev->tx_reqs)));
 	spin_unlock_irqrestore(&dev->lock, flags);
@@ -810,6 +812,7 @@ printer_fsync(struct file *fd, int datasync)
 		wait_event_interruptible(dev->tx_flush_wait,
 				(likely(list_empty(&dev->tx_reqs_active))));
 	}
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 804000183c5e..32814e8800e0 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -66,19 +66,26 @@ static int fb_deferred_io_fault(struct vm_area_struct *vma,
 	return 0;
 }
 
-int fb_deferred_io_fsync(struct file *file, int datasync)
+int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct fb_info *info = file->private_data;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
 
 	/* Skip if deferred io is compiled-in but disabled on this fbdev */
 	if (!info->fbdefio)
 		return 0;
 
+	mutex_lock(&inode->i_mutex);
 	/* Kill off the delayed work */
 	cancel_delayed_work_sync(&info->deferred_work);
 
 	/* Run it immediately */
-	return schedule_delayed_work(&info->deferred_work, 0);
+	err = schedule_delayed_work(&info->deferred_work, 0);
+	mutex_unlock(&inode->i_mutex);
+	return err;
 }
 EXPORT_SYMBOL_GPL(fb_deferred_io_fsync);
 
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 4014160903a9..46ce357ca1ab 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -70,7 +70,8 @@ ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
 ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
 int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
-int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
+			 int datasync);
 ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
 				 const char __user *, size_t, loff_t *, int);
 int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index ffed55817f0c..3c173fcc2c5a 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -519,32 +519,50 @@ out:
 }
 
 
-static int v9fs_file_fsync(struct file *filp, int datasync)
+static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			   int datasync)
 {
 	struct p9_fid *fid;
+	struct inode *inode = filp->f_mapping->host;
 	struct p9_wstat wstat;
 	int retval;
 
+	retval = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (retval)
+		return retval;
+
+	mutex_lock(&inode->i_mutex);
 	P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 	v9fs_blank_wstat(&wstat);
 
 	retval = p9_client_wstat(fid, &wstat);
+	mutex_unlock(&inode->i_mutex);
+
 	return retval;
 }
 
-int v9fs_file_fsync_dotl(struct file *filp, int datasync)
+int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
+			 int datasync)
 {
 	struct p9_fid *fid;
+	struct inode *inode = filp->f_mapping->host;
 	int retval;
 
+	retval = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (retval)
+		return retval;
+
+	mutex_lock(&inode->i_mutex);
 	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
 			filp, datasync);
 
 	fid = filp->private_data;
 
 	retval = p9_client_fsync(fid, datasync);
+	mutex_unlock(&inode->i_mutex);
+
 	return retval;
 }
 
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0e95f73a7023..c2b9c79eb64e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -182,7 +182,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 
 void		affs_free_prealloc(struct inode *inode);
 extern void	affs_truncate(struct inode *);
-int		affs_file_fsync(struct file *, int);
+int		affs_file_fsync(struct file *, loff_t, loff_t, int);
 
 /* dir.c */
 
diff --git a/fs/affs/file.c b/fs/affs/file.c
index acf321b70fcd..2f4c935cb327 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -923,14 +923,20 @@ affs_truncate(struct inode *inode)
 	affs_free_prealloc(inode);
 }
 
-int affs_file_fsync(struct file *filp, int datasync)
+int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int ret, err;
 
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
 	ret = write_inode_now(inode, 0);
 	err = sync_blockdev(inode->i_sb->s_bdev);
 	if (!ret)
 		ret = err;
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index f396d337b817..d2b0888126d4 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -750,7 +750,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
 			      unsigned long, loff_t);
 extern int afs_writeback_all(struct afs_vnode *);
-extern int afs_fsync(struct file *, int);
+extern int afs_fsync(struct file *, loff_t, loff_t, int);
 
 
 /*****************************************************************************/
diff --git a/fs/afs/write.c b/fs/afs/write.c
index b806285ff853..9aa52d93c73c 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -681,9 +681,10 @@ int afs_writeback_all(struct afs_vnode *vnode)
  * - the return status from this call provides a reliable indication of
  *   whether any write errors occurred for this process.
  */
-int afs_fsync(struct file *file, int datasync)
+int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = file->f_mapping->host;
 	struct afs_writeback *wb, *xwb;
 	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
 	int ret;
@@ -692,12 +693,19 @@ int afs_fsync(struct file *file, int datasync)
 	       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
 	       datasync);
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
 	/* use a writeback record as a marker in the queue - when this reaches
 	 * the front of the queue, all the outstanding writes are either
 	 * completed or rejected */
 	wb = kzalloc(sizeof(*wb), GFP_KERNEL);
-	if (!wb)
-		return -ENOMEM;
+	if (!wb) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	wb->vnode = vnode;
 	wb->first = 0;
 	wb->last = -1;
@@ -720,7 +728,7 @@ int afs_fsync(struct file *file, int datasync)
 	if (ret < 0) {
 		afs_put_writeback(wb);
 		_leave(" = %d [wb]", ret);
-		return ret;
+		goto out;
 	}
 
 	/* wait for the preceding writes to actually complete */
@@ -729,6 +737,8 @@ int afs_fsync(struct file *file, int datasync)
 				       vnode->writebacks.next == &wb->link);
 	afs_put_writeback(wb);
 	_leave(" = %d", ret);
+out:
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f024d8aaddef..9205cf25f1c6 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -87,7 +87,8 @@ static int bad_file_release(struct inode *inode, struct file *filp)
 	return -EIO;
 }
 
-static int bad_file_fsync(struct file *file, int datasync)
+static int bad_file_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync)
 {
 	return -EIO;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 966617a422d9..9fb0b15331d3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -378,7 +378,7 @@ out:
 	return retval;
 }
 	
-int blkdev_fsync(struct file *filp, int datasync)
+int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *bd_inode = filp->f_mapping->host;
 	struct block_device *bdev = I_BDEV(bd_inode);
@@ -389,14 +389,10 @@ int blkdev_fsync(struct file *filp, int datasync)
 	 * i_mutex and doing so causes performance issues with concurrent
 	 * O_SYNC writers to a block device.
 	 */
-	mutex_unlock(&bd_inode->i_mutex);
-
 	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 
-	mutex_lock(&bd_inode->i_mutex);
-
 	return error;
 }
 EXPORT_SYMBOL(blkdev_fsync);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f1ff62bff1b3..82be74efbb26 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2605,7 +2605,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
-int btrfs_sync_file(struct file *file, int datasync);
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned);
 extern const struct file_operations btrfs_file_operations;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bd4d061c6e4d..59cbdb120ad0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1452,7 +1452,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
  * important optimization for directories because holding the mutex prevents
  * new operations on the dir while we write to disk.
  */
-int btrfs_sync_file(struct file *file, int datasync)
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
@@ -1462,9 +1462,13 @@ int btrfs_sync_file(struct file *file, int datasync)
 
 	trace_btrfs_sync_file(file, datasync);
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
 	/* we wait first, since the writeback may change the inode */
 	root->log_batch++;
-	/* the VFS called filemap_fdatawrite for us */
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	root->log_batch++;
 
@@ -1472,8 +1476,10 @@ int btrfs_sync_file(struct file *file, int datasync)
 	 * check the transaction that last modified this inode
 	 * and see if its already been committed
 	 */
-	if (!BTRFS_I(inode)->last_trans)
+	if (!BTRFS_I(inode)->last_trans) {
+		mutex_unlock(&inode->i_mutex);
 		goto out;
+	}
 
 	/*
 	 * if the last transaction that changed this file was before
@@ -1484,6 +1490,7 @@ int btrfs_sync_file(struct file *file, int datasync)
 	if (BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
 		BTRFS_I(inode)->last_trans = 0;
+		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
 
@@ -1496,12 +1503,15 @@ int btrfs_sync_file(struct file *file, int datasync)
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
+		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
 
 	ret = btrfs_log_dentry_safe(trans, root, dentry);
-	if (ret < 0)
+	if (ret < 0) {
+		mutex_unlock(&inode->i_mutex);
 		goto out;
+	}
 
 	/* we've logged all the items and now have a consistent
 	 * version of the file in the log.  It is possible that
@@ -1513,7 +1523,7 @@ int btrfs_sync_file(struct file *file, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret != BTRFS_NO_LOG_SYNC) {
 		if (ret > 0) {
@@ -1528,7 +1538,6 @@ int btrfs_sync_file(struct file *file, int datasync)
 	} else {
 		ret = btrfs_end_transaction(trans, root);
 	}
-	mutex_lock(&dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? -EIO : ret;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f605753c8fe9..8d74ad7ba556 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1811,7 +1811,7 @@ out:
 	spin_unlock(&ci->i_unsafe_lock);
 }
 
-int ceph_fsync(struct file *file, int datasync)
+int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1822,9 +1822,10 @@ int ceph_fsync(struct file *file, int datasync)
 	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
 	sync_write_wait(inode);
 
-	ret = filemap_write_and_wait(inode->i_mapping);
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (ret < 0)
 		return ret;
+	mutex_lock(&inode->i_mutex);
 
 	dirty = try_flush_caps(inode, NULL, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
@@ -1841,6 +1842,7 @@ int ceph_fsync(struct file *file, int datasync)
 	}
 
 	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0972b457a03f..1065ac779840 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1118,7 +1118,8 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
  * an fsync() on a dir will wait for any uncommitted directory
  * operations to commit.
  */
-static int ceph_dir_fsync(struct file *file, int datasync)
+static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1128,6 +1129,11 @@ static int ceph_dir_fsync(struct file *file, int datasync)
 	int ret = 0;
 
 	dout("dir_fsync %p\n", inode);
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
 	spin_lock(&ci->i_unsafe_lock);
 	if (list_empty(head))
 		goto out;
@@ -1161,6 +1167,8 @@ static int ceph_dir_fsync(struct file *file, int datasync)
 	} while (req->r_tid < last_tid);
 out:
 	spin_unlock(&ci->i_unsafe_lock);
+	mutex_unlock(&inode->i_mutex);
+
 	return ret;
 }
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 56c41ef47cad..30446b144e3d 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -728,7 +728,8 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc,
 
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
-extern int ceph_fsync(struct file *file, int datasync);
+extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 				    struct ceph_mds_session *session);
 extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 036ca83e5f46..fbd050c8d52a 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -91,8 +91,8 @@ extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
 				  unsigned long nr_segs, loff_t pos);
 extern int cifs_lock(struct file *, int, struct file_lock *);
-extern int cifs_fsync(struct file *, int);
-extern int cifs_strict_fsync(struct file *, int);
+extern int cifs_fsync(struct file *, loff_t, loff_t, int);
+extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index bb71471a4d9d..cef584451113 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1401,7 +1401,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	return rc;
 }
 
-int cifs_strict_fsync(struct file *file, int datasync)
+int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync)
 {
 	int xid;
 	int rc = 0;
@@ -1410,6 +1411,11 @@ int cifs_strict_fsync(struct file *file, int datasync)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (rc)
+		return rc;
+	mutex_lock(&inode->i_mutex);
+
 	xid = GetXid();
 
 	cFYI(1, "Sync file - name: %s datasync: 0x%x",
@@ -1428,16 +1434,23 @@ int cifs_strict_fsync(struct file *file, int datasync)
 		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
 
 	FreeXid(xid);
+	mutex_unlock(&inode->i_mutex);
 	return rc;
 }
 
-int cifs_fsync(struct file *file, int datasync)
+int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	int xid;
 	int rc = 0;
 	struct cifs_tcon *tcon;
 	struct cifsFileInfo *smbfile = file->private_data;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	struct inode *inode = file->f_mapping->host;
+
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (rc)
+		return rc;
+	mutex_lock(&inode->i_mutex);
 
 	xid = GetXid();
 
@@ -1449,6 +1462,7 @@ int cifs_fsync(struct file *file, int datasync)
 		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
 
 	FreeXid(xid);
+	mutex_unlock(&inode->i_mutex);
 	return rc;
 }
 
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 6b443ff43a19..b7143cf783ac 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,7 +11,7 @@ extern int coda_fake_statfs;
 
 void coda_destroy_inodecache(void);
 int coda_init_inodecache(void);
-int coda_fsync(struct file *coda_file, int datasync);
+int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync);
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 0433057be330..8edd404e6419 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -199,7 +199,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	return 0;
 }
 
-int coda_fsync(struct file *coda_file, int datasync)
+int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
 {
 	struct file *host_file;
 	struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
@@ -210,6 +210,11 @@ int coda_fsync(struct file *coda_file, int datasync)
 	      S_ISLNK(coda_inode->i_mode)))
 		return -EINVAL;
 
+	err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&coda_inode->i_mutex);
+
 	cfi = CODA_FTOC(coda_file);
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
@@ -217,6 +222,7 @@ int coda_fsync(struct file *coda_file, int datasync)
 	err = vfs_fsync(host_file, datasync);
 	if (!err && !datasync)
 		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
+	mutex_unlock(&coda_inode->i_mutex);
 
 	return err;
 }
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 4ec9eb00a241..c6ac98cf9baa 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -270,14 +270,15 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 }
 
 static int
-ecryptfs_fsync(struct file *file, int datasync)
+ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	int rc = 0;
 
-	rc = generic_file_fsync(file, datasync);
+	rc = generic_file_fsync(file, start, end, datasync);
 	if (rc)
 		goto out;
-	rc = vfs_fsync(ecryptfs_file_to_lower(file), datasync);
+	rc = vfs_fsync_range(ecryptfs_file_to_lower(file), start, end,
+			     datasync);
 out:
 	return rc;
 }
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 45ca323d8363..491c6c078e7f 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -42,11 +42,19 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
  *   Note, in exofs all metadata is written as part of inode, regardless.
  *   The writeout is synchronous
  */
-static int exofs_file_fsync(struct file *filp, int datasync)
+static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			    int datasync)
 {
+	struct inode *inode = filp->f_mapping->host;
 	int ret;
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
 	ret = sync_inode_metadata(filp->f_mapping->host, 1);
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 645be9e7ee47..af9fc89b1b2d 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -150,7 +150,8 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 
 /* file.c */
-extern int ext2_fsync(struct file *file, int datasync);
+extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 49eec9456c5b..82e06321de35 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
-int ext2_fsync(struct file *file, int datasync)
+int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	int ret;
 	struct super_block *sb = file->f_mapping->host->i_sb;
 	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
 
-	ret = generic_file_fsync(file, datasync);
+	ret = generic_file_fsync(file, start, end, datasync);
 	if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
 		/* We don't really know where the IO error happened... */
 		ext2_error(sb, __func__,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 09b13bb34c94..0bcf63adb80a 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,7 +43,7 @@
  * inode to disk.
  */
 
-int ext3_sync_file(struct file *file, int datasync)
+int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct ext3_inode_info *ei = EXT3_I(inode);
@@ -54,6 +54,17 @@ int ext3_sync_file(struct file *file, int datasync)
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return 0;
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	/*
+	 * Taking the mutex here just to keep consistent with how fsync was
+	 * called previously, however it looks like we don't need to take
+	 * i_mutex at all.
+	 */
+	mutex_lock(&inode->i_mutex);
+
 	J_ASSERT(ext3_journal_current_handle() == NULL);
 
 	/*
@@ -70,8 +81,10 @@ int ext3_sync_file(struct file *file, int datasync)
 	 *  (they were dirtied by commit).  But that's OK - the blocks are
 	 *  safe in-journal, which is all fsync() needs to ensure.
 	 */
-	if (ext3_should_journal_data(inode))
+	if (ext3_should_journal_data(inode)) {
+		mutex_unlock(&inode->i_mutex);
 		return ext3_force_commit(inode->i_sb);
+	}
 
 	if (datasync)
 		commit_tid = atomic_read(&ei->i_datasync_tid);
@@ -91,5 +104,6 @@ int ext3_sync_file(struct file *file, int datasync)
 	 */
 	if (needs_barrier)
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1921392cd708..fa44df879711 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1758,7 +1758,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
-extern int ext4_sync_file(struct file *, int);
+extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
 extern int ext4_flush_completed_IO(struct inode *);
 
 /* hash.c */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ce66d2fe826c..da3bed3e0c29 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -151,6 +151,32 @@ static int ext4_sync_parent(struct inode *inode)
 	return ret;
 }
 
+/**
+ * __sync_file - generic_file_fsync without the locking and filemap_write
+ * @inode:	inode to sync
+ * @datasync:	only sync essential metadata if true
+ *
+ * This is just generic_file_fsync without the locking.  This is needed for
+ * nojournal mode to make sure this inodes data/metadata makes it to disk
+ * properly.  The i_mutex should be held already.
+ */
+static int __sync_inode(struct inode *inode, int datasync)
+{
+	int err;
+	int ret;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return ret;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return ret;
+
+	err = sync_inode_metadata(inode, 1);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
+
 /*
  * akpm: A new design for ext4_sync_file().
  *
@@ -165,7 +191,7 @@ static int ext4_sync_parent(struct inode *inode)
  * i_mutex lock is held when entering and exiting this function
  */
 
-int ext4_sync_file(struct file *file, int datasync)
+int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -178,15 +204,20 @@ int ext4_sync_file(struct file *file, int datasync)
 
 	trace_ext4_sync_file_enter(file, datasync);
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
 	if (inode->i_sb->s_flags & MS_RDONLY)
-		return 0;
+		goto out;
 
 	ret = ext4_flush_completed_IO(inode);
 	if (ret < 0)
 		goto out;
 
 	if (!journal) {
-		ret = generic_file_fsync(file, datasync);
+		ret = __sync_inode(inode, datasync);
 		if (!ret && !list_empty(&inode->i_dentry))
 			ret = ext4_sync_parent(inode);
 		goto out;
@@ -220,6 +251,7 @@ int ext4_sync_file(struct file *file, int datasync)
 	if (needs_barrier)
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
  out:
+	mutex_unlock(&inode->i_mutex);
 	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a975b4147e91..a5d3853822e0 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -310,7 +310,8 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
-extern int fat_file_fsync(struct file *file, int datasync);
+extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync);
 
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e1587c54d3c1..c118acf16e43 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -149,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int fat_file_fsync(struct file *filp, int datasync)
+int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int res, err;
 
-	res = generic_file_fsync(filp, datasync);
+	res = generic_file_fsync(filp, start, end, datasync);
 	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
 
 	return res ? res : err;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 02063dde2728..9f63e493a9b6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1176,9 +1176,10 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int fuse_dir_fsync(struct file *file, int datasync)
+static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync)
 {
-	return fuse_fsync_common(file, datasync, 1);
+	return fuse_fsync_common(file, start, end, datasync, 1);
 }
 
 static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 73b89df20851..7bb685cdd00c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -400,7 +400,8 @@ static void fuse_sync_writes(struct inode *inode)
 	fuse_release_nowrite(inode);
 }
 
-int fuse_fsync_common(struct file *file, int datasync, int isdir)
+int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
+		      int datasync, int isdir)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -412,9 +413,15 @@ int fuse_fsync_common(struct file *file, int datasync, int isdir)
 	if (is_bad_inode(inode))
 		return -EIO;
 
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
 	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
 		return 0;
 
+	mutex_lock(&inode->i_mutex);
+
 	/*
 	 * Start writeback against all dirty pages of the inode, then
 	 * wait for all outstanding writes, before sending the FSYNC
@@ -422,13 +429,15 @@ int fuse_fsync_common(struct file *file, int datasync, int isdir)
 	 */
 	err = write_inode_now(inode, 0);
 	if (err)
-		return err;
+		goto out;
 
 	fuse_sync_writes(inode);
 
 	req = fuse_get_req(fc);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -448,12 +457,15 @@ int fuse_fsync_common(struct file *file, int datasync, int isdir)
 			fc->no_fsync = 1;
 		err = 0;
 	}
+out:
+	mutex_unlock(&inode->i_mutex);
 	return err;
 }
 
-static int fuse_fsync(struct file *file, int datasync)
+static int fuse_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync)
 {
-	return fuse_fsync_common(file, datasync, 0);
+	return fuse_fsync_common(file, start, end, datasync, 0);
 }
 
 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b788becada76..c6aa2d4b8517 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -589,7 +589,8 @@ void fuse_release_common(struct file *file, int opcode);
 /**
  * Send FSYNC or FSYNCDIR request
  */
-int fuse_fsync_common(struct file *file, int datasync, int isdir);
+int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
+		      int datasync, int isdir);
 
 /**
  * Notify poll wakeup
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 89c39e53760d..f82cb5e1cb6b 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -544,7 +544,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
 
 /**
  * gfs2_fsync - sync the dirty data for a file (across the cluster)
- * @file: the file that points to the dentry (we ignore this)
+ * @file: the file that points to the dentry
+ * @start: the start position in the file to sync
+ * @end: the end position in the file to sync
  * @datasync: set if we can ignore timestamp changes
  *
  * The VFS will flush data for us. We only need to worry
@@ -553,23 +555,32 @@ static int gfs2_close(struct inode *inode, struct file *file)
  * Returns: errno
  */
 
-static int gfs2_fsync(struct file *file, int datasync)
+static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	int ret;
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
 	if (datasync)
 		sync_state &= ~I_DIRTY_SYNC;
 
 	if (sync_state) {
 		ret = sync_inode_metadata(inode, 1);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
 			return ret;
+		}
 		gfs2_ail_flush(ip->i_gl);
 	}
 
+	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 5e7c3f309617..96a1b625fc74 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -627,12 +627,18 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 	return 0;
 }
 
-static int hfs_file_fsync(struct file *filp, int datasync)
+static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			  int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	struct super_block * sb;
 	int ret, err;
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
 	/* sync the inode to buffers */
 	ret = write_inode_now(inode, 0);
 
@@ -649,6 +655,7 @@ static int hfs_file_fsync(struct file *filp, int datasync)
 	err = sync_blockdev(sb->s_bdev);
 	if (!ret)
 		ret = err;
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index d6857523336d..38184e360932 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -392,7 +392,8 @@ int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
 int hfsplus_cat_write_inode(struct inode *);
 struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
-int hfsplus_file_fsync(struct file *file, int datasync);
+int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
+		       int datasync);
 
 /* ioctl.c */
 long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 5b1cb98741cc..30486e01d003 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -308,13 +308,19 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
-int hfsplus_file_fsync(struct file *file, int datasync)
+int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
+		       int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
 	int error = 0, error2;
 
+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (error)
+		return error;
+	mutex_lock(&inode->i_mutex);
+
 	/*
 	 * Sync inode metadata into the catalog and extent trees.
 	 */
@@ -342,6 +348,8 @@ int hfsplus_file_fsync(struct file *file, int datasync)
 	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 
+	mutex_unlock(&inode->i_mutex);
+
 	return error;
 }
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 6e449c599b9d..0d22afdd4611 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -362,9 +362,20 @@ retry:
 	return 0;
 }
 
-int hostfs_fsync(struct file *file, int datasync)
+int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = fsync_file(HOSTFS_I(inode)->fd, datasync);
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
 }
 
 static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 89c500ee5213..89d2a5803ae3 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -18,9 +18,14 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-int hpfs_file_fsync(struct file *file, int datasync)
+int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	ret = filemap_write_and_wait_range(file->f_mapping, start, end);
+	if (ret)
+		return ret;
 	return sync_blockdev(inode->i_sb->s_bdev);
 }
 
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index dd552f862c8f..331b5e234ef3 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -258,7 +258,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
 
 /* file.c */
 
-int hpfs_file_fsync(struct file *, int);
+int hpfs_file_fsync(struct file *, loff_t, loff_t, int);
 extern const struct file_operations hpfs_file_ops;
 extern const struct inode_operations hpfs_file_iops;
 extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 85c098a499f3..8635be5ffd97 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -573,9 +573,10 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
 	return err;
 }
 
-static int hppfs_fsync(struct file *file, int datasync)
+static int hppfs_fsync(struct file *file, loff_t start, loff_t end,
+		       int datasync)
 {
-	return 0;
+	return filemap_write_and_wait_range(file->f_mapping, start, end);
 }
 
 static const struct file_operations hppfs_dir_fops = {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 1c0a08d711aa..3989f7e09f7f 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -27,13 +27,20 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 			struct page **pagep, void **fsdata);
 static int jffs2_readpage (struct file *filp, struct page *pg);
 
-int jffs2_fsync(struct file *filp, int datasync)
+int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	int ret;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
 
+	mutex_lock(&inode->i_mutex);
 	/* Trigger GC to flush any pending writes for this inode */
 	jffs2_flush_wbuf_gc(c, inode->i_ino);
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 65c6c43ca482..9c252835e8e5 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
 extern const struct file_operations jffs2_file_operations;
 extern const struct inode_operations jffs2_file_inode_operations;
 extern const struct address_space_operations jffs2_file_address_operations;
-int jffs2_fsync(struct file *, int);
+int jffs2_fsync(struct file *, loff_t, loff_t, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 
 /* ioctl.c */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 9f32315acef1..7527855b5cc6 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -28,19 +28,26 @@
 #include "jfs_acl.h"
 #include "jfs_debug.h"
 
-int jfs_fsync(struct file *file, int datasync)
+int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	int rc = 0;
 
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (rc)
+		return rc;
+
+	mutex_lock(&inode->i_mutex);
 	if (!(inode->i_state & I_DIRTY) ||
 	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
+		mutex_unlock(&inode->i_mutex);
 		return rc;
 	}
 
 	rc |= jfs_commit_inode(inode, 1);
+	mutex_unlock(&inode->i_mutex);
 
 	return rc ? -EIO : 0;
 }
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index ec2fb8b945fc..9271cfe4a149 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
 struct fid;
 
 extern struct inode *ialloc(struct inode *, umode_t);
-extern int jfs_fsync(struct file *, int);
+extern int jfs_fsync(struct file *, loff_t, loff_t, int);
 extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/libfs.c b/fs/libfs.c
index bd50b11f92da..8f2271a5df53 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -905,21 +905,29 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
  * filesystems which track all non-inode metadata in the buffers list
  * hanging off the address_space structure.
  */
-int generic_file_fsync(struct file *file, int datasync)
+int generic_file_fsync(struct file *file, loff_t start, loff_t end,
+		       int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	int err;
 	int ret;
 
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
 	ret = sync_mapping_buffers(inode->i_mapping);
 	if (!(inode->i_state & I_DIRTY))
-		return ret;
+		goto out;
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return ret;
+		goto out;
 
 	err = sync_inode_metadata(inode, 1);
 	if (ret == 0)
 		ret = err;
+out:
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_fsync);
@@ -956,7 +964,7 @@ EXPORT_SYMBOL(generic_check_addressable);
 /*
  * No-op implementation of ->fsync for in-memory filesystems.
  */
-int noop_fsync(struct file *file, int datasync)
+int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	return 0;
 }
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2ad7028def4..b548c87a86f1 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -219,11 +219,20 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 }
 
-int logfs_fsync(struct file *file, int datasync)
+int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct super_block *sb = file->f_mapping->host->i_sb;
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
 
+	mutex_lock(&inode->i_mutex);
 	logfs_write_anchor(sb);
+	mutex_unlock(&inode->i_mutex);
+
 	return 0;
 }
 
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 57afd4a6fabb..f22d108bfa5d 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -506,7 +506,7 @@ extern const struct file_operations logfs_reg_fops;
 extern const struct address_space_operations logfs_reg_aops;
 int logfs_readpage(struct file *file, struct page *page);
 long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int logfs_fsync(struct file *file, int datasync);
+int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
 
 /* gc.c */
 u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 0ed65e0c3dfe..64a326418aa2 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -20,9 +20,9 @@
 
 #include "ncp_fs.h"
 
-static int ncp_fsync(struct file *file, int datasync)
+static int ncp_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	return 0;
+	return filemap_write_and_wait_range(file->f_mapping, start, end);
 }
 
 /*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8a45e6d1f6a4..57f578e2560a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -56,7 +56,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
 static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
 static int nfs_rename(struct inode *, struct dentry *,
 		      struct inode *, struct dentry *);
-static int nfs_fsync_dir(struct file *, int);
+static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 static void nfs_readdir_clear_array(struct page*);
 
@@ -945,15 +945,19 @@ out:
  * All directory operations under NFS are synchronous, so fsync()
  * is a dummy operation.
  */
-static int nfs_fsync_dir(struct file *filp, int datasync)
+static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
+			 int datasync)
 {
 	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
 
 	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			datasync);
 
+	mutex_lock(&inode->i_mutex);
 	nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
+	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2c1705b6acd7..28b8c3f3cda3 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -55,7 +55,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_file_fsync(struct file *, int datasync);
+static int  nfs_file_fsync(struct file *, loff_t, loff_t, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -308,7 +308,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
  * fall back to doing a synchronous write.
  */
 static int
-nfs_file_fsync(struct file *file, int datasync)
+nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -316,11 +316,15 @@ nfs_file_fsync(struct file *file, int datasync)
 	int have_error, status;
 	int ret = 0;
 
-
 	dprintk("NFS: fsync file(%s/%s) datasync %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			datasync);
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 	status = nfs_commit_inode(inode, FLUSH_SYNC);
@@ -332,6 +336,7 @@ nfs_file_fsync(struct file *file, int datasync)
 	if (!ret && !datasync)
 		/* application has asked for meta-data sync */
 		ret = pnfs_layoutcommit_inode(inode, true);
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index d7eeca62febd..26601529dc17 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
 #include "nilfs.h"
 #include "segment.h"
 
-int nilfs_sync_file(struct file *file, int datasync)
+int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	/*
 	 * Called from fsync() system call
@@ -40,8 +40,15 @@ int nilfs_sync_file(struct file *file, int datasync)
 	struct inode *inode = file->f_mapping->host;
 	int err;
 
-	if (!nilfs_inode_dirty(inode))
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&inode->i_mutex);
+
+	if (!nilfs_inode_dirty(inode)) {
+		mutex_unlock(&inode->i_mutex);
 		return 0;
+	}
 
 	if (datasync)
 		err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
@@ -49,6 +56,7 @@ int nilfs_sync_file(struct file *file, int datasync)
 	else
 		err = nilfs_construct_segment(inode->i_sb);
 
+	mutex_unlock(&inode->i_mutex);
 	return err;
 }
 
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 6fb7511c0328..255d5e1c03b7 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -235,7 +235,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
 			   struct page *, struct inode *);
 
 /* file.c */
-extern int nilfs_sync_file(struct file *, int);
+extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
 
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 0f48e7c5d9e1..99e36107ff60 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,13 +1527,20 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
  * this problem for now.  We do write the $BITMAP attribute if it is present
  * which is the important one for a directory so things are not too bad.
  */
-static int ntfs_dir_fsync(struct file *filp, int datasync)
+static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+			  int datasync)
 {
 	struct inode *bmp_vi, *vi = filp->f_mapping->host;
 	int err, ret;
 	ntfs_attr na;
 
 	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+
+	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&vi->i_mutex);
+
 	BUG_ON(!S_ISDIR(vi->i_mode));
 	/* If the bitmap attribute inode is in memory sync it, too. */
 	na.mft_no = vi->i_ino;
@@ -1555,6 +1562,7 @@ static int ntfs_dir_fsync(struct file *filp, int datasync)
 	else
 		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
 				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
+	mutex_unlock(&vi->i_mutex);
 	return ret;
 }
 
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index b59f5ac26bef..c587e2d27183 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2152,12 +2152,19 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
  * with this inode but since we have no simple way of getting to them we ignore
  * this problem for now.
  */
-static int ntfs_file_fsync(struct file *filp, int datasync)
+static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			   int datasync)
 {
 	struct inode *vi = filp->f_mapping->host;
 	int err, ret = 0;
 
 	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+
+	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&vi->i_mutex);
+
 	BUG_ON(S_ISDIR(vi->i_mode));
 	if (!datasync || !NInoNonResident(NTFS_I(vi)))
 		ret = __ntfs_write_inode(vi, 1);
@@ -2175,6 +2182,7 @@ static int ntfs_file_fsync(struct file *filp, int datasync)
 	else
 		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
 				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
+	mutex_unlock(&vi->i_mutex);
 	return ret;
 }
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 22d604601957..0fc2bd34039d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -171,7 +171,8 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int ocfs2_sync_file(struct file *file, int datasync)
+static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
+			   int datasync)
 {
 	int err = 0;
 	journal_t *journal;
@@ -184,6 +185,16 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 			      file->f_path.dentry->d_name.name,
 			      (unsigned long long)datasync);
 
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	/*
+	 * Probably don't need the i_mutex at all in here, just putting it here
+	 * to be consistent with how fsync used to be called, someone more
+	 * familiar with the fs could possibly remove it.
+	 */
+	mutex_lock(&inode->i_mutex);
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
 		/*
 		 * We still have to flush drive's caches to get data to the
@@ -200,6 +211,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 bail:
 	if (err)
 		mlog_errno(err);
+	mutex_unlock(&inode->i_mutex);
 
 	return (err < 0) ? -EIO : 0;
 }
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 198dabf1b2bb..133e9355dc6f 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,7 +14,8 @@
 extern const struct reiserfs_key MIN_KEY;
 
 static int reiserfs_readdir(struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, int datasync);
+static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+			      int datasync);
 
 const struct file_operations reiserfs_dir_operations = {
 	.llseek = generic_file_llseek,
@@ -27,13 +28,21 @@ const struct file_operations reiserfs_dir_operations = {
 #endif
 };
 
-static int reiserfs_dir_fsync(struct file *filp, int datasync)
+static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+			      int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int err;
+
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
 	reiserfs_write_lock(inode->i_sb);
 	err = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
+	mutex_unlock(&inode->i_mutex);
 	if (err < 0)
 		return err;
 	return 0;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index bbf31003d308..c7156dc39ce7 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -140,12 +140,18 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
  * be removed...
  */
 
-static int reiserfs_sync_file(struct file *filp, int datasync)
+static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
+			      int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int err;
 	int barrier_done;
 
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
 	BUG_ON(!S_ISREG(inode->i_mode));
 	err = sync_mapping_buffers(inode->i_mapping);
 	reiserfs_write_lock(inode->i_sb);
@@ -153,6 +159,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+	mutex_unlock(&inode->i_mutex);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
diff --git a/fs/sync.c b/fs/sync.c
index c38ec163da6c..c98a7477edfd 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -165,28 +165,9 @@ SYSCALL_DEFINE1(syncfs, int, fd)
  */
 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	struct address_space *mapping = file->f_mapping;
-	int err, ret;
-
-	if (!file->f_op || !file->f_op->fsync) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = filemap_write_and_wait_range(mapping, start, end);
-
-	/*
-	 * We need to protect against concurrent writers, which could cause
-	 * livelocks in fsync_buffers_list().
-	 */
-	mutex_lock(&mapping->host->i_mutex);
-	err = file->f_op->fsync(file, datasync);
-	if (!ret)
-		ret = err;
-	mutex_unlock(&mapping->host->i_mutex);
-
-out:
-	return ret;
+	if (!file->f_op || !file->f_op->fsync)
+		return -EINVAL;
+	return file->f_op->fsync(file, start, end, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync_range);
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5e7fccfc4b29..89ef9a2f7837 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1304,7 +1304,7 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return NULL;
 }
 
-int ubifs_fsync(struct file *file, int datasync)
+int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1319,14 +1319,16 @@ int ubifs_fsync(struct file *file, int datasync)
 		 */
 		return 0;
 
-	/*
-	 * VFS has already synchronized dirty pages for this inode. Synchronize
-	 * the inode unless this is a 'datasync()' call.
-	 */
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&inode->i_mutex);
+
+	/* Synchronize the inode unless this is a 'datasync()' call. */
 	if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
 		err = inode->i_sb->s_op->write_inode(inode, NULL);
 		if (err)
-			return err;
+			goto out;
 	}
 
 	/*
@@ -1334,10 +1336,9 @@ int ubifs_fsync(struct file *file, int datasync)
 	 * them.
 	 */
 	err = ubifs_sync_wbufs_by_inode(c, inode);
-	if (err)
-		return err;
-
-	return 0;
+out:
+	mutex_unlock(&inode->i_mutex);
+	return err;
 }
 
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index f79983d6f860..4cd648501fa4 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1720,7 +1720,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
 int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 
 /* file.c */
-int ubifs_fsync(struct file *file, int datasync);
+int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
 
 /* dir.c */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 7f782af286bf..fbbf657df0cd 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -127,6 +127,8 @@ xfs_iozero(
 STATIC int
 xfs_file_fsync(
 	struct file		*file,
+	loff_t			start,
+	loff_t			end,
 	int			datasync)
 {
 	struct inode		*inode = file->f_mapping->host;
@@ -138,6 +140,10 @@ xfs_file_fsync(
 
 	trace_xfs_file_fsync(ip);
 
+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (error)
+		return error;
+
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -XFS_ERROR(EIO);
 
@@ -875,18 +881,11 @@ xfs_file_aio_write(
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
 		loff_t end = pos + ret - 1;
-		int error, error2;
 
 		xfs_rw_iunlock(ip, iolock);
-		error = filemap_write_and_wait_range(mapping, pos, end);
+		ret = -xfs_file_fsync(file, pos, end,
+				      (file->f_flags & __O_SYNC) ? 0 : 1);
 		xfs_rw_ilock(ip, iolock);
-
-		error2 = -xfs_file_fsync(file,
-					 (file->f_flags & __O_SYNC) ? 0 : 1);
-		if (error)
-			ret = error;
-		else if (error2)
-			ret = error2;
 	}
 
 out_unlock:
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 5e06acf95d0f..0c473fd79acb 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -877,7 +877,7 @@ extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext3_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
-extern int ext3_sync_file(struct file *, int);
+extern int ext3_sync_file(struct file *, loff_t, loff_t, int);
 
 /* hash.c */
 extern int ext3fs_dirhash(const char *name, int len, struct
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 6a8274877171..1d6836c498dd 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -1043,7 +1043,8 @@ extern void fb_deferred_io_open(struct fb_info *info,
 				struct inode *inode,
 				struct file *file);
 extern void fb_deferred_io_cleanup(struct fb_info *info);
-extern int fb_deferred_io_fsync(struct file *file, int datasync);
+extern int fb_deferred_io_fsync(struct file *file, loff_t start,
+				loff_t end, int datasync);
 
 static inline bool fb_be_math(struct fb_info *info)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a61f98823a6..9cd2075c4a39 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1572,7 +1572,7 @@ struct file_operations {
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
-	int (*fsync) (struct file *, int datasync);
+	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
@@ -2360,7 +2360,8 @@ extern int generic_segment_checks(const struct iovec *iov,
 /* fs/block_dev.c */
 extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos);
-extern int blkdev_fsync(struct file *filp, int datasync);
+extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
+			int datasync);
 
 /* fs/splice.c */
 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
@@ -2490,7 +2491,7 @@ extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
-extern int noop_fsync(struct file *, int);
+extern int noop_fsync(struct file *, loff_t, loff_t, int);
 extern int simple_empty(struct dentry *);
 extern int simple_readpage(struct file *file, struct page *page);
 extern int simple_write_begin(struct file *file, struct address_space *mapping,
@@ -2515,7 +2516,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
 		const void __user *from, size_t count);
 
-extern int generic_file_fsync(struct file *, int);
+extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
 
 extern int generic_check_addressable(unsigned, u64);
 
diff --git a/ipc/shm.c b/ipc/shm.c
index ab3385a21b27..27884adb1a90 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -277,13 +277,13 @@ static int shm_release(struct inode *ino, struct file *file)
 	return 0;
 }
 
-static int shm_fsync(struct file *file, int datasync)
+static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct shm_file_data *sfd = shm_file_data(file);
 
 	if (!sfd->file->f_op->fsync)
 		return -EINVAL;
-	return sfd->file->f_op->fsync(sfd->file, datasync);
+	return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
 }
 
 static unsigned long shm_get_unmapped_area(struct file *file,
-- 
cgit v1.2.3


From 295cc522c2b2834b302424e41ebb5846df6d61bd Mon Sep 17 00:00:00 2001
From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Date: Tue, 19 Jul 2011 09:48:39 +0800
Subject: fs:update the NOTE of the file_operations structure

Big kernel lock had been removed and setlease now use the lock_flocks()
to hold a special spin lock file_lock_lock by Matthew.
So just remove the out-of-date NOTE.

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9cd2075c4a39..e0569e4ab2d5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1552,11 +1552,6 @@ struct block_device_operations;
 #define HAVE_COMPAT_IOCTL 1
 #define HAVE_UNLOCKED_IOCTL 1
 
-/*
- * NOTE:
- * all file operations except setlease can be called without
- * the big kernel lock held in all filesystems.
- */
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
-- 
cgit v1.2.3


From 4582c0a4866ea70c35aa9279e1f91834d3348a93 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 16 Jul 2011 17:42:00 +0200
Subject: mutex: Make mutex_destroy() an inline function

The non-debug variant of mutex_destroy is a no-op, currently
implemented as a macro which does nothing. This approach fails
to check the type of the parameter, so an error would only show
when debugging gets enabled. Using an inline function instead,
offers type checking for earlier bug catching.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110716174200.41002352@endymion.delvare
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a940fe435aca..7f87217e9d1f 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -92,7 +92,7 @@ do {							\
 							\
 	__mutex_init((mutex), #mutex, &__key);		\
 } while (0)
-# define mutex_destroy(mutex)				do { } while (0)
+static inline void mutex_destroy(struct mutex *lock) {}
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-- 
cgit v1.2.3


From a6a7b759ba62e62542308e091f7fc9cfac4f978e Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 21 Jul 2011 12:05:31 +0200
Subject: netfilter: ipset: make possible to hash some part of the data element
 only

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set_ahash.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index c5b06aaa205c..42b7d25a1b2e 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -211,12 +211,16 @@ ip_set_hash_destroy(struct ip_set *set)
 	set->data = NULL;
 }
 
-#define HKEY(data, initval, htable_bits)				 \
-(jhash2((u32 *)(data), sizeof(struct type_pf_elem)/sizeof(u32), initval) \
-	& jhash_mask(htable_bits))
-
 #endif /* _IP_SET_AHASH_H */
 
+#ifndef HKEY_DATALEN
+#define HKEY_DATALEN	sizeof(struct type_pf_elem)
+#endif
+
+#define HKEY(data, initval, htable_bits)			\
+(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval)	\
+	& jhash_mask(htable_bits))
+
 #define CONCAT(a, b, c)		a##b##c
 #define TOKEN(a, b, c)		CONCAT(a, b, c)
 
@@ -1054,6 +1058,8 @@ type_pf_gc_init(struct ip_set *set)
 		 IPSET_GC_PERIOD(h->timeout));
 }
 
+#undef HKEY_DATALEN
+#undef HKEY
 #undef type_pf_data_equal
 #undef type_pf_data_isnull
 #undef type_pf_data_copy
-- 
cgit v1.2.3


From 89dc79b787d20e4b6c4077dcee1c5b1be4ab55b8 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 21 Jul 2011 12:06:18 +0200
Subject: netfilter: ipset: hash:net,iface fixed to handle overlapping nets
 behind different interfaces

If overlapping networks with different interfaces was added to
the set, the type did not handle it properly. Example

    ipset create test hash:net,iface
    ipset add test 192.168.0.0/16,eth0
    ipset add test 192.168.0.0/24,eth1

Now, if a packet was sent from 192.168.0.0/24,eth0, the type returned
a match.

In the patch the algorithm is fixed in order to correctly handle
overlapping networks.

Limitation: the same network cannot be stored with more than 64 different
interfaces in a single set.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set_ahash.h | 92 ++++++++++++++++++----------
 net/netfilter/ipset/ip_set_hash_ip.c         |  6 +-
 net/netfilter/ipset/ip_set_hash_ipport.c     |  6 +-
 net/netfilter/ipset/ip_set_hash_ipportip.c   |  6 +-
 net/netfilter/ipset/ip_set_hash_ipportnet.c  |  6 +-
 net/netfilter/ipset/ip_set_hash_net.c        |  6 +-
 net/netfilter/ipset/ip_set_hash_netiface.c   | 40 +++++++++---
 net/netfilter/ipset/ip_set_hash_netport.c    |  6 +-
 8 files changed, 117 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index 42b7d25a1b2e..1e7f7594cd02 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -28,7 +28,32 @@
 /* Number of elements to store in an initial array block */
 #define AHASH_INIT_SIZE			4
 /* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE			(3*4)
+#define AHASH_MAX_SIZE			(3*AHASH_INIT_SIZE)
+
+/* Max number of elements can be tuned */
+#ifdef IP_SET_HASH_WITH_MULTI
+#define AHASH_MAX(h)			((h)->ahash_max)
+
+static inline u8
+tune_ahash_max(u8 curr, u32 multi)
+{
+	u32 n;
+
+	if (multi < curr)
+		return curr;
+
+	n = curr + AHASH_INIT_SIZE;
+	/* Currently, at listing one hash bucket must fit into a message.
+	 * Therefore we have a hard limit here.
+	 */
+	return n > curr && n <= 64 ? n : curr;
+}
+#define TUNE_AHASH_MAX(h, multi)	\
+	((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
+#else
+#define AHASH_MAX(h)			AHASH_MAX_SIZE
+#define TUNE_AHASH_MAX(h, multi)
+#endif
 
 /* A hash bucket */
 struct hbucket {
@@ -60,6 +85,9 @@ struct ip_set_hash {
 	u32 timeout;		/* timeout value, if enabled */
 	struct timer_list gc;	/* garbage collection when timeout enabled */
 	struct type_pf_next next; /* temporary storage for uadd */
+#ifdef IP_SET_HASH_WITH_MULTI
+	u8 ahash_max;		/* max elements in an array block */
+#endif
 #ifdef IP_SET_HASH_WITH_NETMASK
 	u8 netmask;		/* netmask value for subnets to store */
 #endif
@@ -279,12 +307,13 @@ ip_set_hash_destroy(struct ip_set *set)
 /* Add an element to the hash table when resizing the set:
  * we spare the maintenance of the internal counters. */
 static int
-type_pf_elem_add(struct hbucket *n, const struct type_pf_elem *value)
+type_pf_elem_add(struct hbucket *n, const struct type_pf_elem *value,
+		 u8 ahash_max)
 {
 	if (n->pos >= n->size) {
 		void *tmp;
 
-		if (n->size >= AHASH_MAX_SIZE)
+		if (n->size >= ahash_max)
 			/* Trigger rehashing */
 			return -EAGAIN;
 
@@ -339,7 +368,7 @@ retry:
 		for (j = 0; j < n->pos; j++) {
 			data = ahash_data(n, j);
 			m = hbucket(t, HKEY(data, h->initval, htable_bits));
-			ret = type_pf_elem_add(m, data);
+			ret = type_pf_elem_add(m, data, AHASH_MAX(h));
 			if (ret < 0) {
 				read_unlock_bh(&set->lock);
 				ahash_destroy(t);
@@ -376,7 +405,7 @@ type_pf_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	const struct type_pf_elem *d = value;
 	struct hbucket *n;
 	int i, ret = 0;
-	u32 key;
+	u32 key, multi = 0;
 
 	if (h->elements >= h->maxelem)
 		return -IPSET_ERR_HASH_FULL;
@@ -386,12 +415,12 @@ type_pf_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	key = HKEY(value, h->initval, t->htable_bits);
 	n = hbucket(t, key);
 	for (i = 0; i < n->pos; i++)
-		if (type_pf_data_equal(ahash_data(n, i), d)) {
+		if (type_pf_data_equal(ahash_data(n, i), d, &multi)) {
 			ret = -IPSET_ERR_EXIST;
 			goto out;
 		}
-
-	ret = type_pf_elem_add(n, value);
+	TUNE_AHASH_MAX(h, multi);
+	ret = type_pf_elem_add(n, value, AHASH_MAX(h));
 	if (ret != 0) {
 		if (ret == -EAGAIN)
 			type_pf_data_next(h, d);
@@ -419,13 +448,13 @@ type_pf_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	struct hbucket *n;
 	int i;
 	struct type_pf_elem *data;
-	u32 key;
+	u32 key, multi = 0;
 
 	key = HKEY(value, h->initval, t->htable_bits);
 	n = hbucket(t, key);
 	for (i = 0; i < n->pos; i++) {
 		data = ahash_data(n, i);
-		if (!type_pf_data_equal(data, d))
+		if (!type_pf_data_equal(data, d, &multi))
 			continue;
 		if (i != n->pos - 1)
 			/* Not last one */
@@ -466,17 +495,17 @@ type_pf_test_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
 	struct hbucket *n;
 	const struct type_pf_elem *data;
 	int i, j = 0;
-	u32 key;
+	u32 key, multi = 0;
 	u8 host_mask = SET_HOST_MASK(set->family);
 
 	pr_debug("test by nets\n");
-	for (; j < host_mask && h->nets[j].cidr; j++) {
+	for (; j < host_mask && h->nets[j].cidr && !multi; j++) {
 		type_pf_data_netmask(d, h->nets[j].cidr);
 		key = HKEY(d, h->initval, t->htable_bits);
 		n = hbucket(t, key);
 		for (i = 0; i < n->pos; i++) {
 			data = ahash_data(n, i);
-			if (type_pf_data_equal(data, d))
+			if (type_pf_data_equal(data, d, &multi))
 				return 1;
 		}
 	}
@@ -494,7 +523,7 @@ type_pf_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	struct hbucket *n;
 	const struct type_pf_elem *data;
 	int i;
-	u32 key;
+	u32 key, multi = 0;
 
 #ifdef IP_SET_HASH_WITH_NETS
 	/* If we test an IP address and not a network address,
@@ -507,7 +536,7 @@ type_pf_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	n = hbucket(t, key);
 	for (i = 0; i < n->pos; i++) {
 		data = ahash_data(n, i);
-		if (type_pf_data_equal(data, d))
+		if (type_pf_data_equal(data, d, &multi))
 			return 1;
 	}
 	return 0;
@@ -664,14 +693,14 @@ type_pf_data_timeout_set(struct type_pf_elem *data, u32 timeout)
 
 static int
 type_pf_elem_tadd(struct hbucket *n, const struct type_pf_elem *value,
-		  u32 timeout)
+		  u8 ahash_max, u32 timeout)
 {
 	struct type_pf_elem *data;
 
 	if (n->pos >= n->size) {
 		void *tmp;
 
-		if (n->size >= AHASH_MAX_SIZE)
+		if (n->size >= ahash_max)
 			/* Trigger rehashing */
 			return -EAGAIN;
 
@@ -776,7 +805,7 @@ retry:
 		for (j = 0; j < n->pos; j++) {
 			data = ahash_tdata(n, j);
 			m = hbucket(t, HKEY(data, h->initval, htable_bits));
-			ret = type_pf_elem_tadd(m, data,
+			ret = type_pf_elem_tadd(m, data, AHASH_MAX(h),
 						type_pf_data_timeout(data));
 			if (ret < 0) {
 				read_unlock_bh(&set->lock);
@@ -807,9 +836,9 @@ type_pf_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	const struct type_pf_elem *d = value;
 	struct hbucket *n;
 	struct type_pf_elem *data;
-	int ret = 0, i, j = AHASH_MAX_SIZE + 1;
+	int ret = 0, i, j = AHASH_MAX(h) + 1;
 	bool flag_exist = flags & IPSET_FLAG_EXIST;
-	u32 key;
+	u32 key, multi = 0;
 
 	if (h->elements >= h->maxelem)
 		/* FIXME: when set is full, we slow down here */
@@ -823,18 +852,18 @@ type_pf_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	n = hbucket(t, key);
 	for (i = 0; i < n->pos; i++) {
 		data = ahash_tdata(n, i);
-		if (type_pf_data_equal(data, d)) {
+		if (type_pf_data_equal(data, d, &multi)) {
 			if (type_pf_data_expired(data) || flag_exist)
 				j = i;
 			else {
 				ret = -IPSET_ERR_EXIST;
 				goto out;
 			}
-		} else if (j == AHASH_MAX_SIZE + 1 &&
+		} else if (j == AHASH_MAX(h) + 1 &&
 			   type_pf_data_expired(data))
 			j = i;
 	}
-	if (j != AHASH_MAX_SIZE + 1) {
+	if (j != AHASH_MAX(h) + 1) {
 		data = ahash_tdata(n, j);
 #ifdef IP_SET_HASH_WITH_NETS
 		del_cidr(h, data->cidr, HOST_MASK);
@@ -844,7 +873,8 @@ type_pf_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
 		type_pf_data_timeout_set(data, timeout);
 		goto out;
 	}
-	ret = type_pf_elem_tadd(n, d, timeout);
+	TUNE_AHASH_MAX(h, multi);
+	ret = type_pf_elem_tadd(n, d, AHASH_MAX(h), timeout);
 	if (ret != 0) {
 		if (ret == -EAGAIN)
 			type_pf_data_next(h, d);
@@ -869,13 +899,13 @@ type_pf_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	struct hbucket *n;
 	int i;
 	struct type_pf_elem *data;
-	u32 key;
+	u32 key, multi = 0;
 
 	key = HKEY(value, h->initval, t->htable_bits);
 	n = hbucket(t, key);
 	for (i = 0; i < n->pos; i++) {
 		data = ahash_tdata(n, i);
-		if (!type_pf_data_equal(data, d))
+		if (!type_pf_data_equal(data, d, &multi))
 			continue;
 		if (type_pf_data_expired(data))
 			return -IPSET_ERR_EXIST;
@@ -915,16 +945,16 @@ type_pf_ttest_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
 	struct type_pf_elem *data;
 	struct hbucket *n;
 	int i, j = 0;
-	u32 key;
+	u32 key, multi = 0;
 	u8 host_mask = SET_HOST_MASK(set->family);
 
-	for (; j < host_mask && h->nets[j].cidr; j++) {
+	for (; j < host_mask && h->nets[j].cidr && !multi; j++) {
 		type_pf_data_netmask(d, h->nets[j].cidr);
 		key = HKEY(d, h->initval, t->htable_bits);
 		n = hbucket(t, key);
 		for (i = 0; i < n->pos; i++) {
 			data = ahash_tdata(n, i);
-			if (type_pf_data_equal(data, d))
+			if (type_pf_data_equal(data, d, &multi))
 				return !type_pf_data_expired(data);
 		}
 	}
@@ -940,7 +970,7 @@ type_pf_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	struct type_pf_elem *data, *d = value;
 	struct hbucket *n;
 	int i;
-	u32 key;
+	u32 key, multi = 0;
 
 #ifdef IP_SET_HASH_WITH_NETS
 	if (d->cidr == SET_HOST_MASK(set->family))
@@ -950,7 +980,7 @@ type_pf_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
 	n = hbucket(t, key);
 	for (i = 0; i < n->pos; i++) {
 		data = ahash_tdata(n, i);
-		if (type_pf_data_equal(data, d))
+		if (type_pf_data_equal(data, d, &multi))
 			return !type_pf_data_expired(data);
 	}
 	return 0;
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index fa80bb9b9c81..f2d576e6b769 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -53,7 +53,8 @@ struct hash_ip4_telem {
 
 static inline bool
 hash_ip4_data_equal(const struct hash_ip4_elem *ip1,
-		    const struct hash_ip4_elem *ip2)
+		    const struct hash_ip4_elem *ip2,
+		    u32 *multi)
 {
 	return ip1->ip == ip2->ip;
 }
@@ -225,7 +226,8 @@ struct hash_ip6_telem {
 
 static inline bool
 hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
-		    const struct hash_ip6_elem *ip2)
+		    const struct hash_ip6_elem *ip2,
+		    u32 *multi)
 {
 	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0;
 }
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index bbf51b67b170..6ee10f5d59bd 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -60,7 +60,8 @@ struct hash_ipport4_telem {
 
 static inline bool
 hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
-			const struct hash_ipport4_elem *ip2)
+			const struct hash_ipport4_elem *ip2,
+			u32 *multi)
 {
 	return ip1->ip == ip2->ip &&
 	       ip1->port == ip2->port &&
@@ -276,7 +277,8 @@ struct hash_ipport6_telem {
 
 static inline bool
 hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
-			const struct hash_ipport6_elem *ip2)
+			const struct hash_ipport6_elem *ip2,
+			u32 *multi)
 {
 	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
 	       ip1->port == ip2->port &&
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 96525f529a54..fb90e344e907 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -62,7 +62,8 @@ struct hash_ipportip4_telem {
 
 static inline bool
 hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
-			  const struct hash_ipportip4_elem *ip2)
+			  const struct hash_ipportip4_elem *ip2,
+			  u32 *multi)
 {
 	return ip1->ip == ip2->ip &&
 	       ip1->ip2 == ip2->ip2 &&
@@ -286,7 +287,8 @@ struct hash_ipportip6_telem {
 
 static inline bool
 hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
-			  const struct hash_ipportip6_elem *ip2)
+			  const struct hash_ipportip6_elem *ip2,
+			  u32 *multi)
 {
 	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
 	       ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 &&
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index d2d6ab89f087..deb3e3dfa5fc 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -62,7 +62,8 @@ struct hash_ipportnet4_telem {
 
 static inline bool
 hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
-			   const struct hash_ipportnet4_elem *ip2)
+			   const struct hash_ipportnet4_elem *ip2,
+			   u32 *multi)
 {
 	return ip1->ip == ip2->ip &&
 	       ip1->ip2 == ip2->ip2 &&
@@ -335,7 +336,8 @@ struct hash_ipportnet6_telem {
 
 static inline bool
 hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
-			   const struct hash_ipportnet6_elem *ip2)
+			   const struct hash_ipportnet6_elem *ip2,
+			   u32 *multi)
 {
 	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
 	       ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 &&
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 2d4b1f48e8c9..60d016541c58 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -58,7 +58,8 @@ struct hash_net4_telem {
 
 static inline bool
 hash_net4_data_equal(const struct hash_net4_elem *ip1,
-		    const struct hash_net4_elem *ip2)
+		     const struct hash_net4_elem *ip2,
+		     u32 *multi)
 {
 	return ip1->ip == ip2->ip && ip1->cidr == ip2->cidr;
 }
@@ -249,7 +250,8 @@ struct hash_net6_telem {
 
 static inline bool
 hash_net6_data_equal(const struct hash_net6_elem *ip1,
-		     const struct hash_net6_elem *ip2)
+		     const struct hash_net6_elem *ip2,
+		     u32 *multi)
 {
 	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
 	       ip1->cidr == ip2->cidr;
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 3d6c53b6211a..e13095deb50d 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -99,7 +99,7 @@ iface_test(struct rb_root *root, const char **iface)
 
 	while (n) {
 		const char *d = iface_data(n);
-		int res = ifname_compare(*iface, d);
+		long res = ifname_compare(*iface, d);
 
 		if (res < 0)
 			n = n->rb_left;
@@ -121,7 +121,7 @@ iface_add(struct rb_root *root, const char **iface)
 
 	while (*n) {
 		char *ifname = iface_data(*n);
-		int res = ifname_compare(*iface, ifname);
+		long res = ifname_compare(*iface, ifname);
 
 		p = *n;
 		if (res < 0)
@@ -159,31 +159,42 @@ hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b);
 
 /* The type variant functions: IPv4 */
 
+struct hash_netiface4_elem_hashed {
+	__be32 ip;
+	u8 physdev;
+	u8 cidr;
+	u16 padding;
+};
+
+#define HKEY_DATALEN	sizeof(struct hash_netiface4_elem_hashed)
+
 /* Member elements without timeout */
 struct hash_netiface4_elem {
 	__be32 ip;
-	const char *iface;
 	u8 physdev;
 	u8 cidr;
 	u16 padding;
+	const char *iface;
 };
 
 /* Member elements with timeout support */
 struct hash_netiface4_telem {
 	__be32 ip;
-	const char *iface;
 	u8 physdev;
 	u8 cidr;
 	u16 padding;
+	const char *iface;
 	unsigned long timeout;
 };
 
 static inline bool
 hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
-			  const struct hash_netiface4_elem *ip2)
+			  const struct hash_netiface4_elem *ip2,
+			  u32 *multi)
 {
 	return ip1->ip == ip2->ip &&
 	       ip1->cidr == ip2->cidr &&
+	       (++*multi) &&
 	       ip1->physdev == ip2->physdev &&
 	       ip1->iface == ip2->iface;
 }
@@ -257,6 +268,7 @@ nla_put_failure:
 
 #define IP_SET_HASH_WITH_NETS
 #define IP_SET_HASH_WITH_RBTREE
+#define IP_SET_HASH_WITH_MULTI
 
 #define PF		4
 #define HOST_MASK	32
@@ -424,29 +436,40 @@ hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b)
 
 /* The type variant functions: IPv6 */
 
+struct hash_netiface6_elem_hashed {
+	union nf_inet_addr ip;
+	u8 physdev;
+	u8 cidr;
+	u16 padding;
+};
+
+#define HKEY_DATALEN	sizeof(struct hash_netiface6_elem_hashed)
+
 struct hash_netiface6_elem {
 	union nf_inet_addr ip;
-	const char *iface;
 	u8 physdev;
 	u8 cidr;
 	u16 padding;
+	const char *iface;
 };
 
 struct hash_netiface6_telem {
 	union nf_inet_addr ip;
-	const char *iface;
 	u8 physdev;
 	u8 cidr;
 	u16 padding;
+	const char *iface;
 	unsigned long timeout;
 };
 
 static inline bool
 hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
-			  const struct hash_netiface6_elem *ip2)
+			  const struct hash_netiface6_elem *ip2,
+			  u32 *multi)
 {
 	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
 	       ip1->cidr == ip2->cidr &&
+	       (++*multi) &&
 	       ip1->physdev == ip2->physdev &&
 	       ip1->iface == ip2->iface;
 }
@@ -681,6 +704,7 @@ hash_netiface_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 	h->maxelem = maxelem;
 	get_random_bytes(&h->initval, sizeof(h->initval));
 	h->timeout = IPSET_NO_TIMEOUT;
+	h->ahash_max = AHASH_MAX_SIZE;
 
 	hbits = htable_bits(hashsize);
 	h->table = ip_set_alloc(
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index fe203d12f56b..8f9de7207ec9 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -59,7 +59,8 @@ struct hash_netport4_telem {
 
 static inline bool
 hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
-			 const struct hash_netport4_elem *ip2)
+			 const struct hash_netport4_elem *ip2,
+			 u32 *multi)
 {
 	return ip1->ip == ip2->ip &&
 	       ip1->port == ip2->port &&
@@ -300,7 +301,8 @@ struct hash_netport6_telem {
 
 static inline bool
 hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
-			 const struct hash_netport6_elem *ip2)
+			 const struct hash_netport6_elem *ip2,
+			 u32 *multi)
 {
 	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
 	       ip1->port == ip2->port &&
-- 
cgit v1.2.3


From 0f598f0b4c3b2259366cfa8adc01bd8e714c82d0 Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@genband.com>
Date: Thu, 21 Jul 2011 12:07:10 +0200
Subject: netfilter: ipset: fix compiler warnings "'hash_ip4_data_next'
 declared inline after being called"

Some gcc versions warn about prototypes without "inline" when the declaration
includes the "inline" keyword. The fix generates a false error message
"marked inline, but without a definition" with sparse below 0.4.2.

Signed-off-by: Chris Friesen <chris.friesen@genband.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set_ahash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index 1e7f7594cd02..b89fb79cb44f 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -392,7 +392,7 @@ retry:
 	return 0;
 }
 
-static void
+static inline void
 type_pf_data_next(struct ip_set_hash *h, const struct type_pf_elem *d);
 
 /* Add an element to a hash and update the internal counters when succeeded,
-- 
cgit v1.2.3


From ae7bd11b471931752e5609094ca0a49386590524 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 21 Jul 2011 13:34:05 -0700
Subject: clocksource: Change __ARCH_HAS_CLOCKSOURCE_DATA to a CONFIG option

The machinery for __ARCH_HAS_CLOCKSOURCE_DATA assumed a file in
asm-generic would be the default for architectures without their own
file in asm/, but that is not how it works.

Replace it with a Kconfig option instead.

Link: http://lkml.kernel.org/r/4E288AA6.7090804@zytor.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@mit.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/Kconfig                   | 3 +++
 arch/ia64/include/asm/clocksource.h | 2 --
 arch/x86/Kconfig                    | 4 ++++
 arch/x86/include/asm/clocksource.h  | 2 --
 include/asm-generic/clocksource.h   | 4 ----
 include/linux/clocksource.h         | 4 +++-
 6 files changed, 10 insertions(+), 9 deletions(-)
 delete mode 100644 include/asm-generic/clocksource.h

(limited to 'include/linux')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 38280ef4a2af..0a9820a77825 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -101,6 +101,9 @@ config GENERIC_IOMAP
 	bool
 	default y
 
+config ARCH_CLOCKSOURCE_DATA
+	def_bool y
+
 config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
diff --git a/arch/ia64/include/asm/clocksource.h b/arch/ia64/include/asm/clocksource.h
index 00eb549a59b0..5c8596e4cb02 100644
--- a/arch/ia64/include/asm/clocksource.h
+++ b/arch/ia64/include/asm/clocksource.h
@@ -3,8 +3,6 @@
 #ifndef _ASM_IA64_CLOCKSOURCE_H
 #define _ASM_IA64_CLOCKSOURCE_H
 
-#define __ARCH_HAS_CLOCKSOURCE_DATA
-
 struct arch_clocksource_data {
 	void *fsys_mmio;        /* used by fsyscall asm code */
 };
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index da349723d411..c1e41bccdcb8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,10 @@ config CLOCKSOURCE_WATCHDOG
 config GENERIC_CLOCKEVENTS
 	def_bool y
 
+config ARCH_CLOCKSOURCE_DATA
+	def_bool y
+	depends on X86_64
+
 config GENERIC_CLOCKEVENTS_BROADCAST
 	def_bool y
 	depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 3882c65dc19b..0bdbbb3b9ce7 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -5,8 +5,6 @@
 
 #ifdef CONFIG_X86_64
 
-#define __ARCH_HAS_CLOCKSOURCE_DATA
-
 #define VCLOCK_NONE 0  /* No vDSO clock available.	*/
 #define VCLOCK_TSC  1  /* vDSO should use vread_tsc.	*/
 #define VCLOCK_HPET 2  /* vDSO should use vread_hpet.	*/
diff --git a/include/asm-generic/clocksource.h b/include/asm-generic/clocksource.h
deleted file mode 100644
index 0a462d3fb05e..000000000000
--- a/include/asm-generic/clocksource.h
+++ /dev/null
@@ -1,4 +0,0 @@
-/*
- * Architectures should override this file to add private userspace
- * clock magic if needed.
- */
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 6bb69702c4fa..59ee970cf89e 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -22,7 +22,9 @@
 typedef u64 cycle_t;
 struct clocksource;
 
+#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
 #include <asm/clocksource.h>
+#endif
 
 /**
  * struct cyclecounter - hardware abstraction for a free running counter
@@ -171,7 +173,7 @@ struct clocksource {
 	u32 shift;
 	u64 max_idle_ns;
 
-#ifdef __ARCH_HAS_CLOCKSOURCE_DATA
+#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
 	struct arch_clocksource_data archdata;
 #endif
 
-- 
cgit v1.2.3


From f605234066852b11096ab071124e3fe415e94420 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:04 +0000
Subject: vlan: finish removing vlan_find_dev from public header

else case remained forgotten.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index bc03e40fd7fd..cfb0cf2230a9 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -135,12 +135,6 @@ vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
 	       unsigned int vlan_tci);
 
 #else
-static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
-					       u16 vlan_id)
-{
-	return NULL;
-}
-
 static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
 {
 	BUG();
-- 
cgit v1.2.3


From cec9c133631039f82e4a5ff3bf47b3eba14ff1aa Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:05 +0000
Subject: vlan: introduce __vlan_find_dev_deep()

Since vlan_group_get_device and vlan_group is not going to be accessible
from device drivers, introduce function which substitutes it.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  8 ++++++++
 net/8021q/vlan_core.c   | 21 +++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index cfb0cf2230a9..69391cc20f3d 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -120,6 +120,8 @@ static inline int is_vlan_dev(struct net_device *dev)
 
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
 
+extern struct net_device *__vlan_find_dev_deep(struct net_device *real_dev,
+					       u16 vlan_id);
 extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
 extern u16 vlan_dev_vlan_id(const struct net_device *dev);
 
@@ -135,6 +137,12 @@ vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
 	       unsigned int vlan_tci);
 
 #else
+static inline struct net_device *
+__vlan_find_dev_deep(struct net_device *real_dev, u16 vlan_id)
+{
+	return NULL;
+}
+
 static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
 {
 	BUG();
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index fcc684678af6..5940366fac48 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -63,6 +63,27 @@ bool vlan_do_receive(struct sk_buff **skbp)
 	return true;
 }
 
+/* Must be invoked with rcu_read_lock or with RTNL. */
+struct net_device *__vlan_find_dev_deep(struct net_device *real_dev,
+					u16 vlan_id)
+{
+	struct vlan_group *grp = rcu_dereference_rtnl(real_dev->vlgrp);
+
+	if (grp) {
+		return vlan_group_get_device(grp, vlan_id);
+	} else {
+		/*
+		 * Bonding slaves do not have grp assigned to themselves.
+		 * Grp is assigned to bonding master instead.
+		 */
+		if (netif_is_bond_slave(real_dev))
+			return __vlan_find_dev_deep(real_dev->master, vlan_id);
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(__vlan_find_dev_deep);
+
 struct net_device *vlan_dev_real_dev(const struct net_device *dev)
 {
 	return vlan_dev_info(dev)->real_dev;
-- 
cgit v1.2.3


From 7756a96e1925addedc87db21ca6646e03b3c674f Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:08 +0000
Subject: lro: kill lro_vlan_hwaccel_receive_skb

no longer used

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_lro.h | 10 ----------
 net/ipv4/inet_lro.c      | 15 ---------------
 2 files changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h
index c4335faebb63..940bbde05a52 100644
--- a/include/linux/inet_lro.h
+++ b/include/linux/inet_lro.h
@@ -136,16 +136,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,
 		     struct sk_buff *skb,
 		     void *priv);
 
-/*
- * Processes a SKB with VLAN HW acceleration support
- */
-
-void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
-				  struct sk_buff *skb,
-				  struct vlan_group *vgrp,
-				  u16 vlan_tag,
-				  void *priv);
-
 /*
  * Processes a fragment list
  *
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 85a0f75dae64..e54425d47467 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -523,21 +523,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,
 }
 EXPORT_SYMBOL(lro_receive_skb);
 
-void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
-				  struct sk_buff *skb,
-				  struct vlan_group *vgrp,
-				  u16 vlan_tag,
-				  void *priv)
-{
-	if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) {
-		if (lro_mgr->features & LRO_F_NAPI)
-			vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
-		else
-			vlan_hwaccel_rx(skb, vgrp, vlan_tag);
-	}
-}
-EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb);
-
 void lro_receive_frags(struct net_lro_mgr *lro_mgr,
 		       struct skb_frag_struct *frags,
 		       int len, int true_size, void *priv, __wsum sum)
-- 
cgit v1.2.3


From 0f7257281dcbc9bcb46b882c23fa57b6738fce5e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:09 +0000
Subject: lro: kill lro_vlan_hwaccel_receive_frags

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_lro.h |  7 -------
 net/ipv4/inet_lro.c      | 20 --------------------
 2 files changed, 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h
index 940bbde05a52..b27e773e7144 100644
--- a/include/linux/inet_lro.h
+++ b/include/linux/inet_lro.h
@@ -155,13 +155,6 @@ void lro_receive_frags(struct net_lro_mgr *lro_mgr,
 		       struct skb_frag_struct *frags,
 		       int len, int true_size, void *priv, __wsum sum);
 
-void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
-				    struct skb_frag_struct *frags,
-				    int len, int true_size,
-				    struct vlan_group *vgrp,
-				    u16 vlan_tag,
-				    void *priv, __wsum sum);
-
 /*
  * Forward all aggregated SKBs held by lro_mgr to network stack
  */
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index e54425d47467..5cf07593282b 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -541,26 +541,6 @@ void lro_receive_frags(struct net_lro_mgr *lro_mgr,
 }
 EXPORT_SYMBOL(lro_receive_frags);
 
-void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
-				    struct skb_frag_struct *frags,
-				    int len, int true_size,
-				    struct vlan_group *vgrp,
-				    u16 vlan_tag, void *priv, __wsum sum)
-{
-	struct sk_buff *skb;
-
-	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp,
-				 vlan_tag, priv, sum);
-	if (!skb)
-		return;
-
-	if (lro_mgr->features & LRO_F_NAPI)
-		vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
-	else
-		vlan_hwaccel_rx(skb, vgrp, vlan_tag);
-}
-EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags);
-
 void lro_flush_all(struct net_lro_mgr *lro_mgr)
 {
 	int i;
-- 
cgit v1.2.3


From 9fea03302ada3291fb3066a3c276e3f43556548b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:10 +0000
Subject: lro: do vlan cleanup

- remove useless vlan parameters and pointers

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_lro.h |  2 --
 net/ipv4/inet_lro.c      | 39 +++++++++++----------------------------
 2 files changed, 11 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h
index b27e773e7144..2cf55afbcd4e 100644
--- a/include/linux/inet_lro.h
+++ b/include/linux/inet_lro.h
@@ -50,7 +50,6 @@ struct net_lro_desc {
 	struct skb_frag_struct *next_frag;
 	struct iphdr *iph;
 	struct tcphdr *tcph;
-	struct vlan_group *vgrp;
 	__wsum  data_csum;
 	__be32 tcp_rcv_tsecr;
 	__be32 tcp_rcv_tsval;
@@ -60,7 +59,6 @@ struct net_lro_desc {
 	u16 ip_tot_len;
 	u16 tcp_saw_tstamp; 		/* timestamps enabled */
 	__be16 tcp_window;
-	u16 vlan_tag;
 	int pkt_aggr_cnt;		/* counts aggregated packets */
 	int vlan_packet;
 	int mss;
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 5cf07593282b..ef7ae6049a51 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -146,8 +146,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
 }
 
 static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
-			  struct iphdr *iph, struct tcphdr *tcph,
-			  u16 vlan_tag, struct vlan_group *vgrp)
+			  struct iphdr *iph, struct tcphdr *tcph)
 {
 	int nr_frags;
 	__be32 *ptr;
@@ -173,8 +172,6 @@ static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
 	}
 
 	lro_desc->mss = tcp_data_len;
-	lro_desc->vgrp = vgrp;
-	lro_desc->vlan_tag = vlan_tag;
 	lro_desc->active = 1;
 
 	lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
@@ -309,29 +306,17 @@ static void lro_flush(struct net_lro_mgr *lro_mgr,
 
 	skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
 
-	if (lro_desc->vgrp) {
-		if (lro_mgr->features & LRO_F_NAPI)
-			vlan_hwaccel_receive_skb(lro_desc->parent,
-						 lro_desc->vgrp,
-						 lro_desc->vlan_tag);
-		else
-			vlan_hwaccel_rx(lro_desc->parent,
-					lro_desc->vgrp,
-					lro_desc->vlan_tag);
-
-	} else {
-		if (lro_mgr->features & LRO_F_NAPI)
-			netif_receive_skb(lro_desc->parent);
-		else
-			netif_rx(lro_desc->parent);
-	}
+	if (lro_mgr->features & LRO_F_NAPI)
+		netif_receive_skb(lro_desc->parent);
+	else
+		netif_rx(lro_desc->parent);
 
 	LRO_INC_STATS(lro_mgr, flushed);
 	lro_clear_desc(lro_desc);
 }
 
 static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
-			  struct vlan_group *vgrp, u16 vlan_tag, void *priv)
+			  void *priv)
 {
 	struct net_lro_desc *lro_desc;
 	struct iphdr *iph;
@@ -360,7 +345,7 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
 			goto out;
 
 		skb->ip_summed = lro_mgr->ip_summed_aggr;
-		lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp);
+		lro_init_desc(lro_desc, skb, iph, tcph);
 		LRO_INC_STATS(lro_mgr, aggregated);
 		return 0;
 	}
@@ -433,8 +418,7 @@ static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
 static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
 					  struct skb_frag_struct *frags,
 					  int len, int true_size,
-					  struct vlan_group *vgrp,
-					  u16 vlan_tag, void *priv, __wsum sum)
+					  void *priv, __wsum sum)
 {
 	struct net_lro_desc *lro_desc;
 	struct iphdr *iph;
@@ -480,7 +464,7 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
 		tcph = (void *)((u8 *)skb->data + vlan_hdr_len
 				+ IP_HDR_LEN(iph));
 
-		lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL);
+		lro_init_desc(lro_desc, skb, iph, tcph);
 		LRO_INC_STATS(lro_mgr, aggregated);
 		return NULL;
 	}
@@ -514,7 +498,7 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,
 		     struct sk_buff *skb,
 		     void *priv)
 {
-	if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
+	if (__lro_proc_skb(lro_mgr, skb, priv)) {
 		if (lro_mgr->features & LRO_F_NAPI)
 			netif_receive_skb(skb);
 		else
@@ -529,8 +513,7 @@ void lro_receive_frags(struct net_lro_mgr *lro_mgr,
 {
 	struct sk_buff *skb;
 
-	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0,
-				 priv, sum);
+	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum);
 	if (!skb)
 		return;
 
-- 
cgit v1.2.3


From 6dacaddd4850cbd6390d4f87548262b5ed79b4a5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:27 +0000
Subject: vlan: kill vlan_hwaccel_receive_skb

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 69391cc20f3d..3996713caaec 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -202,19 +202,6 @@ static inline int vlan_hwaccel_rx(struct sk_buff *skb,
 	return __vlan_hwaccel_rx(skb, grp, vlan_tci, 0);
 }
 
-/**
- * vlan_hwaccel_receive_skb - netif_receive_skb wrapper for VLAN RX acceleration
- * @skb: buffer
- * @grp: vlan group
- * @vlan_tci: VLAN TCI as received from the card
- */
-static inline int vlan_hwaccel_receive_skb(struct sk_buff *skb,
-					   struct vlan_group *grp,
-					   u16 vlan_tci)
-{
-	return __vlan_hwaccel_rx(skb, grp, vlan_tci, 1);
-}
-
 /**
  * vlan_insert_tag - regular VLAN tag inserting
  * @skb: skbuff to tag
-- 
cgit v1.2.3


From a4aeb26628b5184386f99cf202ac837b0e56c975 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:36 +0000
Subject: vlan: kill __vlan_hwaccel_rx and vlan_hwaccel_rx

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 22 ----------------------
 net/8021q/vlan_core.c   |  9 ---------
 2 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 3996713caaec..d81beced0dae 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -125,8 +125,6 @@ extern struct net_device *__vlan_find_dev_deep(struct net_device *real_dev,
 extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
 extern u16 vlan_dev_vlan_id(const struct net_device *dev);
 
-extern int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
-			     u16 vlan_tci, int polling);
 extern bool vlan_do_receive(struct sk_buff **skb);
 extern struct sk_buff *vlan_untag(struct sk_buff *skb);
 extern gro_result_t
@@ -155,13 +153,6 @@ static inline u16 vlan_dev_vlan_id(const struct net_device *dev)
 	return 0;
 }
 
-static inline int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
-				    u16 vlan_tci, int polling)
-{
-	BUG();
-	return NET_XMIT_SUCCESS;
-}
-
 static inline bool vlan_do_receive(struct sk_buff **skb)
 {
 	if ((*skb)->vlan_tci & VLAN_VID_MASK)
@@ -189,19 +180,6 @@ vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
 }
 #endif
 
-/**
- * vlan_hwaccel_rx - netif_rx wrapper for VLAN RX acceleration
- * @skb: buffer
- * @grp: vlan group
- * @vlan_tci: VLAN TCI as received from the card
- */
-static inline int vlan_hwaccel_rx(struct sk_buff *skb,
-				  struct vlan_group *grp,
-				  u16 vlan_tci)
-{
-	return __vlan_hwaccel_rx(skb, grp, vlan_tci, 0);
-}
-
 /**
  * vlan_insert_tag - regular VLAN tag inserting
  * @skb: skbuff to tag
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 5940366fac48..68b04ea5fa48 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -96,15 +96,6 @@ u16 vlan_dev_vlan_id(const struct net_device *dev)
 }
 EXPORT_SYMBOL(vlan_dev_vlan_id);
 
-/* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
-int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
-		      u16 vlan_tci, int polling)
-{
-	__vlan_hwaccel_put_tag(skb, vlan_tci);
-	return polling ? netif_receive_skb(skb) : netif_rx(skb);
-}
-EXPORT_SYMBOL(__vlan_hwaccel_rx);
-
 gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 			      unsigned int vlan_tci, struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From ffcf9b767293ebc3e59e639cd4ec0dff5ca39798 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:42 +0000
Subject: vlan: kill vlan_gro_frags and vlan_gro_receive

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 20 --------------------
 net/8021q/vlan_core.c   | 16 ----------------
 2 files changed, 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index d81beced0dae..f40369e09f5c 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -127,12 +127,6 @@ extern u16 vlan_dev_vlan_id(const struct net_device *dev);
 
 extern bool vlan_do_receive(struct sk_buff **skb);
 extern struct sk_buff *vlan_untag(struct sk_buff *skb);
-extern gro_result_t
-vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
-		 unsigned int vlan_tci, struct sk_buff *skb);
-extern gro_result_t
-vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
-	       unsigned int vlan_tci);
 
 #else
 static inline struct net_device *
@@ -164,20 +158,6 @@ static inline struct sk_buff *vlan_untag(struct sk_buff *skb)
 {
 	return skb;
 }
-
-static inline gro_result_t
-vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
-		 unsigned int vlan_tci, struct sk_buff *skb)
-{
-	return GRO_DROP;
-}
-
-static inline gro_result_t
-vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
-	       unsigned int vlan_tci)
-{
-	return GRO_DROP;
-}
 #endif
 
 /**
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 68b04ea5fa48..5f27f8e30254 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -96,22 +96,6 @@ u16 vlan_dev_vlan_id(const struct net_device *dev)
 }
 EXPORT_SYMBOL(vlan_dev_vlan_id);
 
-gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
-			      unsigned int vlan_tci, struct sk_buff *skb)
-{
-	__vlan_hwaccel_put_tag(skb, vlan_tci);
-	return napi_gro_receive(napi, skb);
-}
-EXPORT_SYMBOL(vlan_gro_receive);
-
-gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
-			    unsigned int vlan_tci)
-{
-	__vlan_hwaccel_put_tag(napi->skb, vlan_tci);
-	return napi_gro_frags(napi);
-}
-EXPORT_SYMBOL(vlan_gro_frags);
-
 static struct sk_buff *vlan_reorder_header(struct sk_buff *skb)
 {
 	if (skb_cow(skb, skb_headroom(skb)) < 0)
-- 
cgit v1.2.3


From 7890a5b9cbfd159c81ffd7866b5b2d4425886e7f Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:48 +0000
Subject: vlan: kill ndo_vlan_rx_register

has no users so remove it

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 --------
 net/8021q/vlan.c          | 4 ----
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 25c805b2c9e5..44f96b24e869 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -768,12 +768,6 @@ struct netdev_tc_txq {
  *	3. Update dev->stats asynchronously and atomically, and define
  *	   neither operation.
  *
- * void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp);
- *	If device support VLAN receive acceleration
- *	(ie. dev->features & NETIF_F_HW_VLAN_RX), then this function is called
- *	when vlan groups for the device changes.  Note: grp is NULL
- *	if no vlan's groups are being used.
- *
  * void (*ndo_vlan_rx_add_vid)(struct net_device *dev, unsigned short vid);
  *	If device support VLAN filtering (dev->features & NETIF_F_HW_VLAN_FILTER)
  *	this function is called when a VLAN id is registered.
@@ -892,8 +886,6 @@ struct net_device_ops {
 						     struct rtnl_link_stats64 *storage);
 	struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 
-	void			(*ndo_vlan_rx_register)(struct net_device *dev,
-						        struct vlan_group *grp);
 	void			(*ndo_vlan_rx_add_vid)(struct net_device *dev,
 						       unsigned short vid);
 	void			(*ndo_vlan_rx_kill_vid)(struct net_device *dev,
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index d24c4644b930..8970ba139d73 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -134,8 +134,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 		vlan_gvrp_uninit_applicant(real_dev);
 
 		rcu_assign_pointer(real_dev->vlgrp, NULL);
-		if (ops->ndo_vlan_rx_register)
-			ops->ndo_vlan_rx_register(real_dev, NULL);
 
 		/* Free the group, after all cpu's are done. */
 		call_rcu(&grp->rcu, vlan_rcu_free);
@@ -207,8 +205,6 @@ int register_vlan_dev(struct net_device *dev)
 	grp->nr_vlans++;
 
 	if (ngrp) {
-		if (ops->ndo_vlan_rx_register && (real_dev->features & NETIF_F_HW_VLAN_RX))
-			ops->ndo_vlan_rx_register(real_dev, ngrp);
 		rcu_assign_pointer(real_dev->vlgrp, ngrp);
 	}
 	if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
-- 
cgit v1.2.3


From 536d1d4a076210f763b60d7c3823f2edbddf3a9c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 20 Jul 2011 04:54:49 +0000
Subject: vlan: move vlan_group_[gs]et_device to public header

there are no users outside vlan code

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 19 -------------------
 net/8021q/vlan.h        | 19 +++++++++++++++++++
 2 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index f40369e09f5c..44da4822bcab 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -91,25 +91,6 @@ struct vlan_group {
 	struct rcu_head		rcu;
 };
 
-static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
-						       u16 vlan_id)
-{
-	struct net_device **array;
-	array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
-	return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
-}
-
-static inline void vlan_group_set_device(struct vlan_group *vg,
-					 u16 vlan_id,
-					 struct net_device *dev)
-{
-	struct net_device **array;
-	if (!vg)
-		return;
-	array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
-	array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
-}
-
 static inline int is_vlan_dev(struct net_device *dev)
 {
         return dev->priv_flags & IFF_802_1Q_VLAN;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index b132f542b44b..9fd45f3571f9 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -74,6 +74,25 @@ static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev)
 	return netdev_priv(dev);
 }
 
+static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
+						       u16 vlan_id)
+{
+	struct net_device **array;
+	array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+	return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
+}
+
+static inline void vlan_group_set_device(struct vlan_group *vg,
+					 u16 vlan_id,
+					 struct net_device *dev)
+{
+	struct net_device **array;
+	if (!vg)
+		return;
+	array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+	array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
+}
+
 /* Must be invoked with rcu_read_lock or with RTNL. */
 static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
 					       u16 vlan_id)
-- 
cgit v1.2.3


From 36bcfe7d74782c07f601edc4831f6b1ef40e9e43 Mon Sep 17 00:00:00 2001
From: Giuseppe CAVALLARO <peppe.cavallaro@st.com>
Date: Wed, 20 Jul 2011 00:05:23 +0000
Subject: stmmac: unify MAC and PHY configuration parameters (V2)

Prior to this change, most PHY configuration parameters were passed
into the STMMAC device as a separate PHY device. As well as being
unusual, this made it difficult to make changes to the MAC/PHY
relationship.

This patch moves all the PHY parameters into the MAC configuration
structure, mainly as a separate structure. This allows us to completely
ignore the MDIO bus attached to a stmmac if desired, and not create
the PHY bus. It also allows the stmmac driver to use a different PHY
from the one it is connected to, for example a fixed PHY or bit banging
PHY.

Also derive the stmmac/PHY connection type (MII/RMII etc) from the
mode can be passed into <platf>_configure_ethernet.
STLinux kernel at git://git.stlinux.com/stm/linux-sh4-2.6.32.y.git
provides several examples how to use this new infrastructure (that
actually is easier to maintain and clearer).

Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/stmmac/stmmac.h      |  6 +--
 drivers/net/stmmac/stmmac_main.c | 95 +++-------------------------------------
 drivers/net/stmmac/stmmac_mdio.c | 83 ++++++++++++++++++++++++-----------
 include/linux/stmmac.h           | 24 +++++-----
 4 files changed, 77 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/stmmac/stmmac.h b/drivers/net/stmmac/stmmac.h
index 63979b75f085..de1929b2641b 100644
--- a/drivers/net/stmmac/stmmac.h
+++ b/drivers/net/stmmac/stmmac.h
@@ -56,14 +56,9 @@ struct stmmac_priv {
 	struct stmmac_extra_stats xstats;
 	struct napi_struct napi;
 
-	phy_interface_t phy_interface;
-	int phy_addr;
-	int phy_mask;
-	int (*phy_reset) (void *priv);
 	int rx_coe;
 	int no_csum_insertion;
 
-	int phy_irq;
 	struct phy_device *phydev;
 	int oldlink;
 	int speed;
@@ -71,6 +66,7 @@ struct stmmac_priv {
 	unsigned int flow_ctrl;
 	unsigned int pause;
 	struct mii_bus *mii;
+	int mii_irq[PHY_MAX_ADDR];
 
 	u32 msg_enable;
 	spinlock_t lock;
diff --git a/drivers/net/stmmac/stmmac_main.c b/drivers/net/stmmac/stmmac_main.c
index 40a6ae582800..c6e567e04eff 100644
--- a/drivers/net/stmmac/stmmac_main.c
+++ b/drivers/net/stmmac/stmmac_main.c
@@ -49,7 +49,6 @@
 #include "stmmac.h"
 
 #define STMMAC_RESOURCE_NAME	"stmmaceth"
-#define PHY_RESOURCE_NAME	"stmmacphy"
 
 #undef STMMAC_DEBUG
 /*#define STMMAC_DEBUG*/
@@ -305,18 +304,13 @@ static int stmmac_init_phy(struct net_device *dev)
 	priv->speed = 0;
 	priv->oldduplex = -1;
 
-	if (priv->phy_addr == -1) {
-		/* We don't have a PHY, so do nothing */
-		return 0;
-	}
-
 	snprintf(bus_id, MII_BUS_ID_SIZE, "%x", priv->plat->bus_id);
 	snprintf(phy_id, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, bus_id,
-		 priv->phy_addr);
+		 priv->plat->phy_addr);
 	pr_debug("stmmac_init_phy:  trying to attach to %s\n", phy_id);
 
 	phydev = phy_connect(dev, phy_id, &stmmac_adjust_link, 0,
-			priv->phy_interface);
+			     priv->plat->interface);
 
 	if (IS_ERR(phydev)) {
 		pr_err("%s: Could not attach to PHY\n", dev->name);
@@ -335,7 +329,7 @@ static int stmmac_init_phy(struct net_device *dev)
 		return -ENODEV;
 	}
 	pr_debug("stmmac_init_phy:  %s: attached to PHY (UID 0x%x)"
-	       " Link = %d\n", dev->name, phydev->phy_id, phydev->link);
+		 " Link = %d\n", dev->name, phydev->phy_id, phydev->link);
 
 	priv->phydev = phydev;
 
@@ -1528,71 +1522,6 @@ static int stmmac_mac_device_setup(struct net_device *dev)
 	return 0;
 }
 
-static int stmmacphy_dvr_probe(struct platform_device *pdev)
-{
-	struct plat_stmmacphy_data *plat_dat = pdev->dev.platform_data;
-
-	pr_debug("stmmacphy_dvr_probe: added phy for bus %d\n",
-	       plat_dat->bus_id);
-
-	return 0;
-}
-
-static int stmmacphy_dvr_remove(struct platform_device *pdev)
-{
-	return 0;
-}
-
-static struct platform_driver stmmacphy_driver = {
-	.driver = {
-		   .name = PHY_RESOURCE_NAME,
-		   },
-	.probe = stmmacphy_dvr_probe,
-	.remove = stmmacphy_dvr_remove,
-};
-
-/**
- * stmmac_associate_phy
- * @dev: pointer to device structure
- * @data: points to the private structure.
- * Description: Scans through all the PHYs we have registered and checks if
- * any are associated with our MAC.  If so, then just fill in
- * the blanks in our local context structure
- */
-static int stmmac_associate_phy(struct device *dev, void *data)
-{
-	struct stmmac_priv *priv = (struct stmmac_priv *)data;
-	struct plat_stmmacphy_data *plat_dat = dev->platform_data;
-
-	DBG(probe, DEBUG, "%s: checking phy for bus %d\n", __func__,
-		plat_dat->bus_id);
-
-	/* Check that this phy is for the MAC being initialised */
-	if (priv->plat->bus_id != plat_dat->bus_id)
-		return 0;
-
-	/* OK, this PHY is connected to the MAC.
-	   Go ahead and get the parameters */
-	DBG(probe, DEBUG, "%s: OK. Found PHY config\n", __func__);
-	priv->phy_irq =
-	    platform_get_irq_byname(to_platform_device(dev), "phyirq");
-	DBG(probe, DEBUG, "%s: PHY irq on bus %d is %d\n", __func__,
-	    plat_dat->bus_id, priv->phy_irq);
-
-	/* Override with kernel parameters if supplied XXX CRS XXX
-	 * this needs to have multiple instances */
-	if ((phyaddr >= 0) && (phyaddr <= 31))
-		plat_dat->phy_addr = phyaddr;
-
-	priv->phy_addr = plat_dat->phy_addr;
-	priv->phy_mask = plat_dat->phy_mask;
-	priv->phy_interface = plat_dat->interface;
-	priv->phy_reset = plat_dat->phy_reset;
-
-	DBG(probe, DEBUG, "%s: exiting\n", __func__);
-	return 1;	/* forces exit of driver_for_each_device() */
-}
-
 /**
  * stmmac_dvr_probe
  * @pdev: platform device pointer
@@ -1683,14 +1612,10 @@ static int stmmac_dvr_probe(struct platform_device *pdev)
 	if (ret < 0)
 		goto out_plat_exit;
 
-	/* associate a PHY - it is provided by another platform bus */
-	if (!driver_for_each_device
-	    (&(stmmacphy_driver.driver), NULL, (void *)priv,
-	     stmmac_associate_phy)) {
-		pr_err("No PHY device is associated with this MAC!\n");
-		ret = -ENODEV;
-		goto out_unregister;
-	}
+	/* Override with kernel parameters if supplied XXX CRS XXX
+	 * this needs to have multiple instances */
+	if ((phyaddr >= 0) && (phyaddr <= 31))
+		priv->plat->phy_addr = phyaddr;
 
 	pr_info("\t%s - (dev. name: %s - id: %d, IRQ #%d\n"
 	       "\tIO base addr: 0x%p)\n", ndev->name, pdev->name,
@@ -1890,11 +1815,6 @@ static int __init stmmac_init_module(void)
 {
 	int ret;
 
-	if (platform_driver_register(&stmmacphy_driver)) {
-		pr_err("No PHY devices registered!\n");
-		return -ENODEV;
-	}
-
 	ret = platform_driver_register(&stmmac_driver);
 	return ret;
 }
@@ -1905,7 +1825,6 @@ static int __init stmmac_init_module(void)
  */
 static void __exit stmmac_cleanup_module(void)
 {
-	platform_driver_unregister(&stmmacphy_driver);
 	platform_driver_unregister(&stmmac_driver);
 }
 
diff --git a/drivers/net/stmmac/stmmac_mdio.c b/drivers/net/stmmac/stmmac_mdio.c
index 29a6bb6b8058..9c3b9d5c3411 100644
--- a/drivers/net/stmmac/stmmac_mdio.c
+++ b/drivers/net/stmmac/stmmac_mdio.c
@@ -113,9 +113,9 @@ static int stmmac_mdio_reset(struct mii_bus *bus)
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	unsigned int mii_address = priv->hw->mii.addr;
 
-	if (priv->phy_reset) {
+	if (priv->plat->mdio_bus_data->phy_reset) {
 		pr_debug("stmmac_mdio_reset: calling phy_reset\n");
-		priv->phy_reset(priv->plat->bsp_priv);
+		priv->plat->mdio_bus_data->phy_reset(priv->plat->bsp_priv);
 	}
 
 	/* This is a workaround for problems with the STE101P PHY.
@@ -138,30 +138,29 @@ int stmmac_mdio_register(struct net_device *ndev)
 	struct mii_bus *new_bus;
 	int *irqlist;
 	struct stmmac_priv *priv = netdev_priv(ndev);
+	struct stmmac_mdio_bus_data *mdio_bus_data = priv->plat->mdio_bus_data;
 	int addr, found;
 
+	if (!mdio_bus_data)
+		return 0;
+
 	new_bus = mdiobus_alloc();
 	if (new_bus == NULL)
 		return -ENOMEM;
 
-	irqlist = kzalloc(sizeof(int) * PHY_MAX_ADDR, GFP_KERNEL);
-	if (irqlist == NULL) {
-		err = -ENOMEM;
-		goto irqlist_alloc_fail;
-	}
-
-	/* Assign IRQ to phy at address phy_addr */
-	if (priv->phy_addr != -1)
-		irqlist[priv->phy_addr] = priv->phy_irq;
+	if (mdio_bus_data->irqs)
+		irqlist = mdio_bus_data->irqs;
+	else
+		irqlist = priv->mii_irq;
 
 	new_bus->name = "STMMAC MII Bus";
 	new_bus->read = &stmmac_mdio_read;
 	new_bus->write = &stmmac_mdio_write;
 	new_bus->reset = &stmmac_mdio_reset;
-	snprintf(new_bus->id, MII_BUS_ID_SIZE, "%x", priv->plat->bus_id);
+	snprintf(new_bus->id, MII_BUS_ID_SIZE, "%x", mdio_bus_data->bus_id);
 	new_bus->priv = ndev;
 	new_bus->irq = irqlist;
-	new_bus->phy_mask = priv->phy_mask;
+	new_bus->phy_mask = mdio_bus_data->phy_mask;
 	new_bus->parent = priv->device;
 	err = mdiobus_register(new_bus);
 	if (err != 0) {
@@ -172,18 +171,50 @@ int stmmac_mdio_register(struct net_device *ndev)
 	priv->mii = new_bus;
 
 	found = 0;
-	for (addr = 0; addr < 32; addr++) {
+	for (addr = 0; addr < PHY_MAX_ADDR; addr++) {
 		struct phy_device *phydev = new_bus->phy_map[addr];
 		if (phydev) {
-			if (priv->phy_addr == -1) {
-				priv->phy_addr = addr;
-				phydev->irq = priv->phy_irq;
-				irqlist[addr] = priv->phy_irq;
+			int act = 0;
+			char irq_num[4];
+			char *irq_str;
+
+			/*
+			 * If an IRQ was provided to be assigned after
+			 * the bus probe, do it here.
+			 */
+			if ((mdio_bus_data->irqs == NULL) &&
+			    (mdio_bus_data->probed_phy_irq > 0)) {
+				irqlist[addr] = mdio_bus_data->probed_phy_irq;
+				phydev->irq = mdio_bus_data->probed_phy_irq;
 			}
-			pr_info("%s: PHY ID %08x at %d IRQ %d (%s)%s\n",
-			       ndev->name, phydev->phy_id, addr,
-			       phydev->irq, dev_name(&phydev->dev),
-			       (addr == priv->phy_addr) ? " active" : "");
+
+			/*
+			 * If we're  going to bind the MAC to this PHY bus,
+			 * and no PHY number was provided to the MAC,
+			 * use the one probed here.
+			 */
+			if ((priv->plat->bus_id == mdio_bus_data->bus_id) &&
+			    (priv->plat->phy_addr == -1))
+				priv->plat->phy_addr = addr;
+
+			act = (priv->plat->bus_id == mdio_bus_data->bus_id) &&
+				(priv->plat->phy_addr == addr);
+			switch (phydev->irq) {
+			case PHY_POLL:
+				irq_str = "POLL";
+				break;
+			case PHY_IGNORE_INTERRUPT:
+				irq_str = "IGNORE";
+				break;
+			default:
+				sprintf(irq_num, "%d", phydev->irq);
+				irq_str = irq_num;
+				break;
+			}
+			pr_info("%s: PHY ID %08x at %d IRQ %s (%s)%s\n",
+				ndev->name, phydev->phy_id, addr,
+				irq_str, dev_name(&phydev->dev),
+				act ? " active" : "");
 			found = 1;
 		}
 	}
@@ -192,10 +223,9 @@ int stmmac_mdio_register(struct net_device *ndev)
 		pr_warning("%s: No PHY found\n", ndev->name);
 
 	return 0;
+
 bus_register_fail:
-	kfree(irqlist);
-irqlist_alloc_fail:
-	kfree(new_bus);
+	mdiobus_free(new_bus);
 	return err;
 }
 
@@ -210,7 +240,8 @@ int stmmac_mdio_unregister(struct net_device *ndev)
 
 	mdiobus_unregister(priv->mii);
 	priv->mii->priv = NULL;
-	kfree(priv->mii);
+	mdiobus_free(priv->mii);
+	priv->mii = NULL;
 
 	return 0;
 }
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 05d775690b72..0dddc9e42b6b 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -28,11 +28,21 @@
 
 #include <linux/platform_device.h>
 
-/* platform data for platform device structure's platform_data field */
+/* Platfrom data for platform device structure's platform_data field */
+
+struct stmmac_mdio_bus_data {
+	int bus_id;
+	int (*phy_reset)(void *priv);
+	unsigned int phy_mask;
+	int *irqs;
+	int probed_phy_irq;
+};
 
-/* Private data for the STM on-board ethernet driver */
 struct plat_stmmacenet_data {
 	int bus_id;
+	int phy_addr;
+	int interface;
+	struct stmmac_mdio_bus_data *mdio_bus_data;
 	int pbl;
 	int clk_csr;
 	int has_gmac;
@@ -48,14 +58,4 @@ struct plat_stmmacenet_data {
 	void *custom_cfg;
 	void *bsp_priv;
 };
-
-struct plat_stmmacphy_data {
-	int bus_id;
-	int phy_addr;
-	unsigned int phy_mask;
-	int interface;
-	int (*phy_reset)(void *priv);
-	void *priv;
-};
 #endif
-
-- 
cgit v1.2.3


From e3dbc46c2a62688d1aaa1f7291a054fb2b502849 Mon Sep 17 00:00:00 2001
From: "Choi, Jong-Hwan" <jhbird.choi@samsung.com>
Date: Wed, 20 Jul 2011 20:33:03 +0000
Subject: net: Kobj and queues_kset should be used when CONFIG_XPS is enabled

Kobj and queues_kset are used with CONFIG_XPS=y.

Signed-off-by: Choi, Jong-Hwan <jhbird.choi@samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 44f96b24e869..34f3abc6457a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,7 +530,7 @@ struct netdev_queue {
 	struct Qdisc		*qdisc;
 	unsigned long		state;
 	struct Qdisc		*qdisc_sleeping;
-#ifdef CONFIG_RPS
+#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
 	struct kobject		kobj;
 #endif
 #if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
@@ -1179,7 +1179,7 @@ struct net_device {
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
-#ifdef CONFIG_RPS
+#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
 	struct kset		*queues_kset;
 
 	struct netdev_rx_queue	*_rx;
-- 
cgit v1.2.3


From 87c48fa3b4630905f98268dde838ee43626a060c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 21 Jul 2011 21:25:58 -0700
Subject: ipv6: make fragment identifications less predictable

IPv6 fragment identification generation is way beyond what we use for
IPv4 : It uses a single generator. Its not scalable and allows DOS
attacks.

Now inetpeer is IPv6 aware, we can use it to provide a more secure and
scalable frag ident generator (per destination, instead of system wide)

This patch :
1) defines a new secure_ipv6_id() helper
2) extends inet_getid() to provide 32bit results
3) extends ipv6_select_ident() with a new dest parameter

Reported-by: Fernando Gont <fernando@gont.com.ar>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/char/random.c  | 15 +++++++++++++++
 include/linux/random.h |  1 +
 include/net/inetpeer.h | 13 ++++++++++---
 include/net/ipv6.h     | 12 +-----------
 net/ipv4/inetpeer.c    |  7 +++++--
 net/ipv6/ip6_output.c  | 36 +++++++++++++++++++++++++++++++-----
 net/ipv6/udp.c         |  2 +-
 7 files changed, 64 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index d4ddeba56682..729281961f22 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1523,6 +1523,21 @@ __u32 secure_ip_id(__be32 daddr)
 	return half_md4_transform(hash, keyptr->secret);
 }
 
+__u32 secure_ipv6_id(const __be32 daddr[4])
+{
+	const struct keydata *keyptr;
+	__u32 hash[4];
+
+	keyptr = get_keyptr();
+
+	hash[0] = (__force __u32)daddr[0];
+	hash[1] = (__force __u32)daddr[1];
+	hash[2] = (__force __u32)daddr[2];
+	hash[3] = (__force __u32)daddr[3];
+
+	return half_md4_transform(hash, keyptr->secret);
+}
+
 #ifdef CONFIG_INET
 
 __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
diff --git a/include/linux/random.h b/include/linux/random.h
index fb7ab9de5f36..ce29a040c8dc 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -58,6 +58,7 @@ extern void get_random_bytes(void *buf, int nbytes);
 void generate_random_uuid(unsigned char uuid_out[16]);
 
 extern __u32 secure_ip_id(__be32 daddr);
+extern __u32 secure_ipv6_id(const __be32 daddr[4]);
 extern u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
 extern u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 				      __be16 dport);
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 39d123081e7e..4233e6f9841d 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -71,7 +71,7 @@ static inline bool inet_metrics_new(const struct inet_peer *p)
 }
 
 /* can be called with or without local BH being disabled */
-struct inet_peer	*inet_getpeer(struct inetpeer_addr *daddr, int create);
+struct inet_peer	*inet_getpeer(const struct inetpeer_addr *daddr, int create);
 
 static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create)
 {
@@ -106,11 +106,18 @@ static inline void inet_peer_refcheck(const struct inet_peer *p)
 
 
 /* can be called with or without local BH being disabled */
-static inline __u16	inet_getid(struct inet_peer *p, int more)
+static inline int inet_getid(struct inet_peer *p, int more)
 {
+	int old, new;
 	more++;
 	inet_peer_refcheck(p);
-	return atomic_add_return(more, &p->ip_id_count) - more;
+	do {
+		old = atomic_read(&p->ip_id_count);
+		new = old + more;
+		if (!new)
+			new = 1;
+	} while (atomic_cmpxchg(&p->ip_id_count, old, new) != old);
+	return new;
 }
 
 #endif /* _NET_INETPEER_H */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index c033ed00df7d..3b5ac1fbff39 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -463,17 +463,7 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 	return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
 }
 
-static __inline__ void ipv6_select_ident(struct frag_hdr *fhdr)
-{
-	static u32 ipv6_fragmentation_id = 1;
-	static DEFINE_SPINLOCK(ip6_id_lock);
-
-	spin_lock_bh(&ip6_id_lock);
-	fhdr->identification = htonl(ipv6_fragmentation_id);
-	if (++ipv6_fragmentation_id == 0)
-		ipv6_fragmentation_id = 1;
-	spin_unlock_bh(&ip6_id_lock);
-}
+extern void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt);
 
 /*
  *	Prototypes exported by ipv6
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 90c5f0d1bcf3..e38213817d0a 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -391,7 +391,7 @@ static int inet_peer_gc(struct inet_peer_base *base,
 	return cnt;
 }
 
-struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
+struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
 {
 	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
 	struct inet_peer_base *base = family_to_base(daddr->family);
@@ -436,7 +436,10 @@ relookup:
 		p->daddr = *daddr;
 		atomic_set(&p->refcnt, 1);
 		atomic_set(&p->rid, 0);
-		atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
+		atomic_set(&p->ip_id_count,
+				(daddr->family == AF_INET) ?
+					secure_ip_id(daddr->addr.a4) :
+					secure_ipv6_id(daddr->addr.a6));
 		p->tcp_ts_stamp = 0;
 		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 		p->rate_tokens = 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8db0e4875ad8..32e5339db0c8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -596,6 +596,31 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 	return offset;
 }
 
+void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
+{
+	static atomic_t ipv6_fragmentation_id;
+	int old, new;
+
+	if (rt) {
+		struct inet_peer *peer;
+
+		if (!rt->rt6i_peer)
+			rt6_bind_peer(rt, 1);
+		peer = rt->rt6i_peer;
+		if (peer) {
+			fhdr->identification = htonl(inet_getid(peer, 0));
+			return;
+		}
+	}
+	do {
+		old = atomic_read(&ipv6_fragmentation_id);
+		new = old + 1;
+		if (!new)
+			new = 1;
+	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
+	fhdr->identification = htonl(new);
+}
+
 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
 	struct sk_buff *frag;
@@ -680,7 +705,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		skb_reset_network_header(skb);
 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
 
-		ipv6_select_ident(fh);
+		ipv6_select_ident(fh, rt);
 		fh->nexthdr = nexthdr;
 		fh->reserved = 0;
 		fh->frag_off = htons(IP6_MF);
@@ -826,7 +851,7 @@ slow_path:
 		fh->nexthdr = nexthdr;
 		fh->reserved = 0;
 		if (!frag_id) {
-			ipv6_select_ident(fh);
+			ipv6_select_ident(fh, rt);
 			frag_id = fh->identification;
 		} else
 			fh->identification = frag_id;
@@ -1076,7 +1101,8 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 			int getfrag(void *from, char *to, int offset, int len,
 			int odd, struct sk_buff *skb),
 			void *from, int length, int hh_len, int fragheaderlen,
-			int transhdrlen, int mtu,unsigned int flags)
+			int transhdrlen, int mtu,unsigned int flags,
+			struct rt6_info *rt)
 
 {
 	struct sk_buff *skb;
@@ -1120,7 +1146,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
 					     sizeof(struct frag_hdr)) & ~7;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
-		ipv6_select_ident(&fhdr);
+		ipv6_select_ident(&fhdr, rt);
 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
 		__skb_queue_tail(&sk->sk_write_queue, skb);
 
@@ -1286,7 +1312,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 
 			err = ip6_ufo_append_data(sk, getfrag, from, length,
 						  hh_len, fragheaderlen,
-						  transhdrlen, mtu, flags);
+						  transhdrlen, mtu, flags, rt);
 			if (err)
 				goto error;
 			return 0;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 328985c40883..29213b51c499 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1359,7 +1359,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u32 features)
 	fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
 	fptr->nexthdr = nexthdr;
 	fptr->reserved = 0;
-	ipv6_select_ident(fptr);
+	ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb));
 
 	/* Fragment the skb. ipv6 header and the remaining fields of the
 	 * fragment header are updated in ipv6_gso_segment()
-- 
cgit v1.2.3


From 5dea1c88ed11a1221581c4b202f053c4fc138704 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 22 Jul 2011 14:39:48 +0930
Subject: lguest: use a special 1:1 linear pagetable mode until first switch.

The Host used to create some page tables for the Guest to use at the
top of Guest memory; it would then tell the Guest where this was.  In
particular, it created linear mappings for 0 and 0xC0000000 addresses
because lguest used to switch to its real page tables quite late in
boot.

However, since d50d8fe19 Linux initialized boot page tables in
head_32.S even before the "are we lguest?" boot jump.  So, now we can
simplify things: the Host pagetable code assumes 1:1 linear mapping
until it first calls the LHCALL_NEW_PGTABLE hypercall, which we now do
before we reach C code.

This also means that the Host doesn't need to know anything about the
Guest's PAGE_OFFSET.  (Non-Linux guests might not even have such a
thing).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/kernel/asm-offsets_32.c |   1 -
 arch/x86/lguest/boot.c           |  11 +-
 arch/x86/lguest/i386_head.S      |   9 +-
 drivers/lguest/lg.h              |   2 +
 drivers/lguest/page_tables.c     | 278 ++++++++++++---------------------------
 include/linux/lguest.h           |   2 -
 6 files changed, 98 insertions(+), 205 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6fc..395a10e68067 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
 	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
 
 	BLANK();
 	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index db832fd65ecb..719a32c60516 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -520,17 +520,16 @@ static unsigned long lguest_read_cr2(void)
 
 /* See lguest_set_pte() below. */
 static bool cr3_changed = false;
+static unsigned long current_cr3;
 
 /*
  * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes.  The only
- * difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall.
+ * cr0.  Keep a local copy, and tell the Host when it changes.
  */
 static void lguest_write_cr3(unsigned long cr3)
 {
-	lguest_data.pgdir = cr3;
 	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
+	current_cr3 = cr3;
 
 	/* These two page tables are simple, linear, and used during boot */
 	if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +538,7 @@ static void lguest_write_cr3(unsigned long cr3)
 
 static unsigned long lguest_read_cr3(void)
 {
-	return lguest_data.pgdir;
+	return current_cr3;
 }
 
 /* cr4 is used to enable and disable PGE, but we don't care. */
@@ -758,7 +757,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
 static void lguest_flush_tlb_single(unsigned long addr)
 {
 	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
 }
 
 /*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 4f420c2f2d55..863b03a9fbff 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -27,13 +27,18 @@
 .section .init.text, "ax", @progbits
 ENTRY(lguest_entry)
 	/*
-	 * We make the "initialization" hypercall now to tell the Host about
-	 * us, and also find out where it put our page tables.
+	 * We make the "initialization" hypercall now to tell the Host where
+	 * our lguest_data struct is.
 	 */
 	movl $LHCALL_LGUEST_INIT, %eax
 	movl $lguest_data - __PAGE_OFFSET, %ebx
 	int $LGUEST_TRAP_ENTRY
 
+	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
+	movl $LHCALL_NEW_PGTABLE, %eax
+	movl $(initial_page_table - __PAGE_OFFSET), %ebx
+	int $LGUEST_TRAP_ENTRY
+
 	/* Set up the initial stack so we can run C code. */
 	movl $(init_thread_union+THREAD_SIZE),%esp
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 9136411fadd5..295df06e6590 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -59,6 +59,8 @@ struct lg_cpu {
 
 	struct lguest_pages *last_pages;
 
+	/* Initialization mode: linear map everything. */
+	bool linear_pages;
 	int cpu_pgd; /* Which pgd this cpu is currently using */
 
 	/* If a hypercall was asked for, this points to the arguments. */
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index d21578ee95de..00026222bde8 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -17,7 +17,6 @@
 #include <linux/percpu.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
-#include <asm/bootparam.h>
 #include "lg.h"
 
 /*M:008
@@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 #endif
 
 	/* First step: get the top-level Guest page table entry. */
-	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-	/* Toplevel not present?  We can't map it in. */
-	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-		return false;
+	if (unlikely(cpu->linear_pages)) {
+		/* Faking up a linear mapping. */
+		gpgd = __pgd(CHECK_GPGD_MASK);
+	} else {
+		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+		/* Toplevel not present?  We can't map it in. */
+		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+			return false;
+	}
 
 	/* Now look at the matching shadow entry. */
 	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
@@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	}
 
 #ifdef CONFIG_X86_PAE
-	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-	/* Middle level not present?  We can't map it in. */
-	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-		return false;
+	if (unlikely(cpu->linear_pages)) {
+		/* Faking up a linear mapping. */
+		gpmd = __pmd(_PAGE_TABLE);
+	} else {
+		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+		/* Middle level not present?  We can't map it in. */
+		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+			return false;
+	}
 
 	/* Now look at the matching shadow entry. */
 	spmd = spmd_addr(cpu, *spgd, vaddr);
@@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
 #endif
 
-	/* Read the actual PTE value. */
-	gpte = lgread(cpu, gpte_ptr, pte_t);
+	if (unlikely(cpu->linear_pages)) {
+		/* Linear?  Make up a PTE which points to same page. */
+		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
+	} else {
+		/* Read the actual PTE value. */
+		gpte = lgread(cpu, gpte_ptr, pte_t);
+	}
 
 	/* If this page isn't in the Guest page tables, we can't page it in. */
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	 * Finally, we write the Guest PTE entry back: we've set the
 	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
 	 */
-	lgwrite(cpu, gpte_ptr, pte_t, gpte);
+	if (likely(!cpu->linear_pages))
+		lgwrite(cpu, gpte_ptr, pte_t, gpte);
 
 	/*
 	 * The fault is fixed, the page table is populated, the mapping
@@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 #ifdef CONFIG_X86_PAE
 	pmd_t gpmd;
 #endif
+
+	/* Still not set up?  Just map 1:1. */
+	if (unlikely(cpu->linear_pages))
+		return vaddr;
+
 	/* First step: get the top-level Guest page table entry. */
 	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
@@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 	return next;
 }
 
-/*H:430
- * (iv) Switching page tables
- *
- * Now we've seen all the page table setting and manipulation, let's see
- * what happens when the Guest changes page tables (ie. changes the top-level
- * pgdir).  This occurs on almost every context switch.
- */
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
-{
-	int newpgdir, repin = 0;
-
-	/* Look to see if we have this one already. */
-	newpgdir = find_pgdir(cpu->lg, pgtable);
-	/*
-	 * If not, we allocate or mug an existing one: if it's a fresh one,
-	 * repin gets set to 1.
-	 */
-	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
-		newpgdir = new_pgdir(cpu, pgtable, &repin);
-	/* Change the current pgd index to the new one. */
-	cpu->cpu_pgd = newpgdir;
-	/* If it was completely blank, we map in the Guest kernel stack */
-	if (repin)
-		pin_stack_pages(cpu);
-}
-
 /*H:470
  * Finally, a routine which throws away everything: all PGD entries in all
  * the shadow page tables, including the Guest's kernel mappings.  This is used
@@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
 	/* We need the Guest kernel stack mapped again. */
 	pin_stack_pages(cpu);
 }
+
+/*H:430
+ * (iv) Switching page tables
+ *
+ * Now we've seen all the page table setting and manipulation, let's see
+ * what happens when the Guest changes page tables (ie. changes the top-level
+ * pgdir).  This occurs on almost every context switch.
+ */
+void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
+{
+	int newpgdir, repin = 0;
+
+	/*
+	 * The very first time they call this, we're actually running without
+	 * any page tables; we've been making it up.  Throw them away now.
+	 */
+	if (unlikely(cpu->linear_pages)) {
+		release_all_pagetables(cpu->lg);
+		cpu->linear_pages = false;
+		/* Force allocation of a new pgdir. */
+		newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
+	} else {
+		/* Look to see if we have this one already. */
+		newpgdir = find_pgdir(cpu->lg, pgtable);
+	}
+
+	/*
+	 * If not, we allocate or mug an existing one: if it's a fresh one,
+	 * repin gets set to 1.
+	 */
+	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
+		newpgdir = new_pgdir(cpu, pgtable, &repin);
+	/* Change the current pgd index to the new one. */
+	cpu->cpu_pgd = newpgdir;
+	/* If it was completely blank, we map in the Guest kernel stack */
+	if (repin)
+		pin_stack_pages(cpu);
+}
 /*:*/
 
 /*M:009
@@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
 }
 #endif
 
-/*H:505
- * To get through boot, we construct simple identity page mappings (which
- * set virtual == physical) and linear mappings which will get the Guest far
- * enough into the boot to create its own.  The linear mapping means we
- * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
- * as you'll see.
- *
- * We lay them out of the way, just below the initrd (which is why we need to
- * know its size here).
- */
-static unsigned long setup_pagetables(struct lguest *lg,
-				      unsigned long mem,
-				      unsigned long initrd_size)
-{
-	pgd_t __user *pgdir;
-	pte_t __user *linear;
-	unsigned long mem_base = (unsigned long)lg->mem_base;
-	unsigned int mapped_pages, i, linear_pages;
-#ifdef CONFIG_X86_PAE
-	pmd_t __user *pmds;
-	unsigned int j;
-	pgd_t pgd;
-	pmd_t pmd;
-#else
-	unsigned int phys_linear;
-#endif
-
-	/*
-	 * We have mapped_pages frames to map, so we need linear_pages page
-	 * tables to map them.
-	 */
-	mapped_pages = mem / PAGE_SIZE;
-	linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
-
-	/* We put the toplevel page directory page at the top of memory. */
-	pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);
-
-	/* Now we use the next linear_pages pages as pte pages */
-	linear = (void *)pgdir - linear_pages * PAGE_SIZE;
-
-#ifdef CONFIG_X86_PAE
-	/*
-	 * And the single mid page goes below that.  We only use one, but
-	 * that's enough to map 1G, which definitely gets us through boot.
-	 */
-	pmds = (void *)linear - PAGE_SIZE;
-#endif
-	/*
-	 * Linear mapping is easy: put every page's address into the
-	 * mapping in order.
-	 */
-	for (i = 0; i < mapped_pages; i++) {
-		pte_t pte;
-		pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
-		if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
-			return -EFAULT;
-	}
-
-#ifdef CONFIG_X86_PAE
-	/*
-	 * Make the Guest PMD entries point to the corresponding place in the
-	 * linear mapping (up to one page worth of PMD).
-	 */
-	for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
-	     i += PTRS_PER_PTE, j++) {
-		pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE,
-			      __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
-
-		if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
-			return -EFAULT;
-	}
-
-	/* One PGD entry, pointing to that PMD page. */
-	pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT);
-	/* Copy it in as the first PGD entry (ie. addresses 0-1G). */
-	if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
-		return -EFAULT;
-	/*
-	 * And the other PGD entry to make the linear mapping at PAGE_OFFSET
-	 */
-	if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd)))
-		return -EFAULT;
-#else
-	/*
-	 * The top level points to the linear page table pages above.
-	 * We setup the identity and linear mappings here.
-	 */
-	phys_linear = (unsigned long)linear - mem_base;
-	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
-		pgd_t pgd;
-		/*
-		 * Create a PGD entry which points to the right part of the
-		 * linear PTE pages.
-		 */
-		pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
-			    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
-
-		/*
-		 * Copy it into the PGD page at 0 and PAGE_OFFSET.
-		 */
-		if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
-		    || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
-					   + i / PTRS_PER_PTE],
-				    &pgd, sizeof(pgd)))
-			return -EFAULT;
-	}
-#endif
-
-	/*
-	 * We return the top level (guest-physical) address: we remember where
-	 * this is to write it into lguest_data when the Guest initializes.
-	 */
-	return (unsigned long)pgdir - mem_base;
-}
-
 /*H:500
  * (vii) Setting up the page tables initially.
  *
- * When a Guest is first created, the Launcher tells us where the toplevel of
- * its first page table is.  We set some things up here:
+ * When a Guest is first created, set initialize a shadow page table which
+ * we will populate on future faults.  The Guest doesn't have any actual
+ * pagetables yet, so we set linear_pages to tell demand_page() to fake it
+ * for the moment.
  */
 int init_guest_pagetable(struct lguest *lg)
 {
-	u64 mem;
-	u32 initrd_size;
-	struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
-#ifdef CONFIG_X86_PAE
-	pgd_t *pgd;
-	pmd_t *pmd_table;
-#endif
-	/*
-	 * Get the Guest memory size and the ramdisk size from the boot header
-	 * located at lg->mem_base (Guest address 0).
-	 */
-	if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
-	    || get_user(initrd_size, &boot->hdr.ramdisk_size))
-		return -EFAULT;
+	struct lg_cpu *cpu = &lg->cpus[0];
+	int allocated = 0;
 
-	/*
-	 * We start on the first shadow page table, and give it a blank PGD
-	 * page.
-	 */
-	lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
-	if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
-		return lg->pgdirs[0].gpgdir;
-	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
-	if (!lg->pgdirs[0].pgdir)
+	/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
+	cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
+	if (!allocated)
 		return -ENOMEM;
 
-#ifdef CONFIG_X86_PAE
-	/* For PAE, we also create the initial mid-level. */
-	pgd = lg->pgdirs[0].pgdir;
-	pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
-	if (!pmd_table)
-		return -ENOMEM;
-
-	set_pgd(pgd + SWITCHER_PGD_INDEX,
-		__pgd(__pa(pmd_table) | _PAGE_PRESENT));
-#endif
-
-	/* This is the current page table. */
-	lg->cpus[0].cpu_pgd = 0;
+	/* We start with a linear mapping until the initialize. */
+	cpu->linear_pages = true;
 	return 0;
 }
 
@@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
 		 * of virtual addresses used by the Switcher.
 		 */
 		|| put_user(RESERVE_MEM * 1024 * 1024,
-			&cpu->lg->lguest_data->reserve_mem)
-		|| put_user(cpu->lg->pgdirs[0].gpgdir,
-			&cpu->lg->lguest_data->pgdir))
+			    &cpu->lg->lguest_data->reserve_mem)) {
 		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
+		return;
+	}
 
 	/*
 	 * In flush_user_mappings() we loop from 0 to
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 2fb1dcbcb5aa..9962c6bb1311 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -59,8 +59,6 @@ struct lguest_data {
 	unsigned long reserve_mem;
 	/* KHz for the TSC clock. */
 	u32 tsc_khz;
-	/* Page where the top-level pagetable is */
-	unsigned long pgdir;
 
 /* Fields initialized by the Guest at boot: */
 	/* Instruction range to suppress interrupts even if enabled */
-- 
cgit v1.2.3


From ed70afcd6e795e3de98df56f1cd0f898fbf641a7 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 21 Jul 2011 13:55:37 -0700
Subject: mm/truncate.c: fix build for CONFIG_BLOCK not enabled

Fix build error when CONFIG_BLOCK is not enabled by providing a stub
inode_dio_wait() function.

mm/truncate.c:612: error: implicit declaration of function 'inode_dio_wait'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e0569e4ab2d5..b224dc468a23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2423,6 +2423,10 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 				    offset, nr_segs, get_block, NULL, NULL,
 				    DIO_LOCKING | DIO_SKIP_HOLES);
 }
+#else
+static inline void inode_dio_wait(struct inode *inode)
+{
+}
 #endif
 
 extern const struct file_operations generic_ro_fops;
-- 
cgit v1.2.3


From b83a313bf2520183641cf485d68cc273323597d2 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 11 May 2011 19:59:58 +0200
Subject: regmap: Add generic non-memory mapped register access API

There are many places in the tree where we implement register access for
devices on non-memory mapped buses, especially I2C and SPI. Since hardware
designers seem to have settled on a relatively consistent set of register
interfaces this can be effectively factored out into shared code.  There
are a standard set of formats for marshalling data for exchange with the
device, with the actual I/O mechanisms generally being simple byte
streams.

We create an abstraction for marshaling data into formats which can be
sent on the control interfaces, and create a standard method for
plugging in actual transport underneath that.

This is mostly a refactoring and renaming of the bottom level of the
existing code for sharing register I/O which we have in ASoC. A
subsequent patch in this series converts ASoC to use this.  The main
difference in interface is that reads return values by writing to a
location provided by a pointer rather than in the return value, ensuring
we can use the full range of the type for register data.  We also use
unsigned types rather than ints for the same reason.

As some of the devices can have very large register maps the existing
ASoC code also contains infrastructure for managing register caches.
This cache work will be moved over in a future stage to allow for
separate review, the current patch only deals with the physical I/O.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@ti.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
---
 MAINTAINERS                  |   7 +
 drivers/base/Kconfig         |   2 +
 drivers/base/Makefile        |   1 +
 drivers/base/regmap/Kconfig  |   6 +
 drivers/base/regmap/Makefile |   1 +
 drivers/base/regmap/regmap.c | 455 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/regmap.h       |  74 +++++++
 7 files changed, 546 insertions(+)
 create mode 100644 drivers/base/regmap/Kconfig
 create mode 100644 drivers/base/regmap/Makefile
 create mode 100644 drivers/base/regmap/regmap.c
 create mode 100644 include/linux/regmap.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 187282da9213..b7eaf6393d36 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5313,6 +5313,13 @@ L:	reiserfs-devel@vger.kernel.org
 S:	Supported
 F:	fs/reiserfs/
 
+REGISTER MAP ABSTRACTION
+M:	Mark Brown <broonie@opensource.wolfsonmicro.com>
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git
+S:	Supported
+F:	drivers/base/regmap/
+F:	include/linux/regmap.h
+
 RFKILL
 M:	Johannes Berg <johannes@sipsolutions.net>
 L:	linux-wireless@vger.kernel.org
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index d57e8d0fb823..b605d01f5d45 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -168,4 +168,6 @@ config SYS_HYPERVISOR
 	bool
 	default n
 
+source "drivers/base/regmap/Kconfig"
+
 endmenu
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 4c5701c15f53..dd4f9b245fc1 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -18,6 +18,7 @@ ifeq ($(CONFIG_SYSFS),y)
 obj-$(CONFIG_MODULES)	+= module.o
 endif
 obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o
+obj-$(CONFIG_REGMAP)	+= regmap/
 
 ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
 
diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig
new file mode 100644
index 000000000000..fc0eb1d484dd
--- /dev/null
+++ b/drivers/base/regmap/Kconfig
@@ -0,0 +1,6 @@
+# Generic register map support.  There are no user servicable options here,
+# this is an API intended to be used by other kernel subsystems.  These
+# subsystems should select the appropriate symbols.
+
+config REGMAP
+	bool
diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
new file mode 100644
index 000000000000..7532e13e0f00
--- /dev/null
+++ b/drivers/base/regmap/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_REGMAP) += regmap.o
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
new file mode 100644
index 000000000000..cf3565cae93d
--- /dev/null
+++ b/drivers/base/regmap/regmap.c
@@ -0,0 +1,455 @@
+/*
+ * Register map access API
+ *
+ * Copyright 2011 Wolfson Microelectronics plc
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+
+#include <linux/regmap.h>
+
+struct regmap;
+
+struct regmap_format {
+	size_t buf_size;
+	size_t reg_bytes;
+	size_t val_bytes;
+	void (*format_write)(struct regmap *map,
+			     unsigned int reg, unsigned int val);
+	void (*format_reg)(void *buf, unsigned int reg);
+	void (*format_val)(void *buf, unsigned int val);
+	unsigned int (*parse_val)(void *buf);
+};
+
+struct regmap {
+	struct mutex lock;
+
+	struct device *dev; /* Device we do I/O on */
+	void *work_buf;     /* Scratch buffer used to format I/O */
+	struct regmap_format format;  /* Buffer format */
+	const struct regmap_bus *bus;
+};
+
+static void regmap_format_4_12_write(struct regmap *map,
+				     unsigned int reg, unsigned int val)
+{
+	__be16 *out = map->work_buf;
+	*out = cpu_to_be16((reg << 12) | val);
+}
+
+static void regmap_format_7_9_write(struct regmap *map,
+				    unsigned int reg, unsigned int val)
+{
+	__be16 *out = map->work_buf;
+	*out = cpu_to_be16((reg << 9) | val);
+}
+
+static void regmap_format_8(void *buf, unsigned int val)
+{
+	u8 *b = buf;
+
+	b[0] = val;
+}
+
+static void regmap_format_16(void *buf, unsigned int val)
+{
+	__be16 *b = buf;
+
+	b[0] = cpu_to_be16(val);
+}
+
+static unsigned int regmap_parse_8(void *buf)
+{
+	u8 *b = buf;
+
+	return b[0];
+}
+
+static unsigned int regmap_parse_16(void *buf)
+{
+	__be16 *b = buf;
+
+	b[0] = be16_to_cpu(b[0]);
+
+	return b[0];
+}
+
+/**
+ * regmap_init(): Initialise register map
+ *
+ * @dev: Device that will be interacted with
+ * @bus: Bus-specific callbacks to use with device
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.  This function should generally not be called
+ * directly, it should be called by bus-specific init functions.
+ */
+struct regmap *regmap_init(struct device *dev,
+			   const struct regmap_bus *bus,
+			   const struct regmap_config *config)
+{
+	struct regmap *map;
+	int ret = -EINVAL;
+
+	if (!bus || !config)
+		return NULL;
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (map == NULL) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	mutex_init(&map->lock);
+	map->format.buf_size = (config->reg_bits + config->val_bits) / 8;
+	map->format.reg_bytes = config->reg_bits / 8;
+	map->format.val_bytes = config->val_bits / 8;
+	map->dev = dev;
+	map->bus = bus;
+
+	switch (config->reg_bits) {
+	case 4:
+		switch (config->val_bits) {
+		case 12:
+			map->format.format_write = regmap_format_4_12_write;
+			break;
+		default:
+			goto err_map;
+		}
+		break;
+
+	case 7:
+		switch (config->val_bits) {
+		case 9:
+			map->format.format_write = regmap_format_7_9_write;
+			break;
+		default:
+			goto err_map;
+		}
+		break;
+
+	case 8:
+		map->format.format_reg = regmap_format_8;
+		break;
+
+	case 16:
+		map->format.format_reg = regmap_format_16;
+		break;
+
+	default:
+		goto err_map;
+	}
+
+	switch (config->val_bits) {
+	case 8:
+		map->format.format_val = regmap_format_8;
+		map->format.parse_val = regmap_parse_8;
+		break;
+	case 16:
+		map->format.format_val = regmap_format_16;
+		map->format.parse_val = regmap_parse_16;
+		break;
+	}
+
+	if (!map->format.format_write &&
+	    !(map->format.format_reg && map->format.format_val))
+		goto err_map;
+
+	map->work_buf = kmalloc(map->format.buf_size, GFP_KERNEL);
+	if (map->work_buf == NULL) {
+		ret = -ENOMEM;
+		goto err_bus;
+	}
+
+	return map;
+
+err_bus:
+	module_put(map->bus->owner);
+err_map:
+	kfree(map);
+err:
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(regmap_init);
+
+/**
+ * regmap_exit(): Free a previously allocated register map
+ */
+void regmap_exit(struct regmap *map)
+{
+	kfree(map->work_buf);
+	module_put(map->bus->owner);
+	kfree(map);
+}
+EXPORT_SYMBOL_GPL(regmap_exit);
+
+static int _regmap_raw_write(struct regmap *map, unsigned int reg,
+			     const void *val, size_t val_len)
+{
+	void *buf;
+	int ret = -ENOTSUPP;
+	size_t len;
+
+	map->format.format_reg(map->work_buf, reg);
+
+	/* Try to do a gather write if we can */
+	if (map->bus->gather_write)
+		ret = map->bus->gather_write(map->dev, map->work_buf,
+					     map->format.reg_bytes,
+					     val, val_len);
+
+	/* Otherwise fall back on linearising by hand. */
+	if (ret == -ENOTSUPP) {
+		len = map->format.reg_bytes + val_len;
+		buf = kmalloc(len, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+
+		memcpy(buf, map->work_buf, map->format.reg_bytes);
+		memcpy(buf + map->format.reg_bytes, val, val_len);
+		ret = map->bus->write(map->dev, buf, len);
+
+		kfree(buf);
+	}
+
+	return ret;
+}
+
+static int _regmap_write(struct regmap *map, unsigned int reg,
+			 unsigned int val)
+{
+	BUG_ON(!map->format.format_write && !map->format.format_val);
+
+	if (map->format.format_write) {
+		map->format.format_write(map, reg, val);
+
+		return map->bus->write(map->dev, map->work_buf,
+				       map->format.buf_size);
+	} else {
+		map->format.format_val(map->work_buf + map->format.reg_bytes,
+				       val);
+		return _regmap_raw_write(map, reg,
+					 map->work_buf + map->format.reg_bytes,
+					 map->format.val_bytes);
+	}
+}
+
+/**
+ * regmap_write(): Write a value to a single register
+ *
+ * @map: Register map to write to
+ * @reg: Register to write to
+ * @val: Value to be written
+ *
+ * A value of zero will be returned on success, a negative errno will
+ * be returned in error cases.
+ */
+int regmap_write(struct regmap *map, unsigned int reg, unsigned int val)
+{
+	int ret;
+
+	mutex_lock(&map->lock);
+
+	ret = _regmap_write(map, reg, val);
+
+	mutex_unlock(&map->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_write);
+
+/**
+ * regmap_raw_write(): Write raw values to one or more registers
+ *
+ * @map: Register map to write to
+ * @reg: Initial register to write to
+ * @val: Block of data to be written, laid out for direct transmission to the
+ *       device
+ * @val_len: Length of data pointed to by val.
+ *
+ * This function is intended to be used for things like firmware
+ * download where a large block of data needs to be transferred to the
+ * device.  No formatting will be done on the data provided.
+ *
+ * A value of zero will be returned on success, a negative errno will
+ * be returned in error cases.
+ */
+int regmap_raw_write(struct regmap *map, unsigned int reg,
+		     const void *val, size_t val_len)
+{
+	int ret;
+
+	mutex_lock(&map->lock);
+
+	ret = _regmap_raw_write(map, reg, val, val_len);
+
+	mutex_unlock(&map->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_raw_write);
+
+static int _regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
+			    unsigned int val_len)
+{
+	u8 *u8 = map->work_buf;
+	int ret;
+
+	map->format.format_reg(map->work_buf, reg);
+
+	/*
+	 * Some buses flag reads by setting the high bits in the
+	 * register addresss; since it's always the high bits for all
+	 * current formats we can do this here rather than in
+	 * formatting.  This may break if we get interesting formats.
+	 */
+	if (map->bus->read_flag_mask)
+		u8[0] |= map->bus->read_flag_mask;
+
+	ret = map->bus->read(map->dev, map->work_buf, map->format.reg_bytes,
+			     val, map->format.val_bytes);
+	if (ret != 0)
+		return ret;
+
+	return 0;
+}
+
+static int _regmap_read(struct regmap *map, unsigned int reg,
+			unsigned int *val)
+{
+	int ret;
+
+	if (!map->format.parse_val)
+		return -EINVAL;
+
+	ret = _regmap_raw_read(map, reg, map->work_buf, map->format.val_bytes);
+	if (ret == 0)
+		*val = map->format.parse_val(map->work_buf);
+
+	return ret;
+}
+
+/**
+ * regmap_read(): Read a value from a single register
+ *
+ * @map: Register map to write to
+ * @reg: Register to be read from
+ * @val: Pointer to store read value
+ *
+ * A value of zero will be returned on success, a negative errno will
+ * be returned in error cases.
+ */
+int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val)
+{
+	int ret;
+
+	mutex_lock(&map->lock);
+
+	ret = _regmap_read(map, reg, val);
+
+	mutex_unlock(&map->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_read);
+
+/**
+ * regmap_raw_read(): Read raw data from the device
+ *
+ * @map: Register map to write to
+ * @reg: First register to be read from
+ * @val: Pointer to store read value
+ * @val_len: Size of data to read
+ *
+ * A value of zero will be returned on success, a negative errno will
+ * be returned in error cases.
+ */
+int regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
+		    size_t val_len)
+{
+	int ret;
+
+	mutex_lock(&map->lock);
+
+	ret = _regmap_raw_read(map, reg, val, val_len);
+
+	mutex_unlock(&map->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_raw_read);
+
+/**
+ * regmap_bulk_read(): Read multiple registers from the device
+ *
+ * @map: Register map to write to
+ * @reg: First register to be read from
+ * @val: Pointer to store read value, in native register size for device
+ * @val_count: Number of registers to read
+ *
+ * A value of zero will be returned on success, a negative errno will
+ * be returned in error cases.
+ */
+int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val,
+		     size_t val_count)
+{
+	int ret, i;
+	size_t val_bytes = map->format.val_bytes;
+
+	if (!map->format.parse_val)
+		return -EINVAL;
+
+	ret = regmap_raw_read(map, reg, val, val_bytes * val_count);
+	if (ret != 0)
+		return ret;
+
+	for (i = 0; i < val_count * val_bytes; i += val_bytes)
+		map->format.parse_val(val + i);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(regmap_bulk_read);
+
+/**
+ * remap_update_bits: Perform a read/modify/write cycle on the register map
+ *
+ * @map: Register map to update
+ * @reg: Register to update
+ * @mask: Bitmask to change
+ * @val: New value for bitmask
+ *
+ * Returns zero for success, a negative number on error.
+ */
+int regmap_update_bits(struct regmap *map, unsigned int reg,
+		       unsigned int mask, unsigned int val)
+{
+	int ret;
+	unsigned int tmp;
+
+	mutex_lock(&map->lock);
+
+	ret = _regmap_read(map, reg, &tmp);
+	if (ret != 0)
+		goto out;
+
+	tmp &= ~mask;
+	tmp |= val & mask;
+
+	ret = _regmap_write(map, reg, tmp);
+
+out:
+	mutex_unlock(&map->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_update_bits);
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
new file mode 100644
index 000000000000..a26c1d001401
--- /dev/null
+++ b/include/linux/regmap.h
@@ -0,0 +1,74 @@
+#ifndef __LINUX_REGMAP_H
+#define __LINUX_REGMAP_H
+
+/*
+ * Register map access API
+ *
+ * Copyright 2011 Wolfson Microelectronics plc
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/module.h>
+
+struct regmap_config {
+	int reg_bits;
+	int val_bits;
+};
+
+typedef int (*regmap_hw_write)(struct device *dev, const void *data,
+			       size_t count);
+typedef int (*regmap_hw_gather_write)(struct device *dev,
+				      const void *reg, size_t reg_len,
+				      const void *val, size_t val_len);
+typedef int (*regmap_hw_read)(struct device *dev,
+			      const void *reg_buf, size_t reg_size,
+			      void *val_buf, size_t val_size);
+
+/**
+ * Description of a hardware bus for the register map infrastructure.
+ *
+ * @list: Internal use.
+ * @type: Bus type, used to identify bus to be used for a device.
+ * @write: Write operation.
+ * @gather_write: Write operation with split register/value, return -ENOTSUPP
+ *                if not implemented  on a given device.
+ * @read: Read operation.  Data is returned in the buffer used to transmit
+ *         data.
+ * @owner: Module with the bus implementation, used to pin the implementation
+ *         in memory.
+ * @read_flag_mask: Mask to be set in the top byte of the register when doing
+ *                  a read.
+ */
+struct regmap_bus {
+	struct list_head list;
+	struct bus_type *type;
+	regmap_hw_write write;
+	regmap_hw_gather_write gather_write;
+	regmap_hw_read read;
+	struct module *owner;
+	u8 read_flag_mask;
+};
+
+struct regmap *regmap_init(struct device *dev,
+			   const struct regmap_bus *bus,
+			   const struct regmap_config *config);
+void regmap_exit(struct regmap *map);
+int regmap_write(struct regmap *map, unsigned int reg, unsigned int val);
+int regmap_raw_write(struct regmap *map, unsigned int reg,
+		     const void *val, size_t val_len);
+int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val);
+int regmap_raw_read(struct regmap *map, unsigned int reg,
+		    void *val, size_t val_len);
+int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val,
+		     size_t val_count);
+int regmap_update_bits(struct regmap *map, unsigned int reg,
+		       unsigned int mask, unsigned int val);
+
+#endif
-- 
cgit v1.2.3


From 9943fa300a5dcd4536e9a4aa5c09cf94e5e7b28c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 20 Jun 2011 19:02:29 +0100
Subject: regmap: Add I2C bus support

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@ti.com>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/base/regmap/Kconfig      |   4 ++
 drivers/base/regmap/Makefile     |   1 +
 drivers/base/regmap/regmap-i2c.c | 115 +++++++++++++++++++++++++++++++++++++++
 include/linux/regmap.h           |   4 ++
 4 files changed, 124 insertions(+)
 create mode 100644 drivers/base/regmap/regmap-i2c.c

(limited to 'include/linux')

diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig
index fc0eb1d484dd..03cf1abed401 100644
--- a/drivers/base/regmap/Kconfig
+++ b/drivers/base/regmap/Kconfig
@@ -3,4 +3,8 @@
 # subsystems should select the appropriate symbols.
 
 config REGMAP
+	default y if REGMAP_I2C
 	bool
+
+config REGMAP_I2C
+	tristate
diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
index 7532e13e0f00..ee5109d20f4e 100644
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_REGMAP) += regmap.o
+obj-$(CONFIG_REGMAP_I2C) += regmap-i2c.o
diff --git a/drivers/base/regmap/regmap-i2c.c b/drivers/base/regmap/regmap-i2c.c
new file mode 100644
index 000000000000..c2231ff06cbc
--- /dev/null
+++ b/drivers/base/regmap/regmap-i2c.c
@@ -0,0 +1,115 @@
+/*
+ * Register map access API - I2C support
+ *
+ * Copyright 2011 Wolfson Microelectronics plc
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/regmap.h>
+#include <linux/i2c.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+static int regmap_i2c_write(struct device *dev, const void *data, size_t count)
+{
+	struct i2c_client *i2c = to_i2c_client(dev);
+	int ret;
+
+	ret = i2c_master_send(i2c, data, count);
+	if (ret == count)
+		return 0;
+	else if (ret < 0)
+		return ret;
+	else
+		return -EIO;
+}
+
+static int regmap_i2c_gather_write(struct device *dev,
+				   const void *reg, size_t reg_size,
+				   const void *val, size_t val_size)
+{
+	struct i2c_client *i2c = to_i2c_client(dev);
+	struct i2c_msg xfer[2];
+	int ret;
+
+	/* If the I2C controller can't do a gather tell the core, it
+	 * will substitute in a linear write for us.
+	 */
+	if (!i2c_check_functionality(i2c->adapter, I2C_FUNC_PROTOCOL_MANGLING))
+		return -ENOTSUPP;
+
+	xfer[0].addr = i2c->addr;
+	xfer[0].flags = 0;
+	xfer[0].len = reg_size;
+	xfer[0].buf = (void *)reg;
+
+	xfer[1].addr = i2c->addr;
+	xfer[1].flags = I2C_M_NOSTART;
+	xfer[1].len = val_size;
+	xfer[1].buf = (void *)val;
+
+	ret = i2c_transfer(i2c->adapter, xfer, 2);
+	if (ret == 2)
+		return 0;
+	if (ret < 0)
+		return ret;
+	else
+		return -EIO;
+}
+
+static int regmap_i2c_read(struct device *dev,
+			   const void *reg, size_t reg_size,
+			   void *val, size_t val_size)
+{
+	struct i2c_client *i2c = to_i2c_client(dev);
+	struct i2c_msg xfer[2];
+	int ret;
+
+	xfer[0].addr = i2c->addr;
+	xfer[0].flags = 0;
+	xfer[0].len = reg_size;
+	xfer[0].buf = (void *)reg;
+
+	xfer[1].addr = i2c->addr;
+	xfer[1].flags = I2C_M_RD;
+	xfer[1].len = val_size;
+	xfer[1].buf = val;
+
+	ret = i2c_transfer(i2c->adapter, xfer, 2);
+	if (ret == 2)
+		return 0;
+	else if (ret < 0)
+		return ret;
+	else
+		return -EIO;
+}
+
+static struct regmap_bus regmap_i2c = {
+	.type = &i2c_bus_type,
+	.write = regmap_i2c_write,
+	.gather_write = regmap_i2c_gather_write,
+	.read = regmap_i2c_read,
+	.owner = THIS_MODULE,
+};
+
+/**
+ * regmap_init_i2c(): Initialise register map
+ *
+ * @i2c: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+struct regmap *regmap_init_i2c(struct i2c_client *i2c,
+			       const struct regmap_config *config)
+{
+	return regmap_init(&i2c->dev, &regmap_i2c, config);
+}
+EXPORT_SYMBOL_GPL(regmap_init_i2c);
+
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index a26c1d001401..5ba61f22c703 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -17,6 +17,8 @@
 #include <linux/list.h>
 #include <linux/module.h>
 
+struct i2c_client;
+
 struct regmap_config {
 	int reg_bits;
 	int val_bits;
@@ -59,6 +61,8 @@ struct regmap_bus {
 struct regmap *regmap_init(struct device *dev,
 			   const struct regmap_bus *bus,
 			   const struct regmap_config *config);
+struct regmap *regmap_init_i2c(struct i2c_client *i2c,
+			       const struct regmap_config *config);
 void regmap_exit(struct regmap *map);
 int regmap_write(struct regmap *map, unsigned int reg, unsigned int val);
 int regmap_raw_write(struct regmap *map, unsigned int reg,
-- 
cgit v1.2.3


From a676f083068b08e676c557279effbd7f4d590812 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 12 May 2011 11:42:10 +0200
Subject: regmap: Add SPI bus support

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@ti.com>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/base/regmap/Kconfig      |  5 ++-
 drivers/base/regmap/Makefile     |  1 +
 drivers/base/regmap/regmap-spi.c | 72 ++++++++++++++++++++++++++++++++++++++++
 include/linux/regmap.h           |  4 +++
 4 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 drivers/base/regmap/regmap-spi.c

(limited to 'include/linux')

diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig
index 03cf1abed401..fabbf6cc5367 100644
--- a/drivers/base/regmap/Kconfig
+++ b/drivers/base/regmap/Kconfig
@@ -3,8 +3,11 @@
 # subsystems should select the appropriate symbols.
 
 config REGMAP
-	default y if REGMAP_I2C
+	default y if (REGMAP_I2C || REGMAP_SPI)
 	bool
 
 config REGMAP_I2C
 	tristate
+
+config REGMAP_SPI
+	tristate
diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
index ee5109d20f4e..f476f4571295 100644
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_REGMAP) += regmap.o
 obj-$(CONFIG_REGMAP_I2C) += regmap-i2c.o
+obj-$(CONFIG_REGMAP_SPI) += regmap-spi.o
diff --git a/drivers/base/regmap/regmap-spi.c b/drivers/base/regmap/regmap-spi.c
new file mode 100644
index 000000000000..4deba0621bc7
--- /dev/null
+++ b/drivers/base/regmap/regmap-spi.c
@@ -0,0 +1,72 @@
+/*
+ * Register map access API - SPI support
+ *
+ * Copyright 2011 Wolfson Microelectronics plc
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/regmap.h>
+#include <linux/spi/spi.h>
+#include <linux/init.h>
+
+static int regmap_spi_write(struct device *dev, const void *data, size_t count)
+{
+	struct spi_device *spi = to_spi_device(dev);
+
+	return spi_write(spi, data, count);
+}
+
+static int regmap_spi_gather_write(struct device *dev,
+				   const void *reg, size_t reg_len,
+				   const void *val, size_t val_len)
+{
+	struct spi_device *spi = to_spi_device(dev);
+	struct spi_message m;
+	struct spi_transfer t[2] = { { .tx_buf = reg, .len = reg_len, },
+				     { .tx_buf = val, .len = val_len, }, };
+
+	spi_message_init(&m);
+	spi_message_add_tail(&t[0], &m);
+	spi_message_add_tail(&t[1], &m);
+
+	return spi_sync(spi, &m);
+}
+
+static int regmap_spi_read(struct device *dev,
+			   const void *reg, size_t reg_size,
+			   void *val, size_t val_size)
+{
+	struct spi_device *spi = to_spi_device(dev);
+
+	return spi_write_then_read(spi, reg, reg_size, val, val_size);
+}
+
+static struct regmap_bus regmap_spi = {
+	.type = &spi_bus_type,
+	.write = regmap_spi_write,
+	.gather_write = regmap_spi_gather_write,
+	.read = regmap_spi_read,
+	.owner = THIS_MODULE,
+	.read_flag_mask = 0x80,
+};
+
+/**
+ * regmap_init_spi(): Initialise register map
+ *
+ * @spi: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+struct regmap *regmap_init_spi(struct spi_device *spi,
+			       const struct regmap_config *config)
+{
+	return regmap_init(&spi->dev, &regmap_spi, config);
+}
+EXPORT_SYMBOL_GPL(regmap_init_spi);
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 5ba61f22c703..60a65cd7e1a0 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 
 struct i2c_client;
+struct spi_device;
 
 struct regmap_config {
 	int reg_bits;
@@ -63,6 +64,9 @@ struct regmap *regmap_init(struct device *dev,
 			   const struct regmap_config *config);
 struct regmap *regmap_init_i2c(struct i2c_client *i2c,
 			       const struct regmap_config *config);
+struct regmap *regmap_init_spi(struct spi_device *dev,
+			       const struct regmap_config *config);
+
 void regmap_exit(struct regmap *map);
 int regmap_write(struct regmap *map, unsigned int reg, unsigned int val);
 int regmap_raw_write(struct regmap *map, unsigned int reg,
-- 
cgit v1.2.3


From fce92dce79dbf5fff39c7ac2fb149729d79b7a39 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:28:54 +0800
Subject: KVM: MMU: filter out the mmio pfn from the fault pfn

If the page fault is caused by mmio, the gfn can not be found in memslots, and
'bad_pfn' is returned on gfn_to_hva path, so we can use 'bad_pfn' to identify
the mmio page fault.
And, to clarify the meaning of mmio pfn, we return fault page instead of bad
page when the gfn is not allowd to prefetch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       |  4 ++--
 include/linux/kvm_host.h |  5 +++++
 virt/kvm/kvm_main.c      | 16 ++++++++++++++--
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5334b4e9ecc7..96a7ed4e6837 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2085,8 +2085,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
 	if (!slot) {
-		get_page(bad_page);
-		return page_to_pfn(bad_page);
+		get_page(fault_page);
+		return page_to_pfn(fault_page);
 	}
 
 	hva = gfn_to_hva_memslot(slot, gfn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c8e023902f79..eabb21a30c34 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -327,12 +327,17 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
 
 extern struct page *bad_page;
+extern struct page *fault_page;
+
 extern pfn_t bad_pfn;
+extern pfn_t fault_pfn;
 
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
 int is_hwpoison_pfn(pfn_t pfn);
 int is_fault_pfn(pfn_t pfn);
+int is_noslot_pfn(pfn_t pfn);
+int is_invalid_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d5ef9ebcaff7..56f3c704fd74 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -101,8 +101,8 @@ static bool largepages_enabled = true;
 static struct page *hwpoison_page;
 static pfn_t hwpoison_pfn;
 
-static struct page *fault_page;
-static pfn_t fault_pfn;
+struct page *fault_page;
+pfn_t fault_pfn;
 
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
@@ -931,6 +931,18 @@ int is_fault_pfn(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(is_fault_pfn);
 
+int is_noslot_pfn(pfn_t pfn)
+{
+	return pfn == bad_pfn;
+}
+EXPORT_SYMBOL_GPL(is_noslot_pfn);
+
+int is_invalid_pfn(pfn_t pfn)
+{
+	return pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_invalid_pfn);
+
 static inline unsigned long bad_hva(void)
 {
 	return PAGE_OFFSET;
-- 
cgit v1.2.3


From 74e08fcf7bef973512a1f813700f802a93678670 Mon Sep 17 00:00:00 2001
From: Jonas Bonn <jonas@southpole.se>
Date: Thu, 30 Jun 2011 21:22:11 +0200
Subject: modules: add default loader hook implementations

The module loader code allows architectures to hook into the code by
providing a small number of entry points that each arch must implement.
This patch provides __weakly linked generic implementations of these
entry points for architectures that don't need to do anything special.

Signed-off-by: Jonas Bonn <jonas@southpole.se>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleloader.h |  7 ++++++-
 kernel/module.c              | 49 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index c1f40c2f7ffb..b2be02ebf453 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -5,7 +5,12 @@
 #include <linux/module.h>
 #include <linux/elf.h>
 
-/* These must be implemented by the specific architecture */
+/* These may be implemented by architectures that need to hook into the
+ * module loader code.  Architectures that don't need to do anything special
+ * can just rely on the 'weak' default hooks defined in kernel/module.c.
+ * Note, however, that at least one of apply_relocate or apply_relocate_add
+ * must be implemented by each architecture.
+ */
 
 /* Adjust arch-specific sections.  Return 0 on success.  */
 int module_frob_arch_sections(Elf_Ehdr *hdr,
diff --git a/kernel/module.c b/kernel/module.c
index 795bdc7f5c3f..6301d6e173ca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1697,6 +1697,15 @@ static void unset_module_core_ro_nx(struct module *mod) { }
 static void unset_module_init_ro_nx(struct module *mod) { }
 #endif
 
+void __weak module_free(struct module *mod, void *module_region)
+{
+	vfree(module_region);
+}
+
+void __weak module_arch_cleanup(struct module *mod)
+{
+}
+
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1851,6 +1860,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 	return ret;
 }
 
+int __weak apply_relocate(Elf_Shdr *sechdrs,
+			  const char *strtab,
+			  unsigned int symindex,
+			  unsigned int relsec,
+			  struct module *me)
+{
+	pr_err("module %s: REL relocation unsupported\n", me->name);
+	return -ENOEXEC;
+}
+
+int __weak apply_relocate_add(Elf_Shdr *sechdrs,
+			      const char *strtab,
+			      unsigned int symindex,
+			      unsigned int relsec,
+			      struct module *me)
+{
+	pr_err("module %s: RELA relocation unsupported\n", me->name);
+	return -ENOEXEC;
+}
+
 static int apply_relocations(struct module *mod, const struct load_info *info)
 {
 	unsigned int i;
@@ -2235,6 +2264,11 @@ static void dynamic_debug_remove(struct _ddebug *debug)
 		ddebug_remove_module(debug->modname);
 }
 
+void * __weak module_alloc(unsigned long size)
+{
+	return size == 0 ? NULL : vmalloc_exec(size);
+}
+
 static void *module_alloc_update_bounds(unsigned long size)
 {
 	void *ret = module_alloc(size);
@@ -2645,6 +2679,14 @@ static void flush_module_icache(const struct module *mod)
 	set_fs(old_fs);
 }
 
+int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+				     Elf_Shdr *sechdrs,
+				     char *secstrings,
+				     struct module *mod)
+{
+	return 0;
+}
+
 static struct module *layout_and_allocate(struct load_info *info)
 {
 	/* Module within temporary copy. */
@@ -2716,6 +2758,13 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 	module_free(mod, mod->module_core);
 }
 
+int __weak module_finalize(const Elf_Ehdr *hdr,
+			   const Elf_Shdr *sechdrs,
+			   struct module *me)
+{
+	return 0;
+}
+
 static int post_relocation(struct module *mod, const struct load_info *info)
 {
 	/* Sort exception table now relocations are done. */
-- 
cgit v1.2.3


From 4befb026cf74b52fc7d382142bbfc0e9b6aab5e7 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 24 Jul 2011 22:06:04 +0930
Subject: module: change attr callbacks to take struct module_kobject

This simplifies the next patch, where we have an attribute on a
builtin module (ie. module == NULL).

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (split into 2)
---
 include/linux/module.h | 23 ++++++++++++-----------
 kernel/module.c        | 14 +++++++-------
 kernel/params.c        | 10 +++++-----
 3 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index d9ca2d5dc6d0..b87e7625a98c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -48,10 +48,18 @@ struct modversion_info
 
 struct module;
 
+struct module_kobject {
+	struct kobject kobj;
+	struct module *mod;
+	struct kobject *drivers_dir;
+	struct module_param_attrs *mp;
+};
+
 struct module_attribute {
-        struct attribute attr;
-        ssize_t (*show)(struct module_attribute *, struct module *, char *);
-        ssize_t (*store)(struct module_attribute *, struct module *,
+	struct attribute attr;
+	ssize_t (*show)(struct module_attribute *, struct module_kobject *,
+			char *);
+	ssize_t (*store)(struct module_attribute *, struct module_kobject *,
 			 const char *, size_t count);
 	void (*setup)(struct module *, const char *);
 	int (*test)(struct module *);
@@ -65,15 +73,8 @@ struct module_version_attribute {
 } __attribute__ ((__aligned__(sizeof(void *))));
 
 extern ssize_t __modver_version_show(struct module_attribute *,
-				     struct module *, char *);
+				     struct module_kobject *, char *);
 
-struct module_kobject
-{
-	struct kobject kobj;
-	struct module *mod;
-	struct kobject *drivers_dir;
-	struct module_param_attrs *mp;
-};
 
 /* These are either module local, or the kernel's dummy ones. */
 extern int init_module(void);
diff --git a/kernel/module.c b/kernel/module.c
index 6301d6e173ca..a4295e67dd83 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -545,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s)  \
 	mod->field = kstrdup(s, GFP_KERNEL);                          \
 }                                                                     \
 static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
-	                struct module *mod, char *buffer)             \
+			struct module_kobject *mk, char *buffer)      \
 {                                                                     \
-	return sprintf(buffer, "%s\n", mod->field);                   \
+	return sprintf(buffer, "%s\n", mk->mod->field);               \
 }                                                                     \
 static int modinfo_##field##_exists(struct module *mod)               \
 {                                                                     \
@@ -902,9 +902,9 @@ void symbol_put_addr(void *addr)
 EXPORT_SYMBOL_GPL(symbol_put_addr);
 
 static ssize_t show_refcnt(struct module_attribute *mattr,
-			   struct module *mod, char *buffer)
+			   struct module_kobject *mk, char *buffer)
 {
-	return sprintf(buffer, "%u\n", module_refcount(mod));
+	return sprintf(buffer, "%u\n", module_refcount(mk->mod));
 }
 
 static struct module_attribute refcnt = {
@@ -952,11 +952,11 @@ static inline int module_unload_init(struct module *mod)
 #endif /* CONFIG_MODULE_UNLOAD */
 
 static ssize_t show_initstate(struct module_attribute *mattr,
-			   struct module *mod, char *buffer)
+			      struct module_kobject *mk, char *buffer)
 {
 	const char *state = "unknown";
 
-	switch (mod->state) {
+	switch (mk->mod->state) {
 	case MODULE_STATE_LIVE:
 		state = "live";
 		break;
@@ -1187,7 +1187,7 @@ struct module_sect_attrs
 };
 
 static ssize_t module_sect_show(struct module_attribute *mattr,
-				struct module *mod, char *buf)
+				struct module_kobject *mk, char *buf)
 {
 	struct module_sect_attr *sattr =
 		container_of(mattr, struct module_sect_attr, mattr);
diff --git a/kernel/params.c b/kernel/params.c
index 2a4ba258f04f..37e9b20a718b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -511,7 +511,7 @@ struct module_param_attrs
 #define to_param_attr(n) container_of(n, struct param_attribute, mattr)
 
 static ssize_t param_attr_show(struct module_attribute *mattr,
-			       struct module *mod, char *buf)
+			       struct module_kobject *mk, char *buf)
 {
 	int count;
 	struct param_attribute *attribute = to_param_attr(mattr);
@@ -531,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 
 /* sysfs always hands a nul-terminated string in buf.  We rely on that. */
 static ssize_t param_attr_store(struct module_attribute *mattr,
-				struct module *owner,
+				struct module_kobject *km,
 				const char *buf, size_t len)
 {
  	int err;
@@ -807,7 +807,7 @@ static void __init param_sysfs_builtin(void)
 }
 
 ssize_t __modver_version_show(struct module_attribute *mattr,
-			      struct module *mod, char *buf)
+			      struct module_kobject *mk, char *buf)
 {
 	struct module_version_attribute *vattr =
 		container_of(mattr, struct module_version_attribute, mattr);
@@ -852,7 +852,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
 	if (!attribute->show)
 		return -EIO;
 
-	ret = attribute->show(attribute, mk->mod, buf);
+	ret = attribute->show(attribute, mk, buf);
 
 	return ret;
 }
@@ -871,7 +871,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
 	if (!attribute->store)
 		return -EIO;
 
-	ret = attribute->store(attribute, mk->mod, buf, len);
+	ret = attribute->store(attribute, mk, buf, len);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 88bfa3247961fe5f3623f4d2cf1cd5dc72457598 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 24 Jul 2011 22:06:04 +0930
Subject: module: add /sys/module/<name>/uevent files

Userspace wants to manage module parameters with udev rules.
This currently only works for loaded modules, but not for
built-in ones.

To allow access to the built-in modules we need to
re-trigger all module load events that happened before any
userspace was running. We already do the same thing for all
devices, subsystems(buses) and drivers.

This adds the currently missing /sys/module/<name>/uevent files
to all module entries.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (split & trivial fix)
---
 include/linux/module.h |  1 +
 kernel/module.c        | 17 +++++++++++++++++
 kernel/params.c        |  4 ++++
 3 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index b87e7625a98c..1c30087a2d81 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -75,6 +75,7 @@ struct module_version_attribute {
 extern ssize_t __modver_version_show(struct module_attribute *,
 				     struct module_kobject *, char *);
 
+extern struct module_attribute module_uevent;
 
 /* These are either module local, or the kernel's dummy ones. */
 extern int init_module(void);
diff --git a/kernel/module.c b/kernel/module.c
index a4295e67dd83..04379f92f843 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -975,10 +975,27 @@ static struct module_attribute initstate = {
 	.show = show_initstate,
 };
 
+static ssize_t store_uevent(struct module_attribute *mattr,
+			    struct module_kobject *mk,
+			    const char *buffer, size_t count)
+{
+	enum kobject_action action;
+
+	if (kobject_action_type(buffer, count, &action) == 0)
+		kobject_uevent(&mk->kobj, action);
+	return count;
+}
+
+struct module_attribute module_uevent = {
+	.attr = { .name = "uevent", .mode = 0200 },
+	.store = store_uevent,
+};
+
 static struct module_attribute *modinfo_attrs[] = {
 	&modinfo_version,
 	&modinfo_srcversion,
 	&initstate,
+	&module_uevent,
 #ifdef CONFIG_MODULE_UNLOAD
 	&refcnt,
 #endif
diff --git a/kernel/params.c b/kernel/params.c
index 37e9b20a718b..22df3e0d142a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -730,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
 		mk->kobj.kset = module_kset;
 		err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
 					   "%s", name);
+#ifdef CONFIG_MODULES
+		if (!err)
+			err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
+#endif
 		if (err) {
 			kobject_put(&mk->kobj);
 			printk(KERN_ERR
-- 
cgit v1.2.3